diff --git a/examples/benchmarks/ALSTM/README.md b/examples/benchmarks/ALSTM/README.md
new file mode 100644
index 000000000..cd9dd3493
--- /dev/null
+++ b/examples/benchmarks/ALSTM/README.md
@@ -0,0 +1,10 @@
+# ALSTM
+
+- ALSTM contains a temporal attentive aggregation layer based on normal LSTM.
+
+- The code used in Qlib is a pyTorch implementation of Code: https://github.com/fulifeng/Adv-ALSTM
+
+- Paper: A dual-stage attention-based recurrent neural network for time series prediction.
+
+  https://www.ijcai.org/Proceedings/2017/0366.pdf
+
diff --git a/examples/benchmarks/TFT/README.md b/examples/benchmarks/TFT/README.md
index a64ca0129..5a6a9f153 100644
--- a/examples/benchmarks/TFT/README.md
+++ b/examples/benchmarks/TFT/README.md
@@ -5,8 +5,10 @@
 **GitHub**: https://github.com/google-research/google-research/tree/master/tft
 
 ## Run the Workflow
-Users can follow the ``workflow_by_code_tft.py`` to run the benchmark. Please be **aware** that this script can only support Python 3.5 - 3.8.
+Users can follow the ``workflow_by_code_tft.py`` to run the benchmark. 
 
 ### Notes
-1. The model must run in GPU, or an error will be raised.
-2. New datasets should be registered in ``data_formatters``, for detail please visit the source.
+1. Please be **aware** that this script can only support `Python 3.5 - 3.8`.
+2. If the CUDA version on your machine is not 10.0, please remember to run the following commands `conda install anaconda cudatoolkit=10.0` and `conda install cudnn` on your machine.
+3. The model must run in GPU, or an error will be raised.
+4. New datasets should be registered in ``data_formatters``, for detail please visit the source.
diff --git a/examples/run_all_model.py b/examples/run_all_model.py
index b448a1857..2f6c4299e 100644
--- a/examples/run_all_model.py
+++ b/examples/run_all_model.py
@@ -10,6 +10,7 @@ import shutil
 import tempfile
 import statistics
 from pathlib import Path
+from operator import xor
 from subprocess import Popen, PIPE
 from threading import Thread
 from pprint import pprint
@@ -174,11 +175,22 @@ def cal_mean_std(results) -> dict:
 
 
 # function to get all the folders benchmark folder
-def get_all_folders() -> dict:
+def get_all_folders(models, exclude) -> dict:
     folders = dict()
+    if isinstance(models, str):
+        model_list = models.split(",")
+        models = [m.lower().strip("[ ]") for m in model_list]
+    elif isinstance(models, list):
+        models = [m.lower() for m in models]
+    elif models is None:
+        models = [f.name.lower() for f in os.scandir("benchmarks")]
+    else:
+        raise ValueError("Input models type is not supported. Please provide str or list without space.")
     for f in os.scandir("benchmarks"):
-        path = Path("benchmarks") / f.name
-        folders[f.name] = str(path.resolve())
+        add = xor(bool(f.name.lower() in models), bool(exclude))
+        if add:
+            path = Path("benchmarks") / f.name
+            folders[f.name] = str(path.resolve())
     return folders
 
 
@@ -225,13 +237,44 @@ def gen_and_save_md_table(metrics):
 
 
 # function to run the all the models
-def run(times=1):
+def run(times=1, models=None, exclude=False):
     """
     Please be aware that this function can only work under Linux. MacOS and Windows will be supported in the future.
     Any PR to enhance this method is highly welcomed.
+
+    Parameters:
+    -----------
+    times : int
+        determines how many times the model should be running.
+    models : str or list
+        determines the specific model or list of models to run or exclude.
+    exclude : boolean
+        determines whether the model being used is excluded or included.
+
+    Usage:
+    -------
+    Here are some use cases of the function in the bash:
+
+    .. code-block:: bash
+
+        # Case 1 - run all models multiple times
+        python run_all_model.py 3
+
+        # Case 2 - run specific models multiple times
+        python run_all_model.py 3 dnn
+
+        # Case 3 - run other models except those are given as arguments for multiple times
+        python run_all_model.py 3 [dnn,tft,lstm] True
+
+        # Case 4 - run specific models for one time
+        python run_all_model.py --models=[dnn,lightgbm]
+
+        # Case 5 - run other models except those are given as aruments for one time
+        python run_all_model.py --models=[dnn,tft,sfm] --exclude=True
+
     """
     # get all folders
-    folders = get_all_folders()
+    folders = get_all_folders(models, exclude)
     # set up
     compatible = True
     if sys.version_info < (3, 3):
diff --git a/examples/workflow_by_code_gats.py b/examples/workflow_by_code_gats.py
index 3bb4edf08..b5bad31ec 100644
--- a/examples/workflow_by_code_gats.py
+++ b/examples/workflow_by_code_gats.py
@@ -7,19 +7,16 @@ from pathlib import Path
 import qlib
 import pandas as pd
 from qlib.config import REG_CN
-from qlib.contrib.model.pytorch_gats import GAT
-from qlib.contrib.data.handler import ALPHA360_Denoise
+
 from qlib.contrib.strategy.strategy import TopkDropoutStrategy
 from qlib.contrib.evaluate import (
     backtest as normal_backtest,
     risk_analysis,
 )
 from qlib.utils import exists_qlib_data
-
-# from qlib.model.learner import train_model
 from qlib.utils import init_instance_by_config
 
-import pickle
+
 
 if __name__ == "__main__":
 
diff --git a/examples/workflow_by_code_sfm.py b/examples/workflow_by_code_sfm.py
index ccc2d412c..e9a72883a 100644
--- a/examples/workflow_by_code_sfm.py
+++ b/examples/workflow_by_code_sfm.py
@@ -71,21 +71,22 @@ if __name__ == "__main__":
             "module_path": "qlib.contrib.model.pytorch_sfm",
             "kwargs": {
                 "d_feat": 6,
-                "hidden_size": 32,
-                "output_dim": 16,
+                "hidden_size": 64,
+                "output_dim": 32,
                 "freq_dim": 25,
                 "dropout_W": 0.5,
                 "dropout_U": 0.5,
-                "n_epochs": 200,
+                "n_epochs": 15,
                 "lr": 1e-3,
-                "batch_size": 200,
+                "metric": "", 
+                "batch_size": 1600,
                 "early_stop": 20,
                 "eval_steps": 5,
                 "loss": "mse",
                 "lr_decay": 0.96,
                 "lr_decay_steps": 100,
                 "optimizer": "adam",
-                "GPU": 1,
+                "GPU": 3,
                 "seed": 710,
             },
         },
diff --git a/qlib/contrib/data/handler.py b/qlib/contrib/data/handler.py
index 8cce92907..07ef2267a 100644
--- a/qlib/contrib/data/handler.py
+++ b/qlib/contrib/data/handler.py
@@ -10,6 +10,28 @@ from inspect import getfullargspec
 import copy
 
 
+def check_transform_proc(proc_l, fit_start_time, fit_end_time):
+    new_l = []
+    for p in proc_l:
+        if not isinstance(p, Processor):
+            klass, pkwargs = get_cls_kwargs(p, processor_module)
+            args = getfullargspec(klass).args
+            if "fit_start_time" in args and "fit_end_time" in args:
+                assert (
+                    fit_start_time is not None and fit_end_time is not None
+                ), "Make sure `fit_start_time` and `fit_end_time` are not None."
+                pkwargs.update(
+                    {
+                        "fit_start_time": fit_start_time,
+                        "fit_end_time": fit_end_time,
+                    }
+                )
+            new_l.append({"class": klass.__name__, "kwargs": pkwargs})
+        else:
+            new_l.append(p)
+    return new_l
+
+
 class ALPHA360_Denoise(DataHandlerLP):
     def __init__(self, instruments="csi500", start_time=None, end_time=None, fit_start_time=None, fit_end_time=None):
         data_loader = {
@@ -83,8 +105,31 @@ class ALPHA360_Denoise(DataHandlerLP):
         return fields, names
 
 
+_DEFAULT_LEARN_PROCESSORS = [
+    {"class": "DropnaLabel"},
+    {"class": "CSZScoreNorm", "kwargs": {"fields_group": "label"}},
+]
+_DEFAULT_INFER_PROCESSORS = [
+    {"class": "ProcessInf", "kwargs": {}},
+    {"class": "ZScoreNorm", "kwargs": {}},
+    {"class": "Fillna", "kwargs": {}},
+]
+
+
 class ALPHA360(DataHandlerLP):
-    def __init__(self, instruments="csi500", start_time=None, end_time=None, fit_start_time=None, fit_end_time=None):
+    def __init__(
+        self,
+        instruments="csi500",
+        start_time=None,
+        end_time=None,
+        infer_processors=_DEFAULT_INFER_PROCESSORS,
+        learn_processors=_DEFAULT_LEARN_PROCESSORS,
+        fit_start_time=None,
+        fit_end_time=None,
+    ):
+        infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time)
+        learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time)
+
         data_loader = {
             "class": "QlibDataLoader",
             "kwargs": {
@@ -95,16 +140,6 @@ class ALPHA360(DataHandlerLP):
             },
         }
 
-        learn_processors = [
-            {"class": "DropnaLabel", "kwargs": {"fields_group": "label"}},
-            {"class": "CSZScoreNorm", "kwargs": {"fields_group": "label"}},
-        ]
-        infer_processors = [
-            {"class": "ProcessInf", "kwargs": {}},
-            {"class": "ZscoreNorm", "kwargs": {"fit_start_time": fit_start_time, "fit_end_time": fit_end_time}},
-            {"class": "Fillna", "kwargs": {}},
-        ]
-
         super().__init__(
             instruments,
             start_time,
@@ -168,33 +203,12 @@ class Alpha158(DataHandlerLP):
         start_time=None,
         end_time=None,
         infer_processors=[],
-        learn_processors=["DropnaLabel", {"class": "CSZScoreNorm", "kwargs": {"fields_group": "label"}}],
+        learn_processors=_DEFAULT_LEARN_PROCESSORS,
         fit_start_time=None,
         fit_end_time=None,
     ):
-        def check_transform_proc(proc_l):
-            new_l = []
-            for p in proc_l:
-                if not isinstance(p, Processor):
-                    klass, pkwargs = get_cls_kwargs(p, processor_module)
-                    args = getfullargspec(klass).args
-                    if "fit_start_time" in args and "fit_end_time" in args:
-                        assert (
-                            fit_start_time is not None and fit_end_time is not None
-                        ), "Make sure `fit_start_time` and `fit_end_time` are not None."
-                        pkwargs.update(
-                            {
-                                "fit_start_time": fit_start_time,
-                                "fit_end_time": fit_end_time,
-                            }
-                        )
-                    new_l.append({"class": klass.__name__, "kwargs": pkwargs})
-                else:
-                    new_l.append(p)
-            return new_l
-
-        infer_processors = check_transform_proc(infer_processors)
-        learn_processors = check_transform_proc(learn_processors)
+        infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time)
+        learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time)
 
         data_loader = {
             "class": "QlibDataLoader",
diff --git a/qlib/contrib/model/catboost_model.py b/qlib/contrib/model/catboost_model.py
index bba006c35..eb97fc75b 100644
--- a/qlib/contrib/model/catboost_model.py
+++ b/qlib/contrib/model/catboost_model.py
@@ -34,14 +34,14 @@ class CatBoostModel(Model):
     def fit(
         self,
         dataset: DatasetH,
-        num_boost_round=1000,
-        early_stopping_rounds=50,
-        verbose_eval=20,
-        evals_result=dict(),
+        num_boost_round = 1000,
+        early_stopping_rounds = 50,
+        verbose_eval = 20,
+        evals_result = dict(),
         **kwargs
     ):
         df_train, df_valid = dataset.prepare(
-            ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
+            ["train", "valid"], col_set = ["feature", "label"], data_key = DataHandlerLP.DK_L
         )
         x_train, y_train = df_train["feature"], df_train["label"]
         x_valid, y_valid = df_valid["feature"], df_valid["label"]
@@ -52,8 +52,8 @@ class CatBoostModel(Model):
         else:
             raise ValueError("CatBoost doesn't support multi-label training")
 
-        train_pool = Pool(data=x_train, label=y_train_1d)
-        valid_pool = Pool(data=x_valid, label=y_valid_1d)
+        train_pool = Pool(data = x_train, label = y_train_1d)
+        valid_pool = Pool(data = x_valid, label = y_valid_1d)
 
         # Initialize the catboost model
         self._params["iterations"] = num_boost_round
@@ -63,7 +63,7 @@ class CatBoostModel(Model):
         self.model = CatBoost(self._params, **kwargs)
 
         # train the model
-        self.model.fit(train_pool, eval_set=valid_pool, use_best_model=True, **kwargs)
+        self.model.fit(train_pool, eval_set = valid_pool, use_best_model = True, **kwargs)
 
         evals_result = self.model.get_evals_result()
         evals_result["train"] = list(evals_result["learn"].values())[0]
@@ -72,8 +72,8 @@ class CatBoostModel(Model):
     def predict(self, dataset):
         if self.model is None:
             raise ValueError("model is not fitted yet!")
-        x_test = dataset.prepare("test", col_set="feature")
-        return pd.Series(self.model.predict(x_test.values), index=x_test.index)
+        x_test = dataset.prepare("test", col_set = "feature")
+        return pd.Series(self.model.predict(x_test.values), index = x_test.index)
 
 
 if __name__ == "__main__":
diff --git a/qlib/contrib/model/pytorch_gats.py b/qlib/contrib/model/pytorch_gats.py
index 07af4eda4..7cdfb571a 100755
--- a/qlib/contrib/model/pytorch_gats.py
+++ b/qlib/contrib/model/pytorch_gats.py
@@ -28,14 +28,12 @@ class GAT(Model):
 
     Parameters
     ----------
-    input_dim : int
-        input dimension
-    output_dim : int
-        output dimension
-    layers : tuple
-        layer sizes
     lr : float
         learning rate
+    d_feat : int
+        input dimensions for each time step
+    metric : str
+        the evaluate metric used in early stop
     optimizer : str
         optimizer name
     GPU : str
@@ -119,11 +117,7 @@ class GAT(Model):
                 seed,
             )
         )
-
-        if loss not in {"mse", "binary"}:
-            raise NotImplementedError("loss {} is not supported!".format(loss))
-        self._scorer = mean_squared_error if loss == "mse" else roc_auc_score
-
+        
         self.GAT_model = GATModel(
             d_feat=self.d_feat,
             hidden_size=self.hidden_size,
@@ -213,7 +207,6 @@ class GAT(Model):
         losses = []
 
         indices = np.arange(len(x_values))
-        np.random.shuffle(indices)
 
         for i in range(len(indices))[:: self.batch_size]:
 
@@ -377,7 +370,6 @@ class GATModel(nn.Module):
         self.fc_out = nn.Linear(hidden_size, 1)
         self.leaky_relu = nn.LeakyReLU()
         self.softmax = nn.Softmax(dim=1)
-
         self.d_feat = d_feat
 
     def cal_convariance(self, x, y):  # the 2nd dimension of x and y are the same
@@ -396,12 +388,7 @@ class GATModel(nn.Module):
         out, _ = self.rnn(x)
         hidden = out[:, -1, :]
         hidden = self.bn1(hidden)
-
         gamma = self.cal_convariance(hidden, hidden)
-        # gamma = hidden.mm(torch.t(hidden))
-        # gamma = self.leaky_relu(gamma)
-        # gamma = self.softmax(gamma)
-        # gamma = gamma * (torch.ones(x.shape[0], x.shape[0]).to(device) - torch.diag(torch.ones(x.shape[0])).to(device))
         output = gamma.mm(hidden)
         output = self.fc(output)
         output = self.bn2(output)
diff --git a/qlib/contrib/model/pytorch_gru.py b/qlib/contrib/model/pytorch_gru.py
index 4cc7f9852..2dd8464e2 100755
--- a/qlib/contrib/model/pytorch_gru.py
+++ b/qlib/contrib/model/pytorch_gru.py
@@ -28,14 +28,10 @@ class GRU(Model):
 
     Parameters
     ----------
-    input_dim : int
-        input dimension
-    output_dim : int
-        output dimension
-    layers : tuple
-        layer sizes
-    lr : float
-        learning rate
+    d_feat : int
+        input dimension for each time step
+    metric: str
+        the evaluate metric used in early stop
     optimizer : str
         optimizer name
     GPU : str
@@ -112,10 +108,6 @@ class GRU(Model):
             )
         )
 
-        if loss not in {"mse", "binary"}:
-            raise NotImplementedError("loss {} is not supported!".format(loss))
-        self._scorer = mean_squared_error if loss == "mse" else roc_auc_score
-
         self.gru_model = GRUModel(
             d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers, dropout=self.dropout
         )
@@ -251,7 +243,6 @@ class GRU(Model):
         # train
         self.logger.info("training...")
         self._fitted = True
-        # return
 
         for step in range(self.n_epochs):
             self.logger.info("Epoch%d:", step)
diff --git a/qlib/contrib/model/pytorch_lstm.py b/qlib/contrib/model/pytorch_lstm.py
index 8b8454380..be43d3698 100755
--- a/qlib/contrib/model/pytorch_lstm.py
+++ b/qlib/contrib/model/pytorch_lstm.py
@@ -28,14 +28,10 @@ class LSTM(Model):
 
     Parameters
     ----------
-    input_dim : int
-        input dimension
-    output_dim : int
-        output dimension
-    layers : tuple
-        layer sizes
-    lr : float
-        learning rate
+    d_feat : int
+        input dimension for each time step
+    metric: str
+        the evaluate metric used in early stop
     optimizer : str
         optimizer name
     GPU : str
@@ -112,10 +108,6 @@ class LSTM(Model):
             )
         )
 
-        if loss not in {"mse", "binary"}:
-            raise NotImplementedError("loss {} is not supported!".format(loss))
-        self._scorer = mean_squared_error if loss == "mse" else roc_auc_score
-
         self.lstm_model = LSTMModel(
             d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers, dropout=self.dropout
         )
@@ -251,7 +243,6 @@ class LSTM(Model):
         # train
         self.logger.info("training...")
         self._fitted = True
-        # return
 
         for step in range(self.n_epochs):
             self.logger.info("Epoch%d:", step)
diff --git a/qlib/contrib/model/pytorch_sfm.py b/qlib/contrib/model/pytorch_sfm.py
index d8baa9cb2..4ec61430e 100644
--- a/qlib/contrib/model/pytorch_sfm.py
+++ b/qlib/contrib/model/pytorch_sfm.py
@@ -31,7 +31,6 @@ from ...model.base import Model
 from ...data.dataset import DatasetH
 from ...data.dataset.handler import DataHandlerLP
 
-
 class SFM_Model(nn.Module):
     def __init__(self, d_feat=6, output_dim=1, freq_dim=10, hidden_size=64, dropout_W=0.0, dropout_U=0.0, device="cpu"):
         super().__init__()
@@ -76,13 +75,13 @@ class SFM_Model(nn.Module):
         self.states = []
 
     def forward(self, input):
-        input = input.reshape(len(input), self.input_dim, -1)  # [N, F, T]
-        input = input.permute(0, 2, 1)  # [N, T, F]
+        input = input.reshape(len(input), self.input_dim, -1) # [N, F, T]
+        input = input.permute(0, 2, 1) # [N, T, F]
         time_step = input.shape[1]
-
+        
         for ts in range(time_step):
-            x = input[:, ts, :]
-            if len(self.states) == 0:  # hasn't initialized yet
+            x = input[:, ts,:]
+            if len(self.states)==0: #hasn't initialized yet
                 self.init_states(x)
             self.get_constants(x)
             p_tm1 = self.states[0]
@@ -99,65 +98,64 @@ class SFM_Model(nn.Module):
             x_fre = torch.matmul(x * B_W[0], self.W_fre) + self.b_fre
             x_c = torch.matmul(x * B_W[0], self.W_c) + self.b_c
             x_o = torch.matmul(x * B_W[0], self.W_o) + self.b_o
-
-            i = self.inner_activation(
-                x_i + torch.matmul(h_tm1 * B_U[0], self.U_i)
-            )  # not sure whether I am doing in the right unsquuze
+            
+            i = self.inner_activation(x_i + torch.matmul(h_tm1 * B_U[0], self.U_i)) # not sure whether I am doing in the right unsquuze
+            
 
             ste = self.inner_activation(x_ste + torch.matmul(h_tm1 * B_U[0], self.U_ste))
             fre = self.inner_activation(x_fre + torch.matmul(h_tm1 * B_U[0], self.U_fre))
 
             ste = torch.reshape(ste, (-1, self.hidden_dim, 1))
             fre = torch.reshape(fre, (-1, 1, self.freq_dim))
-
+            
             f = ste * fre
-
+            
             c = i * self.activation(x_c + torch.matmul(h_tm1 * B_U[0], self.U_c))
 
             time = time_tm1 + 1
 
             omega = torch.tensor(2 * np.pi) * time * frequency
 
-            re = torch.cos(omega)
+            re = torch.cos(omega) 
             im = torch.sin(omega)
-
+            
             c = torch.reshape(c, (-1, self.hidden_dim, 1))
 
             S_re = f * S_re_tm1 + c * re
             S_im = f * S_im_tm1 + c * im
-
+            
             A = torch.square(S_re) + torch.square(S_im)
 
             A = torch.reshape(A, (-1, self.freq_dim)).float()
             A_a = torch.matmul(A * B_U[0], self.U_a)
             A_a = torch.reshape(A_a, (-1, self.hidden_dim))
             a = self.activation(A_a + self.b_a)
-
+            
             o = self.inner_activation(x_o + torch.matmul(h_tm1 * B_U[0], self.U_o))
 
             h = o * a
             p = torch.matmul(h, self.W_p) + self.b_p
 
             self.states = [p, h, S_re, S_im, time, None, None, None]
-        self.states = []
+        self.states = []    
         return self.fc_out(p).squeeze()
 
     def init_states(self, x):
         reducer_f = torch.zeros((self.hidden_dim, self.freq_dim)).to(self.device)
         reducer_p = torch.zeros((self.hidden_dim, self.output_dim)).to(self.device)
-
+        
         init_state_h = torch.zeros(self.hidden_dim).to(self.device)
         init_state_p = torch.matmul(init_state_h, reducer_p)
-
+        
         init_state = torch.zeros_like(init_state_h).to(self.device)
         init_freq = torch.matmul(init_state_h, reducer_f)
 
         init_state = torch.reshape(init_state, (-1, self.hidden_dim, 1))
         init_freq = torch.reshape(init_freq, (-1, 1, self.freq_dim))
-
+        
         init_state_S_re = init_state * init_freq
         init_state_S_im = init_state * init_freq
-
+        
         init_state_time = torch.tensor(0).to(self.device)
 
         self.states = [init_state_p, init_state_h, init_state_S_re, init_state_S_im, init_state_time, None, None, None]
@@ -203,6 +201,7 @@ class SFM(Model):
         dropout_U=0.0,
         n_epochs=200,
         lr=0.001,
+        metric = "",
         batch_size=2000,
         early_stop=20,
         eval_steps=5,
@@ -227,14 +226,15 @@ class SFM(Model):
         self.dropout_U = dropout_U
         self.n_epochs = n_epochs
         self.lr = lr
+        self.metric = metric
         self.batch_size = batch_size
         self.early_stop = early_stop
         self.eval_steps = eval_steps
         self.lr_decay = lr_decay
         self.lr_decay_steps = lr_decay_steps
         self.optimizer = optimizer.lower()
-        self.loss_type = loss
-        self.device = "cuda:%d" % (GPU) if torch.cuda.is_available() else "cpu"
+        self.loss = loss
+        self.device = "cuda:%d"%(GPU) if torch.cuda.is_available() else "cpu"
         self.use_gpu = torch.cuda.is_available()
         self.seed = seed
 
@@ -243,11 +243,12 @@ class SFM(Model):
             "\nd_feat : {}"
             "\nhidden_size : {}"
             "\noutput_size : {}"
-            "\nfrequency_dimension : {}"
+            "\nfrequency_dimension : {}" 
             "\ndropout_W: {}"
             "\ndropout_U: {}"
             "\nn_epochs : {}"
             "\nlr : {}"
+            "\nmetric : {}"
             "\nbatch_size : {}"
             "\nearly_stop : {}"
             "\neval_steps : {}"
@@ -266,6 +267,7 @@ class SFM(Model):
                 dropout_U,
                 n_epochs,
                 lr,
+                metric,
                 batch_size,
                 early_stop,
                 eval_steps,
@@ -284,14 +286,14 @@ class SFM(Model):
         self._scorer = mean_squared_error if loss == "mse" else roc_auc_score
 
         self.sfm_model = SFM_Model(
-            d_feat=self.d_feat,
+            d_feat=self.d_feat, 
             output_dim=self.output_dim,
-            hidden_size=self.hidden_size,
-            freq_dim=self.freq_dim,
-            dropout_W=self.dropout_W,
-            dropout_U=self.dropout_U,
-            device=self.device,
-        )
+            hidden_size=self.hidden_size, 
+            freq_dim=self.freq_dim, 
+            dropout_W=self.dropout_W, 
+            dropout_U=self.dropout_U, 
+            device=self.device
+            )
         if optimizer.lower() == "adam":
             self.train_optimizer = optim.Adam(self.sfm_model.parameters(), lr=self.lr)
         elif optimizer.lower() == "gd":
@@ -299,24 +301,73 @@ class SFM(Model):
         else:
             raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
 
-        # Reduce learning rate when loss has stopped decrease
-        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
-            self.train_optimizer,
-            mode="min",
-            factor=0.5,
-            patience=10,
-            verbose=True,
-            threshold=0.0001,
-            threshold_mode="rel",
-            cooldown=0,
-            min_lr=0.00001,
-            eps=1e-08,
-        )
-
         self._fitted = False
         self.sfm_model.to(self.device)
 
-    def fit(self, dataset: DatasetH, evals_result=dict(), verbose=True, save_path=None, **kwargs):
+    def test_epoch(self, data_x, data_y):
+
+        # prepare training data
+        x_values = data_x.values
+        y_values = np.squeeze(data_y.values)
+
+        self.sfm_model.eval()
+
+        scores = []
+        losses = []
+
+        indices = np.arange(len(x_values))
+        np.random.shuffle(indices)
+
+        for i in range(len(indices))[:: self.batch_size]:
+
+            if len(indices) - i < self.batch_size:
+                break
+
+            feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float().to(self.device)
+            label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float().to(self.device)
+
+            pred = self.sfm_model(feature)
+            loss = self.loss_fn(pred, label)
+            losses.append(loss.item())
+
+            score = self.metric_fn(pred, label)
+            scores.append(score.item())
+
+        return np.mean(losses), np.mean(scores)
+
+    def train_epoch(self, x_train, y_train):
+
+        x_train_values = x_train.values
+        y_train_values = np.squeeze(y_train.values) * 100
+
+        self.sfm_model.train()
+
+        indices = np.arange(len(x_train_values))
+        np.random.shuffle(indices)
+
+        for i in range(len(indices))[:: self.batch_size]:
+
+            if len(indices) - i < self.batch_size:
+                break
+
+            feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float().to(self.device)
+            label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float().to(self.device)
+
+            pred = self.sfm_model(feature)
+            loss = self.loss_fn(pred, label)
+
+            self.train_optimizer.zero_grad()
+            loss.backward()
+            torch.nn.utils.clip_grad_value_(self.sfm_model.parameters(), 3.0)
+            self.train_optimizer.step()
+
+    def fit(
+        self,
+        dataset: DatasetH,
+        evals_result=dict(),
+        verbose=True,
+        save_path=None,
+    ):
 
         df_train, df_valid = dataset.prepare(
             ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
@@ -324,10 +375,10 @@ class SFM(Model):
         x_train, y_train = df_train["feature"], df_train["label"]
         x_valid, y_valid = df_valid["feature"], df_valid["label"]
 
-        save_path = create_save_path(save_path)
         stop_steps = 0
         train_loss = 0
-        best_loss = np.inf
+        best_score = -np.inf
+        best_epoch = 0
         evals_result["train"] = []
         evals_result["valid"] = []
 
@@ -335,90 +386,56 @@ class SFM(Model):
         self.logger.info("training...")
         self._fitted = True
 
-        # prepare training data
-        x_train_values = torch.from_numpy(x_train.values).float()
-        y_train_values = torch.from_numpy(np.squeeze(y_train.values)).float()
-        train_num = y_train_values.shape[0]
-
-        # prepare validation data
-        x_val_auto = torch.from_numpy(x_valid.values).float()
-        y_val_auto = torch.from_numpy(np.squeeze(y_valid.values)).float()
-
-        x_val_auto = x_val_auto.to(self.device)
-        y_val_auto = y_val_auto.to(self.device)
-
         for step in range(self.n_epochs):
-            if stop_steps >= self.early_stop:
-                if verbose:
-                    self.logger.info("\tearly stop")
-                break
-            loss = AverageMeter()
-            self.sfm_model.train()
-            self.train_optimizer.zero_grad()
+            self.logger.info("Epoch%d:", step)
+            self.logger.info("training...")
+            self.train_epoch(x_train, y_train)
+            self.logger.info("evaluating...")
+            train_loss, train_score = self.test_epoch(x_train, y_train)
+            val_loss, val_score = self.test_epoch(x_valid, y_valid)
+            self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
+            evals_result["train"].append(train_score)
+            evals_result["valid"].append(val_score)
 
-            choice = np.random.choice(train_num, self.batch_size)
-            x_batch_auto = x_train_values[choice]
-            y_batch_auto = y_train_values[choice]
-
-            x_batch_auto = x_batch_auto.to(self.device)
-            y_batch_auto = y_batch_auto.to(self.device)
-
-            # forward
-            preds = self.sfm_model(x_batch_auto)
-            cur_loss = self.get_loss(preds, y_batch_auto, self.loss_type)
-            cur_loss.backward()
-            self.train_optimizer.step()
-            loss.update(cur_loss.item())
-
-            # validation
-            train_loss += loss.val
-            if step and step % self.eval_steps == 0:
+            if val_score > best_score:
+                best_score = val_score
+                stop_steps = 0
+                best_epoch = step
+                best_param = copy.deepcopy(self.sfm_model.state_dict())
+            else:
                 stop_steps += 1
-                train_loss /= self.eval_steps
-
-                with torch.no_grad():
-                    self.sfm_model.eval()
-                    loss_val = AverageMeter()
-
-                    # forward
-                    preds = self.sfm_model(x_val_auto)
-                    cur_loss_val = self.get_loss(preds, y_val_auto, self.loss_type)
-                    loss_val.update(cur_loss_val.item())
-
-                if verbose:
-                    self.logger.info(
-                        "[Epoch {}]: train_loss {:.6f}, valid_loss {:.6f}".format(step, train_loss, loss_val.val)
-                    )
-                evals_result["train"].append(train_loss)
-                evals_result["valid"].append(loss_val.val)
-                if loss_val.val < best_loss:
-                    if verbose:
-                        self.logger.info(
-                            "\tvalid loss update from {:.6f} to {:.6f}, save checkpoint.".format(
-                                best_loss, loss_val.val
-                            )
-                        )
-                    best_loss = loss_val.val
-                    stop_steps = 0
-                    torch.save(self.sfm_model.state_dict(), save_path)
-                train_loss = 0
-                # update learning rate
-                self.scheduler.step(cur_loss_val)
-
+                if stop_steps >= self.early_stop:
+                    self.logger.info("early stop")
+                    break
+        self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
         if self.device != "cpu":
             torch.cuda.empty_cache()
 
-    def get_loss(self, pred, target, loss_type):
-        if loss_type == "mse":
-            sqr_loss = (pred - target) ** 2
-            loss = sqr_loss.mean()
-            return loss
-        elif loss_type == "binary":
-            loss = nn.BCELoss()
-            return loss(pred, target)
-        else:
-            raise NotImplementedError("loss {} is not supported!".format(loss_type))
+    def mse(self, pred, label):
+        loss = (pred - label) ** 2
+        return torch.mean(loss)
+   
+    def loss_fn(self, pred, label):
+        mask = ~torch.isnan(label)
 
+        if self.loss == "mse":
+            return self.mse(pred[mask], label[mask])
+
+        raise ValueError("unknown loss `%s`" % self.loss)
+    
+    def metric_fn(self, pred, label):
+
+        mask = torch.isfinite(label)
+        if self.metric == "IC":
+            return self.cal_ic(pred[mask], label[mask])
+
+        if self.metric == "" or self.metric == "loss":  # use loss
+            return -self.loss_fn(pred[mask], label[mask])
+
+        raise ValueError("unknown metric `%s`" % self.metric)
+
+    def cal_ic(self, pred, label):
+        return torch.mean(pred * label)
     def predict(self, dataset):
         if not self._fitted:
             raise ValueError("model is not fitted yet!")
@@ -430,7 +447,7 @@ class SFM(Model):
         sample_num = x_values.shape[0]
         preds = []
 
-        for begin in range(sample_num)[:: self.batch_size]:
+        for begin in range(sample_num)[::self.batch_size]:
             if sample_num - begin < self.batch_size:
                 end = sample_num
             else:
@@ -440,37 +457,16 @@ class SFM(Model):
 
             if self.device != "cpu":
                 x_batch = x_batch.to(self.device)
-
+            
             with torch.no_grad():
-                if self.device != "cpu":
-                    pred = self.sfm_model(x_batch).detach().cpu().numpy()
-                else:
-                    pred = self.sfm_model(x_batch).detach().cpu().numpy()
+                pred = self.sfm_model(x_batch).detach().cpu().numpy()
+
             preds.append(pred)
-
+        
         return pd.Series(np.concatenate(preds), index=index)
 
-    def save(self, filename, **kwargs):
-        with save_multiple_parts_file(filename) as model_dir:
-            model_path = os.path.join(model_dir, os.path.split(model_dir)[-1])
-            # Save model
-            torch.save(self.sfm_model.state_dict(), model_path)
-
-    def load(self, buffer, **kwargs):
-        with unpack_archive_with_buffer(buffer) as model_dir:
-            # Get model name
-            _model_name = os.path.splitext(list(filter(lambda x: x.startswith("model.bin"), os.listdir(model_dir)))[0])[
-                0
-            ]
-            _model_path = os.path.join(model_dir, _model_name)
-            # Load model
-            self.sfm_model.load_state_dict(torch.load(_model_path))
-        self._fitted = True
-
-
 class AverageMeter(object):
     """Computes and stores the average and current value"""
-
     def __init__(self):
         self.reset()
 
diff --git a/qlib/contrib/model/xgboost.py b/qlib/contrib/model/xgboost.py
index 039fd2c80..203e71b9a 100755
--- a/qlib/contrib/model/xgboost.py
+++ b/qlib/contrib/model/xgboost.py
@@ -30,15 +30,15 @@ class XGBModel(Model):
     def fit(
         self,
         dataset: DatasetH,
-        num_boost_round=1000,
-        early_stopping_rounds=50,
-        verbose_eval=20,
-        evals_result=dict(),
+        num_boost_round = 1000,
+        early_stopping_rounds = 50,
+        verbose_eval = 20,
+        evals_result = dict(),
         **kwargs
     ):
 
         df_train, df_valid = dataset.prepare(
-            ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
+            ["train", "valid"], col_set = ["feature", "label"], data_key = DataHandlerLP.DK_L
         )
         x_train, y_train = df_train["feature"], df_train["label"]
         x_valid, y_valid = df_valid["feature"], df_valid["label"]
@@ -49,16 +49,16 @@ class XGBModel(Model):
         else:
             raise ValueError("XGBoost doesn't support multi-label training")
 
-        dtrain = xgb.DMatrix(x_train.values, label=y_train_1d)
-        dvalid = xgb.DMatrix(x_valid.values, label=y_valid_1d)
+        dtrain = xgb.DMatrix(x_train.values, label = y_train_1d)
+        dvalid = xgb.DMatrix(x_valid.values, label = y_valid_1d)
         self.model = xgb.train(
             self._params,
-            dtrain=dtrain,
-            num_boost_round=num_boost_round,
-            evals=[(dtrain, "train"), (dvalid, "valid")],
-            early_stopping_rounds=early_stopping_rounds,
-            verbose_eval=verbose_eval,
-            evals_result=evals_result,
+            dtrain = dtrain,
+            num_boost_round = num_boost_round,
+            evals = [(dtrain, "train"), (dvalid, "valid")],
+            early_stopping_rounds = early_stopping_rounds,
+            verbose_eval = verbose_eval,
+            evals_result = evals_result,
             **kwargs
         )
         evals_result["train"] = list(evals_result["train"].values())[0]
@@ -67,5 +67,5 @@ class XGBModel(Model):
     def predict(self, dataset):
         if self.model is None:
             raise ValueError("model is not fitted yet!")
-        x_test = dataset.prepare("test", col_set="feature")
-        return pd.Series(self.model.predict(xgb.DMatrix(x_test.values)), index=x_test.index)
+        x_test = dataset.prepare("test", col_set = "feature")
+        return pd.Series(self.model.predict(xgb.DMatrix(x_test.values)), index = x_test.index)
diff --git a/qlib/data/dataset/processor.py b/qlib/data/dataset/processor.py
index e4003a1f5..e2d251aa7 100755
--- a/qlib/data/dataset/processor.py
+++ b/qlib/data/dataset/processor.py
@@ -166,7 +166,9 @@ class MinMaxNorm(Processor):
         return df
 
 
-class ZscoreNorm(Processor):
+class ZScoreNorm(Processor):
+    """ZScore Normalization"""
+
     def __init__(self, fit_start_time, fit_end_time, fields_group=None):
         self.fit_start_time = fit_start_time
         self.fit_end_time = fit_end_time
@@ -193,6 +195,42 @@ class ZscoreNorm(Processor):
         return df
 
 
+class RobustZScoreNorm(Processor):
+    """Robust ZScore Normalization
+
+    Use robust statistics for Z-Score normalization:
+        mean(x) = median(x)
+        std(x) = MAD(x) * 1.4826
+
+    Reference:
+        https://en.wikipedia.org/wiki/Median_absolute_deviation.
+    """
+
+    def __init__(self, fit_start_time, fit_end_time, fields_group=None, clip_outlier=True):
+        self.fit_start_time = fit_start_time
+        self.fit_end_time = fit_end_time
+        self.fields_group = fields_group
+        self.clip_outlier = clip_outlier
+
+    def fit(self, df):
+        df = fetch_df_by_index(df, slice(self.fit_start_time, self.fit_end_time), level="datetime")
+        self.cols = get_group_columns(df, self.fields_group)
+        X = df[self.cols].values
+        self.mean_train = np.nanmedian(X, axis=0)
+        self.std_train = np.nanmedian(np.abs(X - self.mean_train), axis=0)
+        self.std_train += EPS
+        self.std_train *= 1.4826
+
+    def __call__(self, df):
+        X = df[self.cols]
+        X -= self.mean_train
+        X /= self.std_train
+        df[self.cols] = X
+        if self.clip_outlier:
+            df.clip(-3, 3, inplace=True)
+        return df
+
+
 class CSZScoreNorm(Processor):
     """Cross Sectional ZScore Normalization"""
 
diff --git a/qlib/workflow/cli.py b/qlib/workflow/cli.py
index 2e087877b..ecec8d3d7 100644
--- a/qlib/workflow/cli.py
+++ b/qlib/workflow/cli.py
@@ -27,9 +27,9 @@ def sys_config(config, config_path):
     Parameters
     ----------
     config : dict
-        configuration of the workflow
+        configuration of the workflow.
     config_path : str
-        configuration of the path
+        configuration of the path.
     """
     sys_config = config.get("sys", {})