diff --git a/examples/benchmarks/GATs/workflow_config_gats.yaml b/examples/benchmarks/GATs/workflow_config_gats.yaml
index 33aa0fe8d..7212e0ee2 100644
--- a/examples/benchmarks/GATs/workflow_config_gats.yaml
+++ b/examples/benchmarks/GATs/workflow_config_gats.yaml
@@ -8,6 +8,20 @@ data_handler_config: &data_handler_config
     fit_start_time: 2008-01-01
     fit_end_time: 2014-12-31
     instruments: *market
+    infer_processors:
+        - class: RobustZScoreNorm
+          kwargs:
+              fields_group: feature
+              clip_outlier: true
+        - class: Fillna
+          kwargs:
+              fields_group: feature
+    learn_processors:
+        - class: DropnaLabel
+        - class: CSRankNorm
+          kwargs:
+              fields_group: label
+    label: ["Ref($close, -2) / Ref($close, -1) - 1"]
 port_analysis_config: &port_analysis_config
     strategy:
         class: TopkDropoutStrategy
@@ -26,8 +40,8 @@ port_analysis_config: &port_analysis_config
         min_cost: 5
 task:
     model:
-        class: GAT
-        module_path: qlib.contrib.model.pytorch_gats
+        class: GAT_Classic
+        module_path: qlib.contrib.model.pytorch_gats_classic
         kwargs:
             d_feat: 6
             hidden_size: 64
@@ -38,8 +52,7 @@ task:
             early_stop: 20
             metric: loss
             loss: mse
-            base_model: LSTM
-            with_pretrain: True
+            base_model: GRU
             seed: 0
             GPU: 0
     dataset:
@@ -47,7 +60,7 @@ task:
         module_path: qlib.data.dataset
         kwargs:
             handler:
-                class: ALPHA360_Denoise
+                class: ALPHA360
                 module_path: qlib.contrib.data.handler
                 kwargs: *data_handler_config
             segments:
@@ -58,11 +71,6 @@ task:
         - class: SignalRecord
           module_path: qlib.workflow.record_temp
           kwargs: {}
-        - class: SigAnaRecord
-          module_path: qlib.workflow.record_temp
-          kwargs: 
-            ana_long_short: False
-            ann_scaler: 252
         - class: PortAnaRecord
           module_path: qlib.workflow.record_temp
           kwargs: 
diff --git a/examples/benchmarks/HATS/README.md b/examples/benchmarks/HATS/README.md
deleted file mode 100644
index b70dbff25..000000000
--- a/examples/benchmarks/HATS/README.md
+++ /dev/null
@@ -1,15 +0,0 @@
-## Requirement
-
-* pandas==1.1.2
-* numpy==1.17.4
-* scikit_learn==0.23.2
-* torch==1.7.0
-
-## HATS
-
-* HATS is a a hierarchical attention network for stock prediction which uses relational data for stock market prediction. HATS selectively aggregates information
-on different relation types and adds the information to the representations of each company. HATS is used as a relational modeling module with initialized node representations.Furthermore, HATS
-can  predict not only individual stock prices but also market index movements, which is similar to the graph classification task.
-
-* HATS uses pretrained model of GRU and LSTM. The code of GRU and LSTM used in Qlib is a pyTorch implemention of GRU and LSTM.
-* Paper address:HATS: A Hierarchical Graph Attention Network for Stock Movement Prediction https://arxiv.org/pdf/1908.07999.pdf
\ No newline at end of file
diff --git a/examples/benchmarks/HATS/requirements.txt b/examples/benchmarks/HATS/requirements.txt
deleted file mode 100644
index 16de0a438..000000000
--- a/examples/benchmarks/HATS/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-pandas==1.1.2
-numpy==1.17.4
-scikit_learn==0.23.2
-torch==1.7.0
diff --git a/examples/benchmarks/HATS/worflow_config_hats.yaml b/examples/benchmarks/HATS/worflow_config_hats.yaml
deleted file mode 100644
index b08df14e0..000000000
--- a/examples/benchmarks/HATS/worflow_config_hats.yaml
+++ /dev/null
@@ -1,77 +0,0 @@
-provider_uri: "~/.qlib/qlib_data/cn_data"
-region: cn
-market: &market csi300
-benchmark: &benchmark SH000300
-data_handler_config: &data_handler_config
-    start_time: 2008-01-01
-    end_time: 2020-08-01
-    fit_start_time: 2008-01-01
-    fit_end_time: 2014-12-31
-    instruments: *market
-    infer_processors:
-        - class: RobustZScoreNorm
-          kwargs:
-              fields_group: feature
-              clip_outlier: true
-        - class: Fillna
-          kwargs:
-              fields_group: feature
-    learn_processors:
-        - class: DropnaLabel
-        - class: CSRankNorm
-          kwargs:
-              fields_group: label
-    label: ["Ref($close, -2) / Ref($close, -1) - 1"]
-port_analysis_config: &port_analysis_config
-    strategy:
-        class: TopkDropoutStrategy
-        module_path: qlib.contrib.strategy.strategy
-        kwargs:
-            topk: 50
-            n_drop: 5
-    backtest:
-        verbose: False
-        limit_threshold: 0.095
-        account: 100000000
-        benchmark: *benchmark
-        deal_price: close
-        open_cost: 0.0005
-        close_cost: 0.0015
-        min_cost: 5
-task:
-    model:
-        class: HATS
-        module_path: qlib.contrib.model.pytorch_hats
-        kwargs:
-            d_feat: 6
-            hidden_size: 64
-            num_layers: 2
-            dropout: 0.6
-            n_epochs: 200
-            lr: 1e-3
-            early_stop: 20
-            metric: loss
-            loss: mse
-            base_model: GRU
-            seed: 0
-            GPU: 0
-    dataset:
-        class: DatasetH
-        module_path: qlib.data.dataset
-        kwargs:
-            handler:
-                class: ALPHA360
-                module_path: qlib.contrib.data.handler
-                kwargs: *data_handler_config
-            segments:
-                train: [2008-01-01, 2014-12-31]
-                valid: [2015-01-01, 2016-12-31]
-                test: [2017-01-01, 2020-08-01]
-    record: 
-        - class: SignalRecord
-          module_path: qlib.workflow.record_temp
-          kwargs: {}
-        - class: PortAnaRecord
-          module_path: qlib.workflow.record_temp
-          kwargs: 
-            config: *port_analysis_config
\ No newline at end of file
diff --git a/examples/benchmarks/LSTM/model_lstm_csi300.pkl b/examples/benchmarks/LSTM/model_lstm_csi300.pkl
index ff7fee450..84d6419da 100644
Binary files a/examples/benchmarks/LSTM/model_lstm_csi300.pkl and b/examples/benchmarks/LSTM/model_lstm_csi300.pkl differ
diff --git a/examples/benchmarks/TabNet/README.md b/examples/benchmarks/TabNet/README.md
deleted file mode 100644
index 3a233df46..000000000
--- a/examples/benchmarks/TabNet/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# TabNet
-* TabNet is a novel high-performance and interpretable canonical deep tabular data learning architectur. TabNet uses sequential attention to choose which features to reason from at each decision step, enabling interpretability and more effcient learning as the learning capacity is used for the most salient features.
-* The code used in Qlib is a pyTorch implementation of Tabnet (Arik, S. O., & Pfister, T. (2019). [https://github.com/dreamquark-ai/tabnet](https://github.com/dreamquark-ai/tabnet)
-* Paper: TabNet: Attentive Interpretable Tabular Learning. [https://arxiv.org/pdf/1908.07442.pdf](https://arxiv.org/pdf/1908.07442.pdf).
\ No newline at end of file
diff --git a/examples/benchmarks/TabNet/requirements.txt b/examples/benchmarks/TabNet/requirements.txt
deleted file mode 100644
index 244b74b19..000000000
--- a/examples/benchmarks/TabNet/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-pandas==1.1.2
-numpy==1.17.4
-scikit_learn==0.23.2
-torch==1.7.0
-pytorch-tabnet==2.0.1
\ No newline at end of file
diff --git a/examples/benchmarks/TabNet/workflow_config_tabnet.yaml b/examples/benchmarks/TabNet/workflow_config_tabnet.yaml
deleted file mode 100644
index 5f6aa8b6d..000000000
--- a/examples/benchmarks/TabNet/workflow_config_tabnet.yaml
+++ /dev/null
@@ -1,66 +0,0 @@
-provider_uri: "~/.qlib/qlib_data/cn_data"
-region: cn
-market: &market csi300
-benchmark: &benchmark SH000300
-data_handler_config: &data_handler_config
-    start_time: 2008-01-01
-    end_time: 2020-08-01
-    fit_start_time: 2008-01-01
-    fit_end_time: 2014-12-31
-    instruments: *market
-port_analysis_config: &port_analysis_config
-    strategy:
-        class: TopkDropoutStrategy
-        module_path: qlib.contrib.strategy.strategy
-        kwargs:
-            topk: 50
-            n_drop: 5
-    backtest:
-        verbose: False
-        limit_threshold: 0.095
-        account: 100000000
-        benchmark: *benchmark
-        deal_price: close
-        open_cost: 0.0005
-        close_cost: 0.0015
-        min_cost: 5
-task:
-    model:
-        class: TabNetModel
-        module_path: qlib.contrib.model.tabnet
-        kwargs:
-            n_d: 8
-            n_a: 8
-            n_steps: 3
-            gamma: 1.3
-            n_independent: 2
-            n_shared: 2
-            seed: 0
-            momentum: 0.02
-            lambda_sparse: 1e-3
-            optimizer_params: {lr: 2e-3}
-    dataset:
-        class: DatasetH
-        module_path: qlib.data.dataset
-        kwargs:
-            handler:
-                class: ALPHA360_Denoise
-                module_path: qlib.contrib.data.handler
-                kwargs: *data_handler_config
-            segments:
-                train: [2008-01-01, 2014-12-31]
-                valid: [2015-01-01, 2016-12-31]
-                test: [2017-01-01, 2020-08-01]
-    record: 
-        - class: SignalRecord
-          module_path: qlib.workflow.record_temp
-          kwargs: {}
-        - class: SigAnaRecord
-          module_path: qlib.workflow.record_temp
-          kwargs: 
-            ana_long_short: False
-            ann_scaler: 252
-        - class: PortAnaRecord
-          module_path: qlib.workflow.record_temp
-          kwargs: 
-            config: *port_analysis_config
\ No newline at end of file
diff --git a/qlib/contrib/model/catboost_model.py b/qlib/contrib/model/catboost_model.py
index bba006c35..43a141418 100644
--- a/qlib/contrib/model/catboost_model.py
+++ b/qlib/contrib/model/catboost_model.py
@@ -41,14 +41,18 @@ class CatBoostModel(Model):
         **kwargs
     ):
         df_train, df_valid = dataset.prepare(
-            ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
+            ["train", "valid"],
+            col_set=["feature", "label"],
+            data_key=DataHandlerLP.DK_L,
         )
         x_train, y_train = df_train["feature"], df_train["label"]
         x_valid, y_valid = df_valid["feature"], df_valid["label"]
 
         # CatBoost needs 1D array as its label
         if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
-            y_train_1d, y_valid_1d = np.squeeze(y_train.values), np.squeeze(y_valid.values)
+            y_train_1d, y_valid_1d = np.squeeze(y_train.values), np.squeeze(
+                y_valid.values
+            )
         else:
             raise ValueError("CatBoost doesn't support multi-label training")
 
diff --git a/qlib/contrib/model/pytorch_alstm.py b/qlib/contrib/model/pytorch_alstm.py
index 1b23d2401..227772499 100644
--- a/qlib/contrib/model/pytorch_alstm.py
+++ b/qlib/contrib/model/pytorch_alstm.py
@@ -11,7 +11,12 @@ import pandas as pd
 import copy
 from sklearn.metrics import roc_auc_score, mean_squared_error
 import logging
-from ...utils import unpack_archive_with_buffer, save_multiple_parts_file, create_save_path, drop_nan_by_y_index
+from ...utils import (
+    unpack_archive_with_buffer,
+    save_multiple_parts_file,
+    create_save_path,
+    drop_nan_by_y_index,
+)
 from ...log import get_module_logger, TimeInspector
 
 import torch
@@ -109,14 +114,19 @@ class ALSTM(Model):
         )
 
         self.ALSTM_model = ALSTMModel(
-            d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers, dropout=self.dropout
+            d_feat=self.d_feat,
+            hidden_size=self.hidden_size,
+            num_layers=self.num_layers,
+            dropout=self.dropout,
         )
         if optimizer.lower() == "adam":
             self.train_optimizer = optim.Adam(self.ALSTM_model.parameters(), lr=self.lr)
         elif optimizer.lower() == "gd":
             self.train_optimizer = optim.SGD(self.ALSTM_model.parameters(), lr=self.lr)
         else:
-            raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
+            raise NotImplementedError(
+                "optimizer {} is not supported!".format(optimizer)
+            )
 
         self._fitted = False
         if self.use_gpu:
@@ -141,7 +151,7 @@ class ALSTM(Model):
 
         mask = torch.isfinite(label)
 
-        if self.metric == "" or self.metric == "loss":  # use loss
+        if self.metric == "" or self.metric == "loss":
             return -self.loss_fn(pred[mask], label[mask])
 
         raise ValueError("unknown metric `%s`" % self.metric)
@@ -161,8 +171,12 @@ class ALSTM(Model):
             if len(indices) - i < self.batch_size:
                 break
 
-            feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float()
-            label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float()
+            feature = torch.from_numpy(
+                x_train_values[indices[i : i + self.batch_size]]
+            ).float()
+            label = torch.from_numpy(
+                y_train_values[indices[i : i + self.batch_size]]
+            ).float()
 
             if self.use_gpu:
                 feature = feature.cuda()
@@ -194,7 +208,9 @@ class ALSTM(Model):
             if len(indices) - i < self.batch_size:
                 break
 
-            feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float()
+            feature = torch.from_numpy(
+                x_values[indices[i : i + self.batch_size]]
+            ).float()
             label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float()
 
             if self.use_gpu:
@@ -219,7 +235,9 @@ class ALSTM(Model):
     ):
 
         df_train, df_valid, df_test = dataset.prepare(
-            ["train", "valid", "test"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
+            ["train", "valid", "test"],
+            col_set=["feature", "label"],
+            data_key=DataHandlerLP.DK_L,
         )
 
         x_train, y_train = df_train["feature"], df_train["label"]
@@ -302,7 +320,9 @@ class ALSTM(Model):
 
 
 class ALSTMModel(nn.Module):
-    def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0, rnn_type="GRU"):
+    def __init__(
+        self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0, rnn_type="GRU"
+    ):
         super().__init__()
         self.hid_size = hidden_size
         self.input_size = d_feat
@@ -317,7 +337,9 @@ class ALSTMModel(nn.Module):
         except:
             raise ValueError("unknown rnn_type `%s`" % self.rnn_type)
         self.net = nn.Sequential()
-        self.net.add_module("fc_in", nn.Linear(in_features=self.input_size, out_features=self.hid_size))
+        self.net.add_module(
+            "fc_in", nn.Linear(in_features=self.input_size, out_features=self.hid_size)
+        )
         self.net.add_module("act", nn.Tanh())
         self.rnn = klass(
             input_size=self.hid_size,
@@ -328,17 +350,27 @@ class ALSTMModel(nn.Module):
         )
         self.fc_out = nn.Linear(in_features=self.hid_size * 2, out_features=1)
         self.att_net = nn.Sequential()
-        self.att_net.add_module("att_fc_in", nn.Linear(in_features=self.hid_size, out_features=int(self.hid_size / 2)))
+        self.att_net.add_module(
+            "att_fc_in",
+            nn.Linear(in_features=self.hid_size, out_features=int(self.hid_size / 2)),
+        )
         self.att_net.add_module("att_dropout", torch.nn.Dropout(self.dropout))
         self.att_net.add_module("att_act", nn.Tanh())
-        self.att_net.add_module("att_fc_out", nn.Linear(in_features=int(self.hid_size / 2), out_features=1, bias=False))
+        self.att_net.add_module(
+            "att_fc_out",
+            nn.Linear(in_features=int(self.hid_size / 2), out_features=1, bias=False),
+        )
         self.att_net.add_module("att_softmax", nn.Softmax(dim=1))
 
     def forward(self, inputs):
         # inputs: [batch_size, input_size*input_day]
         inputs = inputs.view(len(inputs), self.input_size, -1)
-        inputs = inputs.permute(0, 2, 1)  # [batch, input_size, seq_len] -> [batch, seq_len, input_size]
-        rnn_out, _ = self.rnn(self.net(inputs))  # [batch, seq_len, num_directions * hidden_size]
+        inputs = inputs.permute(
+            0, 2, 1
+        )  # [batch, input_size, seq_len] -> [batch, seq_len, input_size]
+        rnn_out, _ = self.rnn(
+            self.net(inputs)
+        )  # [batch, seq_len, num_directions * hidden_size]
         attention_score = self.att_net(rnn_out)  # [batch, seq_len, 1]
         out_att = torch.mul(rnn_out, attention_score)
         out_att = torch.sum(out_att, dim=1)
diff --git a/qlib/contrib/model/pytorch_gats.py b/qlib/contrib/model/pytorch_gats.py
old mode 100755
new mode 100644
index 77a02a9b2..226204fe7
--- a/qlib/contrib/model/pytorch_gats.py
+++ b/qlib/contrib/model/pytorch_gats.py
@@ -19,10 +19,12 @@ import torch.optim as optim
 from ...model.base import Model
 from ...data.dataset import DatasetH
 from ...data.dataset.handler import DataHandlerLP
+from ...contrib.model.pytorch_lstm import LSTMModel
+from ...contrib.model.pytorch_gru import GRUModel
 
 
-class GAT(Model):
-    """GAT Model
+class GATs(Model):
+    """GATs Model
 
     Parameters
     ----------
@@ -57,8 +59,8 @@ class GAT(Model):
         **kwargs
     ):
         # Set logger.
-        self.logger = get_module_logger("GAT")
-        self.logger.info("GAT pytorch version...")
+        self.logger = get_module_logger("GATs")
+        self.logger.info("GATs pytorch version...")
 
         # set hyper-parameters.
         self.d_feat = d_feat
@@ -78,7 +80,7 @@ class GAT(Model):
         self.seed = seed
 
         self.logger.info(
-            "GAT parameters setting:"
+            "GATs parameters setting:"
             "\nd_feat : {}"
             "\nhidden_size : {}"
             "\nnum_layers : {}"
@@ -124,7 +126,9 @@ class GAT(Model):
         elif optimizer.lower() == "gd":
             self.train_optimizer = optim.SGD(self.GAT_model.parameters(), lr=self.lr)
         else:
-            raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
+            raise NotImplementedError(
+                "optimizer {} is not supported!".format(optimizer)
+            )
 
         self._fitted = False
         if self.use_gpu:
@@ -149,18 +153,18 @@ class GAT(Model):
 
         mask = torch.isfinite(label)
 
-        if self.metric == "" or self.metric == "loss":  # use loss
+        if self.metric == "" or self.metric == "loss":
             return -self.loss_fn(pred[mask], label[mask])
 
         raise ValueError("unknown metric `%s`" % self.metric)
 
     def get_daily_inter(self, df, shuffle=False):
-        # organize the train data into daily inter as daily batches
+        # organize the train data into daily batches
         daily_count = df.groupby(level=0).size().values
         daily_index = np.roll(np.cumsum(daily_count), 1)
         daily_index[0] = 0
         if shuffle:
-            # shuffle the daily inter data
+            # shuffle data
             daily_shuffle = list(zip(daily_index, daily_count))
             np.random.shuffle(daily_shuffle)
             daily_index, daily_count = zip(*daily_shuffle)
@@ -172,7 +176,7 @@ class GAT(Model):
         y_train_values = np.squeeze(y_train.values)
         self.GAT_model.train()
 
-        # organize the train data into daily inter as daily batches
+        # organize the train data into daily batches
         daily_index, daily_count = self.get_daily_inter(x_train, shuffle=True)
 
         for idx, count in zip(daily_index, daily_count):
@@ -203,7 +207,7 @@ class GAT(Model):
         scores = []
         losses = []
 
-        # organize the test data into daily inter as daily batches
+        # organize the test data into daily batches
         daily_index, daily_count = self.get_daily_inter(data_x, shuffle=False)
 
         for idx, count in zip(daily_index, daily_count):
@@ -233,7 +237,9 @@ class GAT(Model):
     ):
 
         df_train, df_valid, df_test = dataset.prepare(
-            ["train", "valid", "test"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
+            ["train", "valid", "test"],
+            col_set=["feature", "label"],
+            data_key=DataHandlerLP.DK_L,
         )
 
         x_train, y_train = df_train["feature"], df_train["label"]
@@ -251,17 +257,23 @@ class GAT(Model):
         if self.with_pretrain:
             self.logger.info("Loading pretrained model...")
             if self.base_model == "LSTM":
-                from ...contrib.model.pytorch_lstm import LSTMModel
-
                 pretrained_model = LSTMModel()
-                pretrained_model.load_state_dict(torch.load("benchmarks/LSTM/model_lstm_csi300.pkl"))
-            elif self.base_model == "GRU":
-                from ...contrib.model.pytorch_gru import GRUModel
+                pretrained_model.load_state_dict(
+                    torch.load("benchmarks/LSTM/model_lstm_csi300.pkl")
+                )
 
+            elif self.base_model == "GRU":
                 pretrained_model = GRUModel()
-                pretrained_model.load_state_dict(torch.load("benchmarks/GRU/model_gru_csi300.pkl"))
+                pretrained_model.load_state_dict(
+                    torch.load("benchmarks/GRU/model_gru_csi300.pkl")
+                )
+
             model_dict = self.GAT_model.state_dict()
-            pretrained_dict = {k: v for k, v in pretrained_model.state_dict().items() if k in model_dict}
+            pretrained_dict = {
+                k: v
+                for k, v in pretrained_model.state_dict().items()
+                if k in model_dict
+            }
             model_dict.update(pretrained_dict)
             self.GAT_model.load_state_dict(model_dict)
             self.logger.info("Loading pretrained model Done...")
@@ -269,7 +281,6 @@ class GAT(Model):
         # train
         self.logger.info("training...")
         self._fitted = True
-        # return
 
         for step in range(self.n_epochs):
             self.logger.info("Epoch%d:", step)
@@ -310,7 +321,7 @@ class GAT(Model):
         x_values = x_test.values
         preds = []
 
-        # organize the data into daily inter as daily batches
+        # organize the data into daily batches
         daily_index, daily_count = self.get_daily_inter(x_test, shuffle=False)
 
         for idx, count in zip(daily_index, daily_count):
@@ -332,7 +343,9 @@ class GAT(Model):
 
 
 class GATModel(nn.Module):
-    def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0, base_model="GRU"):
+    def __init__(
+        self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0, base_model="GRU"
+    ):
         super().__init__()
 
         if base_model == "GRU":
@@ -355,22 +368,29 @@ class GATModel(nn.Module):
             raise ValueError("unknown base model name `%s`" % base_model)
 
         self.hidden_size = hidden_size
-        self.bn1 = nn.BatchNorm1d(num_features=hidden_size, track_running_stats=False)
-        self.fc = nn.Linear(hidden_size, hidden_size)
-        self.bn2 = nn.BatchNorm1d(num_features=hidden_size, track_running_stats=False)
+        self.d_feat = d_feat
+        self.transformation = nn.Linear(self.hidden_size, self.hidden_size)
+        self.a = nn.Parameter(torch.randn(self.hidden_size * 2, 1))
+        self.a.requires_grad = True
+        self.fc = nn.Linear(self.hidden_size, self.hidden_size)
         self.fc_out = nn.Linear(hidden_size, 1)
         self.leaky_relu = nn.LeakyReLU()
         self.softmax = nn.Softmax(dim=1)
-        self.d_feat = d_feat
 
-    def cal_convariance(self, x, y):  # the 2nd dimension of x and y are the same
-        e_x = torch.mean(x, dim=1).reshape(-1, 1)
-        e_y = torch.mean(y, dim=1).reshape(-1, 1)
-        e_x_e_y = e_x.mm(torch.t(e_y))
-        x_extend = x.reshape(x.shape[0], 1, x.shape[1]).repeat(1, y.shape[0], 1)
-        y_extend = y.reshape(1, y.shape[0], y.shape[1]).repeat(x.shape[0], 1, 1)
-        e_xy = torch.mean(x_extend * y_extend, dim=2)
-        return e_xy - e_x_e_y
+    def cal_attention(self, x, y):
+        x = self.transformation(x)
+        y = self.transformation(y)
+
+        sample_num = x.shape[0]
+        dim = x.shape[1]
+        e_x = x.expand(sample_num, sample_num, dim)
+        e_y = torch.transpose(e_x, 0, 1)
+        attention_in = torch.cat((e_x, e_y), 2).view(-1, dim * 2)
+        self.a_t = torch.t(self.a)
+        attention_out = self.a_t.mm(torch.t(attention_in)).view(sample_num, sample_num)
+        attention_out = self.leaky_relu(attention_out)
+        att_weight = self.softmax(attention_out)
+        return att_weight
 
     def forward(self, x):
         # x: [N, F*T]
@@ -378,10 +398,8 @@ class GATModel(nn.Module):
         x = x.permute(0, 2, 1)  # [N, T, F]
         out, _ = self.rnn(x)
         hidden = out[:, -1, :]
-        hidden = self.bn1(hidden)
-        gamma = self.cal_convariance(hidden, hidden)
-        output = gamma.mm(hidden)
-        output = self.fc(output)
-        output = self.bn2(output)
-        output = self.leaky_relu(output)
-        return self.fc_out(output).squeeze()
+        att_weight = self.cal_attention(hidden, hidden)
+        hidden = att_weight.mm(hidden) + hidden
+        hidden = self.fc(hidden)
+        hidden = self.leaky_relu(hidden)
+        return self.fc_out(hidden).squeeze()
diff --git a/qlib/contrib/model/pytorch_gru.py b/qlib/contrib/model/pytorch_gru.py
index 02664b6ac..935716bcc 100755
--- a/qlib/contrib/model/pytorch_gru.py
+++ b/qlib/contrib/model/pytorch_gru.py
@@ -11,7 +11,12 @@ import pandas as pd
 import copy
 from sklearn.metrics import roc_auc_score, mean_squared_error
 import logging
-from ...utils import unpack_archive_with_buffer, save_multiple_parts_file, create_save_path, drop_nan_by_y_index
+from ...utils import (
+    unpack_archive_with_buffer,
+    save_multiple_parts_file,
+    create_save_path,
+    drop_nan_by_y_index,
+)
 from ...log import get_module_logger, TimeInspector
 
 import torch
@@ -109,14 +114,19 @@ class GRU(Model):
         )
 
         self.gru_model = GRUModel(
-            d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers, dropout=self.dropout
+            d_feat=self.d_feat,
+            hidden_size=self.hidden_size,
+            num_layers=self.num_layers,
+            dropout=self.dropout,
         )
         if optimizer.lower() == "adam":
             self.train_optimizer = optim.Adam(self.gru_model.parameters(), lr=self.lr)
         elif optimizer.lower() == "gd":
             self.train_optimizer = optim.SGD(self.gru_model.parameters(), lr=self.lr)
         else:
-            raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
+            raise NotImplementedError(
+                "optimizer {} is not supported!".format(optimizer)
+            )
 
         self._fitted = False
         if self.use_gpu:
@@ -141,7 +151,7 @@ class GRU(Model):
 
         mask = torch.isfinite(label)
 
-        if self.metric == "" or self.metric == "loss":  # use loss
+        if self.metric == "" or self.metric == "loss":
             return -self.loss_fn(pred[mask], label[mask])
 
         raise ValueError("unknown metric `%s`" % self.metric)
@@ -161,8 +171,12 @@ class GRU(Model):
             if len(indices) - i < self.batch_size:
                 break
 
-            feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float()
-            label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float()
+            feature = torch.from_numpy(
+                x_train_values[indices[i : i + self.batch_size]]
+            ).float()
+            label = torch.from_numpy(
+                y_train_values[indices[i : i + self.batch_size]]
+            ).float()
 
             if self.use_gpu:
                 feature = feature.cuda()
@@ -194,7 +208,9 @@ class GRU(Model):
             if len(indices) - i < self.batch_size:
                 break
 
-            feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float()
+            feature = torch.from_numpy(
+                x_values[indices[i : i + self.batch_size]]
+            ).float()
             label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float()
 
             if self.use_gpu:
@@ -219,7 +235,9 @@ class GRU(Model):
     ):
 
         df_train, df_valid, df_test = dataset.prepare(
-            ["train", "valid", "test"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
+            ["train", "valid", "test"],
+            col_set=["feature", "label"],
+            data_key=DataHandlerLP.DK_L,
         )
 
         x_train, y_train = df_train["feature"], df_train["label"]
diff --git a/qlib/contrib/model/pytorch_hats.py b/qlib/contrib/model/pytorch_hats.py
deleted file mode 100644
index 7affea73c..000000000
--- a/qlib/contrib/model/pytorch_hats.py
+++ /dev/null
@@ -1,491 +0,0 @@
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import division
-from __future__ import print_function
-
-import os
-import numpy as np
-import pandas as pd
-import copy
-from ...utils import create_save_path
-from ...log import get_module_logger
-
-import torch
-import torch.nn as nn
-import torch.optim as optim
-
-from ...model.base import Model
-from ...data.dataset import DatasetH
-from ...data.dataset.handler import DataHandlerLP
-
-
-class HATS(Model):
-    """HATS Model
-
-    Parameters
-    ----------
-    d_feat : int
-        input dimension for each time step
-    metric: str
-        the evaluate metric used in early stop
-    optimizer : str
-        optimizer name
-    GPU : str
-        the GPU ID(s) used for training
-    """
-
-    def __init__(
-        self,
-        d_feat=6,
-        hidden_size=64,
-        num_layers=2,
-        dropout=0.5,
-        n_epochs=200,
-        lr=0.01,
-        metric="",
-        early_stop=20,
-        loss="mse",
-        base_model="GRU",
-        with_pretrain=True,
-        optimizer="adam",
-        GPU="0",
-        seed=0,
-        **kwargs
-    ):
-        # Set logger.
-        self.logger = get_module_logger("HATS")
-        self.logger.info("HATS pytorch version...")
-
-        # set hyper-parameters.
-        self.d_feat = d_feat
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.dropout = dropout
-        self.n_epochs = n_epochs
-        self.lr = lr
-        self.metric = metric
-        self.early_stop = early_stop
-        self.optimizer = optimizer.lower()
-        self.loss = loss
-        self.base_model = base_model
-        self.with_pretrain = with_pretrain
-        self.visible_GPU = GPU
-        self.use_gpu = torch.cuda.is_available()
-        self.seed = seed
-
-        self.logger.info(
-            "HATS parameters setting:"
-            "\nd_feat : {}"
-            "\nhidden_size : {}"
-            "\nnum_layers : {}"
-            "\ndropout : {}"
-            "\nn_epochs : {}"
-            "\nlr : {}"
-            "\nmetric : {}"
-            "\nearly_stop : {}"
-            "\noptimizer : {}"
-            "\nloss_type : {}"
-            "\nbase_model : {}"
-            "\nwith_pretrain : {}"
-            "\nvisible_GPU : {}"
-            "\nuse_GPU : {}"
-            "\nseed : {}".format(
-                d_feat,
-                hidden_size,
-                num_layers,
-                dropout,
-                n_epochs,
-                lr,
-                metric,
-                early_stop,
-                optimizer.lower(),
-                loss,
-                base_model,
-                with_pretrain,
-                GPU,
-                self.use_gpu,
-                seed,
-            )
-        )
-
-        self.HATS_model = HATSModel(
-            d_feat=self.d_feat,
-            hidden_size=self.hidden_size,
-            num_layers=self.num_layers,
-            dropout=self.dropout,
-            base_model=self.base_model,
-        )
-        if optimizer.lower() == "adam":
-            self.train_optimizer = optim.Adam(self.HATS_model.parameters(), lr=self.lr)
-        elif optimizer.lower() == "gd":
-            self.train_optimizer = optim.SGD(self.HATS_model.parameters(), lr=self.lr)
-        else:
-            raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
-
-        self._fitted = False
-        if self.use_gpu:
-            self.HATS_model.cuda()
-            # set the visible GPU
-            if self.visible_GPU:
-                os.environ["CUDA_VISIBLE_DEVICES"] = self.visible_GPU
-
-    def mse(self, pred, label):
-        loss = (pred - label) ** 2
-        return torch.mean(loss)
-
-    def loss_fn(self, pred, label):
-        mask = ~torch.isnan(label)
-
-        if self.loss == "mse":
-            return self.mse(pred[mask], label[mask])
-
-        raise ValueError("unknown loss `%s`" % self.loss)
-
-    def metric_fn(self, pred, label):
-        mask = torch.isfinite(label)
-
-        if self.metric == "" or self.metric == "loss":  # use loss
-            return -self.loss_fn(pred[mask], label[mask])
-
-        raise ValueError("unknown metric `%s`" % self.metric)
-
-    def get_daily_inter(self, df, shuffle=False):
-        # organize the train data into daily inter as daily batches
-        daily_count = df.groupby(level=0).size().values
-        daily_index = np.roll(np.cumsum(daily_count), 1)
-        daily_index[0] = 0
-        if shuffle:
-            # shuffle the daily inter data
-            daily_shuffle = list(zip(daily_index, daily_count))
-            np.random.shuffle(daily_shuffle)
-            daily_index, daily_count = zip(*daily_shuffle)
-        return daily_index, daily_count
-
-    def train_epoch(self, x_train, y_train):
-
-        x_train_values = x_train.values
-        y_train_values = np.squeeze(y_train.values)
-
-        self.HATS_model.train()
-
-        # organize the train data into daily inter as daily batches
-        daily_index, daily_count = self.get_daily_inter(x_train, shuffle=True)
-
-        for idx, count in zip(daily_index, daily_count):
-            batch = slice(idx, idx + count)
-            feature = torch.from_numpy(x_train_values[batch]).float()
-            label = torch.from_numpy(y_train_values[batch]).float()
-
-            if self.use_gpu:
-                feature = feature.cuda()
-                label = label.cuda()
-
-            pred = self.HATS_model(feature)
-            loss = self.loss_fn(pred, label)
-
-            self.train_optimizer.zero_grad()
-            loss.backward()
-            torch.nn.utils.clip_grad_value_(self.HATS_model.parameters(), 3.0)
-            self.train_optimizer.step()
-
-    def test_epoch(self, data_x, data_y):
-
-        # prepare testing data
-        x_values = data_x.values
-        y_values = np.squeeze(data_y.values)
-
-        self.HATS_model.eval()
-
-        scores = []
-        losses = []
-
-        # organize the test data into daily inter as daily batches
-        daily_index, daily_count = self.get_daily_inter(data_x, shuffle=False)
-
-        for idx, count in zip(daily_index, daily_count):
-            batch = slice(idx, idx + count)
-            feature = torch.from_numpy(x_values[batch]).float()
-            label = torch.from_numpy(y_values[batch]).float()
-
-            if self.use_gpu:
-                feature = feature.cuda()
-                label = label.cuda()
-
-            pred = self.HATS_model(feature)
-            loss = self.loss_fn(pred, label)
-            losses.append(loss.item())
-
-            score = self.metric_fn(pred, label)
-            scores.append(score.item())
-
-        return np.mean(losses), np.mean(scores)
-
-    def fit(
-        self,
-        dataset: DatasetH,
-        evals_result=dict(),
-        verbose=True,
-        save_path=None,
-    ):
-
-        df_train, df_valid, df_test = dataset.prepare(
-            ["train", "valid", "test"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
-        )
-
-        x_train, y_train = df_train["feature"], df_train["label"]
-        x_valid, y_valid = df_valid["feature"], df_valid["label"]
-
-        if save_path == None:
-            save_path = create_save_path(save_path)
-        stop_steps = 0
-        best_score = -np.inf
-        best_epoch = 0
-        evals_result["train"] = []
-        evals_result["valid"] = []
-
-        # load pretrained base_model
-        if self.with_pretrain:
-            self.logger.info("Loading pretrained model...")
-            if self.base_model == "LSTM":
-                from ...contrib.model.pytorch_lstm import LSTMModel
-
-                pretrained_model = LSTMModel()
-                pretrained_model.load_state_dict(torch.load("benchmarks/LSTM/model_lstm_csi300.pkl"))
-            elif self.base_model == "GRU":
-                from ...contrib.model.pytorch_gru import GRUModel
-
-                pretrained_model = GRUModel()
-                pretrained_model.load_state_dict(torch.load("benchmarks/GRU/model_gru_csi300.pkl"))
-            model_dict = self.HATS_model.state_dict()
-            pretrained_dict = {k: v for k, v in pretrained_model.state_dict().items() if k in model_dict}
-            model_dict.update(pretrained_dict)
-            self.HATS_model.load_state_dict(model_dict)
-            self.logger.info("Loading pretrained model Done...")
-
-        # train
-        self.logger.info("training...")
-        self._fitted = True
-
-        for step in range(self.n_epochs):
-            self.logger.info("Epoch%d:", step)
-            self.logger.info("training...")
-            self.train_epoch(x_train, y_train)
-            self.logger.info("evaluating...")
-            train_loss, train_score = self.test_epoch(x_train, y_train)
-            val_loss, val_score = self.test_epoch(x_valid, y_valid)
-            self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
-            evals_result["train"].append(train_score)
-            evals_result["valid"].append(val_score)
-
-            if val_score > best_score:
-                best_score = val_score
-                stop_steps = 0
-                best_epoch = step
-                best_param = copy.deepcopy(self.HATS_model.state_dict())
-            else:
-                stop_steps += 1
-                if stop_steps >= self.early_stop:
-                    self.logger.info("early stop")
-                    break
-
-        self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
-        self.HATS_model.load_state_dict(best_param)
-        torch.save(best_param, save_path)
-
-        if self.use_gpu:
-            torch.cuda.empty_cache()
-
-    def predict(self, dataset):
-        if not self._fitted:
-            raise ValueError("model is not fitted yet!")
-
-        x_test = dataset.prepare("test", col_set="feature")
-        index = x_test.index
-        self.HATS_model.eval()
-        x_values = x_test.values
-        sample_num = x_values.shape[0]
-        preds = []
-
-        # organize the data into daily inter as daily batches
-        daily_index, daily_count = self.get_daily_inter(x_test, shuffle=False)
-
-        for idx, count in zip(daily_index, daily_count):
-            batch = slice(idx, idx + count)
-            x_batch = torch.from_numpy(x_values[batch]).float()
-
-            if self.use_gpu:
-                x_batch = x_batch.cuda()
-
-            with torch.no_grad():
-                if self.use_gpu:
-                    pred = self.HATS_model(x_batch).detach().cpu().numpy()
-                else:
-                    pred = self.HATS_model(x_batch).detach().numpy()
-
-            preds.append(pred)
-
-        return pd.Series(np.concatenate(preds), index=index)
-
-
-class HATSModel(nn.Module):
-    def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0, base_model="GRU"):
-        super().__init__()
-
-        if base_model == "GRU":
-            self.model = nn.GRU(
-                input_size=d_feat,
-                hidden_size=hidden_size,
-                num_layers=num_layers,
-                batch_first=True,
-                dropout=dropout,
-            )
-        elif base_model == "LSTM":
-            self.model = nn.LSTM(
-                input_size=d_feat,
-                hidden_size=hidden_size,
-                num_layers=num_layers,
-                batch_first=True,
-                dropout=dropout,
-            )
-        else:
-            raise ValueError("unknown base model name `%s`" % base_model)
-
-        self.hidden_size = hidden_size
-        self.bn1 = nn.BatchNorm1d(num_features=hidden_size, track_running_stats=False)
-        self.fc = nn.Linear(hidden_size, hidden_size)
-        self.bn2 = nn.BatchNorm1d(num_features=hidden_size, track_running_stats=False)
-        self.fc_out = nn.Linear(hidden_size, 1)
-        self.leaky_relu = nn.LeakyReLU()
-        self.softmax = nn.Softmax(dim=1)
-        self.d_feat = d_feat
-
-        num_head_att = [1] * num_layers
-        hidden_dim = [hidden_size] * num_layers
-        dims = [d_feat] + [d * nh for (d, nh) in zip(hidden_dim, num_head_att[:-1])] + [num_head_att[-1]]
-        in_dims = dims[:-1]
-        out_dims = [d // nh for (d, nh) in zip(dims[1:], num_head_att)]
-        self.attn = nn.ModuleList(
-            [GraphAttention(i, o, nh, dropout) for (i, o, nh) in zip(in_dims, out_dims, num_head_att)]
-        )
-        self.bns = nn.ModuleList([nn.BatchNorm1d(dim) for dim in dims[1:-1]])
-        self.dropout = nn.Dropout(dropout)
-        self.elu = nn.ELU()
-
-    def forward(self, x):
-        x = x.reshape(len(x), self.d_feat, -1)  # [N, F, T]
-        x = x.permute(0, 2, 1)  # [N, T, F]
-        out, _ = self.model(x)
-        hidden = out[:, -1, :]
-        hidden = self.bn1(hidden)
-        attention = GraphAttention.cal_attention(hidden, hidden)
-        output = attention.mm(hidden)
-        output = self.fc(output)
-        output = self.bn2(output)
-        output = self.leaky_relu(output)
-        return self.fc_out(output).squeeze()
-
-
-class GraphAttention(nn.Module):
-    def __init__(self, input_dim, output_dim, num_heads, dropout=0.5):
-
-        super().__init__()
-
-        """
-        Parameters
-        ----------
-        input_dim : int
-            Dimension of input node features.
-        output_dim : int
-            Dimension of output node features.
-        num_heads : list of ints
-            Number of attention heads in each hidden layer and output layer. Must be non empty. Note that len(num_heads) = len(hidden_dims)+1.
-        dropout : float
-            Dropout rate. Default: 0.5.
-        """
-
-        self.input_dim = input_dim
-        self.output_dim = output_dim
-        self.num_heads = num_heads
-
-        self.fcs = nn.ModuleList([nn.Linear(input_dim, output_dim) for _ in range(num_heads)])
-        self.a = nn.ModuleList([nn.Linear(2 * output_dim, 1) for _ in range(num_heads)])
-
-        self.dropout = nn.Dropout(dropout)
-        self.softmax = nn.Softmax(dim=0)
-        self.leakyrelu = nn.LeakyReLU()
-
-    def forward(self, features, nodes, mappings, rows):
-
-        """
-        Parameters
-        ----------
-        features : torch.Tensor
-            An (n' x input_dim) tensor of input node features.
-        nodes : list of numpy array
-            nodes[i] is an array of the nodes in the ith layer of the
-            computation graph.
-        mappings : list of dictionary
-            mappings[i] is a dictionary mappings node v (labelled 0 to |V|-1)
-            in nodes[i] to its position in nodes[i]. For example,
-            if nodes[i] = [2,5], then mappings[i][2] = 0 and
-            mappings[i][5] = 1.
-        rows : numpy array
-            rows[i] is an array of neighbors of node i.
-        Returns
-        -------
-        out : torch.Tensor
-            An (len(node_layers[-1]) x output_dim) tensor of output node features.
-        """
-
-        nprime = features.shape[0]
-        rows = [np.array([mappings[v] for v in row], dtype=np.int64) for row in rows]
-        sum_degs = np.hstack(([0], np.cumsum([len(row) for row in rows])))
-        mapped_nodes = [mappings[v] for v in nodes]
-        indices = torch.LongTensor([[v, c] for (v, row) in zip(mapped_nodes, rows) for c in row]).t()
-
-        out = []
-        for k in range(self.num_heads):
-            h = self.fcs[k](features)
-
-            nbr_h = torch.cat(tuple([h[row] for row in rows]), dim=0)
-            self_h = torch.cat(
-                tuple([h[mappings[nodes[i]]].repeat(len(row), 1) for (i, row) in enumerate(rows)]), dim=0
-            )
-            cat_h = torch.cat((self_h, nbr_h), dim=1)
-
-            e = self.leakyrelu(self.a[k](cat_h))
-
-            alpha = [self.softmax(e[lo:hi]) for (lo, hi) in zip(sum_degs, sum_degs[1:])]
-            alpha = torch.cat(tuple(alpha), dim=0)
-            alpha = alpha.squeeze(1)
-            alpha = self.dropout(alpha)
-
-            adj = torch.sparse.FloatTensor(indices, alpha, torch.Size([nprime, nprime]))
-            out.append(torch.sparse.mm(adj, h)[mapped_nodes])
-
-        return out
-
-    @staticmethod
-    def cal_attention(x, y):
-        att_x = torch.mean(x, dim=1).reshape(-1, 1)
-        att_y = torch.mean(y, dim=1).reshape(-1, 1)
-        att = att_x.mm(torch.t(att_y))
-        return (
-            torch.mean(
-                x.reshape(x.shape[0], 1, x.shape[1]).repeat(1, y.shape[0], 1)
-                * y.reshape(1, y.shape[0], y.shape[1]).repeat(x.shape[0], 1, 1),
-                dim=2,
-            )
-            - att
-        )
diff --git a/qlib/contrib/model/pytorch_lstm.py b/qlib/contrib/model/pytorch_lstm.py
index f8951509a..1d1c0c986 100755
--- a/qlib/contrib/model/pytorch_lstm.py
+++ b/qlib/contrib/model/pytorch_lstm.py
@@ -11,7 +11,12 @@ import pandas as pd
 import copy
 from sklearn.metrics import roc_auc_score, mean_squared_error
 import logging
-from ...utils import unpack_archive_with_buffer, save_multiple_parts_file, create_save_path, drop_nan_by_y_index
+from ...utils import (
+    unpack_archive_with_buffer,
+    save_multiple_parts_file,
+    create_save_path,
+    drop_nan_by_y_index,
+)
 from ...log import get_module_logger, TimeInspector
 
 import torch
@@ -109,14 +114,19 @@ class LSTM(Model):
         )
 
         self.lstm_model = LSTMModel(
-            d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers, dropout=self.dropout
+            d_feat=self.d_feat,
+            hidden_size=self.hidden_size,
+            num_layers=self.num_layers,
+            dropout=self.dropout,
         )
         if optimizer.lower() == "adam":
             self.train_optimizer = optim.Adam(self.lstm_model.parameters(), lr=self.lr)
         elif optimizer.lower() == "gd":
             self.train_optimizer = optim.SGD(self.lstm_model.parameters(), lr=self.lr)
         else:
-            raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
+            raise NotImplementedError(
+                "optimizer {} is not supported!".format(optimizer)
+            )
 
         self._fitted = False
         if self.use_gpu:
@@ -141,7 +151,7 @@ class LSTM(Model):
 
         mask = torch.isfinite(label)
 
-        if self.metric == "" or self.metric == "loss":  # use loss
+        if self.metric == "" or self.metric == "loss":
             return -self.loss_fn(pred[mask], label[mask])
 
         raise ValueError("unknown metric `%s`" % self.metric)
@@ -161,8 +171,12 @@ class LSTM(Model):
             if len(indices) - i < self.batch_size:
                 break
 
-            feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float()
-            label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float()
+            feature = torch.from_numpy(
+                x_train_values[indices[i : i + self.batch_size]]
+            ).float()
+            label = torch.from_numpy(
+                y_train_values[indices[i : i + self.batch_size]]
+            ).float()
 
             if self.use_gpu:
                 feature = feature.cuda()
@@ -194,7 +208,9 @@ class LSTM(Model):
             if len(indices) - i < self.batch_size:
                 break
 
-            feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float()
+            feature = torch.from_numpy(
+                x_values[indices[i : i + self.batch_size]]
+            ).float()
             label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float()
 
             if self.use_gpu:
@@ -219,7 +235,9 @@ class LSTM(Model):
     ):
 
         df_train, df_valid, df_test = dataset.prepare(
-            ["train", "valid", "test"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
+            ["train", "valid", "test"],
+            col_set=["feature", "label"],
+            data_key=DataHandlerLP.DK_L,
         )
 
         x_train, y_train = df_train["feature"], df_train["label"]
diff --git a/qlib/contrib/model/pytorch_sfm.py b/qlib/contrib/model/pytorch_sfm.py
index 1d27f3927..bebc408a8 100644
--- a/qlib/contrib/model/pytorch_sfm.py
+++ b/qlib/contrib/model/pytorch_sfm.py
@@ -19,7 +19,12 @@ import pandas as pd
 import copy
 from sklearn.metrics import roc_auc_score, mean_squared_error
 import logging
-from ...utils import unpack_archive_with_buffer, save_multiple_parts_file, create_save_path, drop_nan_by_y_index
+from ...utils import (
+    unpack_archive_with_buffer,
+    save_multiple_parts_file,
+    create_save_path,
+    drop_nan_by_y_index,
+)
 from ...log import get_module_logger, TimeInspector
 
 import torch
@@ -33,7 +38,16 @@ from ...data.dataset.handler import DataHandlerLP
 
 
 class SFM_Model(nn.Module):
-    def __init__(self, d_feat=6, output_dim=1, freq_dim=10, hidden_size=64, dropout_W=0.0, dropout_U=0.0, device="cpu"):
+    def __init__(
+        self,
+        d_feat=6,
+        output_dim=1,
+        freq_dim=10,
+        hidden_size=64,
+        dropout_W=0.0,
+        dropout_U=0.0,
+        device="cpu",
+    ):
         super().__init__()
 
         self.input_dim = d_feat
@@ -42,30 +56,52 @@ class SFM_Model(nn.Module):
         self.hidden_dim = hidden_size
         self.device = device
 
-        self.W_i = nn.Parameter(init.xavier_uniform_(torch.empty((self.input_dim, self.hidden_dim))))
-        self.U_i = nn.Parameter(init.orthogonal_(torch.empty(self.hidden_dim, self.hidden_dim)))
+        self.W_i = nn.Parameter(
+            init.xavier_uniform_(torch.empty((self.input_dim, self.hidden_dim)))
+        )
+        self.U_i = nn.Parameter(
+            init.orthogonal_(torch.empty(self.hidden_dim, self.hidden_dim))
+        )
         self.b_i = nn.Parameter(torch.zeros(self.hidden_dim))
 
-        self.W_ste = nn.Parameter(init.xavier_uniform_(torch.empty(self.input_dim, self.hidden_dim)))
-        self.U_ste = nn.Parameter(init.orthogonal_(torch.empty(self.hidden_dim, self.hidden_dim)))
+        self.W_ste = nn.Parameter(
+            init.xavier_uniform_(torch.empty(self.input_dim, self.hidden_dim))
+        )
+        self.U_ste = nn.Parameter(
+            init.orthogonal_(torch.empty(self.hidden_dim, self.hidden_dim))
+        )
         self.b_ste = nn.Parameter(torch.ones(self.hidden_dim))
 
-        self.W_fre = nn.Parameter(init.xavier_uniform_(torch.empty(self.input_dim, self.freq_dim)))
-        self.U_fre = nn.Parameter(init.orthogonal_(torch.empty(self.hidden_dim, self.freq_dim)))
+        self.W_fre = nn.Parameter(
+            init.xavier_uniform_(torch.empty(self.input_dim, self.freq_dim))
+        )
+        self.U_fre = nn.Parameter(
+            init.orthogonal_(torch.empty(self.hidden_dim, self.freq_dim))
+        )
         self.b_fre = nn.Parameter(torch.ones(self.freq_dim))
 
-        self.W_c = nn.Parameter(init.xavier_uniform_(torch.empty(self.input_dim, self.hidden_dim)))
-        self.U_c = nn.Parameter(init.orthogonal_(torch.empty(self.hidden_dim, self.hidden_dim)))
+        self.W_c = nn.Parameter(
+            init.xavier_uniform_(torch.empty(self.input_dim, self.hidden_dim))
+        )
+        self.U_c = nn.Parameter(
+            init.orthogonal_(torch.empty(self.hidden_dim, self.hidden_dim))
+        )
         self.b_c = nn.Parameter(torch.zeros(self.hidden_dim))
 
-        self.W_o = nn.Parameter(init.xavier_uniform_(torch.empty(self.input_dim, self.hidden_dim)))
-        self.U_o = nn.Parameter(init.orthogonal_(torch.empty(self.hidden_dim, self.hidden_dim)))
+        self.W_o = nn.Parameter(
+            init.xavier_uniform_(torch.empty(self.input_dim, self.hidden_dim))
+        )
+        self.U_o = nn.Parameter(
+            init.orthogonal_(torch.empty(self.hidden_dim, self.hidden_dim))
+        )
         self.b_o = nn.Parameter(torch.zeros(self.hidden_dim))
 
         self.U_a = nn.Parameter(init.orthogonal_(torch.empty(self.freq_dim, 1)))
         self.b_a = nn.Parameter(torch.zeros(self.hidden_dim))
 
-        self.W_p = nn.Parameter(init.xavier_uniform_(torch.empty(self.hidden_dim, self.output_dim)))
+        self.W_p = nn.Parameter(
+            init.xavier_uniform_(torch.empty(self.hidden_dim, self.output_dim))
+        )
         self.b_p = nn.Parameter(torch.zeros(self.output_dim))
 
         self.activation = nn.Tanh()
@@ -101,8 +137,12 @@ class SFM_Model(nn.Module):
             x_o = torch.matmul(x * B_W[0], self.W_o) + self.b_o
 
             i = self.inner_activation(x_i + torch.matmul(h_tm1 * B_U[0], self.U_i))
-            ste = self.inner_activation(x_ste + torch.matmul(h_tm1 * B_U[0], self.U_ste))
-            fre = self.inner_activation(x_fre + torch.matmul(h_tm1 * B_U[0], self.U_fre))
+            ste = self.inner_activation(
+                x_ste + torch.matmul(h_tm1 * B_U[0], self.U_ste)
+            )
+            fre = self.inner_activation(
+                x_fre + torch.matmul(h_tm1 * B_U[0], self.U_fre)
+            )
 
             ste = torch.reshape(ste, (-1, self.hidden_dim, 1))
             fre = torch.reshape(fre, (-1, 1, self.freq_dim))
@@ -157,7 +197,16 @@ class SFM_Model(nn.Module):
 
         init_state_time = torch.tensor(0).to(self.device)
 
-        self.states = [init_state_p, init_state_h, init_state_S_re, init_state_S_im, init_state_time, None, None, None]
+        self.states = [
+            init_state_p,
+            init_state_h,
+            init_state_S_re,
+            init_state_S_im,
+            init_state_time,
+            None,
+            None,
+            None,
+        ]
 
     def get_constants(self, x):
         constants = []
@@ -282,7 +331,9 @@ class SFM(Model):
         elif optimizer.lower() == "gd":
             self.train_optimizer = optim.SGD(self.sfm_model.parameters(), lr=self.lr)
         else:
-            raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
+            raise NotImplementedError(
+                "optimizer {} is not supported!".format(optimizer)
+            )
 
         self._fitted = False
         self.sfm_model.to(self.device)
@@ -305,8 +356,16 @@ class SFM(Model):
             if len(indices) - i < self.batch_size:
                 break
 
-            feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float().to(self.device)
-            label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float().to(self.device)
+            feature = (
+                torch.from_numpy(x_values[indices[i : i + self.batch_size]])
+                .float()
+                .to(self.device)
+            )
+            label = (
+                torch.from_numpy(y_values[indices[i : i + self.batch_size]])
+                .float()
+                .to(self.device)
+            )
 
             pred = self.sfm_model(feature)
             loss = self.loss_fn(pred, label)
@@ -332,8 +391,16 @@ class SFM(Model):
             if len(indices) - i < self.batch_size:
                 break
 
-            feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float().to(self.device)
-            label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float().to(self.device)
+            feature = (
+                torch.from_numpy(x_train_values[indices[i : i + self.batch_size]])
+                .float()
+                .to(self.device)
+            )
+            label = (
+                torch.from_numpy(y_train_values[indices[i : i + self.batch_size]])
+                .float()
+                .to(self.device)
+            )
 
             pred = self.sfm_model(feature)
             loss = self.loss_fn(pred, label)
@@ -352,7 +419,9 @@ class SFM(Model):
     ):
 
         df_train, df_valid = dataset.prepare(
-            ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
+            ["train", "valid"],
+            col_set=["feature", "label"],
+            data_key=DataHandlerLP.DK_L,
         )
         x_train, y_train = df_train["feature"], df_train["label"]
         x_valid, y_valid = df_valid["feature"], df_valid["label"]
@@ -409,7 +478,7 @@ class SFM(Model):
 
         mask = torch.isfinite(label)
 
-        if self.metric == "" or self.metric == "loss":  # use loss
+        if self.metric == "" or self.metric == "loss":
             return -self.loss_fn(pred[mask], label[mask])
 
         raise ValueError("unknown metric `%s`" % self.metric)
diff --git a/qlib/contrib/model/tabnet.py b/qlib/contrib/model/tabnet.py
deleted file mode 100644
index bc13d1f62..000000000
--- a/qlib/contrib/model/tabnet.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-
-import numpy as np
-import pandas as pd
-from pytorch_tabnet.tab_model import TabNetRegressor
-
-from ...model.base import Model
-from ...data.dataset import DatasetH
-from ...data.dataset.handler import DataHandlerLP
-
-
-class TabNetModel(Model):
-    """TabNetModel Model"""
-
-    def __init__(
-        self,
-        n_d,
-        n_a,
-        n_steps,
-        gamma,
-        n_independent,
-        n_shared,
-        seed,
-        momentum,
-        lambda_sparse,
-        optimizer_params,
-        **kwargs
-    ):
-        self.model = None
-
-        self.n_d = n_d
-        self.n_a = n_a
-        self.n_steps = n_steps
-        self.gamma = gamma
-        self.n_independent = n_independent
-        self.n_shared = n_shared
-        self.seed = seed
-        self.momentum = momentum
-        self.lambda_sparse = lambda_sparse
-        self.optimizer_params = optimizer_params
-
-    def fit(
-        self,
-        dataset: DatasetH,
-        n_d=8,
-        n_a=8,
-        n_steps=3,
-        gamma=1.3,
-        n_independent=2,
-        n_shared=2,
-        seed=0,
-        momentum=0.02,
-        lambda_sparse=1e-3,
-        optimizer_params={"lr": 2e-3},
-        **kwargs
-    ):
-
-        df_train, df_valid = dataset.prepare(
-            ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
-        )
-        x_train, y_train = df_train["feature"].values, df_train["label"].values * 100
-        x_valid, y_valid = df_valid["feature"].values, df_valid["label"].values * 100
-
-        self.model = TabNetRegressor(
-            n_d=self.n_d,
-            n_a=self.n_a,
-            n_steps=self.n_steps,
-            gamma=self.gamma,
-            n_independent=self.n_independent,
-            n_shared=self.n_shared,
-            seed=self.seed,
-            momentum=self.momentum,
-            lambda_sparse=self.lambda_sparse,
-            optimizer_params=self.optimizer_params,
-            **kwargs
-        )
-        self.model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)])
-
-    def predict(self, dataset):
-        if self.model is None:
-            raise ValueError("model is not fitted yet!")
-        x_test = dataset.prepare("test", col_set="feature")
-        test_pred = self.model.predict(x_test.values)
-        return pd.Series(test_pred.reshape([-1]), index=x_test.index)
diff --git a/qlib/contrib/model/xgboost.py b/qlib/contrib/model/xgboost.py
index 039fd2c80..32d631189 100755
--- a/qlib/contrib/model/xgboost.py
+++ b/qlib/contrib/model/xgboost.py
@@ -38,14 +38,18 @@ class XGBModel(Model):
     ):
 
         df_train, df_valid = dataset.prepare(
-            ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
+            ["train", "valid"],
+            col_set=["feature", "label"],
+            data_key=DataHandlerLP.DK_L,
         )
         x_train, y_train = df_train["feature"], df_train["label"]
         x_valid, y_valid = df_valid["feature"], df_valid["label"]
 
         # Lightgbm need 1D array as its label
         if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
-            y_train_1d, y_valid_1d = np.squeeze(y_train.values), np.squeeze(y_valid.values)
+            y_train_1d, y_valid_1d = np.squeeze(y_train.values), np.squeeze(
+                y_valid.values
+            )
         else:
             raise ValueError("XGBoost doesn't support multi-label training")
 
@@ -68,4 +72,6 @@ class XGBModel(Model):
         if self.model is None:
             raise ValueError("model is not fitted yet!")
         x_test = dataset.prepare("test", col_set="feature")
-        return pd.Series(self.model.predict(xgb.DMatrix(x_test.values)), index=x_test.index)
+        return pd.Series(
+            self.model.predict(xgb.DMatrix(x_test.values)), index=x_test.index
+        )