diff --git a/examples/workflow_by_code_hats.py b/examples/workflow_by_code_hats.py
index 3ea81ba49..4476a1f32 100644
--- a/examples/workflow_by_code_hats.py
+++ b/examples/workflow_by_code_hats.py
@@ -3,24 +3,17 @@
 
 import sys
 from pathlib import Path
-
 import qlib
 import pandas as pd
 from qlib.config import REG_CN
-from qlib.contrib.model.pytorch_hats import HATS
-from qlib.contrib.data.handler import ALPHA360_Denoise
 from qlib.contrib.strategy.strategy import TopkDropoutStrategy
 from qlib.contrib.evaluate import (
     backtest as normal_backtest,
     risk_analysis,
 )
 from qlib.utils import exists_qlib_data
-
-# from qlib.model.learner import train_model
 from qlib.utils import init_instance_by_config
 
-import pickle
-
 if __name__ == "__main__":
 
     # use default data
@@ -30,7 +23,7 @@ if __name__ == "__main__":
         sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts")))
         from get_data import GetData
 
-        GetData().qlib_data_cn(target_dir=provider_uri)
+        GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
 
     qlib.init(provider_uri=provider_uri, region=REG_CN)
 
@@ -74,7 +67,7 @@ if __name__ == "__main__":
                 "loss": "mse",
                 "base_model": "LSTM",
                 "seed": 0,
-                "GPU": 0,
+                "GPU": "1",
             },
         },
         "dataset": {
@@ -97,7 +90,7 @@ if __name__ == "__main__":
         # "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'],
     }
 
-    # model = train_model(task)
+
     model = init_instance_by_config(task["model"])
     dataset = init_instance_by_config(task["dataset"])
     model.fit(dataset, save_path="benchmarks/HATS/model_hat.pkl")
diff --git a/qlib/contrib/model/pytorch_hats.py b/qlib/contrib/model/pytorch_hats.py
index 593cef635..cdfae0284 100644
--- a/qlib/contrib/model/pytorch_hats.py
+++ b/qlib/contrib/model/pytorch_hats.py
@@ -18,10 +18,8 @@ import os
 import numpy as np
 import pandas as pd
 import copy
-from sklearn.metrics import roc_auc_score, mean_squared_error
-import logging
-from ...utils import unpack_archive_with_buffer, save_multiple_parts_file, create_save_path, drop_nan_by_y_index
-from ...log import get_module_logger, TimeInspector
+from ...utils import create_save_path
+from ...log import get_module_logger
 
 import torch
 import torch.nn as nn
@@ -37,14 +35,10 @@ class HATS(Model):
 
     Parameters
     ----------
-    input_dim : int
-        input dimension
-    output_dim : int
-        output dimension
-    layers : tuple
-        layer sizes
-    lr : float
-        learning rate
+    d_feat : int
+        input dimension for each time step
+    metric: str
+        the evaluate metric used in early stop
     optimizer : str
         optimizer name
     GPU : str
@@ -87,7 +81,7 @@ class HATS(Model):
         self.optimizer = optimizer.lower()
         self.loss = loss
         self.base_model = base_model
-        self.with_pretrain = with_pretrain  #### True if train HATS with pretrained base model
+        self.with_pretrain = with_pretrain
         self.visible_GPU = GPU
         self.use_gpu = torch.cuda.is_available()
         self.seed = seed
@@ -106,7 +100,7 @@ class HATS(Model):
             "\noptimizer : {}"
             "\nloss_type : {}"
             "\nbase_model : {}"
-            "\nwith_pretrain : {}"  ##### debug
+            "\nwith_pretrain : {}"
             "\nvisible_GPU : {}"
             "\nuse_GPU : {}"
             "\nseed : {}".format(
@@ -122,17 +116,13 @@ class HATS(Model):
                 optimizer.lower(),
                 loss,
                 base_model,
-                with_pretrain,  ### debug
+                with_pretrain,
                 GPU,
                 self.use_gpu,
                 seed,
             )
         )
 
-        if loss not in {"mse", "binary"}:
-            raise NotImplementedError("loss {} is not supported!".format(loss))
-        self._scorer = mean_squared_error if loss == "mse" else roc_auc_score
-
         self.HATS_model = HATSModel(
             d_feat=self.d_feat,
             hidden_size=self.hidden_size,
@@ -167,7 +157,6 @@ class HATS(Model):
         raise ValueError("unknown loss `%s`" % self.loss)
 
     def metric_fn(self, pred, label):
-
         mask = torch.isfinite(label)
         if self.metric == "IC":
             return self.cal_ic(pred[mask], label[mask])
@@ -212,7 +201,7 @@ class HATS(Model):
 
     def test_epoch(self, data_x, data_y):
 
-        # prepare training data
+        # prepare testing data
         x_values = data_x.values
         y_values = np.squeeze(data_y.values)
 
@@ -222,7 +211,6 @@ class HATS(Model):
         losses = []
 
         indices = np.arange(len(x_values))
-        np.random.shuffle(indices)
 
         for i in range(len(indices))[:: self.batch_size]:
 
@@ -263,7 +251,6 @@ class HATS(Model):
         if save_path == None:
             save_path = create_save_path(save_path)
         stop_steps = 0
-        train_loss = 0
         best_score = -np.inf
         best_epoch = 0
         evals_result["train"] = []
@@ -271,31 +258,24 @@ class HATS(Model):
 
         # load pretrained base_model
         if self.with_pretrain:
-            self.logger.info("loading pretrained model...")
+            self.logger.info("Loading pretrained model...")
             if self.base_model == "LSTM":
                 from ...contrib.model.pytorch_lstm import LSTMModel
-
                 pretrained_model = LSTMModel()
                 pretrained_model.load_state_dict(torch.load("benchmarks/LSTM/model_lstm_csi300.pkl"))
             elif self.base_model == "GRU":
                 from ...contrib.model.pytorch_gru import GRUModel
-
                 pretrained_model = GRUModel()
                 pretrained_model.load_state_dict(torch.load("benchmarks/GRU/model_gru_csi300.pkl"))
             model_dict = self.HATS_model.state_dict()
-
-            # filter unnecessary parameters
             pretrained_dict = {k: v for k, v in pretrained_model.state_dict().items() if k in model_dict}
-            # overwrite entries in the existing state dict
             model_dict.update(pretrained_dict)
-            # load the new state dict
             self.HATS_model.load_state_dict(model_dict)
-            self.logger.info("loading pretrained model Done...")
+            self.logger.info("Loading pretrained model Done...")
 
         # train
         self.logger.info("training...")
         self._fitted = True
-        # return
 
         for step in range(self.n_epochs):
             self.logger.info("Epoch%d:", step)
@@ -447,20 +427,20 @@ class GraphAttention(nn.Module):
         self.softmax = nn.Softmax(dim=0)
         self.leakyrelu = nn.LeakyReLU()
 
-    def forward(self, features, nodes, mapping, rows):
+    def forward(self, features, nodes, mappings, rows):
 
         """
         Parameters
         ----------
         features : torch.Tensor
             An (n' x input_dim) tensor of input node features.
-        node_layers : list of numpy array
-            node_layers[i] is an array of the nodes in the ith layer of the
+        nodes : list of numpy array
+            nodes[i] is an array of the nodes in the ith layer of the
             computation graph.
         mappings : list of dictionary
-            mappings[i] is a dictionary mapping node v (labelled 0 to |V|-1)
-            in node_layers[i] to its position in node_layers[i]. For example,
-            if node_layers[i] = [2,5], then mappings[i][2] = 0 and
+            mappings[i] is a dictionary mappings node v (labelled 0 to |V|-1)
+            in nodes[i] to its position in nodes[i]. For example,
+            if nodes[i] = [2,5], then mappings[i][2] = 0 and
             mappings[i][5] = 1.
         rows : numpy array
             rows[i] is an array of neighbors of node i.
@@ -471,9 +451,9 @@ class GraphAttention(nn.Module):
         """
 
         nprime = features.shape[0]
-        rows = [np.array([mapping[v] for v in row], dtype=np.int64) for row in rows]
+        rows = [np.array([mappings[v] for v in row], dtype=np.int64) for row in rows]
         sum_degs = np.hstack(([0], np.cumsum([len(row) for row in rows])))
-        mapped_nodes = [mapping[v] for v in nodes]
+        mapped_nodes = [mappings[v] for v in nodes]
         indices = torch.LongTensor([[v, c] for (v, row) in zip(mapped_nodes, rows) for c in row]).t()
 
         out = []
@@ -481,7 +461,7 @@ class GraphAttention(nn.Module):
             h = self.fcs[k](features)
 
             nbr_h = torch.cat(tuple([h[row] for row in rows]), dim=0)
-            self_h = torch.cat(tuple([h[mapping[nodes[i]]].repeat(len(row), 1) for (i, row) in enumerate(rows)]), dim=0)
+            self_h = torch.cat(tuple([h[mappings[nodes[i]]].repeat(len(row), 1) for (i, row) in enumerate(rows)]), dim=0)
             cat_h = torch.cat((self_h, nbr_h), dim=1)
 
             e = self.leakyrelu(self.a[k](cat_h))
@@ -496,13 +476,11 @@ class GraphAttention(nn.Module):
 
         return out
 
+    @staticmethod
     def cal_attention(x, y):
-
         att_x = torch.mean(x, dim=1).reshape(-1, 1)
         att_y = torch.mean(y, dim=1).reshape(-1, 1)
         att = att_x.mm(torch.t(att_y))
-        x_att = x.reshape(x.shape[0], 1, x.shape[1]).repeat(1, y.shape[0], 1)
-        y_att = y.reshape(1, y.shape[0], y.shape[1]).repeat(x.shape[0], 1, 1)
         return (
             torch.mean(
                 x.reshape(x.shape[0], 1, x.shape[1]).repeat(1, y.shape[0], 1)