diff --git a/README.md b/README.md
index 4383dea26..cd0c8542f 100644
--- a/README.md
+++ b/README.md
@@ -196,10 +196,12 @@ Here is a list of models built on `Qlib`.
 - [MLP based on pytorch](qlib/contrib/model/pytorch_nn.py)
 - [GRU based on pytorch](qlib/contrib/model/pytorch_gru.py)
 - [LSTM based on pytorcn](qlib/contrib/model/pytorch_lstm.py)
+- [ALSTM based on pytorcn](qlib/contrib/model/pytorch_alstm.py)
 - [GATs based on pytorch](qlib/contrib/model/pytorch_gats.py)
 - [TabNet based on pytorch](qlib/contrib/model/tabnet.py)
 - [SFM based on pytorch](qlib/contrib/model/pytorch_sfm.py)
-<!-- - [TFT based on tensorflow](examples/benchmarks/TFT/tft.py) -->
+- [HATs based on pytorch](qlib/contrib/model/pytorch_hats.py)
+- [TFT based on tensorflow](examples/benchmarks/TFT/tft.py)
 
 Your PR of new Quant models is highly welcomed.
 
diff --git a/examples/benchmarks/ALSTM/requirements.txt b/examples/benchmarks/ALSTM/requirements.txt
new file mode 100644
index 000000000..1fc2779c0
--- /dev/null
+++ b/examples/benchmarks/ALSTM/requirements.txt
@@ -0,0 +1,4 @@
+numpy==1.17.4
+pandas==1.1.2
+scikit_learn==0.23.2
+torch==1.7.0
diff --git a/examples/benchmarks/ALSTM/workflow_config_alstm.yaml b/examples/benchmarks/ALSTM/workflow_config_alstm.yaml
new file mode 100644
index 000000000..bb35b6da5
--- /dev/null
+++ b/examples/benchmarks/ALSTM/workflow_config_alstm.yaml
@@ -0,0 +1,69 @@
+provider_uri: "~/.qlib/qlib_data/cn_data"
+region: cn
+market: &market csi300
+benchmark: &benchmark SH000300
+data_handler_config: &data_handler_config
+    start_time: 2008-01-01
+    end_time: 2020-08-01
+    fit_start_time: 2008-01-01
+    fit_end_time: 2014-12-31
+    instruments: *market
+port_analysis_config: &port_analysis_config
+    strategy:
+        class: TopkDropoutStrategy
+        module_path: qlib.contrib.strategy.strategy
+        kwargs:
+            topk: 50
+            n_drop: 5
+    backtest:
+        verbose: False
+        limit_threshold: 0.095
+        account: 100000000
+        benchmark: *benchmark
+        deal_price: close
+        open_cost: 0.0005
+        close_cost: 0.0015
+        min_cost: 5
+task:
+    model:
+        class: ALSTM
+        module_path: qlib.contrib.model.pytorch_alstm
+        kwargs:
+            d_feat: 6
+            hidden_size: 64
+            num_layers: 2
+            dropout: 0.0
+            n_epochs: 200
+            lr: 1e-3
+            early_stop: 20
+            batch_size: 800
+            metric: IC
+            loss: mse
+            seed: 0
+            GPU: 0
+            rnn_type: GRU
+    dataset:
+        class: DatasetH
+        module_path: qlib.data.dataset
+        kwargs:
+            handler:
+                class: ALPHA360_Denoise
+                module_path: qlib.contrib.data.handler
+                kwargs: *data_handler_config
+            segments:
+                train: [2008-01-01, 2014-12-31]
+                valid: [2015-01-01, 2016-12-31]
+                test: [2017-01-01, 2020-08-01]
+    record: 
+        - class: SignalRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: {}
+        - class: SigAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            ana_long_short: False
+            ann_scaler: 252
+        - class: PortAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            config: *port_analysis_config
\ No newline at end of file
diff --git a/examples/benchmarks/CatBoost/README.md b/examples/benchmarks/CatBoost/README.md
new file mode 100644
index 000000000..5e4f3966f
--- /dev/null
+++ b/examples/benchmarks/CatBoost/README.md
@@ -0,0 +1,3 @@
+# CatBoost
+* Code: [https://github.com/catboost/catboost](https://github.com/catboost/catboost)
+* Paper: CatBoost: unbiased boosting with categorical features. [https://proceedings.neurips.cc/paper/2018/file/14491b756b3a51daac41c24863285549-Paper.pdf](https://proceedings.neurips.cc/paper/2018/file/14491b756b3a51daac41c24863285549-Paper.pdf).
\ No newline at end of file
diff --git a/examples/benchmarks/DNN/workflow_config_dnn.yaml b/examples/benchmarks/DNN/workflow_config_dnn.yaml
index e853726ca..0f9ae7254 100644
--- a/examples/benchmarks/DNN/workflow_config_dnn.yaml
+++ b/examples/benchmarks/DNN/workflow_config_dnn.yaml
@@ -30,7 +30,7 @@ task:
         module_path: qlib.contrib.model.pytorch_nn
         kwargs:
             loss: mse
-            input_dim: 360
+            input_dim: 158
             output_dim: 1
             lr: 0.002
             lr_decay: 0.96
diff --git a/examples/benchmarks/GATs/worflow_config_gats.yaml b/examples/benchmarks/GATs/workflow_config_gats.yaml
similarity index 95%
rename from examples/benchmarks/GATs/worflow_config_gats.yaml
rename to examples/benchmarks/GATs/workflow_config_gats.yaml
index 84eeff4db..37bced99d 100644
--- a/examples/benchmarks/GATs/worflow_config_gats.yaml
+++ b/examples/benchmarks/GATs/workflow_config_gats.yaml
@@ -37,9 +37,10 @@ task:
             lr: 1e-3
             early_stop: 20
             batch_size: 800
-            metric: IC
+            metric: loss
             loss: mse
-            base_model: GRU
+            base_model: LSTM
+            with_pretrain: True
             seed: 0
             GPU: 0
     dataset:
diff --git a/examples/benchmarks/GRU/model_gru_csi300.pkl b/examples/benchmarks/GRU/model_gru_csi300.pkl
new file mode 100644
index 000000000..46347ce8c
Binary files /dev/null and b/examples/benchmarks/GRU/model_gru_csi300.pkl differ
diff --git a/examples/benchmarks/HATS/requirements.txt b/examples/benchmarks/HATS/requirements.txt
new file mode 100644
index 000000000..16de0a438
--- /dev/null
+++ b/examples/benchmarks/HATS/requirements.txt
@@ -0,0 +1,4 @@
+pandas==1.1.2
+numpy==1.17.4
+scikit_learn==0.23.2
+torch==1.7.0
diff --git a/examples/benchmarks/HATS/worflow_config_hats.yaml b/examples/benchmarks/HATS/worflow_config_hats.yaml
new file mode 100644
index 000000000..a7ab0d2d7
--- /dev/null
+++ b/examples/benchmarks/HATS/worflow_config_hats.yaml
@@ -0,0 +1,64 @@
+provider_uri: "~/.qlib/qlib_data/cn_data"
+region: cn
+market: &market csi300
+benchmark: &benchmark SH000300
+data_handler_config: &data_handler_config
+    start_time: 2008-01-01
+    end_time: 2020-08-01
+    fit_start_time: 2008-01-01
+    fit_end_time: 2014-12-31
+    instruments: *market
+port_analysis_config: &port_analysis_config
+    strategy:
+        class: TopkDropoutStrategy
+        module_path: qlib.contrib.strategy.strategy
+        kwargs:
+            topk: 50
+            n_drop: 5
+    backtest:
+        verbose: False
+        limit_threshold: 0.095
+        account: 100000000
+        benchmark: *benchmark
+        deal_price: close
+        open_cost: 0.0005
+        close_cost: 0.0015
+        min_cost: 5
+task:
+    model:
+        class: HATS
+        module_path: qlib.contrib.model.pytorch_gats
+        kwargs:
+            d_feat: 6
+            hidden_size: 64
+            num_layers: 2
+            dropout: 0.6
+            n_epochs: 200
+            lr: 1e-3
+            early_stop: 20
+            batch_size: 800
+            metric: IC
+            loss: mse
+            base_model: GRU
+            seed: 0
+            GPU: 0
+    dataset:
+        class: DatasetH
+        module_path: qlib.data.dataset
+        kwargs:
+            handler:
+                class: ALPHA360_Denoise
+                module_path: qlib.contrib.data.handler
+                kwargs: *data_handler_config
+            segments:
+                train: [2008-01-01, 2014-12-31]
+                valid: [2015-01-01, 2016-12-31]
+                test: [2017-01-01, 2020-08-01]
+    record: 
+        - class: SignalRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: {}
+        - class: PortAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            config: *port_analysis_config
\ No newline at end of file
diff --git a/examples/benchmarks/LSTM/model_lstm_csi300.pkl b/examples/benchmarks/LSTM/model_lstm_csi300.pkl
new file mode 100644
index 000000000..ff7fee450
Binary files /dev/null and b/examples/benchmarks/LSTM/model_lstm_csi300.pkl differ
diff --git a/examples/benchmarks/LightGBM/README.md b/examples/benchmarks/LightGBM/README.md
new file mode 100644
index 000000000..13f408d5f
--- /dev/null
+++ b/examples/benchmarks/LightGBM/README.md
@@ -0,0 +1,4 @@
+# LightGBM
+* Code: [https://github.com/microsoft/LightGBM](https://github.com/microsoft/LightGBM)
+* Paper: LightGBM: A Highly Efficient Gradient Boosting
+Decision Tree. [https://proceedings.neurips.cc/paper/2017/file/6449f44a102fde848669bdd9eb6b76fa-Paper.pdf](https://proceedings.neurips.cc/paper/2017/file/6449f44a102fde848669bdd9eb6b76fa-Paper.pdf).
\ No newline at end of file
diff --git a/examples/benchmarks/SFM/README.md b/examples/benchmarks/SFM/README.md
new file mode 100644
index 000000000..06ca50485
--- /dev/null
+++ b/examples/benchmarks/SFM/README.md
@@ -0,0 +1,4 @@
+# State-Frequency-Memory
+- State Frequency Memory (SFM) is a novel recurrent network that uses Discrete Fourier Transform (DFT) to decompose the hidden states of memory cells and capture the multi-frequency trading patterns from past market data to make stock price predictions. 
+- The code used in Qlib is a pyTorch implementation of SFM (Zhang, L., Aggarwal, C., & Qi, G. J. (2017,)).
+- Paper: Stock Price Prediction via Discovering Multi-Frequency Trading Patterns. https://www.cs.ucf.edu/~gqi/publications/kdd2017_stock.pdf.
\ No newline at end of file
diff --git a/examples/benchmarks/TabNet/README.md b/examples/benchmarks/TabNet/README.md
new file mode 100644
index 000000000..3a233df46
--- /dev/null
+++ b/examples/benchmarks/TabNet/README.md
@@ -0,0 +1,4 @@
+# TabNet
+* TabNet is a novel high-performance and interpretable canonical deep tabular data learning architectur. TabNet uses sequential attention to choose which features to reason from at each decision step, enabling interpretability and more effcient learning as the learning capacity is used for the most salient features.
+* The code used in Qlib is a pyTorch implementation of Tabnet (Arik, S. O., & Pfister, T. (2019). [https://github.com/dreamquark-ai/tabnet](https://github.com/dreamquark-ai/tabnet)
+* Paper: TabNet: Attentive Interpretable Tabular Learning. [https://arxiv.org/pdf/1908.07442.pdf](https://arxiv.org/pdf/1908.07442.pdf).
\ No newline at end of file
diff --git a/examples/benchmarks/XGBoost/README.md b/examples/benchmarks/XGBoost/README.md
new file mode 100644
index 000000000..33e04b23b
--- /dev/null
+++ b/examples/benchmarks/XGBoost/README.md
@@ -0,0 +1,3 @@
+# XGBoost
+* Code: [https://github.com/dmlc/xgboost](https://github.com/dmlc/xgboost)
+* Paper: XGBoost: A Scalable Tree Boosting System. [https://dl.acm.org/doi/pdf/10.1145/2939672.2939785](https://dl.acm.org/doi/pdf/10.1145/2939672.2939785).
\ No newline at end of file
diff --git a/examples/workflow_by_code_alstm.py b/examples/workflow_by_code_alstm.py
new file mode 100644
index 000000000..eabce3b07
--- /dev/null
+++ b/examples/workflow_by_code_alstm.py
@@ -0,0 +1,145 @@
+#  Copyright (c) Microsoft Corporation.
+#  Licensed under the MIT License.
+
+import sys
+from pathlib import Path
+
+import qlib
+import pandas as pd
+from qlib.config import REG_CN
+from qlib.contrib.model.pytorch_alstm import ALSTM
+from qlib.contrib.data.handler import ALPHA360_Denoise
+from qlib.contrib.strategy.strategy import TopkDropoutStrategy
+from qlib.contrib.evaluate import (
+    backtest as normal_backtest,
+    risk_analysis,
+)
+from qlib.utils import exists_qlib_data
+
+# from qlib.model.learner import train_model
+from qlib.utils import init_instance_by_config
+
+import pickle
+
+if __name__ == "__main__":
+
+    # use default data
+    provider_uri = "~/.qlib/qlib_data/cn_data"  # target_dir
+    if not exists_qlib_data(provider_uri):
+        print(f"Qlib data is not found in {provider_uri}")
+        sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts")))
+        from get_data import GetData
+
+        GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
+
+    qlib.init(provider_uri=provider_uri, region=REG_CN)
+
+    MARKET = "csi300"
+    BENCHMARK = "SH000300"
+
+    ###################################
+    # train model
+    ###################################
+    DATA_HANDLER_CONFIG = {
+        "start_time": "2008-01-01",
+        "end_time": "2020-08-01",
+        "fit_start_time": "2008-01-01",
+        "fit_end_time": "2014-12-31",
+        "instruments": MARKET,
+    }
+
+    TRAINER_CONFIG = {
+        "train_start_time": "2008-01-01",
+        "train_end_time": "2014-12-31",
+        "validate_start_time": "2015-01-01",
+        "validate_end_time": "2016-12-31",
+        "test_start_time": "2017-01-01",
+        "test_end_time": "2020-08-01",
+    }
+
+    task = {
+        "model": {
+            "class": "ALSTM",
+            "module_path": "qlib.contrib.model.pytorch_alstm",
+            "kwargs": {
+                "d_feat": 6,
+                "hidden_size": 64,
+                "num_layers": 2,
+                "dropout": 0.0,
+                "n_epochs": 200,
+                "lr": 1e-3,
+                "early_stop": 20,
+                "batch_size": 800,
+                "metric": "IC",
+                "loss": "mse",
+                "seed": 0,
+                "GPU": 0,
+                "rnn_type": "GRU",
+            },
+        },
+        "dataset": {
+            "class": "DatasetH",
+            "module_path": "qlib.data.dataset",
+            "kwargs": {
+                "handler": {
+                    "class": "ALPHA360_Denoise",
+                    "module_path": "qlib.contrib.data.handler",
+                    "kwargs": DATA_HANDLER_CONFIG,
+                },
+                "segments": {
+                    "train": ("2008-01-01", "2014-12-31"),
+                    "valid": ("2015-01-01", "2016-12-31"),
+                    "test": ("2017-01-01", "2020-08-01"),
+                },
+            },
+        }
+        # You shoud record the data in specific sequence
+        # "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'],
+    }
+
+    # model = train_model(task)
+    model = init_instance_by_config(task["model"])
+    dataset = init_instance_by_config(task["dataset"])
+    model.fit(dataset)
+
+    pred_score = model.predict(dataset)
+
+    # save pred_score to file
+    pred_score_path = Path("~/tmp/qlib/pred_score.pkl").expanduser()
+    pred_score_path.parent.mkdir(exist_ok=True, parents=True)
+    pred_score.to_pickle(pred_score_path)
+
+    ###################################
+    # backtest
+    ###################################
+    STRATEGY_CONFIG = {
+        "topk": 50,
+        "n_drop": 5,
+    }
+    BACKTEST_CONFIG = {
+        "verbose": False,
+        "limit_threshold": 0.095,
+        "account": 100000000,
+        "benchmark": BENCHMARK,
+        "deal_price": "close",
+        "open_cost": 0.0005,
+        "close_cost": 0.0015,
+        "min_cost": 5,
+    }
+
+    # use default strategy
+    # custom Strategy, refer to: TODO: Strategy API url
+    strategy = TopkDropoutStrategy(**STRATEGY_CONFIG)
+    report_normal, positions_normal = normal_backtest(pred_score, strategy=strategy, **BACKTEST_CONFIG)
+
+    ###################################
+    # analyze
+    # If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb
+    ###################################
+    analysis = dict()
+    analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"])
+    analysis["excess_return_with_cost"] = risk_analysis(
+        report_normal["return"] - report_normal["bench"] - report_normal["cost"]
+    )
+    analysis_df = pd.concat(analysis)  # type: pd.DataFrame
+    print(analysis_df)
diff --git a/examples/workflow_by_code_gats.py b/examples/workflow_by_code_gats.py
index 6b15b77b4..3bb4edf08 100644
--- a/examples/workflow_by_code_gats.py
+++ b/examples/workflow_by_code_gats.py
@@ -70,9 +70,10 @@ if __name__ == "__main__":
                 "lr": 1e-3,
                 "early_stop": 20,
                 "batch_size": 800,
-                "metric": "IC",
+                "metric": "loss",
                 "loss": "mse",
-                "base_model": "GRU",
+                "base_model": "LSTM",
+                "with_pretrain": True,
                 "seed": 0,
                 "GPU": 0,
             },
diff --git a/examples/workflow_by_code_hats.py b/examples/workflow_by_code_hats.py
new file mode 100644
index 000000000..3ea81ba49
--- /dev/null
+++ b/examples/workflow_by_code_hats.py
@@ -0,0 +1,145 @@
+#  Copyright (c) Microsoft Corporation.
+#  Licensed under the MIT License.
+
+import sys
+from pathlib import Path
+
+import qlib
+import pandas as pd
+from qlib.config import REG_CN
+from qlib.contrib.model.pytorch_hats import HATS
+from qlib.contrib.data.handler import ALPHA360_Denoise
+from qlib.contrib.strategy.strategy import TopkDropoutStrategy
+from qlib.contrib.evaluate import (
+    backtest as normal_backtest,
+    risk_analysis,
+)
+from qlib.utils import exists_qlib_data
+
+# from qlib.model.learner import train_model
+from qlib.utils import init_instance_by_config
+
+import pickle
+
+if __name__ == "__main__":
+
+    # use default data
+    provider_uri = "~/.qlib/qlib_data/cn_data"  # target_dir
+    if not exists_qlib_data(provider_uri):
+        print(f"Qlib data is not found in {provider_uri}")
+        sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts")))
+        from get_data import GetData
+
+        GetData().qlib_data_cn(target_dir=provider_uri)
+
+    qlib.init(provider_uri=provider_uri, region=REG_CN)
+
+    MARKET = "csi300"
+    BENCHMARK = "SH000300"
+
+    ###################################
+    # train model
+    ###################################
+    DATA_HANDLER_CONFIG = {
+        "start_time": "2008-01-01",
+        "end_time": "2020-08-01",
+        "fit_start_time": "2008-01-01",
+        "fit_end_time": "2014-12-31",
+        "instruments": MARKET,
+    }
+
+    TRAINER_CONFIG = {
+        "train_start_time": "2008-01-01",
+        "train_end_time": "2014-12-31",
+        "validate_start_time": "2015-01-01",
+        "validate_end_time": "2016-12-31",
+        "test_start_time": "2017-01-01",
+        "test_end_time": "2020-08-01",
+    }
+
+    task = {
+        "model": {
+            "class": "HATS",
+            "module_path": "qlib.contrib.model.pytorch_hats",
+            "kwargs": {
+                "d_feat": 6,
+                "hidden_size": 64,
+                "num_layers": 2,
+                "dropout": 0.6,
+                "n_epochs": 200,
+                "lr": 1e-3,
+                "early_stop": 20,
+                "batch_size": 800,
+                "metric": "IC",
+                "loss": "mse",
+                "base_model": "LSTM",
+                "seed": 0,
+                "GPU": 0,
+            },
+        },
+        "dataset": {
+            "class": "DatasetH",
+            "module_path": "qlib.data.dataset",
+            "kwargs": {
+                "handler": {
+                    "class": "ALPHA360_Denoise",
+                    "module_path": "qlib.contrib.data.handler",
+                    "kwargs": DATA_HANDLER_CONFIG,
+                },
+                "segments": {
+                    "train": ("2008-01-01", "2014-12-31"),
+                    "valid": ("2015-01-01", "2016-12-31"),
+                    "test": ("2017-01-01", "2020-08-01"),
+                },
+            },
+        }
+        # You shoud record the data in specific sequence
+        # "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'],
+    }
+
+    # model = train_model(task)
+    model = init_instance_by_config(task["model"])
+    dataset = init_instance_by_config(task["dataset"])
+    model.fit(dataset, save_path="benchmarks/HATS/model_hat.pkl")
+
+    pred_score = model.predict(dataset)
+
+    # save pred_score to file
+    pred_score_path = Path("~/tmp/qlib/pred_score.pkl").expanduser()
+    pred_score_path.parent.mkdir(exist_ok=True, parents=True)
+    pred_score.to_pickle(pred_score_path)
+
+    ###################################
+    # backtest
+    ###################################
+    STRATEGY_CONFIG = {
+        "topk": 50,
+        "n_drop": 5,
+    }
+    BACKTEST_CONFIG = {
+        "verbose": False,
+        "limit_threshold": 0.095,
+        "account": 100000000,
+        "benchmark": BENCHMARK,
+        "deal_price": "close",
+        "open_cost": 0.0005,
+        "close_cost": 0.0015,
+        "min_cost": 5,
+    }
+
+    # use default strategy
+    # custom Strategy, refer to: TODO: Strategy API url
+    strategy = TopkDropoutStrategy(**STRATEGY_CONFIG)
+    report_normal, positions_normal = normal_backtest(pred_score, strategy=strategy, **BACKTEST_CONFIG)
+
+    ###################################
+    # analyze
+    # If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb
+    ###################################
+    analysis = dict()
+    analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"])
+    analysis["excess_return_with_cost"] = risk_analysis(
+        report_normal["return"] - report_normal["bench"] - report_normal["cost"]
+    )
+    analysis_df = pd.concat(analysis)  # type: pd.DataFrame
+    print(analysis_df)
diff --git a/qlib/contrib/evaluate.py b/qlib/contrib/evaluate.py
index a9b08719a..2b85f1a9b 100644
--- a/qlib/contrib/evaluate.py
+++ b/qlib/contrib/evaluate.py
@@ -190,7 +190,8 @@ def backtest(pred, account=1e9, shift=1, benchmark="SH000905", verbose=True, **k
     Parameters
     ----------
 
-    # backtest workflow related or commmon arguments
+    - **backtest workflow related or commmon arguments**
+
     pred : pandas.DataFrame
         predict should has <datetime, instrument> index and one `score` column
     account : float
@@ -202,7 +203,8 @@ def backtest(pred, account=1e9, shift=1, benchmark="SH000905", verbose=True, **k
     verbose : bool
         whether to print log
 
-    # strategy related arguments
+    - **strategy related arguments**
+
     strategy : Strategy()
         strategy used in backtest
     topk : int (Default value: 50)
@@ -225,7 +227,8 @@ def backtest(pred, account=1e9, shift=1, benchmark="SH000905", verbose=True, **k
     str_type: 'amount', 'weight' or 'dropout'
         strategy type: TopkAmountStrategy ,TopkWeightStrategy or TopkDropoutStrategy
 
-    # exchange related arguments
+    - **exchange related arguments**
+
     exchange: Exchange()
         pass the exchange for speeding up.
     subscribe_fields: list
diff --git a/qlib/contrib/model/catboost_model.py b/qlib/contrib/model/catboost_model.py
index e487a6d1e..bba006c35 100644
--- a/qlib/contrib/model/catboost_model.py
+++ b/qlib/contrib/model/catboost_model.py
@@ -1,3 +1,15 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 import pandas as pd
 from catboost import Pool, CatBoost
diff --git a/qlib/contrib/model/pytorch_alstm.py b/qlib/contrib/model/pytorch_alstm.py
new file mode 100644
index 000000000..bdf1e3ea0
--- /dev/null
+++ b/qlib/contrib/model/pytorch_alstm.py
@@ -0,0 +1,394 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+import pandas as pd
+import copy
+from sklearn.metrics import roc_auc_score, mean_squared_error
+import logging
+from ...utils import unpack_archive_with_buffer, save_multiple_parts_file, create_save_path, drop_nan_by_y_index
+from ...log import get_module_logger, TimeInspector
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+
+from ...model.base import Model
+from ...data.dataset import DatasetH
+from ...data.dataset.handler import DataHandlerLP
+
+
+class ALSTM(Model):
+    """ALSTM Model
+
+    Parameters
+    ----------
+    input_dim : int
+        input dimension
+    output_dim : int
+        output dimension
+    layers : tuple
+        layer sizes
+    lr : float
+        learning rate
+    optimizer : str
+        optimizer name
+    GPU : str
+        the GPU ID(s) used for training
+    """
+
+    def __init__(
+        self,
+        d_feat=6,
+        hidden_size=64,
+        num_layers=2,
+        dropout=0.0,
+        n_epochs=200,
+        lr=0.001,
+        metric="IC",
+        batch_size=2000,
+        early_stop=20,
+        loss="mse",
+        optimizer="adam",
+        GPU="0",
+        seed=0,
+        rnn_type="GRU",
+        **kwargs
+    ):
+        # Set logger.
+        self.logger = get_module_logger("ALSTM")
+        self.logger.info("ALSTM pytorch version...")
+
+        # set hyper-parameters.
+        self.d_feat = d_feat
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.dropout = dropout
+        self.n_epochs = n_epochs
+        self.lr = lr
+        self.metric = metric
+        self.batch_size = batch_size
+        self.early_stop = early_stop
+        self.optimizer = optimizer.lower()
+        self.loss = loss
+        self.visible_GPU = GPU
+        self.use_gpu = torch.cuda.is_available()
+        self.seed = seed
+        self.rnn_type = rnn_type
+
+        self.logger.info(
+            "ALSTM parameters setting:"
+            "\nd_feat : {}"
+            "\nhidden_size : {}"
+            "\nnum_layers : {}"
+            "\ndropout : {}"
+            "\nn_epochs : {}"
+            "\nlr : {}"
+            "\nmetric : {}"
+            "\nbatch_size : {}"
+            "\nearly_stop : {}"
+            "\noptimizer : {}"
+            "\nloss_type : {}"
+            "\nvisible_GPU : {}"
+            "\nuse_GPU : {}"
+            "\nseed : {}"
+            "\nrnn_type : {}".format(
+                d_feat,
+                hidden_size,
+                num_layers,
+                dropout,
+                n_epochs,
+                lr,
+                metric,
+                batch_size,
+                early_stop,
+                optimizer.lower(),
+                loss,
+                GPU,
+                self.use_gpu,
+                seed,
+                self.rnn_type,
+            )
+        )
+
+        if loss not in {"mse", "binary"}:
+            raise NotImplementedError("loss {} is not supported!".format(loss))
+        self._scorer = mean_squared_error if loss == "mse" else roc_auc_score
+
+        self.alstm_model = ALSTMModel(
+            d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers, dropout=self.dropout
+        )
+        # def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0, input_day=20, rnn_type="GRU"):
+
+        if optimizer.lower() == "adam":
+            self.train_optimizer = optim.Adam(self.alstm_model.parameters(), lr=self.lr)
+        elif optimizer.lower() == "gd":
+            self.train_optimizer = optim.SGD(self.alstm_model.parameters(), lr=self.lr)
+        else:
+            raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
+
+        self._fitted = False
+        if self.use_gpu:
+            self.alstm_model.cuda()
+            # set the visible GPU
+            if self.visible_GPU:
+                os.environ["CUDA_VISIBLE_DEVICES"] = self.visible_GPU
+
+    def mse(self, pred, label):
+        loss = (pred - label) ** 2
+        return torch.mean(loss)
+
+    def loss_fn(self, pred, label):
+        mask = ~torch.isnan(label)
+
+        if self.loss == "mse":
+            return self.mse(pred[mask], label[mask])
+
+        raise ValueError("unknown loss `%s`" % self.loss)
+
+    def metric_fn(self, pred, label):
+
+        mask = torch.isfinite(label)
+        if self.metric == "IC":
+            return self.cal_ic(pred[mask], label[mask])
+
+        if self.metric == "" or self.metric == "loss":  # use loss
+            return -self.loss_fn(pred[mask], label[mask])
+
+        raise ValueError("unknown metric `%s`" % self.metric)
+
+    def cal_ic(self, pred, label):
+        return torch.mean(pred * label)
+
+    def train_epoch(self, x_train, y_train):
+
+        x_train_values = x_train.values
+        y_train_values = np.squeeze(y_train.values) * 100
+
+        self.alstm_model.train()
+
+        indices = np.arange(len(x_train_values))
+        np.random.shuffle(indices)
+
+        for i in range(len(indices))[:: self.batch_size]:
+
+            if len(indices) - i < self.batch_size:
+                break
+
+            feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float()
+            label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float()
+
+            if self.use_gpu:
+                feature = feature.cuda()
+                label = label.cuda()
+
+            pred = self.alstm_model(feature)
+            loss = self.loss_fn(pred, label)
+
+            self.train_optimizer.zero_grad()
+            loss.backward()
+            torch.nn.utils.clip_grad_value_(self.alstm_model.parameters(), 3.0)
+            self.train_optimizer.step()
+
+    def test_epoch(self, data_x, data_y):
+
+        # prepare training data
+        x_values = data_x.values
+        y_values = np.squeeze(data_y.values)
+
+        self.alstm_model.eval()
+
+        scores = []
+        losses = []
+
+        indices = np.arange(len(x_values))
+        np.random.shuffle(indices)
+
+        for i in range(len(indices))[:: self.batch_size]:
+
+            if len(indices) - i < self.batch_size:
+                break
+
+            feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float()
+            label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float()
+
+            if self.use_gpu:
+                feature = feature.cuda()
+                label = label.cuda()
+
+            pred = self.alstm_model(feature)
+            loss = self.loss_fn(pred, label)
+            losses.append(loss.item())
+
+            score = self.metric_fn(pred, label)
+            scores.append(score.item())
+
+        return np.mean(losses), np.mean(scores)
+
+    def fit(
+        self,
+        dataset: DatasetH,
+        evals_result=dict(),
+        verbose=True,
+        save_path=None,
+    ):
+
+        df_train, df_valid, df_test = dataset.prepare(
+            ["train", "valid", "test"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
+        )
+
+        x_train, y_train = df_train["feature"], df_train["label"]
+        x_valid, y_valid = df_valid["feature"], df_valid["label"]
+
+        if save_path == None:
+            save_path = create_save_path(save_path)
+        stop_steps = 0
+        train_loss = 0
+        best_score = -np.inf
+        best_epoch = 0
+        evals_result["train"] = []
+        evals_result["valid"] = []
+
+        # train
+        self.logger.info("training...")
+        self._fitted = True
+        # return
+
+        for step in range(self.n_epochs):
+            self.logger.info("Epoch%d:", step)
+            self.logger.info("training...")
+            self.train_epoch(x_train, y_train)
+            self.logger.info("evaluating...")
+            train_loss, train_score = self.test_epoch(x_train, y_train)
+            val_loss, val_score = self.test_epoch(x_valid, y_valid)
+            self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
+            evals_result["train"].append(train_score)
+            evals_result["valid"].append(val_score)
+
+            if val_score > best_score:
+                best_score = val_score
+                stop_steps = 0
+                best_epoch = step
+                best_param = copy.deepcopy(self.alstm_model.state_dict())
+            else:
+                stop_steps += 1
+                if stop_steps >= self.early_stop:
+                    self.logger.info("early stop")
+                    break
+
+        self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
+        self.alstm_model.load_state_dict(best_param)
+        torch.save(best_param, save_path)
+
+        if self.use_gpu:
+            torch.cuda.empty_cache()
+
+    def predict(self, dataset):
+        if not self._fitted:
+            raise ValueError("model is not fitted yet!")
+
+        x_test = dataset.prepare("test", col_set="feature")
+        index = x_test.index
+        self.alstm_model.eval()
+        x_values = x_test.values
+        sample_num = x_values.shape[0]
+        preds = []
+
+        for begin in range(sample_num)[:: self.batch_size]:
+
+            if sample_num - begin < self.batch_size:
+                end = sample_num
+            else:
+                end = begin + self.batch_size
+
+            x_batch = torch.from_numpy(x_values[begin:end]).float()
+
+            if self.use_gpu:
+                x_batch = x_batch.cuda()
+
+            with torch.no_grad():
+                if self.use_gpu:
+                    pred = self.alstm_model(x_batch).detach().cpu().numpy()
+                else:
+                    pred = self.alstm_model(x_batch).detach().numpy()
+
+            preds.append(pred)
+
+        return pd.Series(np.concatenate(preds), index=index)
+
+
+class GRUModel(nn.Module):
+    def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0):
+        super().__init__()
+
+        self.rnn = nn.GRU(
+            input_size=d_feat,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            batch_first=True,
+            dropout=dropout,
+        )
+        self.fc_out = nn.Linear(hidden_size, 1)
+
+        self.d_feat = d_feat
+
+    def forward(self, x):
+        # x: [N, F*T]
+        x = x.reshape(len(x), self.d_feat, -1)  # [N, F, T]
+        x = x.permute(0, 2, 1)  # [N, T, F]
+        out, _ = self.rnn(x)
+        return self.fc_out(out[:, -1, :]).squeeze()
+
+
+class ALSTMModel(nn.Module):
+    def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0, rnn_type="GRU"):
+        super().__init__()
+        self.hid_size = hidden_size
+        self.input_size = d_feat
+        self.dropout = dropout
+        self.rnn_type = rnn_type
+        self.rnn_layer = num_layers
+        self._build_model()
+
+    def _build_model(self):
+        try:
+            klass = getattr(nn, self.rnn_type.upper())
+        except:
+            raise ValueError("unknown rnn_type `%s`" % self.rnn_type)
+        self.net = nn.Sequential()
+        self.net.add_module("fc_in", nn.Linear(in_features=self.input_size, out_features=self.hid_size))
+        self.net.add_module("act", nn.Tanh())
+        self.rnn = klass(
+            input_size=self.hid_size,
+            hidden_size=self.hid_size,
+            num_layers=self.rnn_layer,
+            batch_first=True,
+            dropout=self.dropout,
+        )
+        self.fc_out = nn.Linear(in_features=self.hid_size * 2, out_features=1)
+        # self.fc_out = nn.Linear(in_features=self.hid_size, out_features=1)
+        self.att_net = nn.Sequential()
+        self.att_net.add_module("att_fc_in", nn.Linear(in_features=self.hid_size, out_features=int(self.hid_size / 2)))
+        self.att_net.add_module("att_dropout", torch.nn.Dropout(self.dropout))
+        self.att_net.add_module("att_act", nn.Tanh())
+        self.att_net.add_module("att_fc_out", nn.Linear(in_features=int(self.hid_size / 2), out_features=1, bias=False))
+        self.att_net.add_module("att_softmax", nn.Softmax(dim=1))
+
+    def forward(self, inputs):
+        # inputs: [batch_size, input_size*input_day]
+        inputs = inputs.view(len(inputs), self.input_size, -1)
+        inputs = inputs.permute(0, 2, 1)  # [batch, input_size, seq_len] -> [batch, seq_len, input_size]
+        rnn_out, _ = self.rnn(self.net(inputs))  # [batch, seq_len, num_directions * hidden_size]
+        attention_score = self.att_net(rnn_out)  # [batch, seq_len, 1]
+        out_att = torch.mul(rnn_out, attention_score)
+        out_att = torch.sum(out_att, dim=1)
+        out = self.fc_out(
+            torch.cat((rnn_out[:, -1, :], out_att), dim=1)
+        )  # [batch, seq_len, num_directions * hidden_size] -> [batch, 1]
+        # out = self.fc_out(rnn_out[:, -1, :] + out_att)
+        return out[..., 0]
diff --git a/qlib/contrib/model/pytorch_gats.py b/qlib/contrib/model/pytorch_gats.py
index 22ed6812d..07af4eda4 100755
--- a/qlib/contrib/model/pytorch_gats.py
+++ b/qlib/contrib/model/pytorch_gats.py
@@ -55,6 +55,7 @@ class GAT(Model):
         early_stop=20,
         loss="mse",
         base_model="GRU",
+        with_pretrain=True,
         optimizer="adam",
         GPU="0",
         seed=0,
@@ -77,6 +78,7 @@ class GAT(Model):
         self.optimizer = optimizer.lower()
         self.loss = loss
         self.base_model = base_model
+        self.with_pretrain = with_pretrain
         self.visible_GPU = GPU
         self.use_gpu = torch.cuda.is_available()
         self.seed = seed
@@ -95,6 +97,7 @@ class GAT(Model):
             "\noptimizer : {}"
             "\nloss_type : {}"
             "\nbase_model : {}"
+            "\nwith_pretrain : {}"
             "\nvisible_GPU : {}"
             "\nuse_GPU : {}"
             "\nseed : {}".format(
@@ -110,6 +113,7 @@ class GAT(Model):
                 optimizer.lower(),
                 loss,
                 base_model,
+                with_pretrain,
                 GPU,
                 self.use_gpu,
                 seed,
@@ -256,6 +260,25 @@ class GAT(Model):
         evals_result["train"] = []
         evals_result["valid"] = []
 
+        # load pretrained base_model
+        if self.with_pretrain:
+            self.logger.info("Loading pretrained model...")
+            if self.base_model == "LSTM":
+                from ...contrib.model.pytorch_lstm import LSTMModel
+
+                pretrained_model = LSTMModel()
+                pretrained_model.load_state_dict(torch.load("benchmarks/LSTM/model_lstm_csi300.pkl"))
+            elif self.base_model == "GRU":
+                from ...contrib.model.pytorch_gru import GRUModel
+
+                pretrained_model = GRUModel()
+                pretrained_model.load_state_dict(torch.load("benchmarks/GRU/model_gru_csi300.pkl"))
+            model_dict = self.GAT_model.state_dict()
+            pretrained_dict = {k: v for k, v in pretrained_model.state_dict().items() if k in model_dict}
+            model_dict.update(pretrained_dict)
+            self.GAT_model.load_state_dict(model_dict)
+            self.logger.info("Loading pretrained model Done...")
+
         # train
         self.logger.info("training...")
         self._fitted = True
diff --git a/qlib/contrib/model/pytorch_hats.py b/qlib/contrib/model/pytorch_hats.py
new file mode 100644
index 000000000..7b4307e25
--- /dev/null
+++ b/qlib/contrib/model/pytorch_hats.py
@@ -0,0 +1,504 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+import pandas as pd
+import copy
+from sklearn.metrics import roc_auc_score, mean_squared_error
+import logging
+from ...utils import unpack_archive_with_buffer, save_multiple_parts_file, create_save_path, drop_nan_by_y_index
+from ...log import get_module_logger, TimeInspector
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+
+from ...model.base import Model
+from ...data.dataset import DatasetH
+from ...data.dataset.handler import DataHandlerLP
+
+
+class HATS(Model):
+    """HATS Model
+
+    Parameters
+    ----------
+    input_dim : int
+        input dimension
+    output_dim : int
+        output dimension
+    layers : tuple
+        layer sizes
+    lr : float
+        learning rate
+    optimizer : str
+        optimizer name
+    GPU : str
+        the GPU ID(s) used for training
+    """
+
+    def __init__(
+        self,
+        d_feat=6,
+        hidden_size=64,
+        num_layers=2,
+        dropout=0.5,
+        n_epochs=200,
+        lr=0.01,
+        metric="IC",
+        batch_size=800,
+        early_stop=20,
+        loss="mse",
+        base_model="GRU",
+        with_pretrain=True,
+        optimizer="adam",
+        GPU="0",
+        seed=0,
+        **kwargs
+    ):
+        # Set logger.
+        self.logger = get_module_logger("HATS")
+        self.logger.info("HATS pytorch version...")
+
+        # set hyper-parameters.
+        self.d_feat = d_feat
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.dropout = dropout
+        self.n_epochs = n_epochs
+        self.lr = lr
+        self.metric = metric
+        self.batch_size = batch_size
+        self.early_stop = early_stop
+        self.optimizer = optimizer.lower()
+        self.loss = loss
+        self.base_model = base_model
+        self.with_pretrain = with_pretrain  #### True if train HATS with pretrained base model
+        self.visible_GPU = GPU
+        self.use_gpu = torch.cuda.is_available()
+        self.seed = seed
+
+        self.logger.info(
+            "HATS parameters setting:"
+            "\nd_feat : {}"
+            "\nhidden_size : {}"
+            "\nnum_layers : {}"
+            "\ndropout : {}"
+            "\nn_epochs : {}"
+            "\nlr : {}"
+            "\nmetric : {}"
+            "\nbatch_size : {}"
+            "\nearly_stop : {}"
+            "\noptimizer : {}"
+            "\nloss_type : {}"
+            "\nbase_model : {}"
+            "\nwith_pretrain : {}"  ##### debug
+            "\nvisible_GPU : {}"
+            "\nuse_GPU : {}"
+            "\nseed : {}".format(
+                d_feat,
+                hidden_size,
+                num_layers,
+                dropout,
+                n_epochs,
+                lr,
+                metric,
+                batch_size,
+                early_stop,
+                optimizer.lower(),
+                loss,
+                base_model,
+                with_pretrain,  ### debug
+                GPU,
+                self.use_gpu,
+                seed,
+            )
+        )
+
+        if loss not in {"mse", "binary"}:
+            raise NotImplementedError("loss {} is not supported!".format(loss))
+        self._scorer = mean_squared_error if loss == "mse" else roc_auc_score
+
+        self.HATS_model = HATSModel(
+            d_feat=self.d_feat,
+            hidden_size=self.hidden_size,
+            num_layers=self.num_layers,
+            dropout=self.dropout,
+            base_model=self.base_model,
+        )
+        if optimizer.lower() == "adam":
+            self.train_optimizer = optim.Adam(self.HATS_model.parameters(), lr=self.lr)
+        elif optimizer.lower() == "gd":
+            self.train_optimizer = optim.SGD(self.HATS_model.parameters(), lr=self.lr)
+        else:
+            raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
+
+        self._fitted = False
+        if self.use_gpu:
+            self.HATS_model.cuda()
+            # set the visible GPU
+            if self.visible_GPU:
+                os.environ["CUDA_VISIBLE_DEVICES"] = self.visible_GPU
+
+    def mse(self, pred, label):
+        loss = (pred - label) ** 2
+        return torch.mean(loss)
+
+    def loss_fn(self, pred, label):
+        mask = ~torch.isnan(label)
+
+        if self.loss == "mse":
+            return self.mse(pred[mask], label[mask])
+
+        raise ValueError("unknown loss `%s`" % self.loss)
+
+    def metric_fn(self, pred, label):
+
+        mask = torch.isfinite(label)
+        if self.metric == "IC":
+            return self.cal_ic(pred[mask], label[mask])
+
+        if self.metric == "" or self.metric == "loss":  # use loss
+            return -self.loss_fn(pred[mask], label[mask])
+
+        raise ValueError("unknown metric `%s`" % self.metric)
+
+    def cal_ic(self, pred, label):
+        return torch.mean(pred * label)
+
+    def train_epoch(self, x_train, y_train):
+
+        x_train_values = x_train.values
+        y_train_values = np.squeeze(y_train.values) * 100
+
+        self.HATS_model.train()
+
+        indices = np.arange(len(x_train_values))
+        np.random.shuffle(indices)
+
+        for i in range(len(indices))[:: self.batch_size]:
+
+            if len(indices) - i < self.batch_size:
+                break
+
+            feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float()
+            label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float()
+
+            if self.use_gpu:
+                feature = feature.cuda()
+                label = label.cuda()
+
+            pred = self.HATS_model(feature)
+            loss = self.loss_fn(pred, label)
+
+            self.train_optimizer.zero_grad()
+            loss.backward()
+            torch.nn.utils.clip_grad_value_(self.HATS_model.parameters(), 3.0)
+            self.train_optimizer.step()
+
+    def test_epoch(self, data_x, data_y):
+
+        # prepare training data
+        x_values = data_x.values
+        y_values = np.squeeze(data_y.values)
+
+        self.HATS_model.eval()
+
+        scores = []
+        losses = []
+
+        indices = np.arange(len(x_values))
+        np.random.shuffle(indices)
+
+        for i in range(len(indices))[:: self.batch_size]:
+
+            if len(indices) - i < self.batch_size:
+                break
+
+            feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float()
+            label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float()
+
+            if self.use_gpu:
+                feature = feature.cuda()
+                label = label.cuda()
+
+            pred = self.HATS_model(feature)
+            loss = self.loss_fn(pred, label)
+            losses.append(loss.item())
+
+            score = self.metric_fn(pred, label)
+            scores.append(score.item())
+
+        return np.mean(losses), np.mean(scores)
+
+    def fit(
+        self,
+        dataset: DatasetH,
+        evals_result=dict(),
+        verbose=True,
+        save_path=None,
+    ):
+
+        df_train, df_valid, df_test = dataset.prepare(
+            ["train", "valid", "test"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
+        )
+
+        x_train, y_train = df_train["feature"], df_train["label"]
+        x_valid, y_valid = df_valid["feature"], df_valid["label"]
+
+        if save_path == None:
+            save_path = create_save_path(save_path)
+        stop_steps = 0
+        train_loss = 0
+        best_score = -np.inf
+        best_epoch = 0
+        evals_result["train"] = []
+        evals_result["valid"] = []
+
+        # load pretrained base_model
+        if self.with_pretrain:
+            self.logger.info("loading pretrained model...")
+            if self.base_model == "LSTM":
+                from ...contrib.model.pytorch_lstm import LSTMModel
+
+                pretrained_model = LSTMModel()
+                pretrained_model.load_state_dict(torch.load("benchmarks/LSTM/model_lstm_csi300.pkl"))
+            elif self.base_model == "GRU":
+                from ...contrib.model.pytorch_gru import GRUModel
+
+                pretrained_model = GRUModel()
+                pretrained_model.load_state_dict(torch.load("benchmarks/GRU/model_gru_csi300.pkl"))
+            model_dict = self.HATS_model.state_dict()
+
+            # filter unnecessary parameters
+            pretrained_dict = {k: v for k, v in pretrained_model.state_dict().items() if k in model_dict}
+            # overwrite entries in the existing state dict
+            model_dict.update(pretrained_dict)
+            # load the new state dict
+            self.HATS_model.load_state_dict(model_dict)
+            self.logger.info("loading pretrained model Done...")
+
+        # train
+        self.logger.info("training...")
+        self._fitted = True
+        # return
+
+        for step in range(self.n_epochs):
+            self.logger.info("Epoch%d:", step)
+            self.logger.info("training...")
+            self.train_epoch(x_train, y_train)
+            self.logger.info("evaluating...")
+            train_loss, train_score = self.test_epoch(x_train, y_train)
+            val_loss, val_score = self.test_epoch(x_valid, y_valid)
+            self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
+            evals_result["train"].append(train_score)
+            evals_result["valid"].append(val_score)
+
+            if val_score > best_score:
+                best_score = val_score
+                stop_steps = 0
+                best_epoch = step
+                best_param = copy.deepcopy(self.HATS_model.state_dict())
+            else:
+                stop_steps += 1
+                if stop_steps >= self.early_stop:
+                    self.logger.info("early stop")
+                    break
+
+        self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
+        self.HATS_model.load_state_dict(best_param)
+        torch.save(best_param, save_path)
+
+        if self.use_gpu:
+            torch.cuda.empty_cache()
+
+    def predict(self, dataset):
+        if not self._fitted:
+            raise ValueError("model is not fitted yet!")
+
+        x_test = dataset.prepare("test", col_set="feature")
+        index = x_test.index
+        self.HATS_model.eval()
+        x_values = x_test.values
+        sample_num = x_values.shape[0]
+        preds = []
+
+        for begin in range(sample_num)[:: self.batch_size]:
+
+            if sample_num - begin < self.batch_size:
+                end = sample_num
+            else:
+                end = begin + self.batch_size
+
+            x_batch = torch.from_numpy(x_values[begin:end]).float()
+
+            if self.use_gpu:
+                x_batch = x_batch.cuda()
+
+            with torch.no_grad():
+                if self.use_gpu:
+                    pred = self.HATS_model(x_batch).detach().cpu().numpy()
+                else:
+                    pred = self.HATS_model(x_batch).detach().numpy()
+
+            preds.append(pred)
+
+        return pd.Series(np.concatenate(preds), index=index)
+
+
+class HATSModel(nn.Module):
+    def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0, base_model="GRU"):
+        super().__init__()
+
+        if base_model == "GRU":
+            self.model = nn.GRU(
+                input_size=d_feat,
+                hidden_size=hidden_size,
+                num_layers=num_layers,
+                batch_first=True,
+                dropout=dropout,
+            )
+        elif base_model == "LSTM":
+            self.model = nn.LSTM(
+                input_size=d_feat,
+                hidden_size=hidden_size,
+                num_layers=num_layers,
+                batch_first=True,
+                dropout=dropout,
+            )
+        else:
+            raise ValueError("unknown base model name `%s`" % base_model)
+
+        self.hidden_size = hidden_size
+        self.bn1 = nn.BatchNorm1d(num_features=hidden_size, track_running_stats=False)
+        self.fc = nn.Linear(hidden_size, hidden_size)
+        self.bn2 = nn.BatchNorm1d(num_features=hidden_size, track_running_stats=False)
+        self.fc_out = nn.Linear(hidden_size, 1)
+        self.leaky_relu = nn.LeakyReLU()
+        self.softmax = nn.Softmax(dim=1)
+        self.d_feat = d_feat
+
+        num_head_att = [1] * num_layers
+        hidden_dim = [hidden_size] * num_layers
+        dims = [d_feat] + [d * nh for (d, nh) in zip(hidden_dim, num_head_att[:-1])] + [num_head_att[-1]]
+        in_dims = dims[:-1]
+        out_dims = [d // nh for (d, nh) in zip(dims[1:], num_head_att)]
+        self.attn = nn.ModuleList(
+            [GraphAttention(i, o, nh, dropout) for (i, o, nh) in zip(in_dims, out_dims, num_head_att)]
+        )
+        self.bns = nn.ModuleList([nn.BatchNorm1d(dim) for dim in dims[1:-1]])
+        self.dropout = nn.Dropout(dropout)
+        self.elu = nn.ELU()
+
+    def forward(self, x):
+        x = x.reshape(len(x), self.d_feat, -1)  # [N, F, T]
+        x = x.permute(0, 2, 1)  # [N, T, F]
+        out, _ = self.model(x)
+        hidden = out[:, -1, :]
+        hidden = self.bn1(hidden)
+        attention = GraphAttention.cal_attention(hidden, hidden)
+        output = attention.mm(hidden)
+        output = self.fc(output)
+        output = self.bn2(output)
+        output = self.leaky_relu(output)
+        return self.fc_out(output).squeeze()
+
+
+class GraphAttention(nn.Module):
+    def __init__(self, input_dim, output_dim, num_heads, dropout=0.5):
+
+        super().__init__()
+
+        """
+        Parameters
+        ----------
+        input_dim : int
+            Dimension of input node features.
+        output_dim : int
+            Dimension of output node features.
+        num_heads : list of ints
+            Number of attention heads in each hidden layer and output layer. Must be non empty. Note that len(num_heads) = len(hidden_dims)+1.
+        dropout : float
+            Dropout rate. Default: 0.5.
+        """
+
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.num_heads = num_heads
+
+        self.fcs = nn.ModuleList([nn.Linear(input_dim, output_dim) for _ in range(num_heads)])
+        self.a = nn.ModuleList([nn.Linear(2 * output_dim, 1) for _ in range(num_heads)])
+
+        self.dropout = nn.Dropout(dropout)
+        self.softmax = nn.Softmax(dim=0)
+        self.leakyrelu = nn.LeakyReLU()
+
+    def forward(self, features, nodes, mapping, rows):
+
+        """
+        Parameters
+        ----------
+        features : torch.Tensor
+            An (n' x input_dim) tensor of input node features.
+        node_layers : list of numpy array
+            node_layers[i] is an array of the nodes in the ith layer of the
+            computation graph.
+        mappings : list of dictionary
+            mappings[i] is a dictionary mapping node v (labelled 0 to |V|-1)
+            in node_layers[i] to its position in node_layers[i]. For example,
+            if node_layers[i] = [2,5], then mappings[i][2] = 0 and
+            mappings[i][5] = 1.
+        rows : numpy array
+            rows[i] is an array of neighbors of node i.
+        Returns
+        -------
+        out : torch.Tensor
+            An (len(node_layers[-1]) x output_dim) tensor of output node features.
+        """
+
+        nprime = features.shape[0]
+        rows = [np.array([mapping[v] for v in row], dtype=np.int64) for row in rows]
+        sum_degs = np.hstack(([0], np.cumsum([len(row) for row in rows])))
+        mapped_nodes = [mapping[v] for v in nodes]
+        indices = torch.LongTensor([[v, c] for (v, row) in zip(mapped_nodes, rows) for c in row]).t()
+
+        out = []
+        for k in range(self.num_heads):
+            h = self.fcs[k](features)
+
+            nbr_h = torch.cat(tuple([h[row] for row in rows]), dim=0)
+            self_h = torch.cat(tuple([h[mapping[nodes[i]]].repeat(len(row), 1) for (i, row) in enumerate(rows)]), dim=0)
+            cat_h = torch.cat((self_h, nbr_h), dim=1)
+
+            e = self.leakyrelu(self.a[k](cat_h))
+
+            alpha = [self.softmax(e[lo:hi]) for (lo, hi) in zip(sum_degs, sum_degs[1:])]
+            alpha = torch.cat(tuple(alpha), dim=0)
+            alpha = alpha.squeeze(1)
+            alpha = self.dropout(alpha)
+
+            adj = torch.sparse.FloatTensor(indices, alpha, torch.Size([nprime, nprime]))
+            out.append(torch.sparse.mm(adj, h)[mapped_nodes])
+
+        return out
+
+    def cal_attention(x, y):
+
+        att_x = torch.mean(x, dim=1).reshape(-1, 1)
+        att_y = torch.mean(y, dim=1).reshape(-1, 1)
+        att = att_x.mm(torch.t(att_y))
+        x_att = x.reshape(x.shape[0], 1, x.shape[1]).repeat(1, y.shape[0], 1)
+        y_att = y.reshape(1, y.shape[0], y.shape[1]).repeat(x.shape[0], 1, 1)
+        return (
+            torch.mean(
+                x.reshape(x.shape[0], 1, x.shape[1]).repeat(1, y.shape[0], 1)
+                * y.reshape(1, y.shape[0], y.shape[1]).repeat(x.shape[0], 1, 1),
+                dim=2,
+            )
+            - att
+        )
diff --git a/qlib/contrib/model/xgboost.py b/qlib/contrib/model/xgboost.py
index e0691ba16..b45e12e10 100755
--- a/qlib/contrib/model/xgboost.py
+++ b/qlib/contrib/model/xgboost.py
@@ -1,5 +1,14 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import numpy as np
 import pandas as pd
diff --git a/qlib/contrib/strategy/strategy.py b/qlib/contrib/strategy/strategy.py
index 6eac9bafe..f2e2a4554 100644
--- a/qlib/contrib/strategy/strategy.py
+++ b/qlib/contrib/strategy/strategy.py
@@ -26,7 +26,9 @@ class BaseStrategy:
 
     def generate_order_list(self, score_series, current, trade_exchange, pred_date, trade_date):
         """
-        Parameters:
+        DO NOT directly change the state of current
+
+        Parameters
         -----------
         score_series : pd.Seires
             stock_id , score
@@ -39,14 +41,12 @@ class BaseStrategy:
             predict date
         trade_date : pd.Timestamp
             trade date
-
-        DO NOT directly change the state of current
         """
         pass
 
     def update(self, score_series, pred_date, trade_date):
         """User can use this method to update strategy state each trade date.
-        Parameters:
+        Parameters
         -----------
         score_series : pd.Series
             stock_id , score
@@ -98,8 +98,9 @@ class AdjustTimer:
     """AdjustTimer
     Responsible for timing of position adjusting
 
-    This is designed as multiple inheritance mechanism due to
+    This is designed as multiple inheritance mechanism due to:
     - the is_adjust may need access to the internel state of a strategy
+
     - it can be reguard as a enhancement to the existing strategy
     """
 
@@ -140,21 +141,24 @@ class WeightStrategyBase(BaseStrategy, AdjustTimer):
 
     def generate_target_weight_position(self, score, current, trade_date):
         """
-        Parameters:
+        Generate target position from score for this date and the current position.The cash is not considered in the position
+
+        Parameters
         -----------
-        score : pred score for this trade date, pd.Series, index is stock_id, contain 'score' column
-        current : current position, use Position() class
+        score : pd.Series
+            pred score for this trade date, index is stock_id, contain 'score' column
+        current : Position()
+            current position
         trade_exchange : Exchange()
-        trade_date : trade date
-        generate target position from score for this date and the current position
-        The cash is not considered in the position
+        trade_date : pd.Timestamp
+            trade date
         """
         raise NotImplementedError()
 
     def generate_order_list(self, score_series, current, trade_exchange, pred_date, trade_date):
         """
-        Parameters:
-        ----------
+        Parameters
+        -----------
         score_series : pd.Seires
             stock_id , score
         current : Position()
@@ -186,16 +190,29 @@ class WeightStrategyBase(BaseStrategy, AdjustTimer):
 
 
 class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer):
-    def __init__(self, topk, n_drop, method="bottom", risk_degree=0.95, thresh=1, hold_thresh=1, **kwargs):
+    def __init__(
+        self,
+        topk,
+        n_drop,
+        method_sell="bottom",
+        method_buy="top",
+        risk_degree=0.95,
+        thresh=1,
+        hold_thresh=1,
+        only_tradable=False,
+        **kwargs,
+    ):
         """
-        Parameters:
+        Parameters
         -----------
         topk : int
             The number of stocks in the portfolio
         n_drop : int
             number of stocks to be replaced in each trading date
-        method : str
-            dropout method, random/bottom
+        method_sell : str
+            dropout method_sell, random/bottom
+        method_buy : str
+            dropout method_buy, random/top
         risk_degree : float
             position percentage of total value
         thresh : int
@@ -203,12 +220,19 @@ class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer):
         hold_thresh : int
             minimum holding days
             before sell stock , will check current.get_stock_count(order.stock_id) >= self.thresh
+        only_tradable : bool
+            will the strategy only consider the tradable stock when buying and selling.
+            if only_tradable:
+                strategy will make buy sell decision without checking the tradable state of the stock
+            else:
+                strategy will make decision with the tradable state of the stock info and avoid buy and sell them
         """
         super(TopkDropoutStrategy, self).__init__()
         ListAdjustTimer.__init__(self, kwargs.get("adjust_dates", None))
         self.topk = topk
         self.n_drop = n_drop
-        self.method = method
+        self.method_sell = method_sell
+        self.method_buy = method_buy
         self.risk_degree = risk_degree
         self.thresh = thresh
         # self.stock_count['code'] will be the days the stock has been hold
@@ -216,6 +240,7 @@ class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer):
         self.stock_count = {}
 
         self.hold_thresh = hold_thresh
+        self.only_tradable = only_tradable
 
     def get_risk_degree(self, date):
         """get_risk_degree
@@ -229,7 +254,7 @@ class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer):
         """
         Gnererate order list according to score_series at trade_date, will not change current.
 
-        Parameters:
+        Parameters
         -----------
         score_series : pd.Series
             stock_id , score
@@ -244,24 +269,85 @@ class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer):
         """
         if not self.is_adjust(trade_date):
             return []
+
+        if self.only_tradable:
+            # If The strategy only consider tradable stock when make decision
+            # It needs following actions to filter stocks
+            def get_first_n(l, n, reverse=False):
+                cur_n = 0
+                res = []
+                for si in reversed(l) if reverse else l:
+                    if trade_exchange.is_stock_tradable(stock_id=si, trade_date=trade_date):
+                        res.append(si)
+                        cur_n += 1
+                        if cur_n >= n:
+                            break
+                return res[::-1] if reverse else res
+
+            def get_last_n(l, n):
+                return get_first_n(l, n, reverse=True)
+
+            def filter_stock(l):
+                return [si for si in l if trade_exchange.is_stock_tradable(stock_id=si, trade_date=trade_date)]
+
+        else:
+            # Otherwise, the stock will make decision with out the stock tradable info
+            def get_first_n(l, n):
+                return list(l)[:n]
+
+            def get_last_n(l, n):
+                return list(l)[-n:]
+
+            def filter_stock(l):
+                return l
+
         current_temp = copy.deepcopy(current)
         # generate order list for this adjust date
         sell_order_list = []
         buy_order_list = []
         # load score
+        cash = current_temp.get_cash()
         current_stock_list = current_temp.get_stock_list()
+        # last position (sorted by score)
         last = score_series.reindex(current_stock_list).sort_values(ascending=False).index
-        today = (
-            score_series[~score_series.index.isin(last)]
-            .sort_values(ascending=False)
-            .index[: self.n_drop + self.topk - len(last)]
-        )
-        comb = score_series.reindex(last.union(today)).sort_values(ascending=False).index
-        if self.method == "bottom":
-            sell = last[last.isin(comb[-self.n_drop :])]
-        elif self.method == "random":
-            sell = pd.Index(np.random.choice(last, self.n_drop) if len(last) else [])
+        # The new stocks today want to buy **at most**
+        if self.method_buy == "top":
+            today = get_first_n(
+                score_series[~score_series.index.isin(last)].sort_values(ascending=False).index,
+                self.n_drop + self.topk - len(last),
+            )
+        elif self.method_buy == "random":
+            topk_candi = get_first_n(score_series.sort_values(ascending=False).index, self.topk)
+            candi = list(filter(lambda x: x not in last, topk_candi))
+            n = self.n_drop + self.topk - len(last)
+            try:
+                today = np.random.choice(candi, n, replace=False)
+            except ValueError:
+                today = candi
+        else:
+            raise NotImplementedError(f"This type of input is not supported")
+        # combine(new stocks + last stocks),  we will drop stocks from this list
+        # In case of dropping higher score stock and buying lower score stock.
+        comb = score_series.reindex(last.union(pd.Index(today))).sort_values(ascending=False).index
+
+        # Get the stock list we really want to sell (After filtering the case that we sell high and buy low)
+        if self.method_sell == "bottom":
+            sell = last[last.isin(get_last_n(comb, self.n_drop))]
+        elif self.method_sell == "random":
+            candi = filter_stock(last)
+            try:
+                sell = pd.Index(np.random.choice(candi, self.n_drop, replace=False) if len(last) else [])
+            except ValueError:  #  No enough candidates
+                sell = candi
+        else:
+            raise NotImplementedError(f"This type of input is not supported")
+
+        # Get the stock list we really want to buy
         buy = today[: len(sell) + self.topk - len(last)]
+
+        # buy singal: if a stock falls into topk, it appear in the buy_sinal
+        buy_signal = score_series.sort_values(ascending=False).iloc[: self.topk].index
+
         for code in current_stock_list:
             if not trade_exchange.is_stock_tradable(stock_id=code, trade_date=trade_date):
                 continue
@@ -285,12 +371,14 @@ class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer):
                 if trade_exchange.check_order(sell_order):
                     sell_order_list.append(sell_order)
                     trade_val, trade_cost, trade_price = trade_exchange.deal_order(sell_order, position=current_temp)
+                    # update cash
+                    cash += trade_val - trade_cost
                     # sold
                     del self.stock_count[code]
                 else:
                     # no buy signal, but the stock is kept
                     self.stock_count[code] += 1
-            elif code in buy:
+            elif code in buy_signal:
                 # NOTE: This is different from the original version
                 # get new buy signal
                 # Only the stock fall in to topk will produce buy signal
@@ -300,7 +388,7 @@ class TopkDropoutStrategy(BaseStrategy, ListAdjustTimer):
         # buy new stock
         # note the current has been changed
         current_stock_list = current_temp.get_stock_list()
-        value = current_temp.get_cash() * self.risk_degree / len(buy) if len(buy) > 0 else 0
+        value = cash * self.risk_degree / len(buy) if len(buy) > 0 else 0
 
         # open_cost should be considered in the real trading environment, while the backtest in evaluate.py does not consider it
         # as the aim of demo is to accomplish same strategy as evaluate.py, so comment out this line
diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py
index c46528944..e972aba3c 100644
--- a/qlib/data/dataset/__init__.py
+++ b/qlib/data/dataset/__init__.py
@@ -14,9 +14,11 @@ class Dataset(Serializable):
 
     def __init__(self, *args, **kwargs):
         """
-        init is designed to finish following steps
+        init is designed to finish following steps:
+
         - setup data
             - The data related attributes' names should start with '_' so that it will not be saved on disk when serializing
+
         - initialize the state of the dataset(info to prepare the data)
             - The name of essential state for preparing data should not start with '_' so that it could be serialized on disk when serializing.
 
@@ -29,11 +31,15 @@ class Dataset(Serializable):
         """
         setup the data
 
-        We split the setup_data function for following situation
-        - 1) User have a Dataset object with learned status on disk
-        - 2) User load the Dataset object from the disk(Note the init function is skiped)
-        - 3) User call `setup_data` to load new data
-        - 4) User prepare data for model based on previous status
+        We split the setup_data function for following situation:
+
+        - User have a Dataset object with learned status on disk
+
+        - User load the Dataset object from the disk(Note the init function is skiped)
+
+        - User call `setup_data` to load new data
+
+        - User prepare data for model based on previous status
         """
         pass
 
@@ -41,8 +47,9 @@ class Dataset(Serializable):
         """
         The type of dataset depends on the model. (It could be pd.DataFrame, pytorch.DataLoader, etc.)
         The parameters should specify the scope for the prepared data
-        The method sould
+        The method should:
         - process the data
+
         - return the processed data
 
         Returns
@@ -55,11 +62,12 @@ class Dataset(Serializable):
 
 class DatasetH(Dataset):
     """
-    Dataset with Data(H)anler
+    Dataset with Data(H)andler
 
     User should try to put the data preprocessing functions into handler.
-    Only following data processing functions should be placed in Dataset
+    Only following data processing functions should be placed in Dataset:
     - The processing is related to specific model.
+
     - The processing is related to data split
     """
 
@@ -81,21 +89,26 @@ class DatasetH(Dataset):
         Parameters
         ----------
         handler : Union[dict, DataHandler]
-            handler could be
-            1) insntance of `DataHandler`
-            2) config of `DataHandler`.  Please refer to `DataHandler`
+            handler could be:
+
+            - insntance of `DataHandler`
+
+            - config of `DataHandler`.  Please refer to `DataHandler`
         segments : list
             Describe the options to segment the data.
-            Here are some examples
-            1) 'segments': {
-                    'train': ("2008-01-01", "2014-12-31"),
-                    'valid': ("2017-01-01", "2020-08-01",),
-                    'test': ("2015-01-01", "2016-12-31",),
-                }
-            2) 'segments': {
-                    'insample': ("2008-01-01", "2014-12-31"),
-                    'outsample': ("2017-01-01", "2020-08-01",),
-                }
+            Here are some examples:
+
+            .. code-block::
+
+                1) 'segments': {
+                        'train': ("2008-01-01", "2014-12-31"),
+                        'valid': ("2017-01-01", "2020-08-01",),
+                        'test': ("2015-01-01", "2016-12-31",),
+                    }
+                2) 'segments': {
+                        'insample': ("2008-01-01", "2014-12-31"),
+                        'outsample': ("2017-01-01", "2020-08-01",),
+                    }
         """
         self._handler = init_instance_by_config(handler, accept_types=DataHandler)
         self._segments = segments.copy()
@@ -114,9 +127,11 @@ class DatasetH(Dataset):
         ----------
         segments : Union[List[str], Tuple[str], str, slice]
             Describe the scope of the data to be prepared
-            Here are some examples
-            1) 'train'
-            2) ['train', 'valid']
+            Here are some examples:
+
+            - 'train'
+
+            - ['train', 'valid']
         col_set : str
             The col_set will be passed to self._handler when fetching data
         data_key: str
diff --git a/qlib/data/dataset/handler.py b/qlib/data/dataset/handler.py
index e0a4d809a..4d3d88c38 100644
--- a/qlib/data/dataset/handler.py
+++ b/qlib/data/dataset/handler.py
@@ -41,7 +41,7 @@ class DataHandler(Serializable):
     Example of the data:
     The multi-index of the columns is optional.
 
-    .. code-block::
+    .. code-block:: python
 
                                 feature                                                            label
                                 $close     $volume  Ref($close, 1)  Mean($close, 3)  $high-$low  LABEL0
@@ -109,7 +109,8 @@ class DataHandler(Serializable):
         Parameters
         ----------
         enable_cache : bool
-            default value is false
+            default value is false:
+
             - if `enable_cache` == True:
 
                 the processed data will be saved on disk, and handler will load the cached data from the disk directly
@@ -378,8 +379,10 @@ class DataHandlerLP(DataHandler):
         init_type : str
             The type `IT_*` listed above
         enable_cache : bool
-            default value is false
-            if `enable_cache` == True:
+            default value is false:
+
+            - if `enable_cache` == True:
+
                 the processed data will be saved on disk, and handler will load the cached data from the disk directly
                 when we call `init` next time
         """
diff --git a/qlib/data/dataset/loader.py b/qlib/data/dataset/loader.py
index e95dc4479..404313e80 100644
--- a/qlib/data/dataset/loader.py
+++ b/qlib/data/dataset/loader.py
@@ -39,14 +39,16 @@ class DataLoader(abc.ABC):
         pd.DataFrame:
             data load from the under layer source
 
-            Example of the data:
-            (The multi-index of the columns is optional.)
-                                    feature                                                             label
-                                    $close     $volume     Ref($close, 1)  Mean($close, 3)  $high-$low  LABEL0
-            datetime    instrument
-            2010-01-04  SH600000    81.807068  17145150.0       83.737389        83.016739    2.741058  0.0032
-                        SH600004    13.313329  11800983.0       13.313329        13.317701    0.183632  0.0042
-                        SH600005    37.796539  12231662.0       38.258602        37.919757    0.970325  0.0289
+            Example of the data (The multi-index of the columns is optional.):
+
+            .. code-block::
+
+                                        feature                                                             label
+                                        $close     $volume     Ref($close, 1)  Mean($close, 3)  $high-$low  LABEL0
+                datetime    instrument
+                2010-01-04  SH600000    81.807068  17145150.0       83.737389        83.016739    2.741058  0.0032
+                            SH600004    13.313329  11800983.0       13.313329        13.317701    0.183632  0.0042
+                            SH600005    37.796539  12231662.0       38.258602        37.919757    0.970325  0.0289
         """
         pass
 
@@ -55,7 +57,7 @@ class DLWParser(DataLoader):
     """
     (D)ata(L)oader (W)ith (P)arser for features and names
 
-    Extracting this class so that QlibDataLoader and other dataloaders(such as QdbDataLoader) can share the fields
+    Extracting this class so that QlibDataLoader and other dataloaders(such as QdbDataLoader) can share the fields.
     """
 
     def __init__(self, config: Tuple[list, tuple, dict]):
@@ -65,14 +67,16 @@ class DLWParser(DataLoader):
         config : Tuple[list, tuple, dict]
             Config will be used to describe the fields and column names
 
-            <config> := {
-                "group_name1": <fields_info1>
-                "group_name2": <fields_info2>
-            }
-            or
-            <config> := <fields_info>
+            .. code-block:: YAML
 
-            <fields_info> := ["expr", ...] | (["expr", ...], ["col_name", ...])
+                <config> := {
+                    "group_name1": <fields_info1>
+                    "group_name2": <fields_info2>
+                }
+                or
+                <config> := <fields_info>
+
+                <fields_info> := ["expr", ...] | (["expr", ...], ["col_name", ...])
         """
         self.is_group = isinstance(config, dict)
 
diff --git a/scripts/README.md b/scripts/README.md
index 88ebdc680..99af4a457 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -43,6 +43,8 @@ python get_data.py qlib_data --help
 
 ### US data
 
+> Need to download data first: [Downlaod US Data](#Downlaod-US-Data)
+
 ```python
 import qlib
 from qlib.config import REG_US
@@ -52,6 +54,8 @@ qlib.init(provider_uri=provider_uri, region=REG_US)
 
 ### CN data
 
+> Need to download data first: [Download CN Data](#Download-CN-Data)
+
 ```python
 import qlib
 from qlib.config import REG_CN
diff --git a/scripts/dump_bin.py b/scripts/dump_bin.py
index 2bca4f037..9f6dd88e2 100644
--- a/scripts/dump_bin.py
+++ b/scripts/dump_bin.py
@@ -140,7 +140,7 @@ class DumpDataBase:
 
     def _get_source_data(self, file_path: Path) -> pd.DataFrame:
         df = pd.read_csv(str(file_path.resolve()), low_memory=False)
-        df[self.date_field_name] = df[self.date_field_name].astype(np.datetime64)
+        df[self.date_field_name] = df[self.date_field_name].astype(str).astype(np.datetime64)
         # df.drop_duplicates([self.date_field_name], inplace=True)
         return df
 
@@ -339,10 +339,10 @@ class DumpDataFix(DumpDataAll):
     def dump(self):
         self._calendars_list = self._read_calendars(self._calendars_dir.joinpath(f"{self.freq}.txt"))
         # noinspection PyAttributeOutsideInit
-        self._old_instruments = self._read_instruments(
-            self._instruments_dir.joinpath(self.INSTRUMENTS_FILE_NAME)
-        ).to_dict(
-            orient="index"
+        self._old_instruments = (
+            self._read_instruments(self._instruments_dir.joinpath(self.INSTRUMENTS_FILE_NAME))
+            .set_index([self.symbol_field_name])
+            .to_dict(orient="index")
         )  # type: dict
         self._dump_instruments()
         self._dump_features()