From 21c0dae03c4161427904875fa3e5509b0e950b12 Mon Sep 17 00:00:00 2001
From: Jactus <dw1920@nyu.edu>
Date: Thu, 19 Nov 2020 16:51:09 +0800
Subject: [PATCH] Init benchmarks

---
 examples/benchmarks/CatBoost/requirements.txt |  3 +
 examples/benchmarks/DNN/requirements.txt      |  4 ++
 .../benchmarks/DNN/workflow_config_dnn.yaml   | 62 ++++++++++++++++++
 examples/benchmarks/GATs/requirements.txt     |  4 ++
 .../benchmarks/GATs/worflow_config_gats.yaml  | 63 ++++++++++++++++++
 examples/benchmarks/GBDT/requirements.txt     |  3 +
 .../benchmarks/GBDT/workflow_config_gbdt.yaml | 59 +++++++++++++++++
 examples/benchmarks/GRU/requirements.txt      |  4 ++
 .../benchmarks/GRU/workflow_config_gru.yaml   | 62 ++++++++++++++++++
 examples/benchmarks/LSTM/requirements.txt     |  4 ++
 .../benchmarks/LSTM/workflow_config_lstm.yaml | 62 ++++++++++++++++++
 examples/benchmarks/XGBoost/requirements.txt  |  3 +
 .../XGBoost/workflow_config_xgboost.yaml      | 62 ++++++++++++++++++
 examples/benchmarks/XGBoost/xgboost.py        | 64 +++++++++++++++++++
 14 files changed, 459 insertions(+)
 create mode 100644 examples/benchmarks/CatBoost/requirements.txt
 create mode 100644 examples/benchmarks/DNN/requirements.txt
 create mode 100644 examples/benchmarks/DNN/workflow_config_dnn.yaml
 create mode 100644 examples/benchmarks/GATs/requirements.txt
 create mode 100644 examples/benchmarks/GATs/worflow_config_gats.yaml
 create mode 100644 examples/benchmarks/GBDT/requirements.txt
 create mode 100644 examples/benchmarks/GBDT/workflow_config_gbdt.yaml
 create mode 100644 examples/benchmarks/GRU/requirements.txt
 create mode 100644 examples/benchmarks/GRU/workflow_config_gru.yaml
 create mode 100644 examples/benchmarks/LSTM/requirements.txt
 create mode 100644 examples/benchmarks/LSTM/workflow_config_lstm.yaml
 create mode 100644 examples/benchmarks/XGBoost/requirements.txt
 create mode 100644 examples/benchmarks/XGBoost/workflow_config_xgboost.yaml
 create mode 100755 examples/benchmarks/XGBoost/xgboost.py

diff --git a/examples/benchmarks/CatBoost/requirements.txt b/examples/benchmarks/CatBoost/requirements.txt
new file mode 100644
index 000000000..507a65944
--- /dev/null
+++ b/examples/benchmarks/CatBoost/requirements.txt
@@ -0,0 +1,3 @@
+pandas==1.1.2
+numpy==1.17.4
+catboost==0.24.3
diff --git a/examples/benchmarks/DNN/requirements.txt b/examples/benchmarks/DNN/requirements.txt
new file mode 100644
index 000000000..16de0a438
--- /dev/null
+++ b/examples/benchmarks/DNN/requirements.txt
@@ -0,0 +1,4 @@
+pandas==1.1.2
+numpy==1.17.4
+scikit_learn==0.23.2
+torch==1.7.0
diff --git a/examples/benchmarks/DNN/workflow_config_dnn.yaml b/examples/benchmarks/DNN/workflow_config_dnn.yaml
new file mode 100644
index 000000000..0f50cbb25
--- /dev/null
+++ b/examples/benchmarks/DNN/workflow_config_dnn.yaml
@@ -0,0 +1,62 @@
+provider_uri: "~/.qlib/qlib_data/cn_data"
+market: &market csi300
+benchmark: &benchmark SH000300
+data_handler_config: &data_handler_config
+    start_time: 2008-01-01
+    end_time: 2020-08-01
+    fit_start_time: 2008-01-01
+    fit_end_time: 2014-12-31
+    instruments: *market
+port_analysis_config: &port_analysis_config
+    strategy:
+        class: TopkDropoutStrategy
+        module_path: qlib.contrib.strategy.strategy
+        kwargs:
+            topk: 50
+            n_drop: 5
+    backtest:
+        verbose: False
+        limit_threshold: 0.095
+        account: 100000000
+        benchmark: *benchmark
+        deal_price: close
+        open_cost: 0.0005
+        close_cost: 0.0015
+        min_cost: 5
+task:
+    model:
+        class: DNNModelPytorch
+        module_path: qlib.contrib.model.pytorch_nn
+        kwargs:
+            input_dim: 360
+            output_dim: 1
+            layers: [256, 512, 1024, 512, 256, 128, 64]
+            lr: 0.001
+            max_steps: 300
+            batch_size: 2000
+            early_stop_rounds: 50
+            eval_steps: 20
+            lr_decay: 0.96
+            lr_decay_steps: 100
+            optimizer: gd
+            loss: mse
+    dataset:
+        class: DatasetH
+        module_path: qlib.data.dataset
+        kwargs:
+            handler:
+                class: ALPHA360_Denoise
+                module_path: qlib.contrib.data.handler
+                kwargs: *data_handler_config
+            segments:
+                train: [2008-01-01, 2014-12-31]
+                valid: [2015-01-01, 2016-12-31]
+                test: [2017-01-01, 2020-08-01]
+    record: 
+        - class: SignalRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: {}
+        - class: PortAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            config: *port_analysis_config
\ No newline at end of file
diff --git a/examples/benchmarks/GATs/requirements.txt b/examples/benchmarks/GATs/requirements.txt
new file mode 100644
index 000000000..16de0a438
--- /dev/null
+++ b/examples/benchmarks/GATs/requirements.txt
@@ -0,0 +1,4 @@
+pandas==1.1.2
+numpy==1.17.4
+scikit_learn==0.23.2
+torch==1.7.0
diff --git a/examples/benchmarks/GATs/worflow_config_gats.yaml b/examples/benchmarks/GATs/worflow_config_gats.yaml
new file mode 100644
index 000000000..6c8db2e77
--- /dev/null
+++ b/examples/benchmarks/GATs/worflow_config_gats.yaml
@@ -0,0 +1,63 @@
+provider_uri: "~/.qlib/qlib_data/cn_data"
+market: &market csi300
+benchmark: &benchmark SH000300
+data_handler_config: &data_handler_config
+    start_time: 2008-01-01
+    end_time: 2020-08-01
+    fit_start_time: 2008-01-01
+    fit_end_time: 2014-12-31
+    instruments: *market
+port_analysis_config: &port_analysis_config
+    strategy:
+        class: TopkDropoutStrategy
+        module_path: qlib.contrib.strategy.strategy
+        kwargs:
+            topk: 50
+            n_drop: 5
+    backtest:
+        verbose: False
+        limit_threshold: 0.095
+        account: 100000000
+        benchmark: *benchmark
+        deal_price: close
+        open_cost: 0.0005
+        close_cost: 0.0015
+        min_cost: 5
+task:
+    model:
+        class: GAT
+        module_path: qlib.contrib.model.pytorch_gats
+        kwargs:
+            d_feat: 6
+            hidden_size: 64
+            num_layers: 2
+            dropout: 0.0
+            n_epochs: 200
+            lr: 1e-3
+            early_stop: 20
+            batch_size: 800
+            metric: IC
+            loss: mse
+            base_model: GRU
+            seed: 0
+            GPU: 0
+    dataset:
+        class: DatasetH
+        module_path: qlib.data.dataset
+        kwargs:
+            handler:
+                class: ALPHA360_Denoise
+                module_path: qlib.contrib.data.handler
+                kwargs: *data_handler_config
+            segments:
+                train: [2008-01-01, 2014-12-31]
+                valid: [2015-01-01, 2016-12-31]
+                test: [2017-01-01, 2020-08-01]
+    record: 
+        - class: SignalRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: {}
+        - class: PortAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            config: *port_analysis_config
\ No newline at end of file
diff --git a/examples/benchmarks/GBDT/requirements.txt b/examples/benchmarks/GBDT/requirements.txt
new file mode 100644
index 000000000..507d2d453
--- /dev/null
+++ b/examples/benchmarks/GBDT/requirements.txt
@@ -0,0 +1,3 @@
+pandas==1.1.2
+numpy==1.17.4
+lightgbm==3.1.0
diff --git a/examples/benchmarks/GBDT/workflow_config_gbdt.yaml b/examples/benchmarks/GBDT/workflow_config_gbdt.yaml
new file mode 100644
index 000000000..212558044
--- /dev/null
+++ b/examples/benchmarks/GBDT/workflow_config_gbdt.yaml
@@ -0,0 +1,59 @@
+provider_uri: "~/.qlib/qlib_data/cn_data"
+market: &market csi300
+benchmark: &benchmark SH000300
+data_handler_config: &data_handler_config
+    start_time: 2008-01-01
+    end_time: 2020-08-01
+    fit_start_time: 2008-01-01
+    fit_end_time: 2014-12-31
+    instruments: *market
+port_analysis_config: &port_analysis_config
+    strategy:
+        class: TopkDropoutStrategy
+        module_path: qlib.contrib.strategy.strategy
+        kwargs:
+            topk: 50
+            n_drop: 5
+    backtest:
+        verbose: False
+        limit_threshold: 0.095
+        account: 100000000
+        benchmark: *benchmark
+        deal_price: close
+        open_cost: 0.0005
+        close_cost: 0.0015
+        min_cost: 5
+task:
+    model:
+        class: LGBModel
+        module_path: qlib.contrib.model.gbdt
+        kwargs:
+            loss: mse
+            colsample_bytree: 0.8879
+            learning_rate: 0.0421
+            subsample: 0.8789
+            lambda_l1: 205.6999
+            lambda_l2: 580.9768
+            max_depth: 8
+            num_leaves: 210
+            num_threads: 20
+    dataset:
+        class: DatasetH
+        module_path: qlib.data.dataset
+        kwargs:
+            handler:
+                class: Alpha158
+                module_path: qlib.contrib.data.handler
+                kwargs: *data_handler_config
+            segments:
+                train: [2008-01-01, 2014-12-31]
+                valid: [2015-01-01, 2016-12-31]
+                test: [2017-01-01, 2020-08-01]
+    record: 
+        - class: SignalRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: {}
+        - class: PortAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            config: *port_analysis_config
\ No newline at end of file
diff --git a/examples/benchmarks/GRU/requirements.txt b/examples/benchmarks/GRU/requirements.txt
new file mode 100644
index 000000000..1fc2779c0
--- /dev/null
+++ b/examples/benchmarks/GRU/requirements.txt
@@ -0,0 +1,4 @@
+numpy==1.17.4
+pandas==1.1.2
+scikit_learn==0.23.2
+torch==1.7.0
diff --git a/examples/benchmarks/GRU/workflow_config_gru.yaml b/examples/benchmarks/GRU/workflow_config_gru.yaml
new file mode 100644
index 000000000..49b6159dc
--- /dev/null
+++ b/examples/benchmarks/GRU/workflow_config_gru.yaml
@@ -0,0 +1,62 @@
+provider_uri: "~/.qlib/qlib_data/cn_data"
+market: &market csi300
+benchmark: &benchmark SH000300
+data_handler_config: &data_handler_config
+    start_time: 2008-01-01
+    end_time: 2020-08-01
+    fit_start_time: 2008-01-01
+    fit_end_time: 2014-12-31
+    instruments: *market
+port_analysis_config: &port_analysis_config
+    strategy:
+        class: TopkDropoutStrategy
+        module_path: qlib.contrib.strategy.strategy
+        kwargs:
+            topk: 50
+            n_drop: 5
+    backtest:
+        verbose: False
+        limit_threshold: 0.095
+        account: 100000000
+        benchmark: *benchmark
+        deal_price: close
+        open_cost: 0.0005
+        close_cost: 0.0015
+        min_cost: 5
+task:
+    model:
+        class: GRU
+        module_path: qlib.contrib.model.pytorch_gru
+        kwargs:
+            d_feat: 6
+            hidden_size: 64
+            num_layers: 2
+            dropout: 0.0
+            n_epochs: 200
+            lr: 1e-3
+            early_stop: 20
+            batch_size: 800
+            metric: IC
+            loss: mse
+            seed: 0
+            GPU: 0
+    dataset:
+        class: DatasetH
+        module_path: qlib.data.dataset
+        kwargs:
+            handler:
+                class: ALPHA360_Denoise
+                module_path: qlib.contrib.data.handler
+                kwargs: *data_handler_config
+            segments:
+                train: [2008-01-01, 2014-12-31]
+                valid: [2015-01-01, 2016-12-31]
+                test: [2017-01-01, 2020-08-01]
+    record: 
+        - class: SignalRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: {}
+        - class: PortAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            config: *port_analysis_config
\ No newline at end of file
diff --git a/examples/benchmarks/LSTM/requirements.txt b/examples/benchmarks/LSTM/requirements.txt
new file mode 100644
index 000000000..1fc2779c0
--- /dev/null
+++ b/examples/benchmarks/LSTM/requirements.txt
@@ -0,0 +1,4 @@
+numpy==1.17.4
+pandas==1.1.2
+scikit_learn==0.23.2
+torch==1.7.0
diff --git a/examples/benchmarks/LSTM/workflow_config_lstm.yaml b/examples/benchmarks/LSTM/workflow_config_lstm.yaml
new file mode 100644
index 000000000..1e3b309d2
--- /dev/null
+++ b/examples/benchmarks/LSTM/workflow_config_lstm.yaml
@@ -0,0 +1,62 @@
+provider_uri: "~/.qlib/qlib_data/cn_data"
+market: &market csi300
+benchmark: &benchmark SH000300
+data_handler_config: &data_handler_config
+    start_time: 2008-01-01
+    end_time: 2020-08-01
+    fit_start_time: 2008-01-01
+    fit_end_time: 2014-12-31
+    instruments: *market
+port_analysis_config: &port_analysis_config
+    strategy:
+        class: TopkDropoutStrategy
+        module_path: qlib.contrib.strategy.strategy
+        kwargs:
+            topk: 50
+            n_drop: 5
+    backtest:
+        verbose: False
+        limit_threshold: 0.095
+        account: 100000000
+        benchmark: *benchmark
+        deal_price: close
+        open_cost: 0.0005
+        close_cost: 0.0015
+        min_cost: 5
+task:
+    model:
+        class: LSTM
+        module_path: qlib.contrib.model.pytorch_lstm
+        kwargs:
+            d_feat: 6
+            hidden_size: 64
+            num_layers: 2
+            dropout: 0.0
+            n_epochs: 200
+            lr: 1e-3
+            early_stop: 20
+            batch_size: 800
+            metric: IC
+            loss: mse
+            seed: 0
+            GPU: 0
+    dataset:
+        class: DatasetH
+        module_path: qlib.data.dataset
+        kwargs:
+            handler:
+                class: ALPHA360_Denoise
+                module_path: qlib.contrib.data.handler
+                kwargs: *data_handler_config
+            segments:
+                train: [2008-01-01, 2014-12-31]
+                valid: [2015-01-01, 2016-12-31]
+                test: [2017-01-01, 2020-08-01]
+    record: 
+        - class: SignalRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: {}
+        - class: PortAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            config: *port_analysis_config
\ No newline at end of file
diff --git a/examples/benchmarks/XGBoost/requirements.txt b/examples/benchmarks/XGBoost/requirements.txt
new file mode 100644
index 000000000..077f343e5
--- /dev/null
+++ b/examples/benchmarks/XGBoost/requirements.txt
@@ -0,0 +1,3 @@
+numpy==1.17.4
+pandas==1.1.2
+xgboost==1.2.1
\ No newline at end of file
diff --git a/examples/benchmarks/XGBoost/workflow_config_xgboost.yaml b/examples/benchmarks/XGBoost/workflow_config_xgboost.yaml
new file mode 100644
index 000000000..497ffa5b6
--- /dev/null
+++ b/examples/benchmarks/XGBoost/workflow_config_xgboost.yaml
@@ -0,0 +1,62 @@
+provider_uri: "~/.qlib/qlib_data/cn_data"
+market: &market csi300
+benchmark: &benchmark SH000300
+data_handler_config: &data_handler_config
+    start_time: 2008-01-01
+    end_time: 2020-08-01
+    fit_start_time: 2008-01-01
+    fit_end_time: 2014-12-31
+    instruments: *market
+port_analysis_config: &port_analysis_config
+    strategy:
+        class: TopkDropoutStrategy
+        module_path: qlib.contrib.strategy.strategy
+        kwargs:
+            topk: 50
+            n_drop: 5
+    backtest:
+        verbose: False
+        limit_threshold: 0.095
+        account: 100000000
+        benchmark: *benchmark
+        deal_price: close
+        open_cost: 0.0005
+        close_cost: 0.0015
+        min_cost: 5
+task:
+    model:
+        class: XGBModel
+        module_path: qlib.contrib.model.xgboost
+        kwargs:
+            objective: reg:linear
+            n_estimators: 5000
+            colsample_bytree: 0.85
+            learning_rate: 0.0421
+            subsample: 0.8789
+            max_depth: 8
+            num_leaves: 210
+            num_threads: 20
+            missing: -1
+            min_child_weight: 1
+            nthread: 4
+            tree_method: hist
+    dataset:
+        class: DatasetH
+        module_path: qlib.data.dataset
+        kwargs:
+            handler:
+                class: Alpha158
+                module_path: qlib.contrib.data.handler
+                kwargs: *data_handler_config
+            segments:
+                train: [2008-01-01, 2014-12-31]
+                valid: [2015-01-01, 2016-12-31]
+                test: [2017-01-01, 2020-08-01]
+    record: 
+        - class: SignalRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: {}
+        - class: PortAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            config: *port_analysis_config
\ No newline at end of file
diff --git a/examples/benchmarks/XGBoost/xgboost.py b/examples/benchmarks/XGBoost/xgboost.py
new file mode 100755
index 000000000..f1208eb93
--- /dev/null
+++ b/examples/benchmarks/XGBoost/xgboost.py
@@ -0,0 +1,64 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import numpy as np
+import pandas as pd
+import xgboost as xgb
+
+from ...model.base import Model
+from ...data.dataset import DatasetH
+from ...data.dataset.handler import DataHandlerLP
+
+
+class XGBModel(Model):
+    """XGBModel Model"""
+
+    def __init__(self, obj="mse", **kwargs):
+        if obj not in {"mse", "binary"}:
+            raise NotImplementedError
+        self._params = {"obj": obj}
+        self._params.update(kwargs)
+        self.model = None
+
+    def fit(
+        self,
+        dataset: DatasetH,
+        num_boost_round=1000,
+        early_stopping_rounds=50,
+        verbose_eval=20,
+        evals_result=dict(),
+        **kwargs
+    ):
+
+        df_train, df_valid = dataset.prepare(
+            ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
+        )
+        x_train, y_train = df_train["feature"], df_train["label"]
+        x_valid, y_valid = df_valid["feature"], df_valid["label"]
+
+        # Lightgbm need 1D array as its label
+        if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
+            y_train_1d, y_valid_1d = np.squeeze(y_train.values), np.squeeze(y_valid.values)
+        else:
+            raise ValueError("XGBoost doesn't support multi-label training")
+
+        dtrain = xgb.DMatrix(x_train.values, label=y_train_1d)
+        dvalid = xgb.DMatrix(x_valid.values, label=y_valid_1d)
+        self.model = xgb.train(
+            self._params,
+            dtrain=dtrain,
+            num_boost_round=num_boost_round,
+            evals=[(dtrain, "train"), (dvalid, "valid")],
+            early_stopping_rounds=early_stopping_rounds,
+            verbose_eval=verbose_eval,
+            evals_result=evals_result,
+            **kwargs
+        )
+        evals_result["train"] = list(evals_result["train"].values())[0]
+        evals_result["valid"] = list(evals_result["valid"].values())[0]
+
+    def predict(self, dataset):
+        if self.model is None:
+            raise ValueError("model is not fitted yet!")
+        x_test = dataset.prepare("test", col_set="feature")
+        return pd.Series(self.model.predict(xgb.DMatrix(np.squeeze(x_test.values))), index=x_test.index)