From cd4ab998fbba39cff507694bc6159fb68f11b1d5 Mon Sep 17 00:00:00 2001
From: Wendi Li <wendili.academic@qq.com>
Date: Sat, 3 Jun 2023 08:42:24 +0800
Subject: [PATCH] Update on Dynamic Benchmark (#1539)

* move config file to benchmark_dynamic & switch default sim task model to GBDT

* Update benchmark_dynamic results

* Change the default value of alpha of DDG-DA
---
 .../benchmarks_dynamic/DDG-DA/workflow.py     |  6 +-
 examples/benchmarks_dynamic/README.md         | 12 +--
 .../baseline/rolling_benchmark.py             |  5 +-
 .../workflow_config_lightgbm_Alpha158.yaml    | 72 +++++++++++++++++
 .../workflow_config_linear_Alpha158.yaml      | 79 +++++++++++++++++++
 5 files changed, 164 insertions(+), 10 deletions(-)
 create mode 100644 examples/benchmarks_dynamic/baseline/workflow_config_lightgbm_Alpha158.yaml
 create mode 100644 examples/benchmarks_dynamic/baseline/workflow_config_linear_Alpha158.yaml

diff --git a/examples/benchmarks_dynamic/DDG-DA/workflow.py b/examples/benchmarks_dynamic/DDG-DA/workflow.py
index f57080055..fef86726d 100644
--- a/examples/benchmarks_dynamic/DDG-DA/workflow.py
+++ b/examples/benchmarks_dynamic/DDG-DA/workflow.py
@@ -34,14 +34,14 @@ class DDGDA:
 
     def __init__(
         self,
-        sim_task_model: Literal["linear", "gbdt"] = "linear",
+        sim_task_model: Literal["linear", "gbdt"] = "gbdt",
         forecast_model: Literal["linear", "gbdt"] = "linear",
         h_path: Optional[str] = None,
         test_end: Optional[str] = None,
         train_start: Optional[str] = None,
         meta_1st_train_end: Optional[str] = None,
         task_ext_conf: Optional[dict] = None,
-        alpha: float = 0.0,
+        alpha: float = 0.01,
         proxy_hd: str = "handler_proxy.pkl",
     ):
         """
@@ -215,7 +215,7 @@ class DDGDA:
         with R.start(experiment_name=self.meta_exp_name):
             R.log_params(**kwargs)
             mm = MetaModelDS(
-                step=self.step, hist_step_n=kwargs["hist_step_n"], lr=0.001, max_epoch=100, seed=43, alpha=self.alpha
+                step=self.step, hist_step_n=kwargs["hist_step_n"], lr=0.001, max_epoch=30, seed=43, alpha=self.alpha
             )
             mm.fit(md)
             R.save_objects(model=mm)
diff --git a/examples/benchmarks_dynamic/README.md b/examples/benchmarks_dynamic/README.md
index 261fcc035..6f78fa71a 100644
--- a/examples/benchmarks_dynamic/README.md
+++ b/examples/benchmarks_dynamic/README.md
@@ -8,15 +8,17 @@ The table below shows the performances of different solutions on different forec
 Here is the [crowd sourced version of qlib data](data_collector/crowd_source/README.md): https://github.com/chenditc/investment_data/releases
 ```bash
 wget https://github.com/chenditc/investment_data/releases/download/20220720/qlib_bin.tar.gz
+mkdir -p ~/.qlib/qlib_data/cn_data
 tar -zxvf qlib_bin.tar.gz -C ~/.qlib/qlib_data/cn_data --strip-components=2
+rm -f qlib_bin.tar.gz
 ```
 
 | Model Name       | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown |
-|------------------|---------|----|------|---------|-----------|-------------------|-------------------|--------------|
-| RR[Linear]       |Alpha158 |0.089|0.577|0.102    |0.627      |0.093              |1.458              |-0.073        |
-| DDG-DA[Linear]   |Alpha158 |0.096|0.636|0.107    |0.677      |0.067              |0.996              |-0.091        |
-| RR[LightGBM]     |Alpha158 |0.082|0.589|0.091    |0.626      |0.077              |1.320              |-0.091        |
-| DDG-DA[LightGBM] |Alpha158 |0.085|0.658|0.094    |0.686      |0.115              |1.792              |-0.068        |
+|------------------|---------|------|------|---------|-----------|-------------------|-------------------|--------------|
+| RR[Linear]       |Alpha158 |0.0945|0.5989|0.1069   |0.6495     |0.0857             |1.3682             |-0.0986       |
+| DDG-DA[Linear]   |Alpha158 |0.0983|0.6157|0.1108   |0.6646     |0.0764             |1.1904             |-0.0769       |
+| RR[LightGBM]     |Alpha158 |0.0816|0.5887|0.0912   |0.6263     |0.0771             |1.3196             |-0.0909       |
+| DDG-DA[LightGBM] |Alpha158 |0.0878|0.6185|0.0975   |0.6524     |0.1261             |2.0096             |-0.0744       |
 
 - The label horizon of the `Alpha158` dataset is set to 20.
 - The rolling time intervals are set to 20 trading days.
diff --git a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py
index d452957d4..b0c7aea4f 100644
--- a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py
+++ b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py
@@ -67,11 +67,12 @@ class RollingBenchmark:
     def basic_task(self):
         """For fast training rolling"""
         if self.model_type == "gbdt":
-            conf_path = DIRNAME.parent.parent / "benchmarks" / "LightGBM" / "workflow_config_lightgbm_Alpha158.yaml"
+            conf_path = DIRNAME / "workflow_config_lightgbm_Alpha158.yaml"
             # dump the processed data on to disk for later loading to speed up the processing
             h_path = DIRNAME / "lightgbm_alpha158_handler_horizon{}.pkl".format(self.horizon)
         elif self.model_type == "linear":
-            conf_path = DIRNAME.parent.parent / "benchmarks" / "Linear" / "workflow_config_linear_Alpha158.yaml"
+            # We use ridge regression to stabilize the performance
+            conf_path = DIRNAME / "workflow_config_linear_Alpha158.yaml"
             h_path = DIRNAME / "linear_alpha158_handler_horizon{}.pkl".format(self.horizon)
         else:
             raise AssertionError("Model type is not supported!")
diff --git a/examples/benchmarks_dynamic/baseline/workflow_config_lightgbm_Alpha158.yaml b/examples/benchmarks_dynamic/baseline/workflow_config_lightgbm_Alpha158.yaml
new file mode 100644
index 000000000..2d441dea9
--- /dev/null
+++ b/examples/benchmarks_dynamic/baseline/workflow_config_lightgbm_Alpha158.yaml
@@ -0,0 +1,72 @@
+qlib_init:
+    provider_uri: "~/.qlib/qlib_data/cn_data"
+    region: cn
+market: &market csi300
+benchmark: &benchmark SH000300
+data_handler_config: &data_handler_config
+    start_time: 2008-01-01
+    end_time: 2020-08-01
+    fit_start_time: 2008-01-01
+    fit_end_time: 2014-12-31
+    instruments: *market
+port_analysis_config: &port_analysis_config
+    strategy:
+        class: TopkDropoutStrategy
+        module_path: qlib.contrib.strategy
+        kwargs:
+            model: <MODEL> 
+            dataset: <DATASET>
+            topk: 50
+            n_drop: 5
+    backtest:
+        start_time: 2017-01-01
+        end_time: 2020-08-01
+        account: 100000000
+        benchmark: *benchmark
+        exchange_kwargs:
+            limit_threshold: 0.095
+            deal_price: close
+            open_cost: 0.0005
+            close_cost: 0.0015
+            min_cost: 5
+task:
+    model:
+        class: LGBModel
+        module_path: qlib.contrib.model.gbdt
+        kwargs:
+            loss: mse
+            colsample_bytree: 0.8879
+            learning_rate: 0.2
+            subsample: 0.8789
+            lambda_l1: 205.6999
+            lambda_l2: 580.9768
+            max_depth: 8
+            num_leaves: 210
+            num_threads: 20
+    dataset:
+        class: DatasetH
+        module_path: qlib.data.dataset
+        kwargs:
+            handler:
+                class: Alpha158
+                module_path: qlib.contrib.data.handler
+                kwargs: *data_handler_config
+            segments:
+                train: [2008-01-01, 2014-12-31]
+                valid: [2015-01-01, 2016-12-31]
+                test: [2017-01-01, 2020-08-01]
+    record: 
+        - class: SignalRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            model: <MODEL>
+            dataset: <DATASET>
+        - class: SigAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            ana_long_short: False
+            ann_scaler: 252
+        - class: PortAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            config: *port_analysis_config
diff --git a/examples/benchmarks_dynamic/baseline/workflow_config_linear_Alpha158.yaml b/examples/benchmarks_dynamic/baseline/workflow_config_linear_Alpha158.yaml
new file mode 100644
index 000000000..78ec4e612
--- /dev/null
+++ b/examples/benchmarks_dynamic/baseline/workflow_config_linear_Alpha158.yaml
@@ -0,0 +1,79 @@
+qlib_init:
+    provider_uri: "~/.qlib/qlib_data/cn_data"
+    region: cn
+market: &market csi300
+benchmark: &benchmark SH000300
+data_handler_config: &data_handler_config
+    start_time: 2008-01-01
+    end_time: 2020-08-01
+    fit_start_time: 2008-01-01
+    fit_end_time: 2014-12-31
+    instruments: *market
+    infer_processors:
+        - class: RobustZScoreNorm
+          kwargs:
+              fields_group: feature
+              clip_outlier: true
+        - class: Fillna
+          kwargs:
+              fields_group: feature
+    learn_processors:
+        - class: DropnaLabel
+        - class: CSRankNorm
+          kwargs:
+              fields_group: label
+port_analysis_config: &port_analysis_config
+    strategy:
+        class: TopkDropoutStrategy
+        module_path: qlib.contrib.strategy
+        kwargs:
+            signal:
+                - <MODEL> 
+                - <DATASET>
+            topk: 50
+            n_drop: 5
+    backtest:
+        start_time: 2017-01-01
+        end_time: 2020-08-01
+        account: 100000000
+        benchmark: *benchmark
+        exchange_kwargs:
+            limit_threshold: 0.095
+            deal_price: close
+            open_cost: 0.0005
+            close_cost: 0.0015
+            min_cost: 5
+task:
+    model:
+        class: LinearModel
+        module_path: qlib.contrib.model.linear
+        kwargs:
+            estimator: ridge
+            alpha: 0.05
+    dataset:
+        class: DatasetH
+        module_path: qlib.data.dataset
+        kwargs:
+            handler:
+                class: Alpha158
+                module_path: qlib.contrib.data.handler
+                kwargs: *data_handler_config
+            segments:
+                train: [2008-01-01, 2014-12-31]
+                valid: [2015-01-01, 2016-12-31]
+                test: [2017-01-01, 2020-08-01]
+    record: 
+        - class: SignalRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            model: <MODEL>
+            dataset: <DATASET>
+        - class: SigAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            ana_long_short: True
+            ann_scaler: 252
+        - class: PortAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            config: *port_analysis_config