From ad6c2e759f5e75fc02f228cc663875afd704a50a Mon Sep 17 00:00:00 2001
From: lwwang1995 <lewen.wang@microsoft.com>
Date: Wed, 11 Nov 2020 11:11:36 +0800
Subject: [PATCH] Add Xgboost Model

---
 examples/workflow_by_code_xgboost.py | 144 +++++++++++++++++++++++++++
 qlib/contrib/model/xgboost.py        |  64 ++++++++++++
 2 files changed, 208 insertions(+)
 create mode 100755 examples/workflow_by_code_xgboost.py
 create mode 100755 qlib/contrib/model/xgboost.py

diff --git a/examples/workflow_by_code_xgboost.py b/examples/workflow_by_code_xgboost.py
new file mode 100755
index 000000000..0eb5f4e93
--- /dev/null
+++ b/examples/workflow_by_code_xgboost.py
@@ -0,0 +1,144 @@
+#  Copyright (c) Microsoft Corporation.
+#  Licensed under the MIT License.
+
+import sys
+from pathlib import Path
+
+import qlib
+import pandas as pd
+from qlib.config import REG_CN
+from qlib.contrib.model.xgboost import XGBModel
+from qlib.contrib.data.handler import Alpha158
+from qlib.contrib.strategy.strategy import TopkDropoutStrategy
+from qlib.contrib.evaluate import (
+    backtest as normal_backtest,
+    risk_analysis,
+)
+from qlib.utils import exists_qlib_data
+
+# from qlib.model.learner import train_model
+from qlib.utils import init_instance_by_config
+
+if __name__ == "__main__":
+
+
+    # use default data
+    provider_uri = "~/.qlib/qlib_data/cn_data"  # target_dir
+    if not exists_qlib_data(provider_uri):
+        print(f"Qlib data is not found in {provider_uri}")
+        sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts")))
+        from get_data import GetData
+
+        GetData().qlib_data_cn(target_dir=provider_uri)
+
+    qlib.init(provider_uri=provider_uri, region=REG_CN)
+
+    MARKET = "csi300"
+    BENCHMARK = "SH000300"
+
+
+    ###################################
+    # train model
+    ###################################
+    DATA_HANDLER_CONFIG = {
+        "start_time": "2008-01-01",
+        "end_time": "2020-08-01",
+        "fit_start_time":"2008-01-01",
+        "fit_end_time":"2014-12-31",
+        "instruments": MARKET,
+    }
+
+    TRAINER_CONFIG = {
+        "train_start_time": "2008-01-01",
+        "train_end_time": "2014-12-31",
+        "validate_start_time": "2015-01-01",
+        "validate_end_time": "2016-12-31",
+        "test_start_time": "2017-01-01",
+        "test_end_time": "2020-08-01",
+    }
+
+    task = {
+        "model": {
+            "class": "XGBModel",
+            "module_path": "qlib.contrib.model.xgboost",
+            "kwargs": {
+                "objective": 'reg:linear',
+                "n_estimators":5000,
+                "colsample_bytree": 0.85,
+                "learning_rate": 0.0421,
+                "subsample": 0.8789,
+                "max_depth": 8,
+                "num_leaves": 210,
+                "num_threads": 20,
+                "missing":-1,
+                "min_child_weight":1,
+                "nthread":4,
+                "tree_method":'hist',
+            }
+        },
+        "dataset": {
+            "class": "DatasetH",
+            "module_path": "qlib.data.dataset",
+            "kwargs": {
+                'handler': {
+                    "class": "Alpha158",
+                    "module_path": "qlib.contrib.data.handler",
+                    "kwargs": DATA_HANDLER_CONFIG
+                },
+                'segments': {
+                    'train': ("2008-01-01", "2014-12-31"),
+                    'valid': ("2015-01-01", "2016-12-31",),
+                    'test': ("2017-01-01", "2020-08-01",),
+                }
+            }
+        }
+        # You shoud record the data in specific sequence
+        # "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'],
+    }
+
+    # model = train_model(task)
+    model = init_instance_by_config(task['model'])
+    dataset = init_instance_by_config(task['dataset'])
+
+    model.fit(dataset)
+    pred_score = model.predict(dataset)
+
+    # save pred_score to file
+    pred_score_path = Path("~/tmp/qlib/pred_score.pkl").expanduser()
+    pred_score_path.parent.mkdir(exist_ok=True, parents=True)
+    pred_score.to_pickle(pred_score_path)
+
+    ###################################
+    # backtest
+    ###################################
+    STRATEGY_CONFIG = {
+        "topk": 50,
+        "n_drop": 5,
+    }
+    BACKTEST_CONFIG = {
+        "verbose": False,
+        "limit_threshold": 0.095,
+        "account": 100000000,
+        "benchmark": BENCHMARK,
+        "deal_price": "close",
+        "open_cost": 0.0005,
+        "close_cost": 0.0015,
+        "min_cost": 5,
+    }
+
+    # use default strategy
+    # custom Strategy, refer to: TODO: Strategy API url
+    strategy = TopkDropoutStrategy(**STRATEGY_CONFIG)
+    report_normal, positions_normal = normal_backtest(pred_score, strategy=strategy, **BACKTEST_CONFIG)
+
+    ###################################
+    # analyze
+    # If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb
+    ###################################
+    analysis = dict()
+    analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"])
+    analysis["excess_return_with_cost"] = risk_analysis(
+        report_normal["return"] - report_normal["bench"] - report_normal["cost"]
+    )
+    analysis_df = pd.concat(analysis)  # type: pd.DataFrame
+    print(analysis_df)
diff --git a/qlib/contrib/model/xgboost.py b/qlib/contrib/model/xgboost.py
new file mode 100755
index 000000000..95954198e
--- /dev/null
+++ b/qlib/contrib/model/xgboost.py
@@ -0,0 +1,64 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import numpy as np
+import pandas as pd
+import xgboost as xgb
+
+from ...model.base import Model
+from ...data.dataset import DatasetH
+from ...data.dataset.handler import DataHandlerLP
+
+
+class XGBModel(Model):
+    """XGBModel Model"""
+
+    def __init__(self, obj="mse", **kwargs):
+        if obj not in {"mse", "binary"}:
+            raise NotImplementedError
+        self._params = {"obj": obj}
+        self._params.update(kwargs)
+        self.model = None
+
+    def fit(
+        self,
+        dataset: DatasetH,
+        num_boost_round=1000,
+        early_stopping_rounds=50,
+        verbose_eval=20,
+        evals_result=dict(),
+        **kwargs
+    ):
+
+        df_train, df_valid = dataset.prepare(
+            ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
+        )
+        x_train, y_train = df_train["feature"], df_train["label"]
+        x_valid, y_valid = df_valid["feature"], df_valid["label"]
+
+        # Lightgbm need 1D array as its label
+        if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
+            y_train_1d, y_valid_1d = np.squeeze(y_train.values), np.squeeze(y_valid.values)
+        else:
+            raise ValueError("XGBoost doesn't support multi-label training")
+        
+        dtrain = xgb.DMatrix(x_train.values, label=y_train_1d)
+        dvalid = xgb.DMatrix(x_valid.values, label=y_valid_1d)
+        self.model = xgb.train(
+            self._params,
+            dtrain=dtrain,
+            num_boost_round=num_boost_round,
+            evals=[(dtrain, 'train'), (dvalid, 'valid')],
+            early_stopping_rounds=early_stopping_rounds,
+            verbose_eval=verbose_eval,
+            evals_result=evals_result,
+            **kwargs
+        )
+        evals_result["train"] = list(evals_result["train"].values())[0]
+        evals_result["valid"] = list(evals_result["valid"].values())[0]
+
+    def predict(self, dataset):
+        if self.model is None:
+            raise ValueError("model is not fitted yet!")
+        x_test = dataset.prepare("test", col_set="feature")
+        return pd.Series(self.model.predict(xgb.DMatrix(np.squeeze(x_test.values))), index=x_test.index)