From ad6c2e759f5e75fc02f228cc663875afd704a50a Mon Sep 17 00:00:00 2001 From: lwwang1995 Date: Wed, 11 Nov 2020 11:11:36 +0800 Subject: [PATCH] Add Xgboost Model --- examples/workflow_by_code_xgboost.py | 144 +++++++++++++++++++++++++++ qlib/contrib/model/xgboost.py | 64 ++++++++++++ 2 files changed, 208 insertions(+) create mode 100755 examples/workflow_by_code_xgboost.py create mode 100755 qlib/contrib/model/xgboost.py diff --git a/examples/workflow_by_code_xgboost.py b/examples/workflow_by_code_xgboost.py new file mode 100755 index 000000000..0eb5f4e93 --- /dev/null +++ b/examples/workflow_by_code_xgboost.py @@ -0,0 +1,144 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import sys +from pathlib import Path + +import qlib +import pandas as pd +from qlib.config import REG_CN +from qlib.contrib.model.xgboost import XGBModel +from qlib.contrib.data.handler import Alpha158 +from qlib.contrib.strategy.strategy import TopkDropoutStrategy +from qlib.contrib.evaluate import ( + backtest as normal_backtest, + risk_analysis, +) +from qlib.utils import exists_qlib_data + +# from qlib.model.learner import train_model +from qlib.utils import init_instance_by_config + +if __name__ == "__main__": + + + # use default data + provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir + if not exists_qlib_data(provider_uri): + print(f"Qlib data is not found in {provider_uri}") + sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts"))) + from get_data import GetData + + GetData().qlib_data_cn(target_dir=provider_uri) + + qlib.init(provider_uri=provider_uri, region=REG_CN) + + MARKET = "csi300" + BENCHMARK = "SH000300" + + + ################################### + # train model + ################################### + DATA_HANDLER_CONFIG = { + "start_time": "2008-01-01", + "end_time": "2020-08-01", + "fit_start_time":"2008-01-01", + "fit_end_time":"2014-12-31", + "instruments": MARKET, + } + + TRAINER_CONFIG = { + "train_start_time": "2008-01-01", + "train_end_time": "2014-12-31", + "validate_start_time": "2015-01-01", + "validate_end_time": "2016-12-31", + "test_start_time": "2017-01-01", + "test_end_time": "2020-08-01", + } + + task = { + "model": { + "class": "XGBModel", + "module_path": "qlib.contrib.model.xgboost", + "kwargs": { + "objective": 'reg:linear', + "n_estimators":5000, + "colsample_bytree": 0.85, + "learning_rate": 0.0421, + "subsample": 0.8789, + "max_depth": 8, + "num_leaves": 210, + "num_threads": 20, + "missing":-1, + "min_child_weight":1, + "nthread":4, + "tree_method":'hist', + } + }, + "dataset": { + "class": "DatasetH", + "module_path": "qlib.data.dataset", + "kwargs": { + 'handler': { + "class": "Alpha158", + "module_path": "qlib.contrib.data.handler", + "kwargs": DATA_HANDLER_CONFIG + }, + 'segments': { + 'train': ("2008-01-01", "2014-12-31"), + 'valid': ("2015-01-01", "2016-12-31",), + 'test': ("2017-01-01", "2020-08-01",), + } + } + } + # You shoud record the data in specific sequence + # "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'], + } + + # model = train_model(task) + model = init_instance_by_config(task['model']) + dataset = init_instance_by_config(task['dataset']) + + model.fit(dataset) + pred_score = model.predict(dataset) + + # save pred_score to file + pred_score_path = Path("~/tmp/qlib/pred_score.pkl").expanduser() + pred_score_path.parent.mkdir(exist_ok=True, parents=True) + pred_score.to_pickle(pred_score_path) + + ################################### + # backtest + ################################### + STRATEGY_CONFIG = { + "topk": 50, + "n_drop": 5, + } + BACKTEST_CONFIG = { + "verbose": False, + "limit_threshold": 0.095, + "account": 100000000, + "benchmark": BENCHMARK, + "deal_price": "close", + "open_cost": 0.0005, + "close_cost": 0.0015, + "min_cost": 5, + } + + # use default strategy + # custom Strategy, refer to: TODO: Strategy API url + strategy = TopkDropoutStrategy(**STRATEGY_CONFIG) + report_normal, positions_normal = normal_backtest(pred_score, strategy=strategy, **BACKTEST_CONFIG) + + ################################### + # analyze + # If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb + ################################### + analysis = dict() + analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"]) + analysis["excess_return_with_cost"] = risk_analysis( + report_normal["return"] - report_normal["bench"] - report_normal["cost"] + ) + analysis_df = pd.concat(analysis) # type: pd.DataFrame + print(analysis_df) diff --git a/qlib/contrib/model/xgboost.py b/qlib/contrib/model/xgboost.py new file mode 100755 index 000000000..95954198e --- /dev/null +++ b/qlib/contrib/model/xgboost.py @@ -0,0 +1,64 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import numpy as np +import pandas as pd +import xgboost as xgb + +from ...model.base import Model +from ...data.dataset import DatasetH +from ...data.dataset.handler import DataHandlerLP + + +class XGBModel(Model): + """XGBModel Model""" + + def __init__(self, obj="mse", **kwargs): + if obj not in {"mse", "binary"}: + raise NotImplementedError + self._params = {"obj": obj} + self._params.update(kwargs) + self.model = None + + def fit( + self, + dataset: DatasetH, + num_boost_round=1000, + early_stopping_rounds=50, + verbose_eval=20, + evals_result=dict(), + **kwargs + ): + + df_train, df_valid = dataset.prepare( + ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L + ) + x_train, y_train = df_train["feature"], df_train["label"] + x_valid, y_valid = df_valid["feature"], df_valid["label"] + + # Lightgbm need 1D array as its label + if y_train.values.ndim == 2 and y_train.values.shape[1] == 1: + y_train_1d, y_valid_1d = np.squeeze(y_train.values), np.squeeze(y_valid.values) + else: + raise ValueError("XGBoost doesn't support multi-label training") + + dtrain = xgb.DMatrix(x_train.values, label=y_train_1d) + dvalid = xgb.DMatrix(x_valid.values, label=y_valid_1d) + self.model = xgb.train( + self._params, + dtrain=dtrain, + num_boost_round=num_boost_round, + evals=[(dtrain, 'train'), (dvalid, 'valid')], + early_stopping_rounds=early_stopping_rounds, + verbose_eval=verbose_eval, + evals_result=evals_result, + **kwargs + ) + evals_result["train"] = list(evals_result["train"].values())[0] + evals_result["valid"] = list(evals_result["valid"].values())[0] + + def predict(self, dataset): + if self.model is None: + raise ValueError("model is not fitted yet!") + x_test = dataset.prepare("test", col_set="feature") + return pd.Series(self.model.predict(xgb.DMatrix(np.squeeze(x_test.values))), index=x_test.index)