diff --git a/examples/benchmarks/CatBoost/requirements.txt b/examples/benchmarks/CatBoost/requirements.txt new file mode 100644 index 000000000..507a65944 --- /dev/null +++ b/examples/benchmarks/CatBoost/requirements.txt @@ -0,0 +1,3 @@ +pandas==1.1.2 +numpy==1.17.4 +catboost==0.24.3 diff --git a/examples/benchmarks/DNN/requirements.txt b/examples/benchmarks/DNN/requirements.txt new file mode 100644 index 000000000..16de0a438 --- /dev/null +++ b/examples/benchmarks/DNN/requirements.txt @@ -0,0 +1,4 @@ +pandas==1.1.2 +numpy==1.17.4 +scikit_learn==0.23.2 +torch==1.7.0 diff --git a/examples/benchmarks/DNN/workflow_config_dnn.yaml b/examples/benchmarks/DNN/workflow_config_dnn.yaml new file mode 100644 index 000000000..0f50cbb25 --- /dev/null +++ b/examples/benchmarks/DNN/workflow_config_dnn.yaml @@ -0,0 +1,62 @@ +provider_uri: "~/.qlib/qlib_data/cn_data" +market: &market csi300 +benchmark: &benchmark SH000300 +data_handler_config: &data_handler_config + start_time: 2008-01-01 + end_time: 2020-08-01 + fit_start_time: 2008-01-01 + fit_end_time: 2014-12-31 + instruments: *market +port_analysis_config: &port_analysis_config + strategy: + class: TopkDropoutStrategy + module_path: qlib.contrib.strategy.strategy + kwargs: + topk: 50 + n_drop: 5 + backtest: + verbose: False + limit_threshold: 0.095 + account: 100000000 + benchmark: *benchmark + deal_price: close + open_cost: 0.0005 + close_cost: 0.0015 + min_cost: 5 +task: + model: + class: DNNModelPytorch + module_path: qlib.contrib.model.pytorch_nn + kwargs: + input_dim: 360 + output_dim: 1 + layers: [256, 512, 1024, 512, 256, 128, 64] + lr: 0.001 + max_steps: 300 + batch_size: 2000 + early_stop_rounds: 50 + eval_steps: 20 + lr_decay: 0.96 + lr_decay_steps: 100 + optimizer: gd + loss: mse + dataset: + class: DatasetH + module_path: qlib.data.dataset + kwargs: + handler: + class: ALPHA360_Denoise + module_path: qlib.contrib.data.handler + kwargs: *data_handler_config + segments: + train: [2008-01-01, 2014-12-31] + valid: [2015-01-01, 2016-12-31] + test: [2017-01-01, 2020-08-01] + record: + - class: SignalRecord + module_path: qlib.workflow.record_temp + kwargs: {} + - class: PortAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config \ No newline at end of file diff --git a/examples/benchmarks/GATs/requirements.txt b/examples/benchmarks/GATs/requirements.txt new file mode 100644 index 000000000..16de0a438 --- /dev/null +++ b/examples/benchmarks/GATs/requirements.txt @@ -0,0 +1,4 @@ +pandas==1.1.2 +numpy==1.17.4 +scikit_learn==0.23.2 +torch==1.7.0 diff --git a/examples/benchmarks/GATs/worflow_config_gats.yaml b/examples/benchmarks/GATs/worflow_config_gats.yaml new file mode 100644 index 000000000..6c8db2e77 --- /dev/null +++ b/examples/benchmarks/GATs/worflow_config_gats.yaml @@ -0,0 +1,63 @@ +provider_uri: "~/.qlib/qlib_data/cn_data" +market: &market csi300 +benchmark: &benchmark SH000300 +data_handler_config: &data_handler_config + start_time: 2008-01-01 + end_time: 2020-08-01 + fit_start_time: 2008-01-01 + fit_end_time: 2014-12-31 + instruments: *market +port_analysis_config: &port_analysis_config + strategy: + class: TopkDropoutStrategy + module_path: qlib.contrib.strategy.strategy + kwargs: + topk: 50 + n_drop: 5 + backtest: + verbose: False + limit_threshold: 0.095 + account: 100000000 + benchmark: *benchmark + deal_price: close + open_cost: 0.0005 + close_cost: 0.0015 + min_cost: 5 +task: + model: + class: GAT + module_path: qlib.contrib.model.pytorch_gats + kwargs: + d_feat: 6 + hidden_size: 64 + num_layers: 2 + dropout: 0.0 + n_epochs: 200 + lr: 1e-3 + early_stop: 20 + batch_size: 800 + metric: IC + loss: mse + base_model: GRU + seed: 0 + GPU: 0 + dataset: + class: DatasetH + module_path: qlib.data.dataset + kwargs: + handler: + class: ALPHA360_Denoise + module_path: qlib.contrib.data.handler + kwargs: *data_handler_config + segments: + train: [2008-01-01, 2014-12-31] + valid: [2015-01-01, 2016-12-31] + test: [2017-01-01, 2020-08-01] + record: + - class: SignalRecord + module_path: qlib.workflow.record_temp + kwargs: {} + - class: PortAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config \ No newline at end of file diff --git a/examples/benchmarks/GBDT/requirements.txt b/examples/benchmarks/GBDT/requirements.txt new file mode 100644 index 000000000..507d2d453 --- /dev/null +++ b/examples/benchmarks/GBDT/requirements.txt @@ -0,0 +1,3 @@ +pandas==1.1.2 +numpy==1.17.4 +lightgbm==3.1.0 diff --git a/examples/benchmarks/GBDT/workflow_config_gbdt.yaml b/examples/benchmarks/GBDT/workflow_config_gbdt.yaml new file mode 100644 index 000000000..212558044 --- /dev/null +++ b/examples/benchmarks/GBDT/workflow_config_gbdt.yaml @@ -0,0 +1,59 @@ +provider_uri: "~/.qlib/qlib_data/cn_data" +market: &market csi300 +benchmark: &benchmark SH000300 +data_handler_config: &data_handler_config + start_time: 2008-01-01 + end_time: 2020-08-01 + fit_start_time: 2008-01-01 + fit_end_time: 2014-12-31 + instruments: *market +port_analysis_config: &port_analysis_config + strategy: + class: TopkDropoutStrategy + module_path: qlib.contrib.strategy.strategy + kwargs: + topk: 50 + n_drop: 5 + backtest: + verbose: False + limit_threshold: 0.095 + account: 100000000 + benchmark: *benchmark + deal_price: close + open_cost: 0.0005 + close_cost: 0.0015 + min_cost: 5 +task: + model: + class: LGBModel + module_path: qlib.contrib.model.gbdt + kwargs: + loss: mse + colsample_bytree: 0.8879 + learning_rate: 0.0421 + subsample: 0.8789 + lambda_l1: 205.6999 + lambda_l2: 580.9768 + max_depth: 8 + num_leaves: 210 + num_threads: 20 + dataset: + class: DatasetH + module_path: qlib.data.dataset + kwargs: + handler: + class: Alpha158 + module_path: qlib.contrib.data.handler + kwargs: *data_handler_config + segments: + train: [2008-01-01, 2014-12-31] + valid: [2015-01-01, 2016-12-31] + test: [2017-01-01, 2020-08-01] + record: + - class: SignalRecord + module_path: qlib.workflow.record_temp + kwargs: {} + - class: PortAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config \ No newline at end of file diff --git a/examples/benchmarks/GRU/requirements.txt b/examples/benchmarks/GRU/requirements.txt new file mode 100644 index 000000000..1fc2779c0 --- /dev/null +++ b/examples/benchmarks/GRU/requirements.txt @@ -0,0 +1,4 @@ +numpy==1.17.4 +pandas==1.1.2 +scikit_learn==0.23.2 +torch==1.7.0 diff --git a/examples/benchmarks/GRU/workflow_config_gru.yaml b/examples/benchmarks/GRU/workflow_config_gru.yaml new file mode 100644 index 000000000..49b6159dc --- /dev/null +++ b/examples/benchmarks/GRU/workflow_config_gru.yaml @@ -0,0 +1,62 @@ +provider_uri: "~/.qlib/qlib_data/cn_data" +market: &market csi300 +benchmark: &benchmark SH000300 +data_handler_config: &data_handler_config + start_time: 2008-01-01 + end_time: 2020-08-01 + fit_start_time: 2008-01-01 + fit_end_time: 2014-12-31 + instruments: *market +port_analysis_config: &port_analysis_config + strategy: + class: TopkDropoutStrategy + module_path: qlib.contrib.strategy.strategy + kwargs: + topk: 50 + n_drop: 5 + backtest: + verbose: False + limit_threshold: 0.095 + account: 100000000 + benchmark: *benchmark + deal_price: close + open_cost: 0.0005 + close_cost: 0.0015 + min_cost: 5 +task: + model: + class: GRU + module_path: qlib.contrib.model.pytorch_gru + kwargs: + d_feat: 6 + hidden_size: 64 + num_layers: 2 + dropout: 0.0 + n_epochs: 200 + lr: 1e-3 + early_stop: 20 + batch_size: 800 + metric: IC + loss: mse + seed: 0 + GPU: 0 + dataset: + class: DatasetH + module_path: qlib.data.dataset + kwargs: + handler: + class: ALPHA360_Denoise + module_path: qlib.contrib.data.handler + kwargs: *data_handler_config + segments: + train: [2008-01-01, 2014-12-31] + valid: [2015-01-01, 2016-12-31] + test: [2017-01-01, 2020-08-01] + record: + - class: SignalRecord + module_path: qlib.workflow.record_temp + kwargs: {} + - class: PortAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config \ No newline at end of file diff --git a/examples/benchmarks/LSTM/requirements.txt b/examples/benchmarks/LSTM/requirements.txt new file mode 100644 index 000000000..1fc2779c0 --- /dev/null +++ b/examples/benchmarks/LSTM/requirements.txt @@ -0,0 +1,4 @@ +numpy==1.17.4 +pandas==1.1.2 +scikit_learn==0.23.2 +torch==1.7.0 diff --git a/examples/benchmarks/LSTM/workflow_config_lstm.yaml b/examples/benchmarks/LSTM/workflow_config_lstm.yaml new file mode 100644 index 000000000..1e3b309d2 --- /dev/null +++ b/examples/benchmarks/LSTM/workflow_config_lstm.yaml @@ -0,0 +1,62 @@ +provider_uri: "~/.qlib/qlib_data/cn_data" +market: &market csi300 +benchmark: &benchmark SH000300 +data_handler_config: &data_handler_config + start_time: 2008-01-01 + end_time: 2020-08-01 + fit_start_time: 2008-01-01 + fit_end_time: 2014-12-31 + instruments: *market +port_analysis_config: &port_analysis_config + strategy: + class: TopkDropoutStrategy + module_path: qlib.contrib.strategy.strategy + kwargs: + topk: 50 + n_drop: 5 + backtest: + verbose: False + limit_threshold: 0.095 + account: 100000000 + benchmark: *benchmark + deal_price: close + open_cost: 0.0005 + close_cost: 0.0015 + min_cost: 5 +task: + model: + class: LSTM + module_path: qlib.contrib.model.pytorch_lstm + kwargs: + d_feat: 6 + hidden_size: 64 + num_layers: 2 + dropout: 0.0 + n_epochs: 200 + lr: 1e-3 + early_stop: 20 + batch_size: 800 + metric: IC + loss: mse + seed: 0 + GPU: 0 + dataset: + class: DatasetH + module_path: qlib.data.dataset + kwargs: + handler: + class: ALPHA360_Denoise + module_path: qlib.contrib.data.handler + kwargs: *data_handler_config + segments: + train: [2008-01-01, 2014-12-31] + valid: [2015-01-01, 2016-12-31] + test: [2017-01-01, 2020-08-01] + record: + - class: SignalRecord + module_path: qlib.workflow.record_temp + kwargs: {} + - class: PortAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config \ No newline at end of file diff --git a/examples/benchmarks/XGBoost/requirements.txt b/examples/benchmarks/XGBoost/requirements.txt new file mode 100644 index 000000000..077f343e5 --- /dev/null +++ b/examples/benchmarks/XGBoost/requirements.txt @@ -0,0 +1,3 @@ +numpy==1.17.4 +pandas==1.1.2 +xgboost==1.2.1 \ No newline at end of file diff --git a/examples/benchmarks/XGBoost/workflow_config_xgboost.yaml b/examples/benchmarks/XGBoost/workflow_config_xgboost.yaml new file mode 100644 index 000000000..497ffa5b6 --- /dev/null +++ b/examples/benchmarks/XGBoost/workflow_config_xgboost.yaml @@ -0,0 +1,62 @@ +provider_uri: "~/.qlib/qlib_data/cn_data" +market: &market csi300 +benchmark: &benchmark SH000300 +data_handler_config: &data_handler_config + start_time: 2008-01-01 + end_time: 2020-08-01 + fit_start_time: 2008-01-01 + fit_end_time: 2014-12-31 + instruments: *market +port_analysis_config: &port_analysis_config + strategy: + class: TopkDropoutStrategy + module_path: qlib.contrib.strategy.strategy + kwargs: + topk: 50 + n_drop: 5 + backtest: + verbose: False + limit_threshold: 0.095 + account: 100000000 + benchmark: *benchmark + deal_price: close + open_cost: 0.0005 + close_cost: 0.0015 + min_cost: 5 +task: + model: + class: XGBModel + module_path: qlib.contrib.model.xgboost + kwargs: + objective: reg:linear + n_estimators: 5000 + colsample_bytree: 0.85 + learning_rate: 0.0421 + subsample: 0.8789 + max_depth: 8 + num_leaves: 210 + num_threads: 20 + missing: -1 + min_child_weight: 1 + nthread: 4 + tree_method: hist + dataset: + class: DatasetH + module_path: qlib.data.dataset + kwargs: + handler: + class: Alpha158 + module_path: qlib.contrib.data.handler + kwargs: *data_handler_config + segments: + train: [2008-01-01, 2014-12-31] + valid: [2015-01-01, 2016-12-31] + test: [2017-01-01, 2020-08-01] + record: + - class: SignalRecord + module_path: qlib.workflow.record_temp + kwargs: {} + - class: PortAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config \ No newline at end of file diff --git a/examples/benchmarks/XGBoost/xgboost.py b/examples/benchmarks/XGBoost/xgboost.py new file mode 100755 index 000000000..f1208eb93 --- /dev/null +++ b/examples/benchmarks/XGBoost/xgboost.py @@ -0,0 +1,64 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import numpy as np +import pandas as pd +import xgboost as xgb + +from ...model.base import Model +from ...data.dataset import DatasetH +from ...data.dataset.handler import DataHandlerLP + + +class XGBModel(Model): + """XGBModel Model""" + + def __init__(self, obj="mse", **kwargs): + if obj not in {"mse", "binary"}: + raise NotImplementedError + self._params = {"obj": obj} + self._params.update(kwargs) + self.model = None + + def fit( + self, + dataset: DatasetH, + num_boost_round=1000, + early_stopping_rounds=50, + verbose_eval=20, + evals_result=dict(), + **kwargs + ): + + df_train, df_valid = dataset.prepare( + ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L + ) + x_train, y_train = df_train["feature"], df_train["label"] + x_valid, y_valid = df_valid["feature"], df_valid["label"] + + # Lightgbm need 1D array as its label + if y_train.values.ndim == 2 and y_train.values.shape[1] == 1: + y_train_1d, y_valid_1d = np.squeeze(y_train.values), np.squeeze(y_valid.values) + else: + raise ValueError("XGBoost doesn't support multi-label training") + + dtrain = xgb.DMatrix(x_train.values, label=y_train_1d) + dvalid = xgb.DMatrix(x_valid.values, label=y_valid_1d) + self.model = xgb.train( + self._params, + dtrain=dtrain, + num_boost_round=num_boost_round, + evals=[(dtrain, "train"), (dvalid, "valid")], + early_stopping_rounds=early_stopping_rounds, + verbose_eval=verbose_eval, + evals_result=evals_result, + **kwargs + ) + evals_result["train"] = list(evals_result["train"].values())[0] + evals_result["valid"] = list(evals_result["valid"].values())[0] + + def predict(self, dataset): + if self.model is None: + raise ValueError("model is not fitted yet!") + x_test = dataset.prepare("test", col_set="feature") + return pd.Series(self.model.predict(xgb.DMatrix(np.squeeze(x_test.values))), index=x_test.index) diff --git a/examples/estimator/analyze_from_estimator.ipynb b/examples/estimator/analyze_from_estimator.ipynb deleted file mode 100644 index 2ed63bf22..000000000 --- a/examples/estimator/analyze_from_estimator.ipynb +++ /dev/null @@ -1,222 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "import json\n", - "import yaml\n", - "import pickle\n", - "from pathlib import Path\n", - "\n", - "import qlib\n", - "import pandas as pd\n", - "from qlib.config import REG_CN\n", - "from qlib.utils import exists_qlib_data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "CUR_DIR = Path.cwd()\n", - "MARKET = \"csi300\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# use default data\n", - "# NOTE: need to download data from remote: python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data\n", - "provider_uri = \"~/.qlib/qlib_data/cn_data\" # target_dir\n", - "if not exists_qlib_data(provider_uri):\n", - " print(f\"Qlib data is not found in {provider_uri}\")\n", - " sys.path.append(str(CUR_DIR.parent.parent.joinpath(\"scripts\")))\n", - " from get_data import GetData\n", - " GetData().qlib_data(target_dir=provider_uri)\n", - "qlib.init(provider_uri=provider_uri, region=REG_CN)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with CUR_DIR.joinpath('estimator_config.yaml').open() as fp:\n", - " estimator_name = yaml.load(fp, Loader=yaml.FullLoader)['experiment']['name']\n", - "with CUR_DIR.joinpath(estimator_name, 'exp_info.json').open() as fp:\n", - " latest_id = json.load(fp)['id']\n", - " \n", - "estimator_dir = CUR_DIR.joinpath(estimator_name, 'sacred', latest_id)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# read estimator result" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pred_df = pd.read_pickle(estimator_dir.joinpath('pred.pkl'))\n", - "report_normal_df = pd.read_pickle(estimator_dir.joinpath('report_normal.pkl'))\n", - "report_normal_df.index.names = ['index']\n", - "\n", - "analysis_df = pd.read_pickle(estimator_dir.joinpath('analysis.pkl'))\n", - "positions = pickle.load(estimator_dir.joinpath('positions.pkl').open('rb'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# analyze graphs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from qlib.data import D\n", - "from qlib.contrib.report import analysis_model, analysis_position\n", - "pred_df_dates = pred_df.index.get_level_values(level='datetime')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## analysis position" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "stock_ret = D.features(D.instruments(MARKET), ['Ref($close, -1)/$close - 1'], pred_df_dates.min(), pred_df_dates.max())\n", - "stock_ret.columns = ['label']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### report" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "analysis_position.report_graph(report_normal_df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### risk analysis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "analysis_position.risk_analysis_graph(analysis_df, report_normal_df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## analysis model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "label_df = D.features(D.instruments(MARKET), ['Ref($close, -2)/Ref($close, -1) - 1'], pred_df_dates.min(), pred_df_dates.max())\n", - "label_df.columns = ['label']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### score IC" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pred_label = pd.concat([label_df, pred_df], axis=1, sort=True).reindex(label_df.index)\n", - "analysis_position.score_ic_graph(pred_label)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### model performance" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "analysis_model.model_performance_graph(pred_label)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/examples/estimator/estimator_config.yaml b/examples/estimator/estimator_config.yaml deleted file mode 100644 index eaffc181b..000000000 --- a/examples/estimator/estimator_config.yaml +++ /dev/null @@ -1,53 +0,0 @@ -experiment: - name: estimator_example - observer_type: file_storage - mode: train - -model: - class: LGBModel - module_path: qlib.gbdt.model.gbdt - args: - loss: mse - colsample_bytree: 0.8879 - learning_rate: 0.0421 - subsample: 0.8789 - lambda_l1: 205.6999 - lambda_l2: 580.9768 - max_depth: 8 - num_leaves: 210 - num_threads: 20 -data: - class: Alpha158 - args: - dropna_label: True - filter: - market: csi300 -trainer: - class: StaticTrainer - args: - train_start_date: 2008-01-01 - train_end_date: 2014-12-31 - validate_start_date: 2015-01-01 - validate_end_date: 2016-12-31 - test_start_date: 2017-01-01 - test_end_date: 2020-08-01 -strategy: - class: TopkDropoutStrategy - args: - topk: 50 - n_drop: 5 -backtest: - normal_backtest_args: - verbose: False - limit_threshold: 0.095 - account: 100000000 - benchmark: SH000300 - deal_price: close - open_cost: 0.0005 - close_cost: 0.0015 - min_cost: 5 - -qlib_data: - # when testing, please modify the following parameters according to the specific environment - provider_uri: "~/.qlib/qlib_data/cn_data" - region: "cn" diff --git a/examples/estimator/estimator_config_dnn.yaml b/examples/estimator/estimator_config_dnn.yaml deleted file mode 100644 index 1aa122313..000000000 --- a/examples/estimator/estimator_config_dnn.yaml +++ /dev/null @@ -1,55 +0,0 @@ -experiment: - name: estimator_example - observer_type: file_storage - mode: train - -model: - module_path: qlib.model.pytorch_nn - class: DNNModelPytorch - args: - loss: mse - input_dim: 158 - output_dim: 1 - lr: 0.002 - lr_decay: 0.96 - lr_decay_steps: 100 - optimizer: 'adam' - max_steps: 8000 - batch_size: 4096 - GPU: '0' -data: - class: Alpha158 - args: - dropna_label: True - dropna_feature: True - filter: - market: csi300 -trainer: - class: StaticTrainer - args: - train_start_date: 2007-01-01 - train_end_date: 2014-12-31 - validate_start_date: 2015-01-01 - validate_end_date: 2016-12-31 - test_start_date: 2017-01-01 - test_end_date: 2020-08-01 -strategy: - class: TopkDropoutStrategy - args: - topk: 50 - n_drop: 5 -backtest: - normal_backtest_args: - verbose: False - limit_threshold: 0.095 - account: 100000000 - benchmark: SH000300 - deal_price: close - open_cost: 0.0005 - close_cost: 0.0015 - min_cost: 5 - -qlib_data: - # when testing, please modify the following parameters according to the specific environment - provider_uri: "~/.qlib/qlib_data/cn_data" - region: "cn" diff --git a/examples/train_and_backtest.py b/examples/train_and_backtest.py deleted file mode 100644 index 6905ef169..000000000 --- a/examples/train_and_backtest.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import sys -from pathlib import Path - -import qlib -import pandas as pd -from qlib.config import REG_CN -from qlib.contrib.model.gbdt import LGBModel -from qlib.contrib.data.handler import Alpha158 -from qlib.contrib.strategy.strategy import TopkDropoutStrategy -from qlib.contrib.evaluate import ( - backtest as normal_backtest, - risk_analysis, -) -from qlib.utils import exists_qlib_data - - -if __name__ == "__main__": - - # use default data - provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir - if not exists_qlib_data(provider_uri): - print(f"Qlib data is not found in {provider_uri}") - sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts"))) - from get_data import GetData - - GetData().qlib_data(target_dir=provider_uri) - - qlib.init(provider_uri=provider_uri, region=REG_CN) - - MARKET = "CSI300" - BENCHMARK = "SH000300" - - ################################### - # train model - ################################### - DATA_HANDLER_CONFIG = { - "dropna_label": True, - "start_date": "2008-01-01", - "end_date": "2020-08-01", - "market": MARKET, - } - - TRAINER_CONFIG = { - "train_start_date": "2008-01-01", - "train_end_date": "2014-12-31", - "validate_start_date": "2015-01-01", - "validate_end_date": "2016-12-31", - "test_start_date": "2017-01-01", - "test_end_date": "2020-08-01", - } - - # use default DataHandler - # custom DataHandler, refer to: TODO: DataHandler API url - x_train, y_train, x_validate, y_validate, x_test, y_test = Alpha158(**DATA_HANDLER_CONFIG).get_split_data( - **TRAINER_CONFIG - ) - - MODEL_CONFIG = { - "loss": "mse", - "colsample_bytree": 0.8879, - "learning_rate": 0.0421, - "subsample": 0.8789, - "lambda_l1": 205.6999, - "lambda_l2": 580.9768, - "max_depth": 8, - "num_leaves": 210, - "num_threads": 20, - } - # use default model - # custom Model, refer to: TODO: Model API url - model = LGBModel(**MODEL_CONFIG) - model.fit(x_train, y_train, x_validate, y_validate) - _pred = model.predict(x_test) - _pred = pd.DataFrame(_pred, index=x_test.index, columns=y_test.columns) - - # backtest requires pred_score - pred_score = pd.DataFrame(index=_pred.index) - pred_score["score"] = _pred.iloc(axis=1)[0] - - # save pred_score to file - pred_score_path = Path("~/tmp/qlib/pred_score.pkl").expanduser() - pred_score_path.parent.mkdir(exist_ok=True, parents=True) - pred_score.to_pickle(pred_score_path) - - ################################### - # backtest - ################################### - STRATEGY_CONFIG = { - "topk": 50, - "n_drop": 5, - } - BACKTEST_CONFIG = { - "verbose": False, - "limit_threshold": 0.095, - "account": 100000000, - "benchmark": BENCHMARK, - "deal_price": "close", - "open_cost": 0.0005, - "close_cost": 0.0015, - "min_cost": 5, - } - - # use default strategy - # custom Strategy, refer to: TODO: Strategy API url - strategy = TopkDropoutStrategy(**STRATEGY_CONFIG) - report_normal, positions_normal = normal_backtest(pred_score, strategy=strategy, **BACKTEST_CONFIG) - - ################################### - # analyze - # If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb - ################################### - analysis = dict() - analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"]) - analysis["excess_return_with_cost"] = risk_analysis( - report_normal["return"] - report_normal["bench"] - report_normal["cost"] - ) - analysis_df = pd.concat(analysis) # type: pd.DataFrame - print(analysis_df) diff --git a/examples/train_backtest_analyze.ipynb b/examples/train_backtest_analyze.ipynb deleted file mode 100644 index d8987b58f..000000000 --- a/examples/train_backtest_analyze.ipynb +++ /dev/null @@ -1,338 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "from pathlib import Path\n", - "\n", - "import qlib\n", - "import pandas as pd\n", - "from qlib.config import REG_CN\n", - "from qlib.contrib.model.gbdt import LGBModel\n", - "from qlib.contrib.estimator.handler import Alpha158\n", - "from qlib.contrib.strategy.strategy import TopkDropoutStrategy\n", - "from qlib.contrib.evaluate import (\n", - " backtest as normal_backtest,\n", - " risk_analysis,\n", - ")\n", - "from qlib.utils import exists_qlib_data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# use default data\n", - "# NOTE: need to download data from remote: python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data --region cn\n", - "provider_uri = \"~/.qlib/qlib_data/cn_data\" # target_dir\n", - "if not exists_qlib_data(provider_uri):\n", - " print(f\"Qlib data is not found in {provider_uri}\")\n", - " sys.path.append(str(Path.cwd().parent.joinpath(\"scripts\")))\n", - " from get_data import GetData\n", - " GetData().qlib_data(target_dir=provider_uri)\n", - "qlib.init(provider_uri=provider_uri, region=REG_CN)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MARKET = \"csi300\"\n", - "BENCHMARK = \"SH000300\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# train model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "###################################\n", - "# train model\n", - "###################################\n", - "DATA_HANDLER_CONFIG = {\n", - " \"dropna_label\": True,\n", - " \"start_date\": \"2008-01-01\",\n", - " \"end_date\": \"2020-08-01\",\n", - " \"market\": MARKET,\n", - "}\n", - "\n", - "TRAINER_CONFIG = {\n", - " \"train_start_date\": \"2008-01-01\",\n", - " \"train_end_date\": \"2014-12-31\",\n", - " \"validate_start_date\": \"2015-01-01\",\n", - " \"validate_end_date\": \"2016-12-31\",\n", - " \"test_start_date\": \"2017-01-01\",\n", - " \"test_end_date\": \"2020-08-01\",\n", - "}\n", - "\n", - "# use default DataHandler\n", - "# custom DataHandler, refer to: TODO: DataHandler api url\n", - "x_train, y_train, x_validate, y_validate, x_test, y_test = Alpha158(**DATA_HANDLER_CONFIG).get_split_data(**TRAINER_CONFIG)\n", - "\n", - "\n", - "MODEL_CONFIG = {\n", - " \"loss\": \"mse\",\n", - " \"colsample_bytree\": 0.8879,\n", - " \"learning_rate\": 0.0421,\n", - " \"subsample\": 0.8789,\n", - " \"lambda_l1\": 205.6999,\n", - " \"lambda_l2\": 580.9768,\n", - " \"max_depth\": 8,\n", - " \"num_leaves\": 210,\n", - " \"num_threads\": 20,\n", - "}\n", - "# use default model\n", - "# custom Model, refer to: TODO: Model api url\n", - "model = LGBModel(**MODEL_CONFIG)\n", - "model.fit(x_train, y_train, x_validate, y_validate)\n", - "_pred = model.predict(x_test)\n", - "_pred = pd.DataFrame(_pred, index=x_test.index, columns=y_test.columns)\n", - "\n", - "# backtest requires pred_score\n", - "pred_score = pd.DataFrame(index=_pred.index)\n", - "pred_score[\"score\"] = _pred.iloc(axis=1)[0]\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# backtest" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "###################################\n", - "# backtest\n", - "###################################\n", - "STRATEGY_CONFIG = {\n", - " \"topk\": 50,\n", - " \"n_drop\": 5}\n", - "BACKTEST_CONFIG = {\n", - " \"verbose\": False,\n", - " \"limit_threshold\": 0.095,\n", - " \"account\": 100000000,\n", - " \"benchmark\": BENCHMARK,\n", - " \"deal_price\": \"close\",\n", - " \"open_cost\": 0.0005,\n", - " \"close_cost\": 0.0015,\n", - " \"min_cost\": 5,\n", - " \n", - "}\n", - "\n", - "# use default strategy\n", - "# custom Strategy, refer to: TODO: Strategy api url\n", - "strategy = TopkDropoutStrategy(**STRATEGY_CONFIG)\n", - "report_normal, positions_normal = normal_backtest(pred_score, strategy=strategy, **BACKTEST_CONFIG)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# analyze" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "###################################\n", - "# analyze\n", - "# If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb\n", - "###################################\n", - "analysis = dict()\n", - "analysis[\"excess_return_without_cost\"] = risk_analysis(report_normal[\"return\"] - report_normal[\"bench\"])\n", - "analysis[\"excess_return_with_cost\"] = risk_analysis(\n", - " report_normal[\"return\"] - report_normal[\"bench\"] - report_normal[\"cost\"]\n", - ")\n", - "analysis_df = pd.concat(analysis) # type: pd.DataFrame\n", - "print(analysis_df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# analyze graphs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from qlib.contrib.report import analysis_model, analysis_position\n", - "from qlib.data import D\n", - "pred_df_dates = pred_score.index.get_level_values(level='datetime')\n", - "report_normal_df = report_normal\n", - "positions = positions_normal\n", - "pred_df = pred_score" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## analysis position" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "stock_ret = D.features(D.instruments(MARKET), ['Ref($close, -1)/$close - 1'], pred_df_dates.min(), pred_df_dates.max())\n", - "stock_ret.columns = ['label']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### report" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "analysis_position.report_graph(report_normal_df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### risk analysis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "analysis_position.risk_analysis_graph(analysis_df, report_normal_df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## analysis model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "label_df = D.features(D.instruments(MARKET), ['Ref($close, -2)/Ref($close, -1) - 1'], pred_df_dates.min(), pred_df_dates.max())\n", - "label_df.columns = ['label']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### score IC" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pred_label = pd.concat([label_df, pred_df], axis=1, sort=True).reindex(label_df.index)\n", - "analysis_position.score_ic_graph(pred_label)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### model performance" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "analysis_model.model_performance_graph(pred_label)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/examples/workflow_by_code_gats.py b/examples/workflow_by_code_gats.py deleted file mode 100755 index 06845d448..000000000 --- a/examples/workflow_by_code_gats.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import sys -from pathlib import Path - -import qlib -import pandas as pd -from qlib.config import REG_CN -from qlib.contrib.model.pytorch_gats import GAT -from qlib.contrib.data.handler import ALPHA360_Denoise -from qlib.contrib.strategy.strategy import TopkDropoutStrategy -from qlib.contrib.evaluate import ( - backtest as normal_backtest, - risk_analysis, -) -from qlib.utils import exists_qlib_data - -# from qlib.model.learner import train_model -from qlib.utils import init_instance_by_config - -import pickle - -if __name__ == "__main__": - - # use default data - provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir - if not exists_qlib_data(provider_uri): - print(f"Qlib data is not found in {provider_uri}") - sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts"))) - from get_data import GetData - - GetData().qlib_data_cn(target_dir=provider_uri) - - qlib.init(provider_uri=provider_uri, region=REG_CN) - - MARKET = "csi300" - BENCHMARK = "SH000300" - - ################################### - # train model - ################################### - DATA_HANDLER_CONFIG = { - "start_time": "2008-01-01", - "end_time": "2020-08-01", - "fit_start_time": "2008-01-01", - "fit_end_time": "2014-12-31", - "instruments": MARKET, - } - - TRAINER_CONFIG = { - "train_start_time": "2008-01-01", - "train_end_time": "2014-12-31", - "validate_start_time": "2015-01-01", - "validate_end_time": "2016-12-31", - "test_start_time": "2017-01-01", - "test_end_time": "2020-08-01", - } - - task = { - "model": { - "class": "GAT", - "module_path": "qlib.contrib.model.pytorch_gats", - "kwargs": { - "d_feat": 6, - "hidden_size": 64, - "num_layers": 2, - "dropout": 0.0, - "n_epochs": 200, - "lr": 1e-3, - "early_stop": 20, - "batch_size": 800, - "metric": "IC", - "loss": "mse", - "base_model":"GRU", - "seed": 0, - "GPU": 0, - }, - }, - "dataset": { - "class": "DatasetH", - "module_path": "qlib.data.dataset", - "kwargs": { - "handler": { - "class": "ALPHA360_Denoise", - "module_path": "qlib.contrib.data.handler", - "kwargs": DATA_HANDLER_CONFIG, - }, - "segments": { - "train": ("2008-01-01", "2014-12-31"), - "valid": ("2015-01-01", "2016-12-31"), - "test": ("2017-01-01", "2020-08-01"), - }, - }, - } - # You shoud record the data in specific sequence - # "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'], - } - - # model = train_model(task) - model = init_instance_by_config(task["model"]) - dataset = init_instance_by_config(task["dataset"]) - model.fit(dataset) - - pred_score = model.predict(dataset) - - # save pred_score to file - pred_score_path = Path("~/tmp/qlib/pred_score.pkl").expanduser() - pred_score_path.parent.mkdir(exist_ok=True, parents=True) - pred_score.to_pickle(pred_score_path) - - ################################### - # backtest - ################################### - STRATEGY_CONFIG = { - "topk": 50, - "n_drop": 5, - } - BACKTEST_CONFIG = { - "verbose": False, - "limit_threshold": 0.095, - "account": 100000000, - "benchmark": BENCHMARK, - "deal_price": "close", - "open_cost": 0.0005, - "close_cost": 0.0015, - "min_cost": 5, - } - - # use default strategy - # custom Strategy, refer to: TODO: Strategy API url - strategy = TopkDropoutStrategy(**STRATEGY_CONFIG) - report_normal, positions_normal = normal_backtest(pred_score, strategy=strategy, **BACKTEST_CONFIG) - - ################################### - # analyze - # If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb - ################################### - analysis = dict() - analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"]) - analysis["excess_return_with_cost"] = risk_analysis( - report_normal["return"] - report_normal["bench"] - report_normal["cost"] - ) - analysis_df = pd.concat(analysis) # type: pd.DataFrame - print(analysis_df) diff --git a/examples/workflow_by_code_gru.py b/examples/workflow_by_code_gru.py deleted file mode 100755 index e55f0ae45..000000000 --- a/examples/workflow_by_code_gru.py +++ /dev/null @@ -1,144 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import sys -from pathlib import Path - -import qlib -import pandas as pd -from qlib.config import REG_CN -from qlib.contrib.model.pytorch_gru import GRU -from qlib.contrib.data.handler import ALPHA360_Denoise -from qlib.contrib.strategy.strategy import TopkDropoutStrategy -from qlib.contrib.evaluate import ( - backtest as normal_backtest, - risk_analysis, -) -from qlib.utils import exists_qlib_data - -# from qlib.model.learner import train_model -from qlib.utils import init_instance_by_config - -import pickle - -if __name__ == "__main__": - - # use default data - provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir - if not exists_qlib_data(provider_uri): - print(f"Qlib data is not found in {provider_uri}") - sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts"))) - from get_data import GetData - - GetData().qlib_data_cn(target_dir=provider_uri) - - qlib.init(provider_uri=provider_uri, region=REG_CN) - - MARKET = "csi300" - BENCHMARK = "SH000300" - - ################################### - # train model - ################################### - DATA_HANDLER_CONFIG = { - "start_time": "2008-01-01", - "end_time": "2020-08-01", - "fit_start_time": "2008-01-01", - "fit_end_time": "2014-12-31", - "instruments": MARKET, - } - - TRAINER_CONFIG = { - "train_start_time": "2008-01-01", - "train_end_time": "2014-12-31", - "validate_start_time": "2015-01-01", - "validate_end_time": "2016-12-31", - "test_start_time": "2017-01-01", - "test_end_time": "2020-08-01", - } - - task = { - "model": { - "class": "GRU", - "module_path": "qlib.contrib.model.pytorch_gru", - "kwargs": { - "d_feat": 6, - "hidden_size": 64, - "num_layers": 2, - "dropout": 0.0, - "n_epochs": 200, - "lr": 1e-3, - "early_stop": 20, - "batch_size": 800, - "metric": "IC", - "loss": "mse", - "seed": 0, - "GPU": 0, - }, - }, - "dataset": { - "class": "DatasetH", - "module_path": "qlib.data.dataset", - "kwargs": { - "handler": { - "class": "ALPHA360_Denoise", - "module_path": "qlib.contrib.data.handler", - "kwargs": DATA_HANDLER_CONFIG, - }, - "segments": { - "train": ("2008-01-01", "2014-12-31"), - "valid": ("2015-01-01", "2016-12-31"), - "test": ("2017-01-01", "2020-08-01"), - }, - }, - } - # You shoud record the data in specific sequence - # "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'], - } - - # model = train_model(task) - model = init_instance_by_config(task["model"]) - dataset = init_instance_by_config(task["dataset"]) - model.fit(dataset) - - pred_score = model.predict(dataset) - - # save pred_score to file - pred_score_path = Path("~/tmp/qlib/pred_score.pkl").expanduser() - pred_score_path.parent.mkdir(exist_ok=True, parents=True) - pred_score.to_pickle(pred_score_path) - - ################################### - # backtest - ################################### - STRATEGY_CONFIG = { - "topk": 50, - "n_drop": 5, - } - BACKTEST_CONFIG = { - "verbose": False, - "limit_threshold": 0.095, - "account": 100000000, - "benchmark": BENCHMARK, - "deal_price": "close", - "open_cost": 0.0005, - "close_cost": 0.0015, - "min_cost": 5, - } - - # use default strategy - # custom Strategy, refer to: TODO: Strategy API url - strategy = TopkDropoutStrategy(**STRATEGY_CONFIG) - report_normal, positions_normal = normal_backtest(pred_score, strategy=strategy, **BACKTEST_CONFIG) - - ################################### - # analyze - # If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb - ################################### - analysis = dict() - analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"]) - analysis["excess_return_with_cost"] = risk_analysis( - report_normal["return"] - report_normal["bench"] - report_normal["cost"] - ) - analysis_df = pd.concat(analysis) # type: pd.DataFrame - print(analysis_df) diff --git a/examples/workflow_by_code_lstm.py b/examples/workflow_by_code_lstm.py deleted file mode 100755 index 1815d2fec..000000000 --- a/examples/workflow_by_code_lstm.py +++ /dev/null @@ -1,144 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import sys -from pathlib import Path - -import qlib -import pandas as pd -from qlib.config import REG_CN -from qlib.contrib.model.pytorch_lstm import LSTM -from qlib.contrib.data.handler import ALPHA360_Denoise -from qlib.contrib.strategy.strategy import TopkDropoutStrategy -from qlib.contrib.evaluate import ( - backtest as normal_backtest, - risk_analysis, -) -from qlib.utils import exists_qlib_data - -# from qlib.model.learner import train_model -from qlib.utils import init_instance_by_config - -import pickle - -if __name__ == "__main__": - - # use default data - provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir - if not exists_qlib_data(provider_uri): - print(f"Qlib data is not found in {provider_uri}") - sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts"))) - from get_data import GetData - - GetData().qlib_data_cn(target_dir=provider_uri) - - qlib.init(provider_uri=provider_uri, region=REG_CN) - - MARKET = "csi300" - BENCHMARK = "SH000300" - - ################################### - # train model - ################################### - DATA_HANDLER_CONFIG = { - "start_time": "2008-01-01", - "end_time": "2020-08-01", - "fit_start_time": "2008-01-01", - "fit_end_time": "2014-12-31", - "instruments": MARKET, - } - - TRAINER_CONFIG = { - "train_start_time": "2008-01-01", - "train_end_time": "2014-12-31", - "validate_start_time": "2015-01-01", - "validate_end_time": "2016-12-31", - "test_start_time": "2017-01-01", - "test_end_time": "2020-08-01", - } - - task = { - "model": { - "class": "LSTM", - "module_path": "qlib.contrib.model.pytorch_lstm", - "kwargs": { - "d_feat": 6, - "hidden_size": 64, - "num_layers": 2, - "dropout": 0.0, - "n_epochs": 200, - "lr": 1e-3, - "early_stop": 20, - "batch_size": 800, - "metric": "IC", - "loss": "mse", - "seed": 0, - "GPU": 0, - }, - }, - "dataset": { - "class": "DatasetH", - "module_path": "qlib.data.dataset", - "kwargs": { - "handler": { - "class": "ALPHA360_Denoise", - "module_path": "qlib.contrib.data.handler", - "kwargs": DATA_HANDLER_CONFIG, - }, - "segments": { - "train": ("2008-01-01", "2014-12-31"), - "valid": ("2015-01-01", "2016-12-31"), - "test": ("2017-01-01", "2020-08-01"), - }, - }, - } - # You shoud record the data in specific sequence - # "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'], - } - - # model = train_model(task) - model = init_instance_by_config(task["model"]) - dataset = init_instance_by_config(task["dataset"]) - model.fit(dataset) - - pred_score = model.predict(dataset) - - # save pred_score to file - pred_score_path = Path("~/tmp/qlib/pred_score.pkl").expanduser() - pred_score_path.parent.mkdir(exist_ok=True, parents=True) - pred_score.to_pickle(pred_score_path) - - ################################### - # backtest - ################################### - STRATEGY_CONFIG = { - "topk": 50, - "n_drop": 5, - } - BACKTEST_CONFIG = { - "verbose": False, - "limit_threshold": 0.095, - "account": 100000000, - "benchmark": BENCHMARK, - "deal_price": "close", - "open_cost": 0.0005, - "close_cost": 0.0015, - "min_cost": 5, - } - - # use default strategy - # custom Strategy, refer to: TODO: Strategy API url - strategy = TopkDropoutStrategy(**STRATEGY_CONFIG) - report_normal, positions_normal = normal_backtest(pred_score, strategy=strategy, **BACKTEST_CONFIG) - - ################################### - # analyze - # If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb - ################################### - analysis = dict() - analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"]) - analysis["excess_return_with_cost"] = risk_analysis( - report_normal["return"] - report_normal["bench"] - report_normal["cost"] - ) - analysis_df = pd.concat(analysis) # type: pd.DataFrame - print(analysis_df) diff --git a/examples/workflow_by_code_xgboost.py b/examples/workflow_by_code_xgboost.py deleted file mode 100755 index 94b43f449..000000000 --- a/examples/workflow_by_code_xgboost.py +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import sys -from pathlib import Path - -import qlib -import pandas as pd -from qlib.config import REG_CN -from qlib.contrib.model.xgboost import XGBModel -from qlib.contrib.data.handler import Alpha158 -from qlib.contrib.strategy.strategy import TopkDropoutStrategy -from qlib.contrib.evaluate import ( - backtest as normal_backtest, - risk_analysis, -) -from qlib.utils import exists_qlib_data - -# from qlib.model.learner import train_model -from qlib.utils import init_instance_by_config - -if __name__ == "__main__": - - # use default data - provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir - if not exists_qlib_data(provider_uri): - print(f"Qlib data is not found in {provider_uri}") - sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts"))) - from get_data import GetData - - GetData().qlib_data_cn(target_dir=provider_uri) - - qlib.init(provider_uri=provider_uri, region=REG_CN) - - MARKET = "csi300" - BENCHMARK = "SH000300" - - ################################### - # train model - ################################### - DATA_HANDLER_CONFIG = { - "start_time": "2008-01-01", - "end_time": "2020-08-01", - "fit_start_time": "2008-01-01", - "fit_end_time": "2014-12-31", - "instruments": MARKET, - } - - TRAINER_CONFIG = { - "train_start_time": "2008-01-01", - "train_end_time": "2014-12-31", - "validate_start_time": "2015-01-01", - "validate_end_time": "2016-12-31", - "test_start_time": "2017-01-01", - "test_end_time": "2020-08-01", - } - - task = { - "model": { - "class": "XGBModel", - "module_path": "qlib.contrib.model.xgboost", - "kwargs": { - "objective": "reg:linear", - "n_estimators": 5000, - "colsample_bytree": 0.85, - "learning_rate": 0.0421, - "subsample": 0.8789, - "max_depth": 8, - "num_leaves": 210, - "num_threads": 20, - "missing": -1, - "min_child_weight": 1, - "nthread": 4, - "tree_method": "hist", - }, - }, - "dataset": { - "class": "DatasetH", - "module_path": "qlib.data.dataset", - "kwargs": { - "handler": { - "class": "Alpha158", - "module_path": "qlib.contrib.data.handler", - "kwargs": DATA_HANDLER_CONFIG, - }, - "segments": { - "train": ("2008-01-01", "2014-12-31"), - "valid": ("2015-01-01", "2016-12-31"), - "test": ("2017-01-01", "2020-08-01"), - }, - }, - } - # You shoud record the data in specific sequence - # "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'], - } - - # model = train_model(task) - model = init_instance_by_config(task["model"]) - dataset = init_instance_by_config(task["dataset"]) - - model.fit(dataset) - pred_score = model.predict(dataset) - - # save pred_score to file - pred_score_path = Path("~/tmp/qlib/pred_score.pkl").expanduser() - pred_score_path.parent.mkdir(exist_ok=True, parents=True) - pred_score.to_pickle(pred_score_path) - - ################################### - # backtest - ################################### - STRATEGY_CONFIG = { - "topk": 50, - "n_drop": 5, - } - BACKTEST_CONFIG = { - "verbose": False, - "limit_threshold": 0.095, - "account": 100000000, - "benchmark": BENCHMARK, - "deal_price": "close", - "open_cost": 0.0005, - "close_cost": 0.0015, - "min_cost": 5, - } - - # use default strategy - # custom Strategy, refer to: TODO: Strategy API url - strategy = TopkDropoutStrategy(**STRATEGY_CONFIG) - report_normal, positions_normal = normal_backtest(pred_score, strategy=strategy, **BACKTEST_CONFIG) - - ################################### - # analyze - # If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb - ################################### - analysis = dict() - analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"]) - analysis["excess_return_with_cost"] = risk_analysis( - report_normal["return"] - report_normal["bench"] - report_normal["cost"] - ) - analysis_df = pd.concat(analysis) # type: pd.DataFrame - print(analysis_df) diff --git a/qlib/contrib/model/pytorch_gats.py b/qlib/contrib/model/pytorch_gats.py index edfb26d72..22ed6812d 100755 --- a/qlib/contrib/model/pytorch_gats.py +++ b/qlib/contrib/model/pytorch_gats.py @@ -121,7 +121,11 @@ class GAT(Model): self._scorer = mean_squared_error if loss == "mse" else roc_auc_score self.GAT_model = GATModel( - d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers, dropout=self.dropout, base_model=self.base_model + d_feat=self.d_feat, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + dropout=self.dropout, + base_model=self.base_model, ) if optimizer.lower() == "adam": self.train_optimizer = optim.Adam(self.GAT_model.parameters(), lr=self.lr) @@ -321,11 +325,10 @@ class GAT(Model): class GATModel(nn.Module): - - def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0, base_model='GRU'): + def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0, base_model="GRU"): super().__init__() - if base_model == 'GRU': + if base_model == "GRU": self.rnn = nn.GRU( input_size=d_feat, hidden_size=hidden_size, @@ -333,7 +336,7 @@ class GATModel(nn.Module): batch_first=True, dropout=dropout, ) - elif base_model == 'LSTM': + elif base_model == "LSTM": self.rnn = nn.LSTM( input_size=d_feat, hidden_size=hidden_size, @@ -342,7 +345,7 @@ class GATModel(nn.Module): dropout=dropout, ) else: - raise ValueError('unknown base model name `%s`'%base_model) + raise ValueError("unknown base model name `%s`" % base_model) self.hidden_size = hidden_size self.bn1 = nn.BatchNorm1d(num_features=hidden_size, track_running_stats=False) @@ -354,19 +357,19 @@ class GATModel(nn.Module): self.d_feat = d_feat - def cal_convariance(self, x, y): # the 2nd dimension of x and y are the same - e_x = torch.mean(x, dim = 1).reshape(-1, 1) - e_y = torch.mean(y, dim = 1).reshape(-1, 1) + def cal_convariance(self, x, y): # the 2nd dimension of x and y are the same + e_x = torch.mean(x, dim=1).reshape(-1, 1) + e_y = torch.mean(y, dim=1).reshape(-1, 1) e_x_e_y = e_x.mm(torch.t(e_y)) x_extend = x.reshape(x.shape[0], 1, x.shape[1]).repeat(1, y.shape[0], 1) y_extend = y.reshape(1, y.shape[0], y.shape[1]).repeat(x.shape[0], 1, 1) - e_xy = torch.mean(x_extend*y_extend, dim = 2) + e_xy = torch.mean(x_extend * y_extend, dim=2) return e_xy - e_x_e_y def forward(self, x): # x: [N, F*T] - x = x.reshape(len(x), self.d_feat, -1) # [N, F, T] - x = x.permute(0, 2, 1) # [N, T, F] + x = x.reshape(len(x), self.d_feat, -1) # [N, F, T] + x = x.permute(0, 2, 1) # [N, T, F] out, _ = self.rnn(x) hidden = out[:, -1, :] hidden = self.bn1(hidden) @@ -380,4 +383,4 @@ class GATModel(nn.Module): output = self.fc(output) output = self.bn2(output) output = self.leaky_relu(output) - return self.fc_out(output).squeeze() \ No newline at end of file + return self.fc_out(output).squeeze() diff --git a/qlib/contrib/model/pytorch_lstm.py b/qlib/contrib/model/pytorch_lstm.py index 4eb41c250..8b8454380 100755 --- a/qlib/contrib/model/pytorch_lstm.py +++ b/qlib/contrib/model/pytorch_lstm.py @@ -317,7 +317,6 @@ class LSTM(Model): class LSTMModel(nn.Module): - def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0): super().__init__() @@ -334,7 +333,7 @@ class LSTMModel(nn.Module): def forward(self, x): # x: [N, F*T] - x = x.reshape(len(x), self.d_feat, -1) # [N, F, T] - x = x.permute(0, 2, 1) # [N, T, F] + x = x.reshape(len(x), self.d_feat, -1) # [N, F, T] + x = x.permute(0, 2, 1) # [N, T, F] out, _ = self.rnn(x) - return self.fc_out(out[:, -1, :]).squeeze() \ No newline at end of file + return self.fc_out(out[:, -1, :]).squeeze() diff --git a/qlib/contrib/report/analysis_position/report.py b/qlib/contrib/report/analysis_position/report.py index e8bb5313f..714cfdd9c 100644 --- a/qlib/contrib/report/analysis_position/report.py +++ b/qlib/contrib/report/analysis_position/report.py @@ -75,11 +75,12 @@ def _report_figure(df: pd.DataFrame) -> [list, tuple]: max_start_date, max_end_date = _calculate_maximum(report_df) ex_max_start_date, ex_max_end_date = _calculate_maximum(report_df, True) + index_name = report_df.index.name _temp_df = report_df.reset_index() _temp_df.loc[-1] = 0 _temp_df = _temp_df.shift(1) - _temp_df.loc[0, "index"] = "T0" - _temp_df.set_index("index", inplace=True) + _temp_df.loc[0, index_name] = "T0" + _temp_df.set_index(index_name, inplace=True) _temp_df.iloc[0] = 0 report_df = _temp_df diff --git a/qlib/contrib/report/graph.py b/qlib/contrib/report/graph.py index 082eafa49..07ed94f90 100644 --- a/qlib/contrib/report/graph.py +++ b/qlib/contrib/report/graph.py @@ -11,7 +11,7 @@ import pandas as pd import plotly.offline as py import plotly.graph_objs as go -from plotly.tools import make_subplots +from plotly.subplots import make_subplots from plotly.figure_factory import create_distplot from ...utils import get_module_by_module_path @@ -357,7 +357,7 @@ class SubplotsGraph(object): # _item.pop('yaxis', None) for _g_obj in _graph_data: - self._figure.append_trace(_g_obj, row=row, col=col) + self._figure.add_trace(_g_obj, row=row, col=col) if self._sub_graph_layout is not None: for k, v in self._sub_graph_layout.items(): diff --git a/qlib/workflow/cli.py b/qlib/workflow/cli.py index 6acbee66e..f660a8098 100644 --- a/qlib/workflow/cli.py +++ b/qlib/workflow/cli.py @@ -6,8 +6,8 @@ from pathlib import Path import qlib import fire -import yaml import pandas as pd +import ruamel.yaml as yaml from qlib.config import REG_CN from qlib.utils import init_instance_by_config from qlib.workflow import R @@ -16,7 +16,7 @@ from qlib.workflow.record_temp import SignalRecord # worflow handler function def workflow(config_path): with open(config_path) as fp: - config = yaml.load(fp, Loader=yaml.FullLoader) + config = yaml.load(fp, Loader=yaml.Loader) provider_uri = config.get("provider_uri") qlib.init(provider_uri=provider_uri, region=REG_CN) @@ -26,7 +26,8 @@ def workflow(config_path): dataset = init_instance_by_config(config.get("task")["dataset"]) # start exp - with R.start("workflow"): + with R.start(experiment_name="workflow"): + R.log_paramters(**flatten_dict(task)) model.fit(dataset) recorder = R.get_recorder() diff --git a/qlib/workflow/record_temp.py b/qlib/workflow/record_temp.py index d6b4d608e..7d4c79364 100644 --- a/qlib/workflow/record_temp.py +++ b/qlib/workflow/record_temp.py @@ -1,6 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. +import re import pandas as pd from pathlib import Path from pprint import pprint @@ -37,12 +38,14 @@ class RecordTemp: """ raise NotImplementedError(f"Please implement the `generate` method.") - def load(self, **kwargs): + def load(self, name, **kwargs): """ Load the stored records. Parameters ---------- + name : str + the name for the file to be load. kwargs Return @@ -51,6 +54,16 @@ class RecordTemp: """ raise NotImplementedError(f"Please implement the `load` method.") + def list(self): + """ + List the stored records. + + Return + ------ + A list of all the stored records. + """ + raise NotImplementedError(f"Please implement the `list` method.") + def check(self, **kwargs): """ Check if the records is properly generated and saved. @@ -81,6 +94,8 @@ class SignalRecord(RecordTemp): def generate(self, **kwargs): # generate prediciton pred = self.model.predict(self.dataset) + if isinstance(pred, pd.Series): + pred = pred.to_frame("score") self.recorder.save_objects(**{"pred.pkl": pred}) logger.info( f"Signal record 'pred.pkl' has been saved as the artifact of the Experiment {self.recorder.experiment_id}" @@ -89,11 +104,14 @@ class SignalRecord(RecordTemp): pprint(f"The following are prediction results of the {type(self.model).__name__} model.") pprint(pred.head(5)) - def load(self): + def load(self, name="pred.pkl"): # try to load the saved object - pred = self.recorder.load_object("pred.pkl") + pred = self.recorder.load_object(name) return pred + def list(self): + return ["pred.pkl"] + def check(self, **kwargs): artifacts = self.recorder.list_artifacts() for artifact in artifacts: @@ -165,10 +183,20 @@ class PortAnaRecord(SignalRecord): pprint("The following are analysis results of the excess return with cost.") pprint(analysis["excess_return_with_cost"]) - def load(self): + def load(self, name): # try to load the saved object - pred = self.recorder.load_object(self.artifact_path / "port_analysis.pkl") - return pred + if self.artifact_path not in name: + file_name = re.split(r" |/|\\", name)[-1] + name = f"{self.artifact_path}/{file_name}" + result = self.recorder.load_object(name) + return result + + def list(self): + return [ + f"{self.artifact_path}/report_normal.pkl", + f"{self.artifact_path}/positions_normal.pkl", + f"{self.artifact_path}/port_analysis.pkl", + ] def check(self): artifacts = self.recorder.list_artifacts(self.artifact_path)