diff --git a/examples/benchmarks/CatBoost/workflow_config_catboost.yaml b/examples/benchmarks/CatBoost/workflow_config_catboost.yaml new file mode 100644 index 000000000..187cd116b --- /dev/null +++ b/examples/benchmarks/CatBoost/workflow_config_catboost.yaml @@ -0,0 +1,53 @@ +provider_uri: "~/.qlib/qlib_data/cn_data" +market: &market csi300 +benchmark: &benchmark SH000300 +data_handler_config: &data_handler_config + start_time: 2008-01-01 + end_time: 2020-08-01 + fit_start_time: 2008-01-01 + fit_end_time: 2014-12-31 + instruments: *market +port_analysis_config: &port_analysis_config + strategy: + class: TopkDropoutStrategy + module_path: qlib.contrib.strategy.strategy + kwargs: + topk: 50 + n_drop: 5 + backtest: + verbose: False + limit_threshold: 0.095 + account: 100000000 + benchmark: *benchmark + deal_price: close + open_cost: 0.0005 + close_cost: 0.0015 + min_cost: 5 +task: + model: + class: CatBoostModel + module_path: qlib.contrib.model.catboost_model + kwargs: + loss: RMSE + iterations: 5 + learning_rate: 0.03 + dataset: + class: DatasetH + module_path: qlib.data.dataset + kwargs: + handler: + class: ALPHA360_Denoise + module_path: qlib.contrib.data.handler + kwargs: *data_handler_config + segments: + train: [2008-01-01, 2014-12-31] + valid: [2015-01-01, 2016-12-31] + test: [2017-01-01, 2020-08-01] + record: + - class: SignalRecord + module_path: qlib.workflow.record_temp + kwargs: {} + - class: PortAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config \ No newline at end of file diff --git a/examples/benchmarks/XGBoost/xgboost.py b/examples/benchmarks/XGBoost/xgboost.py deleted file mode 100755 index f1208eb93..000000000 --- a/examples/benchmarks/XGBoost/xgboost.py +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import numpy as np -import pandas as pd -import xgboost as xgb - -from ...model.base import Model -from ...data.dataset import DatasetH -from ...data.dataset.handler import DataHandlerLP - - -class XGBModel(Model): - """XGBModel Model""" - - def __init__(self, obj="mse", **kwargs): - if obj not in {"mse", "binary"}: - raise NotImplementedError - self._params = {"obj": obj} - self._params.update(kwargs) - self.model = None - - def fit( - self, - dataset: DatasetH, - num_boost_round=1000, - early_stopping_rounds=50, - verbose_eval=20, - evals_result=dict(), - **kwargs - ): - - df_train, df_valid = dataset.prepare( - ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L - ) - x_train, y_train = df_train["feature"], df_train["label"] - x_valid, y_valid = df_valid["feature"], df_valid["label"] - - # Lightgbm need 1D array as its label - if y_train.values.ndim == 2 and y_train.values.shape[1] == 1: - y_train_1d, y_valid_1d = np.squeeze(y_train.values), np.squeeze(y_valid.values) - else: - raise ValueError("XGBoost doesn't support multi-label training") - - dtrain = xgb.DMatrix(x_train.values, label=y_train_1d) - dvalid = xgb.DMatrix(x_valid.values, label=y_valid_1d) - self.model = xgb.train( - self._params, - dtrain=dtrain, - num_boost_round=num_boost_round, - evals=[(dtrain, "train"), (dvalid, "valid")], - early_stopping_rounds=early_stopping_rounds, - verbose_eval=verbose_eval, - evals_result=evals_result, - **kwargs - ) - evals_result["train"] = list(evals_result["train"].values())[0] - evals_result["valid"] = list(evals_result["valid"].values())[0] - - def predict(self, dataset): - if self.model is None: - raise ValueError("model is not fitted yet!") - x_test = dataset.prepare("test", col_set="feature") - return pd.Series(self.model.predict(xgb.DMatrix(np.squeeze(x_test.values))), index=x_test.index) diff --git a/examples/workflow_by_code.ipynb b/examples/workflow_by_code.ipynb new file mode 100644 index 000000000..f07c4f19e --- /dev/null +++ b/examples/workflow_by_code.ipynb @@ -0,0 +1,330 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "from pathlib import Path\n", + "\n", + "import qlib\n", + "import pandas as pd\n", + "from qlib.config import REG_CN\n", + "from qlib.contrib.model.gbdt import LGBModel\n", + "from qlib.contrib.estimator.handler import Alpha158\n", + "from qlib.contrib.strategy.strategy import TopkDropoutStrategy\n", + "from qlib.contrib.evaluate import (\n", + " backtest as normal_backtest,\n", + " risk_analysis,\n", + ")\n", + "from qlib.utils import exists_qlib_data, init_instance_by_config\n", + "from qlib.workflow import R\n", + "from qlib.workflow.record_temp import SignalRecord, PortAnaRecord" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# use default data\n", + "# NOTE: need to download data from remote: python scripts/get_data.py qlib_data_cn --target_dir ~/.qlib/qlib_data/cn_data\n", + "provider_uri = \"~/.qlib/qlib_data/cn_data\" # target_dir\n", + "if not exists_qlib_data(provider_uri):\n", + " print(f\"Qlib data is not found in {provider_uri}\")\n", + " sys.path.append(str(Path.cwd().parent.joinpath(\"scripts\")))\n", + " from get_data import GetData\n", + " GetData().qlib_data_cn(target_dir=provider_uri)\n", + "qlib.init(provider_uri=provider_uri, region=REG_CN)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "market = \"csi300\"\n", + "benchmark = \"SH000300\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# train model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "###################################\n", + "# train model\n", + "###################################\n", + "data_handler_config = {\n", + " \"start_time\": \"2008-01-01\",\n", + " \"end_time\": \"2020-08-01\",\n", + " \"fit_start_time\": \"2008-01-01\",\n", + " \"fit_end_time\": \"2014-12-31\",\n", + " \"instruments\": market,\n", + "}\n", + "\n", + "task = {\n", + " \"model\": {\n", + " \"class\": \"LGBModel\",\n", + " \"module_path\": \"qlib.contrib.model.gbdt\",\n", + " \"kwargs\": {\n", + " \"loss\": \"mse\",\n", + " \"colsample_bytree\": 0.8879,\n", + " \"learning_rate\": 0.0421,\n", + " \"subsample\": 0.8789,\n", + " \"lambda_l1\": 205.6999,\n", + " \"lambda_l2\": 580.9768,\n", + " \"max_depth\": 8,\n", + " \"num_leaves\": 210,\n", + " \"num_threads\": 20,\n", + " },\n", + " },\n", + " \"dataset\": {\n", + " \"class\": \"DatasetH\",\n", + " \"module_path\": \"qlib.data.dataset\",\n", + " \"kwargs\": {\n", + " \"handler\": {\n", + " \"class\": \"Alpha158\",\n", + " \"module_path\": \"qlib.contrib.data.handler\",\n", + " \"kwargs\": data_handler_config,\n", + " },\n", + " \"segments\": {\n", + " \"train\": (\"2008-01-01\", \"2014-12-31\"),\n", + " \"valid\": (\"2015-01-01\", \"2016-12-31\"),\n", + " \"test\": (\"2017-01-01\", \"2020-08-01\"),\n", + " },\n", + " },\n", + " },\n", + "}\n", + "\n", + "# model initiaiton\n", + "model = init_instance_by_config(task[\"model\"])\n", + "dataset = init_instance_by_config(task[\"dataset\"])\n", + "\n", + "# start exp to train model\n", + "with R.start(experiment_name=\"train_model\"):\n", + " R.log_paramters(**flatten_dict(task))\n", + " model.fit(dataset)\n", + " R.save_objects(trained_model=model)\n", + " rid = R.get_recorder().id\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# prediction, backtest & analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "###################################\n", + "# prediction, backtest & analysis\n", + "###################################\n", + "port_analysis_config = {\n", + " \"strategy\": {\n", + " \"class\": \"TopkDropoutStrategy\",\n", + " \"module_path\": \"qlib.contrib.strategy.strategy\",\n", + " \"kwargs\": {\n", + " \"topk\": 50,\n", + " \"n_drop\": 5,\n", + " },\n", + " },\n", + " \"backtest\": {\n", + " \"verbose\": False,\n", + " \"limit_threshold\": 0.095,\n", + " \"account\": 100000000,\n", + " \"benchmark\": benchmark,\n", + " \"deal_price\": \"close\",\n", + " \"open_cost\": 0.0005,\n", + " \"close_cost\": 0.0015,\n", + " \"min_cost\": 5,\n", + " },\n", + "}\n", + "\n", + "\n", + "# backtest and analysis\n", + "with R.start(experiment_name=\"backtest_analysis\"):\n", + " recorder = R.get_recorder(rid, experiment_name=\"train_model\")\n", + " model = recorder.load_object(\"trained_model\")\n", + "\n", + " # prediction\n", + " recorder = R.get_recorder()\n", + " ba_rid = recorder.id\n", + " sr = SignalRecord(model, dataset, recorder)\n", + " sr.generate()\n", + "\n", + " # backtest & analysis\n", + " par = PortAnaRecord(recorder, port_analysis_config)\n", + " par.generate()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# analyze graphs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from qlib.contrib.report import analysis_model, analysis_position\n", + "from qlib.data import D\n", + "recorder = R.get_recorder(ba_rid, experiment_name=\"backtest_analysis\")\n", + "pred_df = recorder.load_object(\"pred.pkl\")\n", + "pred_df_dates = pred_df.index.get_level_values(level='datetime')\n", + "report_normal_df = recorder.load_object(\"portfolio_analysis/report_normal.pkl\")\n", + "positions = recorder.load_object(\"portfolio_analysis/positions_normal.pkl\")\n", + "analysis_df = recorder.load_object(\"portfolio_analysis/port_analysis.pkl\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## analysis position" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### report" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "analysis_position.report_graph(report_normal_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### risk analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "analysis_position.risk_analysis_graph(analysis_df, report_normal_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## analysis model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "label_df = dataset.prepare(\"test\", col_set=\"label\")\n", + "label_df.columns = ['label']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### score IC" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pred_label = pd.concat([label_df, pred_df], axis=1, sort=True).reindex(label_df.index)\n", + "analysis_position.score_ic_graph(pred_label)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### model performance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "analysis_model.model_performance_graph(pred_label)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/examples/workflow_by_code.py b/examples/workflow_by_code.py index b70a9e963..9aaa02f35 100644 --- a/examples/workflow_by_code.py +++ b/examples/workflow_by_code.py @@ -32,18 +32,18 @@ if __name__ == "__main__": qlib.init(provider_uri=provider_uri, region=REG_CN) - MARKET = "csi300" - BENCHMARK = "SH000300" + market = "csi300" + benchmark = "SH000300" ################################### # train model ################################### - DATA_HANDLER_CONFIG = { + data_handler_config = { "start_time": "2008-01-01", "end_time": "2020-08-01", "fit_start_time": "2008-01-01", "fit_end_time": "2014-12-31", - "instruments": MARKET, + "instruments": market, } task = { @@ -69,7 +69,7 @@ if __name__ == "__main__": "handler": { "class": "Alpha158", "module_path": "qlib.contrib.data.handler", - "kwargs": DATA_HANDLER_CONFIG, + "kwargs": data_handler_config, }, "segments": { "train": ("2008-01-01", "2014-12-31"), @@ -78,8 +78,6 @@ if __name__ == "__main__": }, }, }, - # You shoud record the data in specific sequence - "record": ["SignalRecord", "PortAnaRecord"], } port_analysis_config = { @@ -95,7 +93,7 @@ if __name__ == "__main__": "verbose": False, "limit_threshold": 0.095, "account": 100000000, - "benchmark": BENCHMARK, + "benchmark": benchmark, "deal_price": "close", "open_cost": 0.0005, "close_cost": 0.0015, @@ -108,7 +106,8 @@ if __name__ == "__main__": dataset = init_instance_by_config(task["dataset"]) # start exp - with R.start("workflow"): + with R.start(experiment_name="workflow"): + R.log_paramters(**flatten_dict(task)) model.fit(dataset) # prediction diff --git a/examples/workflow_by_code_finetune.py b/examples/workflow_by_code_finetune.py index 6df8c9821..c69ecc350 100644 --- a/examples/workflow_by_code_finetune.py +++ b/examples/workflow_by_code_finetune.py @@ -32,18 +32,18 @@ if __name__ == "__main__": qlib.init(provider_uri=provider_uri, region=REG_CN) - MARKET = "csi300" - BENCHMARK = "SH000300" + market = "csi300" + benchmark = "SH000300" ################################### # train model ################################### - DATA_HANDLER_CONFIG = { + data_handler_config = { "start_time": "2008-01-01", "end_time": "2020-08-01", "fit_start_time": "2008-01-01", "fit_end_time": "2014-12-31", - "instruments": MARKET, + "instruments": market, } task = { @@ -69,7 +69,7 @@ if __name__ == "__main__": "handler": { "class": "Alpha158", "module_path": "qlib.contrib.data.handler", - "kwargs": DATA_HANDLER_CONFIG, + "kwargs": data_handler_config, }, "segments": { "train": ("2008-01-01", "2014-12-31"), @@ -78,8 +78,6 @@ if __name__ == "__main__": }, }, }, - # You shoud record the data in specific sequence - "record": ["SignalRecord", "PortAnaRecord"], } port_analysis_config = { @@ -95,7 +93,7 @@ if __name__ == "__main__": "verbose": False, "limit_threshold": 0.095, "account": 100000000, - "benchmark": BENCHMARK, + "benchmark": benchmark, "deal_price": "close", "open_cost": 0.0005, "close_cost": 0.0015, diff --git a/qlib/config.py b/qlib/config.py index 90369c79f..d05161772 100644 --- a/qlib/config.py +++ b/qlib/config.py @@ -130,7 +130,7 @@ _default_config = { "class": "MLflowExpManager", "module_path": "qlib.workflow.expm", "kwargs": { - "uri": 'file:' + str(Path(os.getcwd()).resolve() / "mlruns"), + "uri": "file:" + str(Path(os.getcwd()).resolve() / "mlruns"), "default_exp_name": "Experiment", }, }, diff --git a/qlib/data/dataset/handler.py b/qlib/data/dataset/handler.py index d32b251de..1fead5070 100644 --- a/qlib/data/dataset/handler.py +++ b/qlib/data/dataset/handler.py @@ -289,8 +289,12 @@ class DataHandlerLP(DataHandler): getattr(self, pname).append( init_instance_by_config( proc, - None if (isinstance(data_loader, dict) and "module_path" in data_loader) else data_loader_module, - accept_types=processor_module.Processor)) + None + if (isinstance(data_loader, dict) and "module_path" in data_loader) + else data_loader_module, + accept_types=processor_module.Processor, + ) + ) self.process_type = process_type super().__init__(instruments, start_time, end_time, data_loader, **kwargs) diff --git a/qlib/data/dataset/utils.py b/qlib/data/dataset/utils.py index 85a5e8389..3fb3768a0 100644 --- a/qlib/data/dataset/utils.py +++ b/qlib/data/dataset/utils.py @@ -32,7 +32,10 @@ def get_level_index(df: pd.DataFrame, level=Union[str, int]) -> int: def fetch_df_by_index( - df: pd.DataFrame, selector: Union[pd.Timestamp, slice, str, list], level: Union[str, int], fetch_orig=True, + df: pd.DataFrame, + selector: Union[pd.Timestamp, slice, str, list], + level: Union[str, int], + fetch_orig=True, ) -> pd.DataFrame: """ fetch data from `data` with `selector` and `level` @@ -55,8 +58,12 @@ def fetch_df_by_index( if fetch_orig: for slc in idx_slc: if slc != slice(None, None): - return df.loc[pd.IndexSlice[idx_slc],] + return df.loc[ + pd.IndexSlice[idx_slc], + ] else: return df else: - return df.loc[pd.IndexSlice[idx_slc],] + return df.loc[ + pd.IndexSlice[idx_slc], + ] diff --git a/qlib/workflow/utils.py b/qlib/workflow/utils.py index 0f721e035..33d251dd8 100644 --- a/qlib/workflow/utils.py +++ b/qlib/workflow/utils.py @@ -5,6 +5,7 @@ import sys, traceback, signal, atexit from . import R from .recorder import Recorder from ..log import get_module_logger + logger = get_module_logger("workflow", "INFO")