diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e196c124b..d1e01e46b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -50,9 +50,10 @@ jobs: cd tests pytest . --durations=0 - - name: Test data downloads and examples + - name: Test data downloads run: | python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn - # cd examples - # estimator -c estimator/estimator_config.yaml - # jupyter nbconvert --execute estimator/analyze_from_estimator.ipynb --to html \ No newline at end of file + + - name: Test workflow by config + run: | + workflow_by_config examples/benchmarks/GBDT/workflow_config_gbdt.yaml \ No newline at end of file diff --git a/examples/benchmarks/CatBoost/workflow_config_catboost.yaml b/examples/benchmarks/CatBoost/workflow_config_catboost.yaml index 187cd116b..d66418544 100644 --- a/examples/benchmarks/CatBoost/workflow_config_catboost.yaml +++ b/examples/benchmarks/CatBoost/workflow_config_catboost.yaml @@ -1,4 +1,5 @@ provider_uri: "~/.qlib/qlib_data/cn_data" +region: cn market: &market csi300 benchmark: &benchmark SH000300 data_handler_config: &data_handler_config diff --git a/examples/benchmarks/DNN/workflow_config_dnn.yaml b/examples/benchmarks/DNN/workflow_config_dnn.yaml index 0f50cbb25..8f785aa76 100644 --- a/examples/benchmarks/DNN/workflow_config_dnn.yaml +++ b/examples/benchmarks/DNN/workflow_config_dnn.yaml @@ -1,4 +1,5 @@ provider_uri: "~/.qlib/qlib_data/cn_data" +region: cn market: &market csi300 benchmark: &benchmark SH000300 data_handler_config: &data_handler_config @@ -28,18 +29,16 @@ task: class: DNNModelPytorch module_path: qlib.contrib.model.pytorch_nn kwargs: + loss: mse input_dim: 360 output_dim: 1 - layers: [256, 512, 1024, 512, 256, 128, 64] - lr: 0.001 - max_steps: 300 - batch_size: 2000 - early_stop_rounds: 50 - eval_steps: 20 + lr: 0.002 lr_decay: 0.96 lr_decay_steps: 100 - optimizer: gd - loss: mse + optimizer: adam + max_steps: 8000 + batch_size: 4096 + GPU: 0 dataset: class: DatasetH module_path: qlib.data.dataset diff --git a/examples/benchmarks/GATs/worflow_config_gats.yaml b/examples/benchmarks/GATs/worflow_config_gats.yaml index 6c8db2e77..382c14f01 100644 --- a/examples/benchmarks/GATs/worflow_config_gats.yaml +++ b/examples/benchmarks/GATs/worflow_config_gats.yaml @@ -1,4 +1,5 @@ provider_uri: "~/.qlib/qlib_data/cn_data" +region: cn market: &market csi300 benchmark: &benchmark SH000300 data_handler_config: &data_handler_config diff --git a/examples/benchmarks/GBDT/workflow_config_gbdt.yaml b/examples/benchmarks/GBDT/workflow_config_gbdt.yaml index 212558044..eeb9db7bd 100644 --- a/examples/benchmarks/GBDT/workflow_config_gbdt.yaml +++ b/examples/benchmarks/GBDT/workflow_config_gbdt.yaml @@ -1,4 +1,5 @@ provider_uri: "~/.qlib/qlib_data/cn_data" +region: cn market: &market csi300 benchmark: &benchmark SH000300 data_handler_config: &data_handler_config diff --git a/examples/benchmarks/GRU/workflow_config_gru.yaml b/examples/benchmarks/GRU/workflow_config_gru.yaml index 49b6159dc..4e9ebc670 100644 --- a/examples/benchmarks/GRU/workflow_config_gru.yaml +++ b/examples/benchmarks/GRU/workflow_config_gru.yaml @@ -1,4 +1,5 @@ provider_uri: "~/.qlib/qlib_data/cn_data" +region: cn market: &market csi300 benchmark: &benchmark SH000300 data_handler_config: &data_handler_config diff --git a/examples/benchmarks/LSTM/workflow_config_lstm.yaml b/examples/benchmarks/LSTM/workflow_config_lstm.yaml index 1e3b309d2..b45f94569 100644 --- a/examples/benchmarks/LSTM/workflow_config_lstm.yaml +++ b/examples/benchmarks/LSTM/workflow_config_lstm.yaml @@ -1,4 +1,5 @@ provider_uri: "~/.qlib/qlib_data/cn_data" +region: cn market: &market csi300 benchmark: &benchmark SH000300 data_handler_config: &data_handler_config diff --git a/examples/benchmarks/XGBoost/workflow_config_xgboost.yaml b/examples/benchmarks/XGBoost/workflow_config_xgboost.yaml index 497ffa5b6..fb88f1058 100644 --- a/examples/benchmarks/XGBoost/workflow_config_xgboost.yaml +++ b/examples/benchmarks/XGBoost/workflow_config_xgboost.yaml @@ -1,4 +1,5 @@ provider_uri: "~/.qlib/qlib_data/cn_data" +region: cn market: &market csi300 benchmark: &benchmark SH000300 data_handler_config: &data_handler_config diff --git a/examples/run_all_model.py b/examples/run_all_model.py new file mode 100644 index 000000000..6efde6a52 --- /dev/null +++ b/examples/run_all_model.py @@ -0,0 +1,267 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import os +import sys +import venv +import glob +import shutil +import tempfile +from pathlib import Path +from subprocess import Popen, PIPE +from threading import Thread +from pprint import pprint +from urllib.parse import urlparse +from urllib.request import urlretrieve + +import qlib +from qlib.config import REG_CN +from qlib.workflow import R +from qlib.workflow.cli import workflow + +# init qlib +provider_uri = "~/.qlib/qlib_data/cn_data" +qlib.init(provider_uri=provider_uri, region=REG_CN) + + +class ExtendedEnvBuilder(venv.EnvBuilder): + """ + Thie class is modified based on https://docs.python.org/3/library/venv.html. + This builder installs setuptools and pip so that you can pip or + easy_install other packages into the created virtual environment. + + :param nodist: If true, setuptools and pip are not installed into the + created virtual environment. + :param nopip: If true, pip is not installed into the created + virtual environment. + :param progress: If setuptools or pip are installed, the progress of the + installation can be monitored by passing a progress + callable. If specified, it is called with two + arguments: a string indicating some progress, and a + context indicating where the string is coming from. + The context argument can have one of three values: + 'main', indicating that it is called from virtualize() + itself, and 'stdout' and 'stderr', which are obtained + by reading lines from the output streams of a subprocess + which is used to install the app. + + If a callable is not specified, default progress + information is output to sys.stderr. + """ + + def __init__(self, *args, **kwargs): + self.nodist = kwargs.pop("nodist", False) + self.nopip = kwargs.pop("nopip", False) + self.progress = kwargs.pop("progress", None) + self.verbose = kwargs.pop("verbose", False) + super().__init__(*args, **kwargs) + + def post_setup(self, context): + """ + Set up any packages which need to be pre-installed into the + virtual environment being created. + + :param context: The information for the virtual environment + creation request being processed. + """ + os.environ["VIRTUAL_ENV"] = context.env_dir + if not self.nodist: + self.install_setuptools(context) + # Can't install pip without setuptools + if not self.nopip and not self.nodist: + self.install_pip(context) + + def reader(self, stream, context): + """ + Read lines from a subprocess' output stream and either pass to a progress + callable (if specified) or write progress information to sys.stderr. + """ + progress = self.progress + while True: + s = stream.readline() + if not s: + break + if progress is not None: + progress(s, context) + else: + if not self.verbose: + sys.stderr.write(".") + else: + sys.stderr.write(s.decode("utf-8")) + sys.stderr.flush() + stream.close() + + def install_script(self, context, name, url): + _, _, path, _, _, _ = urlparse(url) + fn = os.path.split(path)[-1] + binpath = context.bin_path + distpath = os.path.join(binpath, fn) + # Download script into the virtual environment's binaries folder + urlretrieve(url, distpath) + progress = self.progress + if self.verbose: + term = "\n" + else: + term = "" + if progress is not None: + progress("Installing %s ...%s" % (name, term), "main") + else: + sys.stderr.write("Installing %s ...%s" % (name, term)) + sys.stderr.flush() + # Install in the virtual environment + args = [context.env_exe, fn] + p = Popen(args, stdout=PIPE, stderr=PIPE, cwd=binpath) + t1 = Thread(target=self.reader, args=(p.stdout, "stdout")) + t1.start() + t2 = Thread(target=self.reader, args=(p.stderr, "stderr")) + t2.start() + p.wait() + t1.join() + t2.join() + if progress is not None: + progress("done.", "main") + else: + sys.stderr.write("done.\n") + # Clean up - no longer needed + os.unlink(distpath) + + def install_setuptools(self, context): + """ + Install setuptools in the virtual environment. + + :param context: The information for the virtual environment + creation request being processed. + """ + url = "https://bootstrap.pypa.io/ez_setup.py" + self.install_script(context, "setuptools", url) + # clear up the setuptools archive which gets downloaded + pred = lambda o: o.startswith("setuptools-") and o.endswith(".tar.gz") + files = filter(pred, os.listdir(context.bin_path)) + for f in files: + f = os.path.join(context.bin_path, f) + os.unlink(f) + + def install_pip(self, context): + """ + Install pip in the virtual environment. + + :param context: The information for the virtual environment + creation request being processed. + """ + url = "https://bootstrap.pypa.io/get-pip.py" + self.install_script(context, "pip", url) + + +# function to get all the folders benchmark folder +def get_all_folders() -> dict: + folders = dict() + for f in os.scandir("benchmarks"): + path = Path("benchmarks") / f.name + if f.name != "TFT": + folders[f.name] = str(path.resolve()) + return folders + + +# function to get all the files under the model folder +def get_all_files(folder_path) -> (str, str): + yaml_path = str(Path(f"{folder_path}") / "*.yaml") + req_path = str(Path(f"{folder_path}") / "*.txt") + return glob.glob(yaml_path)[0], glob.glob(req_path)[0] + + +# function to retrieve all the results +def get_all_results(folders) -> dict: + results = dict() + for fn in folders: + exp = R.get_exp(experiment_name=fn, create=False) + recorders = exp.list_recorders() + recorder = R.get_recorder(recorder_id=next(iter(recorders)), experiment_name=fn) + metrics = recorder.list_metrics() + results[fn] = {key: metrics[key] for key in metrics if "with_cost" in key} + return results + + +# function to generate and save markdown tables +def gen_and_save_md_table(results): + table = "| Model Name | Annualized Return | Information Ratio | Max Drawdown |\n" + table += "|---|---|---|---|\n" + for fn in results: + ar = metrics[fn]["excess_return_with_cost.annualized_return"] + ir = metrics[fn]["excess_return_with_cost.information_ratio"] + md = metrics[fn]["excess_return_with_cost.max_drawdown"] + table += f"| {fn} | {ar:9.5f} | {ir:9.5f} | {md:9.5f} |\n" + pprint(table) + with open("table.md", "w") as f: + f.write(table) + return table + + +# function to run the all the models +def run(): + # get all folders + folders = get_all_folders() + # set up + compatible = True + if sys.version_info < (3, 3): + compatible = False + elif not hasattr(sys, "base_prefix"): + compatible = False + if not compatible: + raise ValueError("This script is only for use with " "Python 3.3 or later") + if os.name == "nt": + use_symlinks = False + else: + use_symlinks = True + builder = ExtendedEnvBuilder( + system_site_packages=False, + clear=False, + symlinks=use_symlinks, + upgrade=False, + nodist=False, + nopip=False, + verbose=False, + ) + for fn in folders: + # create env + temp_dir = tempfile.mkdtemp() + env_path = Path(temp_dir).absolute() + sys.stderr.write(f"Creating Virtual Environment with path: {env_path}...\n") + builder.create(str(env_path)) + python_path = env_path / "bin" / "python" # TODO: FIX ME! + sys.stderr.write("\n") + # get all files + sys.stderr.write("Retrieving files...\n") + yaml_path, req_path = get_all_files(folders[fn]) + sys.stderr.write("\n") + # install requirements.txt + sys.stderr.write("Installing requirements.txt...\n") + os.system(f"{python_path} -m pip install -r {req_path}") + sys.stderr.write("\n") + # install qlib + sys.stderr.write("Installing qlib...\n") + os.system(f"{python_path} -m pip install --upgrade cython") # TODO: FIX ME! + os.system(f"{python_path} -m pip install -e git+https://github.com/you-n-g/qlib#egg=pyqlib") # TODO: FIX ME! + sys.stderr.write("\n") + # run workflow_by_config + sys.stderr.write(f"Running the model: {fn}...\n") + os.system(f"{python_path} {env_path / 'src/pyqlib/qlib/workflow/cli.py'} {yaml_path} {fn}") + sys.stderr.write("\n") + # remove env + sys.stderr.write(f"Deleting the environment: {env_path}...\n") + shutil.rmtree(env_path) + # getting all results + sys.stderr.write(f"Retrieving results...\n") + results = get_all_results(folders) + # generating md table + sys.stderr.write(f"Generating markdown table...\n") + gen_and_save_md_table(results) + + +if __name__ == "__main__": + rc = 1 + try: + run() # run all the model + rc = 0 + except Exception as e: + print("Error: %s" % e, file=sys.stderr) + sys.exit(rc) diff --git a/examples/workflow_by_code.ipynb b/examples/workflow_by_code.ipynb index f07c4f19e..246d2b0c2 100644 --- a/examples/workflow_by_code.ipynb +++ b/examples/workflow_by_code.ipynb @@ -1,5 +1,15 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright (c) Microsoft Corporation.\n", + "# Licensed under the MIT License." + ] + }, { "cell_type": "code", "execution_count": null, @@ -13,7 +23,7 @@ "import pandas as pd\n", "from qlib.config import REG_CN\n", "from qlib.contrib.model.gbdt import LGBModel\n", - "from qlib.contrib.estimator.handler import Alpha158\n", + "from qlib.contrib.data.handler import Alpha158\n", "from qlib.contrib.strategy.strategy import TopkDropoutStrategy\n", "from qlib.contrib.evaluate import (\n", " backtest as normal_backtest,\n", diff --git a/examples/workflow_by_code_gats.py b/examples/workflow_by_code_gats.py new file mode 100644 index 000000000..222eb126f --- /dev/null +++ b/examples/workflow_by_code_gats.py @@ -0,0 +1,145 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import sys +from pathlib import Path + +import qlib +import pandas as pd +from qlib.config import REG_CN +from qlib.contrib.model.pytorch_gats import GAT +from qlib.contrib.data.handler import ALPHA360_Denoise +from qlib.contrib.strategy.strategy import TopkDropoutStrategy +from qlib.contrib.evaluate import ( + backtest as normal_backtest, + risk_analysis, +) +from qlib.utils import exists_qlib_data + +# from qlib.model.learner import train_model +from qlib.utils import init_instance_by_config + +import pickle + +if __name__ == "__main__": + + # use default data + provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir + if not exists_qlib_data(provider_uri): + print(f"Qlib data is not found in {provider_uri}") + sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts"))) + from get_data import GetData + + GetData().qlib_data_cn(target_dir=provider_uri) + + qlib.init(provider_uri=provider_uri, region=REG_CN) + + MARKET = "csi300" + BENCHMARK = "SH000300" + + ################################### + # train model + ################################### + DATA_HANDLER_CONFIG = { + "start_time": "2008-01-01", + "end_time": "2020-08-01", + "fit_start_time": "2008-01-01", + "fit_end_time": "2014-12-31", + "instruments": MARKET, + } + + TRAINER_CONFIG = { + "train_start_time": "2008-01-01", + "train_end_time": "2014-12-31", + "validate_start_time": "2015-01-01", + "validate_end_time": "2016-12-31", + "test_start_time": "2017-01-01", + "test_end_time": "2020-08-01", + } + + task = { + "model": { + "class": "GAT", + "module_path": "qlib.contrib.model.pytorch_gats", + "kwargs": { + "d_feat": 6, + "hidden_size": 64, + "num_layers": 2, + "dropout": 0.0, + "n_epochs": 200, + "lr": 1e-3, + "early_stop": 20, + "batch_size": 800, + "metric": "IC", + "loss": "mse", + "base_model": "GRU", + "seed": 0, + "GPU": 0, + }, + }, + "dataset": { + "class": "DatasetH", + "module_path": "qlib.data.dataset", + "kwargs": { + "handler": { + "class": "ALPHA360_Denoise", + "module_path": "qlib.contrib.data.handler", + "kwargs": DATA_HANDLER_CONFIG, + }, + "segments": { + "train": ("2008-01-01", "2014-12-31"), + "valid": ("2015-01-01", "2016-12-31"), + "test": ("2017-01-01", "2020-08-01"), + }, + }, + } + # You shoud record the data in specific sequence + # "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'], + } + + # model = train_model(task) + model = init_instance_by_config(task["model"]) + dataset = init_instance_by_config(task["dataset"]) + model.fit(dataset) + + pred_score = model.predict(dataset) + + # save pred_score to file + pred_score_path = Path("~/tmp/qlib/pred_score.pkl").expanduser() + pred_score_path.parent.mkdir(exist_ok=True, parents=True) + pred_score.to_pickle(pred_score_path) + + ################################### + # backtest + ################################### + STRATEGY_CONFIG = { + "topk": 50, + "n_drop": 5, + } + BACKTEST_CONFIG = { + "verbose": False, + "limit_threshold": 0.095, + "account": 100000000, + "benchmark": BENCHMARK, + "deal_price": "close", + "open_cost": 0.0005, + "close_cost": 0.0015, + "min_cost": 5, + } + + # use default strategy + # custom Strategy, refer to: TODO: Strategy API url + strategy = TopkDropoutStrategy(**STRATEGY_CONFIG) + report_normal, positions_normal = normal_backtest(pred_score, strategy=strategy, **BACKTEST_CONFIG) + + ################################### + # analyze + # If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb + ################################### + analysis = dict() + analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"]) + analysis["excess_return_with_cost"] = risk_analysis( + report_normal["return"] - report_normal["bench"] - report_normal["cost"] + ) + analysis_df = pd.concat(analysis) # type: pd.DataFrame + print(analysis_df) diff --git a/examples/workflow_by_code_gru.py b/examples/workflow_by_code_gru.py new file mode 100644 index 000000000..e55f0ae45 --- /dev/null +++ b/examples/workflow_by_code_gru.py @@ -0,0 +1,144 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import sys +from pathlib import Path + +import qlib +import pandas as pd +from qlib.config import REG_CN +from qlib.contrib.model.pytorch_gru import GRU +from qlib.contrib.data.handler import ALPHA360_Denoise +from qlib.contrib.strategy.strategy import TopkDropoutStrategy +from qlib.contrib.evaluate import ( + backtest as normal_backtest, + risk_analysis, +) +from qlib.utils import exists_qlib_data + +# from qlib.model.learner import train_model +from qlib.utils import init_instance_by_config + +import pickle + +if __name__ == "__main__": + + # use default data + provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir + if not exists_qlib_data(provider_uri): + print(f"Qlib data is not found in {provider_uri}") + sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts"))) + from get_data import GetData + + GetData().qlib_data_cn(target_dir=provider_uri) + + qlib.init(provider_uri=provider_uri, region=REG_CN) + + MARKET = "csi300" + BENCHMARK = "SH000300" + + ################################### + # train model + ################################### + DATA_HANDLER_CONFIG = { + "start_time": "2008-01-01", + "end_time": "2020-08-01", + "fit_start_time": "2008-01-01", + "fit_end_time": "2014-12-31", + "instruments": MARKET, + } + + TRAINER_CONFIG = { + "train_start_time": "2008-01-01", + "train_end_time": "2014-12-31", + "validate_start_time": "2015-01-01", + "validate_end_time": "2016-12-31", + "test_start_time": "2017-01-01", + "test_end_time": "2020-08-01", + } + + task = { + "model": { + "class": "GRU", + "module_path": "qlib.contrib.model.pytorch_gru", + "kwargs": { + "d_feat": 6, + "hidden_size": 64, + "num_layers": 2, + "dropout": 0.0, + "n_epochs": 200, + "lr": 1e-3, + "early_stop": 20, + "batch_size": 800, + "metric": "IC", + "loss": "mse", + "seed": 0, + "GPU": 0, + }, + }, + "dataset": { + "class": "DatasetH", + "module_path": "qlib.data.dataset", + "kwargs": { + "handler": { + "class": "ALPHA360_Denoise", + "module_path": "qlib.contrib.data.handler", + "kwargs": DATA_HANDLER_CONFIG, + }, + "segments": { + "train": ("2008-01-01", "2014-12-31"), + "valid": ("2015-01-01", "2016-12-31"), + "test": ("2017-01-01", "2020-08-01"), + }, + }, + } + # You shoud record the data in specific sequence + # "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'], + } + + # model = train_model(task) + model = init_instance_by_config(task["model"]) + dataset = init_instance_by_config(task["dataset"]) + model.fit(dataset) + + pred_score = model.predict(dataset) + + # save pred_score to file + pred_score_path = Path("~/tmp/qlib/pred_score.pkl").expanduser() + pred_score_path.parent.mkdir(exist_ok=True, parents=True) + pred_score.to_pickle(pred_score_path) + + ################################### + # backtest + ################################### + STRATEGY_CONFIG = { + "topk": 50, + "n_drop": 5, + } + BACKTEST_CONFIG = { + "verbose": False, + "limit_threshold": 0.095, + "account": 100000000, + "benchmark": BENCHMARK, + "deal_price": "close", + "open_cost": 0.0005, + "close_cost": 0.0015, + "min_cost": 5, + } + + # use default strategy + # custom Strategy, refer to: TODO: Strategy API url + strategy = TopkDropoutStrategy(**STRATEGY_CONFIG) + report_normal, positions_normal = normal_backtest(pred_score, strategy=strategy, **BACKTEST_CONFIG) + + ################################### + # analyze + # If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb + ################################### + analysis = dict() + analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"]) + analysis["excess_return_with_cost"] = risk_analysis( + report_normal["return"] - report_normal["bench"] - report_normal["cost"] + ) + analysis_df = pd.concat(analysis) # type: pd.DataFrame + print(analysis_df) diff --git a/examples/workflow_by_code_lstm.py b/examples/workflow_by_code_lstm.py new file mode 100644 index 000000000..1815d2fec --- /dev/null +++ b/examples/workflow_by_code_lstm.py @@ -0,0 +1,144 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import sys +from pathlib import Path + +import qlib +import pandas as pd +from qlib.config import REG_CN +from qlib.contrib.model.pytorch_lstm import LSTM +from qlib.contrib.data.handler import ALPHA360_Denoise +from qlib.contrib.strategy.strategy import TopkDropoutStrategy +from qlib.contrib.evaluate import ( + backtest as normal_backtest, + risk_analysis, +) +from qlib.utils import exists_qlib_data + +# from qlib.model.learner import train_model +from qlib.utils import init_instance_by_config + +import pickle + +if __name__ == "__main__": + + # use default data + provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir + if not exists_qlib_data(provider_uri): + print(f"Qlib data is not found in {provider_uri}") + sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts"))) + from get_data import GetData + + GetData().qlib_data_cn(target_dir=provider_uri) + + qlib.init(provider_uri=provider_uri, region=REG_CN) + + MARKET = "csi300" + BENCHMARK = "SH000300" + + ################################### + # train model + ################################### + DATA_HANDLER_CONFIG = { + "start_time": "2008-01-01", + "end_time": "2020-08-01", + "fit_start_time": "2008-01-01", + "fit_end_time": "2014-12-31", + "instruments": MARKET, + } + + TRAINER_CONFIG = { + "train_start_time": "2008-01-01", + "train_end_time": "2014-12-31", + "validate_start_time": "2015-01-01", + "validate_end_time": "2016-12-31", + "test_start_time": "2017-01-01", + "test_end_time": "2020-08-01", + } + + task = { + "model": { + "class": "LSTM", + "module_path": "qlib.contrib.model.pytorch_lstm", + "kwargs": { + "d_feat": 6, + "hidden_size": 64, + "num_layers": 2, + "dropout": 0.0, + "n_epochs": 200, + "lr": 1e-3, + "early_stop": 20, + "batch_size": 800, + "metric": "IC", + "loss": "mse", + "seed": 0, + "GPU": 0, + }, + }, + "dataset": { + "class": "DatasetH", + "module_path": "qlib.data.dataset", + "kwargs": { + "handler": { + "class": "ALPHA360_Denoise", + "module_path": "qlib.contrib.data.handler", + "kwargs": DATA_HANDLER_CONFIG, + }, + "segments": { + "train": ("2008-01-01", "2014-12-31"), + "valid": ("2015-01-01", "2016-12-31"), + "test": ("2017-01-01", "2020-08-01"), + }, + }, + } + # You shoud record the data in specific sequence + # "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'], + } + + # model = train_model(task) + model = init_instance_by_config(task["model"]) + dataset = init_instance_by_config(task["dataset"]) + model.fit(dataset) + + pred_score = model.predict(dataset) + + # save pred_score to file + pred_score_path = Path("~/tmp/qlib/pred_score.pkl").expanduser() + pred_score_path.parent.mkdir(exist_ok=True, parents=True) + pred_score.to_pickle(pred_score_path) + + ################################### + # backtest + ################################### + STRATEGY_CONFIG = { + "topk": 50, + "n_drop": 5, + } + BACKTEST_CONFIG = { + "verbose": False, + "limit_threshold": 0.095, + "account": 100000000, + "benchmark": BENCHMARK, + "deal_price": "close", + "open_cost": 0.0005, + "close_cost": 0.0015, + "min_cost": 5, + } + + # use default strategy + # custom Strategy, refer to: TODO: Strategy API url + strategy = TopkDropoutStrategy(**STRATEGY_CONFIG) + report_normal, positions_normal = normal_backtest(pred_score, strategy=strategy, **BACKTEST_CONFIG) + + ################################### + # analyze + # If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb + ################################### + analysis = dict() + analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"]) + analysis["excess_return_with_cost"] = risk_analysis( + report_normal["return"] - report_normal["bench"] - report_normal["cost"] + ) + analysis_df = pd.concat(analysis) # type: pd.DataFrame + print(analysis_df) diff --git a/examples/workflow_config.yaml b/examples/workflow_config.yaml deleted file mode 100644 index 212558044..000000000 --- a/examples/workflow_config.yaml +++ /dev/null @@ -1,59 +0,0 @@ -provider_uri: "~/.qlib/qlib_data/cn_data" -market: &market csi300 -benchmark: &benchmark SH000300 -data_handler_config: &data_handler_config - start_time: 2008-01-01 - end_time: 2020-08-01 - fit_start_time: 2008-01-01 - fit_end_time: 2014-12-31 - instruments: *market -port_analysis_config: &port_analysis_config - strategy: - class: TopkDropoutStrategy - module_path: qlib.contrib.strategy.strategy - kwargs: - topk: 50 - n_drop: 5 - backtest: - verbose: False - limit_threshold: 0.095 - account: 100000000 - benchmark: *benchmark - deal_price: close - open_cost: 0.0005 - close_cost: 0.0015 - min_cost: 5 -task: - model: - class: LGBModel - module_path: qlib.contrib.model.gbdt - kwargs: - loss: mse - colsample_bytree: 0.8879 - learning_rate: 0.0421 - subsample: 0.8789 - lambda_l1: 205.6999 - lambda_l2: 580.9768 - max_depth: 8 - num_leaves: 210 - num_threads: 20 - dataset: - class: DatasetH - module_path: qlib.data.dataset - kwargs: - handler: - class: Alpha158 - module_path: qlib.contrib.data.handler - kwargs: *data_handler_config - segments: - train: [2008-01-01, 2014-12-31] - valid: [2015-01-01, 2016-12-31] - test: [2017-01-01, 2020-08-01] - record: - - class: SignalRecord - module_path: qlib.workflow.record_temp - kwargs: {} - - class: PortAnaRecord - module_path: qlib.workflow.record_temp - kwargs: - config: *port_analysis_config \ No newline at end of file diff --git a/qlib/data/dataset/handler.py b/qlib/data/dataset/handler.py index b3608464d..d1dbe1777 100644 --- a/qlib/data/dataset/handler.py +++ b/qlib/data/dataset/handler.py @@ -291,7 +291,9 @@ class DataHandlerLP(DataHandler): init_instance_by_config( proc, None if (isinstance(proc, dict) and "module_path" in proc) else processor_module, - accept_types=processor_module.Processor)) + accept_types=processor_module.Processor, + ) + ) self.process_type = process_type super().__init__(instruments, start_time, end_time, data_loader, **kwargs) diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index 575ed24aa..f32cceba3 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -659,7 +659,7 @@ def flatten_dict(d, parent_key="", sep="."): items = [] for k, v in d.items(): new_key = parent_key + sep + k if parent_key else k - if isinstance(v, collections.MutableMapping): + if isinstance(v, collections.abc.MutableMapping): items.extend(flatten_dict(v, new_key, sep=sep).items()) else: items.append((new_key, v)) diff --git a/qlib/workflow/cli.py b/qlib/workflow/cli.py index b257d9237..307a466a1 100644 --- a/qlib/workflow/cli.py +++ b/qlib/workflow/cli.py @@ -13,13 +13,15 @@ from qlib.utils import init_instance_by_config, flatten_dict from qlib.workflow import R from qlib.workflow.record_temp import SignalRecord + # worflow handler function def workflow(config_path, experiment_name="workflow"): with open(config_path) as fp: config = yaml.load(fp, Loader=yaml.Loader) provider_uri = config.get("provider_uri") - qlib.init(provider_uri=provider_uri, region=REG_CN) + region = config.get("region") + qlib.init(provider_uri=provider_uri, region=region) # model initiaiton model = init_instance_by_config(config.get("task")["model"]) diff --git a/qlib/workflow/recorder.py b/qlib/workflow/recorder.py index 71f13381f..97f97105f 100644 --- a/qlib/workflow/recorder.py +++ b/qlib/workflow/recorder.py @@ -159,6 +159,36 @@ class Recorder: """ raise NotImplementedError(f"Please implement the `list_artifacts` method.") + def list_metrics(self): + """ + List all the metrics of a recorder. + + Returns + ------- + A dictionary of metrics that being stored. + """ + raise NotImplementedError(f"Please implement the `list_metrics` method.") + + def list_params(self): + """ + List all the params of a recorder. + + Returns + ------- + A dictionary of params that being stored. + """ + raise NotImplementedError(f"Please implement the `list_params` method.") + + def list_tags(self): + """ + List all the tags of a recorder. + + Returns + ------- + A dictionary of tags that being stored. + """ + raise NotImplementedError(f"Please implement the `list_tags` method.") + class MLflowRecorder(Recorder): """ @@ -239,7 +269,7 @@ class MLflowRecorder(Recorder): def log_metrics(self, step=None, **kwargs): for name, data in kwargs.items(): - self.client.log_metric(self.id, name, data) + self.client.log_metric(self.id, name, data, step=step) def set_tags(self, **kwargs): for name, data in kwargs.items(): @@ -261,3 +291,15 @@ class MLflowRecorder(Recorder): assert self._uri is not None, "Please start the experiment and recorder first before using recorder directly." artifacts = self.client.list_artifacts(self.id, artifact_path) return artifacts + + def list_metrics(self): + run = self.client.get_run(self.id) + return run.data.metrics + + def list_params(self): + run = self.client.get_run(self.id) + return run.data.params + + def list_tags(self): + run = self.client.get_run(self.id) + return run.data.tags diff --git a/tests/test_all_pipeline.py b/tests/test_all_pipeline.py index 04c399342..16242189a 100644 --- a/tests/test_all_pipeline.py +++ b/tests/test_all_pipeline.py @@ -2,6 +2,7 @@ # Licensed under the MIT License. import sys +import shutil import unittest from pathlib import Path @@ -10,7 +11,7 @@ import pandas as pd from scipy.stats import pearsonr import qlib -from qlib.config import REG_CN +from qlib.config import REG_CN, C from qlib.utils import drop_nan_by_y_index from qlib.contrib.model.gbdt import LGBModel from qlib.contrib.data.handler import Alpha158 @@ -19,51 +20,78 @@ from qlib.contrib.evaluate import ( backtest as normal_backtest, risk_analysis, ) -from qlib.utils import exists_qlib_data +from qlib.utils import exists_qlib_data, init_instance_by_config, flatten_dict +from qlib.workflow import R +from qlib.workflow.record_temp import SignalRecord, PortAnaRecord -DATA_HANDLER_CONFIG = { - "dropna_label": True, - "start_date": "2008-01-01", - "end_date": "2020-08-01", - "market": "CSI300", +market = "csi300" +benchmark = "SH000300" + +################################### +# train model +################################### +data_handler_config = { + "start_time": "2008-01-01", + "end_time": "2020-08-01", + "fit_start_time": "2008-01-01", + "fit_end_time": "2014-12-31", + "instruments": market, } -MODEL_CONFIG = { - "loss": "mse", - "colsample_bytree": 0.8879, - "learning_rate": 0.0421, - "subsample": 0.8789, - "lambda_l1": 205.6999, - "lambda_l2": 580.9768, - "max_depth": 8, - "num_leaves": 210, - "num_threads": 20, +task = { + "model": { + "class": "LGBModel", + "module_path": "qlib.contrib.model.gbdt", + "kwargs": { + "loss": "mse", + "colsample_bytree": 0.8879, + "learning_rate": 0.0421, + "subsample": 0.8789, + "lambda_l1": 205.6999, + "lambda_l2": 580.9768, + "max_depth": 8, + "num_leaves": 210, + "num_threads": 20, + }, + }, + "dataset": { + "class": "DatasetH", + "module_path": "qlib.data.dataset", + "kwargs": { + "handler": { + "class": "Alpha158", + "module_path": "qlib.contrib.data.handler", + "kwargs": data_handler_config, + }, + "segments": { + "train": ("2008-01-01", "2014-12-31"), + "valid": ("2015-01-01", "2016-12-31"), + "test": ("2017-01-01", "2020-08-01"), + }, + }, + }, } -TRAINER_CONFIG = { - "train_start_date": "2008-01-01", - "train_end_date": "2014-12-31", - "validate_start_date": "2015-01-01", - "validate_end_date": "2016-12-31", - "test_start_date": "2017-01-01", - "test_end_date": "2020-08-01", -} - -STRATEGY_CONFIG = { - "topk": 50, - "n_drop": 5, -} - -BACKTEST_CONFIG = { - "verbose": False, - "limit_threshold": 0.095, - "account": 100000000, - "benchmark": "SH000300", - "deal_price": "close", - "open_cost": 0.0005, - "close_cost": 0.0015, - "min_cost": 5, +port_analysis_config = { + "strategy": { + "class": "TopkDropoutStrategy", + "module_path": "qlib.contrib.strategy.strategy", + "kwargs": { + "topk": 50, + "n_drop": 5, + }, + }, + "backtest": { + "verbose": False, + "limit_threshold": 0.095, + "account": 100000000, + "benchmark": benchmark, + "deal_price": "close", + "open_cost": 0.0005, + "close_cost": 0.0015, + "min_cost": 5, + }, } @@ -78,34 +106,32 @@ def train(): performance: dict model performance """ - # get data - x_train, y_train, x_validate, y_validate, x_test, y_test = Alpha158(**DATA_HANDLER_CONFIG).get_split_data( - **TRAINER_CONFIG - ) - # train - model = LGBModel(**MODEL_CONFIG) - model.fit(x_train, y_train, x_validate, y_validate) - _pred = model.predict(x_test) - _pred = pd.DataFrame(_pred, index=x_test.index, columns=y_test.columns) - pred_score = pd.DataFrame(index=_pred.index) - pred_score["score"] = _pred.iloc(axis=1)[0] + # model initiaiton + model = init_instance_by_config(task["model"]) + dataset = init_instance_by_config(task["dataset"]) - # get performance - try: - model_score = model.score(x_test, y_test) - except NotImplementedError: - model_score = None - # Remove rows from x, y and w, which contain Nan in any columns in y_test. - x_test, y_test, __ = drop_nan_by_y_index(x_test, y_test) - pred_test = model.predict(x_test) - model_pearsonr = pearsonr(np.ravel(pred_test), np.ravel(y_test.values))[0] + # start exp + with R.start(experiment_name="workflow"): + R.log_params(**flatten_dict(task)) + model.fit(dataset) - return pred_score, {"model_score": model_score, "model_pearsonr": model_pearsonr} + # prediction + recorder = R.get_recorder() + rid = recorder.id + sr = SignalRecord(model, dataset, recorder) + sr.generate() + pred_score = sr.load() + + y_test = dataset.prepare("test", col_set="label") + pred_score, y_test, __ = drop_nan_by_y_index(pred_score, y_test) + model_pearsonr = pearsonr(np.ravel(pred_score.values), np.ravel(y_test.values))[0] + + return pred_score, {"model_pearsonr": model_pearsonr}, rid -def backtest(pred): - """backtest +def backtest_analysis(pred, rid): + """backtest and analysis Parameters ---------- @@ -114,23 +140,14 @@ def backtest(pred): Returns ------- - report_normal: pandas.DataFrame - - positions_normal: dict + analysis result : pandas.DataFrame """ - strategy = TopkDropoutStrategy(**STRATEGY_CONFIG) - _report_normal, _positions_normal = normal_backtest(pred, strategy=strategy, **BACKTEST_CONFIG) - return _report_normal, _positions_normal - - -def analyze(report_normal): - _analysis = dict() - _analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"]) - _analysis["excess_return_with_cost"] = risk_analysis( - report_normal["return"] - report_normal["bench"] - report_normal["cost"] - ) - analysis_df = pd.concat(_analysis) # type: pd.DataFrame + recorder = R.get_recorder(experiment_name="workflow", recorder_id=rid) + # backtest + par = PortAnaRecord(recorder, port_analysis_config) + par.generate() + analysis_df = par.load("port_analysis.pkl") print(analysis_df) return analysis_df @@ -139,6 +156,7 @@ class TestAllFlow(unittest.TestCase): PRED_SCORE = None REPORT_NORMAL = None POSITIONS = None + RID = None @classmethod def setUpClass(cls) -> None: @@ -154,13 +172,16 @@ class TestAllFlow(unittest.TestCase): ) qlib.init(provider_uri=provider_uri, region=REG_CN) + @classmethod + def tearDownClass(cls) -> None: + shutil.rmtree(str(Path(C["exp_manager"]["kwargs"]["uri"].strip("file:")).resolve())) + def test_0_train(self): - TestAllFlow.PRED_SCORE, model_pearsonr = train() + TestAllFlow.PRED_SCORE, model_pearsonr, TestAllFlow.RID = train() self.assertGreaterEqual(model_pearsonr["model_pearsonr"], 0, "train failed") def test_1_backtest(self): - TestAllFlow.REPORT_NORMAL, TestAllFlow.POSITIONS = backtest(TestAllFlow.PRED_SCORE) - analyze_df = analyze(TestAllFlow.REPORT_NORMAL) + analyze_df = backtest_analysis(TestAllFlow.PRED_SCORE, TestAllFlow.RID) self.assertGreaterEqual( analyze_df.loc(axis=0)["excess_return_with_cost", "annualized_return"].values[0], 0.10,