From c22bd73f67ec445bc4f542ee9004667663c38f0e Mon Sep 17 00:00:00 2001 From: Jactus Date: Fri, 20 Nov 2020 17:54:56 +0800 Subject: [PATCH] Update CI --- .github/workflows/test.yml | 9 +- examples/workflow_by_code_gats.py | 2 +- qlib/utils/__init__.py | 2 +- tests/test_all_pipeline.py | 183 +++++++++++++++++------------- 4 files changed, 109 insertions(+), 87 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e196c124b..d1e01e46b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -50,9 +50,10 @@ jobs: cd tests pytest . --durations=0 - - name: Test data downloads and examples + - name: Test data downloads run: | python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn - # cd examples - # estimator -c estimator/estimator_config.yaml - # jupyter nbconvert --execute estimator/analyze_from_estimator.ipynb --to html \ No newline at end of file + + - name: Test workflow by config + run: | + workflow_by_config examples/benchmarks/GBDT/workflow_config_gbdt.yaml \ No newline at end of file diff --git a/examples/workflow_by_code_gats.py b/examples/workflow_by_code_gats.py index 06845d448..222eb126f 100644 --- a/examples/workflow_by_code_gats.py +++ b/examples/workflow_by_code_gats.py @@ -72,7 +72,7 @@ if __name__ == "__main__": "batch_size": 800, "metric": "IC", "loss": "mse", - "base_model":"GRU", + "base_model": "GRU", "seed": 0, "GPU": 0, }, diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index 575ed24aa..f32cceba3 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -659,7 +659,7 @@ def flatten_dict(d, parent_key="", sep="."): items = [] for k, v in d.items(): new_key = parent_key + sep + k if parent_key else k - if isinstance(v, collections.MutableMapping): + if isinstance(v, collections.abc.MutableMapping): items.extend(flatten_dict(v, new_key, sep=sep).items()) else: items.append((new_key, v)) diff --git a/tests/test_all_pipeline.py b/tests/test_all_pipeline.py index 04c399342..16242189a 100644 --- a/tests/test_all_pipeline.py +++ b/tests/test_all_pipeline.py @@ -2,6 +2,7 @@ # Licensed under the MIT License. import sys +import shutil import unittest from pathlib import Path @@ -10,7 +11,7 @@ import pandas as pd from scipy.stats import pearsonr import qlib -from qlib.config import REG_CN +from qlib.config import REG_CN, C from qlib.utils import drop_nan_by_y_index from qlib.contrib.model.gbdt import LGBModel from qlib.contrib.data.handler import Alpha158 @@ -19,51 +20,78 @@ from qlib.contrib.evaluate import ( backtest as normal_backtest, risk_analysis, ) -from qlib.utils import exists_qlib_data +from qlib.utils import exists_qlib_data, init_instance_by_config, flatten_dict +from qlib.workflow import R +from qlib.workflow.record_temp import SignalRecord, PortAnaRecord -DATA_HANDLER_CONFIG = { - "dropna_label": True, - "start_date": "2008-01-01", - "end_date": "2020-08-01", - "market": "CSI300", +market = "csi300" +benchmark = "SH000300" + +################################### +# train model +################################### +data_handler_config = { + "start_time": "2008-01-01", + "end_time": "2020-08-01", + "fit_start_time": "2008-01-01", + "fit_end_time": "2014-12-31", + "instruments": market, } -MODEL_CONFIG = { - "loss": "mse", - "colsample_bytree": 0.8879, - "learning_rate": 0.0421, - "subsample": 0.8789, - "lambda_l1": 205.6999, - "lambda_l2": 580.9768, - "max_depth": 8, - "num_leaves": 210, - "num_threads": 20, +task = { + "model": { + "class": "LGBModel", + "module_path": "qlib.contrib.model.gbdt", + "kwargs": { + "loss": "mse", + "colsample_bytree": 0.8879, + "learning_rate": 0.0421, + "subsample": 0.8789, + "lambda_l1": 205.6999, + "lambda_l2": 580.9768, + "max_depth": 8, + "num_leaves": 210, + "num_threads": 20, + }, + }, + "dataset": { + "class": "DatasetH", + "module_path": "qlib.data.dataset", + "kwargs": { + "handler": { + "class": "Alpha158", + "module_path": "qlib.contrib.data.handler", + "kwargs": data_handler_config, + }, + "segments": { + "train": ("2008-01-01", "2014-12-31"), + "valid": ("2015-01-01", "2016-12-31"), + "test": ("2017-01-01", "2020-08-01"), + }, + }, + }, } -TRAINER_CONFIG = { - "train_start_date": "2008-01-01", - "train_end_date": "2014-12-31", - "validate_start_date": "2015-01-01", - "validate_end_date": "2016-12-31", - "test_start_date": "2017-01-01", - "test_end_date": "2020-08-01", -} - -STRATEGY_CONFIG = { - "topk": 50, - "n_drop": 5, -} - -BACKTEST_CONFIG = { - "verbose": False, - "limit_threshold": 0.095, - "account": 100000000, - "benchmark": "SH000300", - "deal_price": "close", - "open_cost": 0.0005, - "close_cost": 0.0015, - "min_cost": 5, +port_analysis_config = { + "strategy": { + "class": "TopkDropoutStrategy", + "module_path": "qlib.contrib.strategy.strategy", + "kwargs": { + "topk": 50, + "n_drop": 5, + }, + }, + "backtest": { + "verbose": False, + "limit_threshold": 0.095, + "account": 100000000, + "benchmark": benchmark, + "deal_price": "close", + "open_cost": 0.0005, + "close_cost": 0.0015, + "min_cost": 5, + }, } @@ -78,34 +106,32 @@ def train(): performance: dict model performance """ - # get data - x_train, y_train, x_validate, y_validate, x_test, y_test = Alpha158(**DATA_HANDLER_CONFIG).get_split_data( - **TRAINER_CONFIG - ) - # train - model = LGBModel(**MODEL_CONFIG) - model.fit(x_train, y_train, x_validate, y_validate) - _pred = model.predict(x_test) - _pred = pd.DataFrame(_pred, index=x_test.index, columns=y_test.columns) - pred_score = pd.DataFrame(index=_pred.index) - pred_score["score"] = _pred.iloc(axis=1)[0] + # model initiaiton + model = init_instance_by_config(task["model"]) + dataset = init_instance_by_config(task["dataset"]) - # get performance - try: - model_score = model.score(x_test, y_test) - except NotImplementedError: - model_score = None - # Remove rows from x, y and w, which contain Nan in any columns in y_test. - x_test, y_test, __ = drop_nan_by_y_index(x_test, y_test) - pred_test = model.predict(x_test) - model_pearsonr = pearsonr(np.ravel(pred_test), np.ravel(y_test.values))[0] + # start exp + with R.start(experiment_name="workflow"): + R.log_params(**flatten_dict(task)) + model.fit(dataset) - return pred_score, {"model_score": model_score, "model_pearsonr": model_pearsonr} + # prediction + recorder = R.get_recorder() + rid = recorder.id + sr = SignalRecord(model, dataset, recorder) + sr.generate() + pred_score = sr.load() + + y_test = dataset.prepare("test", col_set="label") + pred_score, y_test, __ = drop_nan_by_y_index(pred_score, y_test) + model_pearsonr = pearsonr(np.ravel(pred_score.values), np.ravel(y_test.values))[0] + + return pred_score, {"model_pearsonr": model_pearsonr}, rid -def backtest(pred): - """backtest +def backtest_analysis(pred, rid): + """backtest and analysis Parameters ---------- @@ -114,23 +140,14 @@ def backtest(pred): Returns ------- - report_normal: pandas.DataFrame - - positions_normal: dict + analysis result : pandas.DataFrame """ - strategy = TopkDropoutStrategy(**STRATEGY_CONFIG) - _report_normal, _positions_normal = normal_backtest(pred, strategy=strategy, **BACKTEST_CONFIG) - return _report_normal, _positions_normal - - -def analyze(report_normal): - _analysis = dict() - _analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"]) - _analysis["excess_return_with_cost"] = risk_analysis( - report_normal["return"] - report_normal["bench"] - report_normal["cost"] - ) - analysis_df = pd.concat(_analysis) # type: pd.DataFrame + recorder = R.get_recorder(experiment_name="workflow", recorder_id=rid) + # backtest + par = PortAnaRecord(recorder, port_analysis_config) + par.generate() + analysis_df = par.load("port_analysis.pkl") print(analysis_df) return analysis_df @@ -139,6 +156,7 @@ class TestAllFlow(unittest.TestCase): PRED_SCORE = None REPORT_NORMAL = None POSITIONS = None + RID = None @classmethod def setUpClass(cls) -> None: @@ -154,13 +172,16 @@ class TestAllFlow(unittest.TestCase): ) qlib.init(provider_uri=provider_uri, region=REG_CN) + @classmethod + def tearDownClass(cls) -> None: + shutil.rmtree(str(Path(C["exp_manager"]["kwargs"]["uri"].strip("file:")).resolve())) + def test_0_train(self): - TestAllFlow.PRED_SCORE, model_pearsonr = train() + TestAllFlow.PRED_SCORE, model_pearsonr, TestAllFlow.RID = train() self.assertGreaterEqual(model_pearsonr["model_pearsonr"], 0, "train failed") def test_1_backtest(self): - TestAllFlow.REPORT_NORMAL, TestAllFlow.POSITIONS = backtest(TestAllFlow.PRED_SCORE) - analyze_df = analyze(TestAllFlow.REPORT_NORMAL) + analyze_df = backtest_analysis(TestAllFlow.PRED_SCORE, TestAllFlow.RID) self.assertGreaterEqual( analyze_df.loc(axis=0)["excess_return_with_cost", "annualized_return"].values[0], 0.10,