diff --git a/examples/workflow_by_code_lgb_risk_demo.py b/examples/workflow_by_code_lgb_risk_demo.py new file mode 100644 index 000000000..b250993d3 --- /dev/null +++ b/examples/workflow_by_code_lgb_risk_demo.py @@ -0,0 +1,179 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import sys +from pathlib import Path + +import qlib +from qlib.config import REG_CN +from qlib.utils import exists_qlib_data, init_instance_by_config, flatten_dict +from qlib.workflow import R +from qlib.workflow.record_temp import SignalRecord, PortAnaRecord +from qlib.data.dataset.handler import DataHandlerLP + +import seaborn as sns +import matplotlib.pyplot as plt +import math +import pandas as pd +from scipy.stats.stats import pearsonr +import numpy as np + +if __name__ == "__main__": + + # use default data + provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir + if not exists_qlib_data(provider_uri): + print(f"Qlib data is not found in {provider_uri}") + sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts"))) + from get_data import GetData + + GetData().qlib_data(target_dir=provider_uri, region=REG_CN) + + qlib.init(provider_uri=provider_uri, region=REG_CN) + + market = "csi300" + benchmark = "SH000300" + + ################################### + # train model + ################################### + data_handler_config = { + "start_time": "2008-01-01", + "end_time": "2020-08-01", + "fit_start_time": "2008-01-01", + "fit_end_time": "2014-12-31", + "instruments": market, + "infer_processors": [ + {"class": "ProcessInf", "kwargs": {}}, + {"class": "ZScoreNorm", "kwargs": {"fields_group": "feature"}}, + {"class": "Fillna", "kwargs": {}}, + ], + "learn_processors": [{ + "class": "DropnaLabel", }, + ], + "label": (["Ref(Min($low, 5), -4)/$close - 1"], ["LABEL0"]) # the period for risk prediction is 5 days + } + + task = { + "model": { + "class": "LGBModel", + "module_path": "qlib.contrib.model.gbdt", + "kwargs": { + "loss": "mse", + "colsample_bytree": 0.8999, + "learning_rate": 0.02, + "subsample": 0.7, + "lambda_l1": 11.9668, + "lambda_l2": 339.1301, + "max_depth": 16, + "num_leaves": 31, + "num_threads": 20, + }, + }, + "dataset": { + "class": "DatasetH", + "module_path": "qlib.data.dataset", + "kwargs": { + "handler": { + "class": "Alpha360", + "module_path": "qlib.contrib.data.handler", + "kwargs": data_handler_config, + }, + "segments": { + "train": ("2008-01-01", "2014-12-31"), + "valid": ("2015-01-01", "2016-12-31"), + "test": ("2017-01-01", "2020-08-01"), + }, + }, + }, + } + + port_analysis_config = { + "strategy": { + "class": "TopkDropoutStrategy", + "module_path": "qlib.contrib.strategy.strategy", + "kwargs": { + "topk": 50, + "n_drop": 5, + }, + }, + "backtest": { + "verbose": False, + "limit_threshold": 0.095, + "account": 100000000, + "benchmark": benchmark, + "deal_price": "close", + "open_cost": 0.0005, + "close_cost": 0.0015, + "min_cost": 5, + "return_order": True, + }, + } + + # model initiaiton + model = init_instance_by_config(task["model"]) + dataset = init_instance_by_config(task["dataset"]) + + # NOTE: This line is optional + # It demonstrates that the dataset can be used standalone. + example_df = dataset.prepare("train") + print(example_df.head()) + + def heatmap(actual_risk, predicted_risk, step=0.02): + """ + plot the precision heatmap as a visualized evaluation for risk predition + :param actual_risk: the LABEL0 of test samples + :param predicted_risk: the predicted results of test samples + :param step: the internal size of risk values on axis + :return: + """ + num_step = math.ceil(-predicted_risk.min() / step) + matrix = np.zeros((num_step, num_step), dtype=np.float) + for pred_thresh in range(num_step): + for act_thresh in range(num_step): + actual_positive = actual_risk < -act_thresh*step + predicted_alarm = predicted_risk < -pred_thresh*step + num_alarm = predicted_alarm.sum() + num_tp = (actual_positive & predicted_alarm).sum() + matrix[pred_thresh, act_thresh] = num_tp / num_alarm + axis_labels = ['{:.3f}'.format(-x * step) for x in range(num_step)] + return matrix, axis_labels + + # start exp + with R.start(experiment_name="workflow"): + R.log_params(**flatten_dict(task)) + model.fit(dataset) + + # prediction + actual_risk = dataset.prepare("test", col_set="label", data_key=DataHandlerLP.DK_I)['LABEL0'] + pred = model.predict(dataset) + + result_df = pd.concat((actual_risk, pred), axis=1) + result_df.columns = ['Actual Risk', 'Predicted Risk'] + result_df.dropna(inplace=True) + actual_risk, predicted_risk = result_df.iloc[:, 0], result_df.iloc[:, 1] + corr = pearsonr(actual_risk, predicted_risk)[0] + print('The correlation between predicted risk and actual risk is: {:.6f}'.format(corr)) + + # visualized results + fig, axes = plt.subplots(2, 2, figsize=(15, 10)) + sns.histplot(actual_risk, ax=axes[0, 0]) + axes[0, 0].set_title('Market: {} Actual Risk'.format(market)) + axes[0, 0].grid() + + sns.histplot(predicted_risk, ax=axes[0, 1]) + axes[0, 1].set_title('Feature: {} Predicted Risk'.format(task['dataset']['kwargs']['handler']['class'])) + axes[0, 1].grid() + + sns.scatterplot(data=result_df, ax=axes[1, 0], x='Actual Risk', y='Predicted Risk', s=20) + axes[1, 0].set_title('Market: {} Feature: {} Corr: {:.5f}'.format( + market, task['dataset']['kwargs']['handler']['class'], corr)) + axes[1, 0].grid() + + matrix, ax_labels = heatmap(actual_risk, predicted_risk) + sns.heatmap(matrix, annot=True, fmt=".3f", xticklabels=ax_labels, yticklabels=ax_labels, ax=axes[1, 1], + ) + axes[1, 1].set_xlabel('Predicted Alarm Threshold') + axes[1, 1].set_ylabel('Actual Positive Threshold') + axes[1, 1].set_title('Risk Prediction Precision Heatmap') + plt.show()