From 70575e8a1ce4f63cced98a55de1914009012bdcd Mon Sep 17 00:00:00 2001 From: Meng Dong Date: Wed, 24 Feb 2021 16:10:38 +0800 Subject: [PATCH] Delete workflow_by_code_lgb_risk_demo.py --- examples/workflow_by_code_lgb_risk_demo.py | 179 --------------------- 1 file changed, 179 deletions(-) delete mode 100644 examples/workflow_by_code_lgb_risk_demo.py diff --git a/examples/workflow_by_code_lgb_risk_demo.py b/examples/workflow_by_code_lgb_risk_demo.py deleted file mode 100644 index b250993d3..000000000 --- a/examples/workflow_by_code_lgb_risk_demo.py +++ /dev/null @@ -1,179 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import sys -from pathlib import Path - -import qlib -from qlib.config import REG_CN -from qlib.utils import exists_qlib_data, init_instance_by_config, flatten_dict -from qlib.workflow import R -from qlib.workflow.record_temp import SignalRecord, PortAnaRecord -from qlib.data.dataset.handler import DataHandlerLP - -import seaborn as sns -import matplotlib.pyplot as plt -import math -import pandas as pd -from scipy.stats.stats import pearsonr -import numpy as np - -if __name__ == "__main__": - - # use default data - provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir - if not exists_qlib_data(provider_uri): - print(f"Qlib data is not found in {provider_uri}") - sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts"))) - from get_data import GetData - - GetData().qlib_data(target_dir=provider_uri, region=REG_CN) - - qlib.init(provider_uri=provider_uri, region=REG_CN) - - market = "csi300" - benchmark = "SH000300" - - ################################### - # train model - ################################### - data_handler_config = { - "start_time": "2008-01-01", - "end_time": "2020-08-01", - "fit_start_time": "2008-01-01", - "fit_end_time": "2014-12-31", - "instruments": market, - "infer_processors": [ - {"class": "ProcessInf", "kwargs": {}}, - {"class": "ZScoreNorm", "kwargs": {"fields_group": "feature"}}, - {"class": "Fillna", "kwargs": {}}, - ], - "learn_processors": [{ - "class": "DropnaLabel", }, - ], - "label": (["Ref(Min($low, 5), -4)/$close - 1"], ["LABEL0"]) # the period for risk prediction is 5 days - } - - task = { - "model": { - "class": "LGBModel", - "module_path": "qlib.contrib.model.gbdt", - "kwargs": { - "loss": "mse", - "colsample_bytree": 0.8999, - "learning_rate": 0.02, - "subsample": 0.7, - "lambda_l1": 11.9668, - "lambda_l2": 339.1301, - "max_depth": 16, - "num_leaves": 31, - "num_threads": 20, - }, - }, - "dataset": { - "class": "DatasetH", - "module_path": "qlib.data.dataset", - "kwargs": { - "handler": { - "class": "Alpha360", - "module_path": "qlib.contrib.data.handler", - "kwargs": data_handler_config, - }, - "segments": { - "train": ("2008-01-01", "2014-12-31"), - "valid": ("2015-01-01", "2016-12-31"), - "test": ("2017-01-01", "2020-08-01"), - }, - }, - }, - } - - port_analysis_config = { - "strategy": { - "class": "TopkDropoutStrategy", - "module_path": "qlib.contrib.strategy.strategy", - "kwargs": { - "topk": 50, - "n_drop": 5, - }, - }, - "backtest": { - "verbose": False, - "limit_threshold": 0.095, - "account": 100000000, - "benchmark": benchmark, - "deal_price": "close", - "open_cost": 0.0005, - "close_cost": 0.0015, - "min_cost": 5, - "return_order": True, - }, - } - - # model initiaiton - model = init_instance_by_config(task["model"]) - dataset = init_instance_by_config(task["dataset"]) - - # NOTE: This line is optional - # It demonstrates that the dataset can be used standalone. - example_df = dataset.prepare("train") - print(example_df.head()) - - def heatmap(actual_risk, predicted_risk, step=0.02): - """ - plot the precision heatmap as a visualized evaluation for risk predition - :param actual_risk: the LABEL0 of test samples - :param predicted_risk: the predicted results of test samples - :param step: the internal size of risk values on axis - :return: - """ - num_step = math.ceil(-predicted_risk.min() / step) - matrix = np.zeros((num_step, num_step), dtype=np.float) - for pred_thresh in range(num_step): - for act_thresh in range(num_step): - actual_positive = actual_risk < -act_thresh*step - predicted_alarm = predicted_risk < -pred_thresh*step - num_alarm = predicted_alarm.sum() - num_tp = (actual_positive & predicted_alarm).sum() - matrix[pred_thresh, act_thresh] = num_tp / num_alarm - axis_labels = ['{:.3f}'.format(-x * step) for x in range(num_step)] - return matrix, axis_labels - - # start exp - with R.start(experiment_name="workflow"): - R.log_params(**flatten_dict(task)) - model.fit(dataset) - - # prediction - actual_risk = dataset.prepare("test", col_set="label", data_key=DataHandlerLP.DK_I)['LABEL0'] - pred = model.predict(dataset) - - result_df = pd.concat((actual_risk, pred), axis=1) - result_df.columns = ['Actual Risk', 'Predicted Risk'] - result_df.dropna(inplace=True) - actual_risk, predicted_risk = result_df.iloc[:, 0], result_df.iloc[:, 1] - corr = pearsonr(actual_risk, predicted_risk)[0] - print('The correlation between predicted risk and actual risk is: {:.6f}'.format(corr)) - - # visualized results - fig, axes = plt.subplots(2, 2, figsize=(15, 10)) - sns.histplot(actual_risk, ax=axes[0, 0]) - axes[0, 0].set_title('Market: {} Actual Risk'.format(market)) - axes[0, 0].grid() - - sns.histplot(predicted_risk, ax=axes[0, 1]) - axes[0, 1].set_title('Feature: {} Predicted Risk'.format(task['dataset']['kwargs']['handler']['class'])) - axes[0, 1].grid() - - sns.scatterplot(data=result_df, ax=axes[1, 0], x='Actual Risk', y='Predicted Risk', s=20) - axes[1, 0].set_title('Market: {} Feature: {} Corr: {:.5f}'.format( - market, task['dataset']['kwargs']['handler']['class'], corr)) - axes[1, 0].grid() - - matrix, ax_labels = heatmap(actual_risk, predicted_risk) - sns.heatmap(matrix, annot=True, fmt=".3f", xticklabels=ax_labels, yticklabels=ax_labels, ax=axes[1, 1], - ) - axes[1, 1].set_xlabel('Predicted Alarm Threshold') - axes[1, 1].set_ylabel('Actual Positive Threshold') - axes[1, 1].set_title('Risk Prediction Precision Heatmap') - plt.show()