Add Risk Prediction Demo

2026-07-21 11:17:34 +08:00 · 2021-02-23 19:08:11 +09:00
parent cd5b721bc6
commit 1a990fdd25
1 changed files with 179 additions and 0 deletions
--- a/examples/workflow_by_code_lgb_risk_demo.py
+++ b/examples/workflow_by_code_lgb_risk_demo.py
@@ -0,0 +1,179 @@
+#  Copyright (c) Microsoft Corporation.
+#  Licensed under the MIT License.
+
+import sys
+from pathlib import Path
+
+import qlib
+from qlib.config import REG_CN
+from qlib.utils import exists_qlib_data, init_instance_by_config, flatten_dict
+from qlib.workflow import R
+from qlib.workflow.record_temp import SignalRecord, PortAnaRecord
+from qlib.data.dataset.handler import DataHandlerLP
+
+import seaborn as sns
+import matplotlib.pyplot as plt
+import math
+import pandas as pd
+from scipy.stats.stats import pearsonr
+import numpy as np
+
+if __name__ == "__main__":
+
+    # use default data
+    provider_uri = "~/.qlib/qlib_data/cn_data"  # target_dir
+    if not exists_qlib_data(provider_uri):
+        print(f"Qlib data is not found in {provider_uri}")
+        sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts")))
+        from get_data import GetData
+
+        GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
+
+    qlib.init(provider_uri=provider_uri, region=REG_CN)
+
+    market = "csi300"
+    benchmark = "SH000300"
+
+    ###################################
+    # train model
+    ###################################
+    data_handler_config = {
+        "start_time": "2008-01-01",
+        "end_time": "2020-08-01",
+        "fit_start_time": "2008-01-01",
+        "fit_end_time": "2014-12-31",
+        "instruments": market,
+        "infer_processors": [
+            {"class": "ProcessInf", "kwargs": {}},
+            {"class": "ZScoreNorm", "kwargs": {"fields_group": "feature"}},
+            {"class": "Fillna", "kwargs": {}},
+        ],
+        "learn_processors": [{
+            "class": "DropnaLabel", },
+        ],
+        "label": (["Ref(Min($low, 5), -4)/$close - 1"], ["LABEL0"])  # the period for risk prediction is 5 days
+    }
+
+    task = {
+        "model": {
+            "class": "LGBModel",
+            "module_path": "qlib.contrib.model.gbdt",
+            "kwargs": {
+                "loss": "mse",
+                "colsample_bytree": 0.8999,
+                "learning_rate": 0.02,
+                "subsample": 0.7,
+                "lambda_l1": 11.9668,
+                "lambda_l2": 339.1301,
+                "max_depth": 16,
+                "num_leaves": 31,
+                "num_threads": 20,
+            },
+        },
+        "dataset": {
+            "class": "DatasetH",
+            "module_path": "qlib.data.dataset",
+            "kwargs": {
+                "handler": {
+                    "class": "Alpha360",
+                    "module_path": "qlib.contrib.data.handler",
+                    "kwargs": data_handler_config,
+                },
+                "segments": {
+                    "train": ("2008-01-01", "2014-12-31"),
+                    "valid": ("2015-01-01", "2016-12-31"),
+                    "test": ("2017-01-01", "2020-08-01"),
+                },
+            },
+        },
+    }
+
+    port_analysis_config = {
+        "strategy": {
+            "class": "TopkDropoutStrategy",
+            "module_path": "qlib.contrib.strategy.strategy",
+            "kwargs": {
+                "topk": 50,
+                "n_drop": 5,
+            },
+        },
+        "backtest": {
+            "verbose": False,
+            "limit_threshold": 0.095,
+            "account": 100000000,
+            "benchmark": benchmark,
+            "deal_price": "close",
+            "open_cost": 0.0005,
+            "close_cost": 0.0015,
+            "min_cost": 5,
+            "return_order": True,
+        },
+    }
+
+    # model initiaiton
+    model = init_instance_by_config(task["model"])
+    dataset = init_instance_by_config(task["dataset"])
+
+    # NOTE: This line is optional
+    # It demonstrates that the dataset can be used standalone.
+    example_df = dataset.prepare("train")
+    print(example_df.head())
+
+    def heatmap(actual_risk, predicted_risk, step=0.02):
+        """
+        plot the precision heatmap as a visualized evaluation for risk predition
+        :param actual_risk: the LABEL0 of test samples
+        :param predicted_risk: the predicted results of test samples
+        :param step: the internal size of risk values on axis
+        :return:
+        """
+        num_step = math.ceil(-predicted_risk.min() / step)
+        matrix = np.zeros((num_step, num_step), dtype=np.float)
+        for pred_thresh in range(num_step):
+            for act_thresh in range(num_step):
+                actual_positive = actual_risk < -act_thresh*step
+                predicted_alarm = predicted_risk < -pred_thresh*step
+                num_alarm = predicted_alarm.sum()
+                num_tp = (actual_positive & predicted_alarm).sum()
+                matrix[pred_thresh, act_thresh] = num_tp / num_alarm
+        axis_labels = ['{:.3f}'.format(-x * step) for x in range(num_step)]
+        return matrix, axis_labels
+
+    # start exp
+    with R.start(experiment_name="workflow"):
+        R.log_params(**flatten_dict(task))
+        model.fit(dataset)
+
+        # prediction
+        actual_risk = dataset.prepare("test", col_set="label", data_key=DataHandlerLP.DK_I)['LABEL0']
+        pred = model.predict(dataset)
+
+        result_df = pd.concat((actual_risk, pred), axis=1)
+        result_df.columns = ['Actual Risk', 'Predicted Risk']
+        result_df.dropna(inplace=True)
+        actual_risk, predicted_risk = result_df.iloc[:, 0], result_df.iloc[:, 1]
+        corr = pearsonr(actual_risk, predicted_risk)[0]
+        print('The correlation between predicted risk and actual risk is: {:.6f}'.format(corr))
+
+        # visualized results
+        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
+        sns.histplot(actual_risk, ax=axes[0, 0])
+        axes[0, 0].set_title('Market: {}  Actual Risk'.format(market))
+        axes[0, 0].grid()
+
+        sns.histplot(predicted_risk, ax=axes[0, 1])
+        axes[0, 1].set_title('Feature: {}  Predicted Risk'.format(task['dataset']['kwargs']['handler']['class']))
+        axes[0, 1].grid()
+
+        sns.scatterplot(data=result_df, ax=axes[1, 0], x='Actual Risk', y='Predicted Risk', s=20)
+        axes[1, 0].set_title('Market: {}  Feature: {}  Corr: {:.5f}'.format(
+            market, task['dataset']['kwargs']['handler']['class'], corr))
+        axes[1, 0].grid()
+
+        matrix, ax_labels = heatmap(actual_risk, predicted_risk)
+        sns.heatmap(matrix, annot=True, fmt=".3f", xticklabels=ax_labels, yticklabels=ax_labels, ax=axes[1, 1],
+                    )
+        axes[1, 1].set_xlabel('Predicted Alarm Threshold')
+        axes[1, 1].set_ylabel('Actual Positive Threshold')
+        axes[1, 1].set_title('Risk Prediction Precision Heatmap')
+        plt.show()