From 77ba7b4e91e5a73ef3d87e3b3b741cd757c41d0e Mon Sep 17 00:00:00 2001 From: Young Date: Thu, 22 Apr 2021 05:54:43 +0000 Subject: [PATCH] Add analyser example and finetune example --- examples/data/monitor.py | 112 ++++++++++--------- examples/data/monitor_analyser_demo.ipynb | 130 ++++++++++++++++++++++ qlib/data/monitor/analyser.py | 14 +++ qlib/data/monitor/metric.py | 2 +- 4 files changed, 207 insertions(+), 51 deletions(-) create mode 100644 examples/data/monitor_analyser_demo.ipynb create mode 100644 qlib/data/monitor/analyser.py diff --git a/examples/data/monitor.py b/examples/data/monitor.py index f2f9ea572..21fa5d447 100644 --- a/examples/data/monitor.py +++ b/examples/data/monitor.py @@ -1,46 +1,46 @@ """ -This script is the demonstrating the implementation of following requirements. - +This script is the demonstrating the implementation of Metric Extractor and Detector NOTE: A lot of details is not considered in this script - Corner case that will raise error( std == 0) -· Transformer: - 1) Basic statistics on different slices of the DataFrame df: - § The statistics include: + +The following functions are used to demonstrate the following examples + + +· Metric Extractor: + case 1) Basic statistics on different slices of the DataFrame df: + 1) The statistics include: · STD, Mean, Skewnes, Kurtosis - § The above statistics can be calculated on the following data slices: + 2) The above statistics can be calculated on the following data slices: · df.groupby(['datetime']) · df.groupby(['datetime', 'industry' ]) - · df.groupby(['instrument', 'factor']) - · df.apply("").groupby([..]), in which [..] could be any one of the above slicing rules. - 2) Advanced statistics on different slices of the DataFrame df: - § Auto-correlation: + 3) The statistics could be calculated on the time dimension for each instruments and factor(the factor can be represented by experssion) + · .groupby(['instrument', 'factor']) + case 2) Advanced statistics on different slices of the DataFrame df: + 1) Auto-correlation: · Calculate corr(df.loc[t, :, :], df.loc[t-w, :, :]), w=1, 2, …. - § Correlation between factors: + 2) Correlation between factors: · For any pair of factors (i, j): calculate corr(df.loc[t, :, i], df.loc[t, :, j]). The result is a correlation matrix with each element corresponds to a correlation value between a pair of factors. - § The data slices are the same as those in 1). -· Monitor: - 1) Algorithms: + +· Detector: detect the abnormality of the extracted metric; + a) Algorithms: § Basic checks: NaN. § Point anomaly detection. § Segment anomaly detection. - 2) Scenarios: + b) Scenarios: § Online anomaly detection: monitoring streaming data. -Offline anomaly detection: verifying whole historical data. +The usage of the detectors are demonstrated in the `case_1_*`and `case_2_*` -2021-2-19: - -Effectiveness metrics -- Standard metrics: - - [X] IC(Information Coefficient) #case_3_1 - - [ ] IR(Information Ratio): Informatio Ratio is related to backest - - [X] RankIC #case_3_3 +case 3): Examples to use MetricExt to monitor IC and rank IC + 1) IC(Information Coefficient) #case_3_1 + 2) RankIC #case_3_2 """ # AUTO download data +from typing import List, Union from qlib.utils import exists_qlib_data from qlib.tests.data import GetData from qlib.config import REG_CN @@ -51,8 +51,6 @@ if not exists_qlib_data(provider_uri): GetData().qlib_data(target_dir=provider_uri, region=REG_CN) import qlib - -qlib.init() import pandas as pd from qlib.contrib.data.handler import Alpha158 from qlib.data.dataset.loader import QlibDataLoader @@ -62,30 +60,51 @@ from qlib.data.monitor.detector import NDDetector, SWNDD, ThresholdD from qlib.data import D import fire - UNIVERSE = "csi300" START_TIME = "20200101" +# ------------------ a helper function to get data to demonstrate the functionality -------------------- -def get_factor_df(col_idx=0): + +def get_data_df(col_idx: Union[int, List[int]] = 0, verbose: bool = True): + """ + a helper function to get data to demonstrate the functionality. + + Parameters + ---------- + col_idx : Union[int, List[int]] + column index of the metrics + """ dh = Alpha158(instruments=UNIVERSE, infer_processors=[], learn_processors=[], start_time=START_TIME) df = dh.fetch() - print(df.head()) + if verbose: + print(df.head()) # We don't have industries in dataframe, we generate the with fake data industry = pd.Series(df.index.get_level_values("instrument").str.slice(stop=2).to_list(), index=df.index) # select a factor factor_df = format_conv(df.iloc[:, col_idx], industry=industry) - print(f"Selected metric: {df.columns[col_idx]}") - - print(factor_df) + if verbose: + print(f"Selected metric: {df.columns[col_idx]}") + print(factor_df) return factor_df +def get_target(horizon=5): + target = f"Ref($close, -{horizon + 1})/Ref($close, -1) - 1" # There are lots of targets: return is one of them + qdl = QlibDataLoader(config=([target], ["target"])) + df = qdl.load(instruments=UNIVERSE, start_time=START_TIME) # Aligning with factor will improve performance + df = format_conv(df["target"]) + return df + + +# ----------------- Cases to demonstrate the usage of detector and examples ---------------------- + + def case_1_1(): - factor_df = get_factor_df() + factor_df = get_data_df() # 1) Extract metrics # 1.1) df.groupby(["datetime"]) @@ -101,7 +120,7 @@ def case_1_1(): def case_1_2(): - factor_df = get_factor_df() + factor_df = get_data_df() # 1.2) df.groupby("datetime", "industry") mtrc = MeanM(group=["industry"]) m_multi = mtrc.extract(factor_df) @@ -116,9 +135,9 @@ def case_1_2(): print(check_res.value_counts()) -def case_1_3_1_4(): - # case 1.3 and case 1.4 - # factor_df = get_factor_df() +def case_1_3(): + # case 1.3 + # factor_df = get_data_df() qdl = QlibDataLoader(config=(["$close/Ref($close, 1) - 1"], ["return"])) df = qdl.load(instruments=["SH600519"], start_time=START_TIME) df = format_conv(df) @@ -134,7 +153,7 @@ def case_1_3_1_4(): def case_2_1(): # · Calculate corr(df.loc[t, :, :], df.loc[t-w, :, :]), w=1, 2, …. - factor_df = get_factor_df() + factor_df = get_data_df() acm = AutoCM() mtrc = acm.extract(factor_df) print(mtrc) @@ -147,7 +166,7 @@ def case_2_1(): def case_2_2(): - factor_df1, factor_df2 = get_factor_df(0), get_factor_df(1) + factor_df1, factor_df2 = get_data_df(0), get_data_df(1) cm = CorrM() mtrc = cm.extract(factor_df1, factor_df2) @@ -160,26 +179,18 @@ def case_2_2(): print(check_res.value_counts()) -def get_target(horizon=5): - target = f"Ref($close, -{horizon + 1})/Ref($close, -1) - 1" # There are lots of targets: return is one of them - qdl = QlibDataLoader(config=([target], ["target"])) - df = qdl.load(instruments=UNIVERSE, start_time=START_TIME) # Aligning with factor will improve performance - df = format_conv(df["target"]) - return df - - -def case_3_1_3_3(): - target, factor = get_target(), get_factor_df(0) +def case_3_1_3_2(): + target, factor = get_target(), get_data_df(0) ic_m, rank_ic_m = CorrM(), CorrM(mode="spearman") ic, rank_ic = ic_m.extract(factor, target), rank_ic_m.extract(factor, target) print(pd.DataFrame({"ic": ic, "rank_ic": rank_ic})) -def run(test_list=["case_1_1", "case_1_2", "case_1_3_1_4", "case_2_1", "case_2_2", "case_3_1_3_3"]): +def run(test_list=["case_1_1", "case_1_2", "case_1_3", "case_2_1", "case_2_2", "case_3_1_3_2"]): """ run the specific tests - python monitor.py case_3_1_3_3 + python monitor.py case_3_1_3_2 Parameters ---------- @@ -193,4 +204,5 @@ def run(test_list=["case_1_1", "case_1_2", "case_1_3_1_4", "case_2_1", "case_2_2 if __name__ == "__main__": + qlib.init() fire.Fire(run) diff --git a/examples/data/monitor_analyser_demo.ipynb b/examples/data/monitor_analyser_demo.ipynb new file mode 100644 index 000000000..22dd11cec --- /dev/null +++ b/examples/data/monitor_analyser_demo.ipynb @@ -0,0 +1,130 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "0e62a81e", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from tqdm.auto import tqdm\n", + "%matplotlib inline\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c503217b", + "metadata": {}, + "outputs": [], + "source": [ + "from qlib.data.monitor.analyser import Analyser\n", + "import qlib\n", + "qlib.init()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c276470", + "metadata": {}, + "outputs": [], + "source": [ + "class SimpleDFA(Analyser):\n", + " \"\"\"Simple (D)ata(F)rame (A)nalyser\"\"\"\n", + " def analyse(self, data: pd.DataFrame, *args, **kwargs):\n", + " data.plot(*args, **kwargs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "110262e4", + "metadata": {}, + "outputs": [], + "source": [ + "from monitor import get_data_df, AutoCM" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ea38c62", + "metadata": {}, + "outputs": [], + "source": [ + "# get data\n", + "factor_df = get_data_df([1], verbose=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbded6fe", + "metadata": {}, + "outputs": [], + "source": [ + "# metric extractor\n", + "acm = AutoCM()\n", + "mtrc = acm.extract(factor_df)\n", + "print(mtrc)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65517c81", + "metadata": {}, + "outputs": [], + "source": [ + "# Analyser\n", + "sa = SimpleDFA()\n", + "sa.analyse(mtrc, title='Auto Correlation')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dab6fb2e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/qlib/data/monitor/analyser.py b/qlib/data/monitor/analyser.py new file mode 100644 index 000000000..e2c6fd453 --- /dev/null +++ b/qlib/data/monitor/analyser.py @@ -0,0 +1,14 @@ +from abc import abstractmethod + + +class Analyser: + """ + Analyser is supposed to process the output MetricExt and produce a analysis result + - The results could be a report or plot. + + We suppose the Analyser doesn't need much computing resource (The heavy computation should be done in MetricExt) + """ + + @abstractmethod + def analyse(self, *args, **kwargs): + ... diff --git a/qlib/data/monitor/metric.py b/qlib/data/monitor/metric.py index 15538b73f..db531e075 100644 --- a/qlib/data/monitor/metric.py +++ b/qlib/data/monitor/metric.py @@ -118,7 +118,7 @@ class AutoCM(MetricExt): class CorrM(MetricExt): - """correlation extractor """ + """correlation extractor""" def __init__(self, mode="pearson"): self.mode = mode