1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-06-06 05:51:17 +08:00

Add analyser example and finetune example

This commit is contained in:
Young
2021-04-22 05:54:43 +00:00
parent 7a639eeea7
commit 77ba7b4e91
4 changed files with 207 additions and 51 deletions

View File

@@ -1,46 +1,46 @@
"""
This script is the demonstrating the implementation of following requirements.
This script is the demonstrating the implementation of Metric Extractor and Detector
NOTE: A lot of details is not considered in this script
- Corner case that will raise error( std == 0)
· Transformer:
1) Basic statistics on different slices of the DataFrame df:
§ The statistics include:
The following functions are used to demonstrate the following examples
· Metric Extractor:
case 1) Basic statistics on different slices of the DataFrame df:
1) The statistics include:
· STD, Mean, Skewnes, Kurtosis
§ The above statistics can be calculated on the following data slices:
2) The above statistics can be calculated on the following data slices:
· df.groupby(['datetime'])
· df.groupby(['datetime', 'industry' ])
· df.groupby(['instrument', 'factor'])
· df.apply("<expresion>").groupby([..]), in which [..] could be any one of the above slicing rules.
2) Advanced statistics on different slices of the DataFrame df:
§ Auto-correlation:
3) The statistics could be calculated on the time dimension for each instruments and factor(the factor can be represented by experssion)
· <df implemented by expresion>.groupby(['instrument', 'factor'])
case 2) Advanced statistics on different slices of the DataFrame df:
1) Auto-correlation:
· Calculate corr(df.loc[t, :, :], df.loc[t-w, :, :]), w=1, 2, ….
§ Correlation between factors:
2) Correlation between factors:
· For any pair of factors (i, j): calculate corr(df.loc[t, :, i], df.loc[t, :, j]). The result is a correlation matrix with each element corresponds to a correlation value between a pair of factors.
§ The data slices are the same as those in 1).
· Monitor:
1) Algorithms:
· Detector: detect the abnormality of the extracted metric;
a) Algorithms:
§ Basic checks: NaN.
§ Point anomaly detection.
§ Segment anomaly detection.
2) Scenarios:
b) Scenarios:
§ Online anomaly detection: monitoring streaming data.
Offline anomaly detection: verifying whole historical data.
The usage of the detectors are demonstrated in the `case_1_*`and `case_2_*`
2021-2-19:
Effectiveness metrics
- Standard metrics:
- [X] IC(Information Coefficient) #case_3_1
- [ ] IR(Information Ratio): Informatio Ratio is related to backest
- [X] RankIC #case_3_3
case 3): Examples to use MetricExt to monitor IC and rank IC
1) IC(Information Coefficient) #case_3_1
2) RankIC #case_3_2
"""
# AUTO download data
from typing import List, Union
from qlib.utils import exists_qlib_data
from qlib.tests.data import GetData
from qlib.config import REG_CN
@@ -51,8 +51,6 @@ if not exists_qlib_data(provider_uri):
GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
import qlib
qlib.init()
import pandas as pd
from qlib.contrib.data.handler import Alpha158
from qlib.data.dataset.loader import QlibDataLoader
@@ -62,30 +60,51 @@ from qlib.data.monitor.detector import NDDetector, SWNDD, ThresholdD
from qlib.data import D
import fire
UNIVERSE = "csi300"
START_TIME = "20200101"
# ------------------ a helper function to get data to demonstrate the functionality --------------------
def get_factor_df(col_idx=0):
def get_data_df(col_idx: Union[int, List[int]] = 0, verbose: bool = True):
"""
a helper function to get data to demonstrate the functionality.
Parameters
----------
col_idx : Union[int, List[int]]
column index of the metrics
"""
dh = Alpha158(instruments=UNIVERSE, infer_processors=[], learn_processors=[], start_time=START_TIME)
df = dh.fetch()
print(df.head())
if verbose:
print(df.head())
# We don't have industries in dataframe, we generate the with fake data
industry = pd.Series(df.index.get_level_values("instrument").str.slice(stop=2).to_list(), index=df.index)
# select a factor
factor_df = format_conv(df.iloc[:, col_idx], industry=industry)
print(f"Selected metric: {df.columns[col_idx]}")
print(factor_df)
if verbose:
print(f"Selected metric: {df.columns[col_idx]}")
print(factor_df)
return factor_df
def get_target(horizon=5):
target = f"Ref($close, -{horizon + 1})/Ref($close, -1) - 1" # There are lots of targets: return is one of them
qdl = QlibDataLoader(config=([target], ["target"]))
df = qdl.load(instruments=UNIVERSE, start_time=START_TIME) # Aligning with factor will improve performance
df = format_conv(df["target"])
return df
# ----------------- Cases to demonstrate the usage of detector and examples ----------------------
def case_1_1():
factor_df = get_factor_df()
factor_df = get_data_df()
# 1) Extract metrics
# 1.1) df.groupby(["datetime"])
@@ -101,7 +120,7 @@ def case_1_1():
def case_1_2():
factor_df = get_factor_df()
factor_df = get_data_df()
# 1.2) df.groupby("datetime", "industry")
mtrc = MeanM(group=["industry"])
m_multi = mtrc.extract(factor_df)
@@ -116,9 +135,9 @@ def case_1_2():
print(check_res.value_counts())
def case_1_3_1_4():
# case 1.3 and case 1.4
# factor_df = get_factor_df()
def case_1_3():
# case 1.3
# factor_df = get_data_df()
qdl = QlibDataLoader(config=(["$close/Ref($close, 1) - 1"], ["return"]))
df = qdl.load(instruments=["SH600519"], start_time=START_TIME)
df = format_conv(df)
@@ -134,7 +153,7 @@ def case_1_3_1_4():
def case_2_1():
# · Calculate corr(df.loc[t, :, :], df.loc[t-w, :, :]), w=1, 2, ….
factor_df = get_factor_df()
factor_df = get_data_df()
acm = AutoCM()
mtrc = acm.extract(factor_df)
print(mtrc)
@@ -147,7 +166,7 @@ def case_2_1():
def case_2_2():
factor_df1, factor_df2 = get_factor_df(0), get_factor_df(1)
factor_df1, factor_df2 = get_data_df(0), get_data_df(1)
cm = CorrM()
mtrc = cm.extract(factor_df1, factor_df2)
@@ -160,26 +179,18 @@ def case_2_2():
print(check_res.value_counts())
def get_target(horizon=5):
target = f"Ref($close, -{horizon + 1})/Ref($close, -1) - 1" # There are lots of targets: return is one of them
qdl = QlibDataLoader(config=([target], ["target"]))
df = qdl.load(instruments=UNIVERSE, start_time=START_TIME) # Aligning with factor will improve performance
df = format_conv(df["target"])
return df
def case_3_1_3_3():
target, factor = get_target(), get_factor_df(0)
def case_3_1_3_2():
target, factor = get_target(), get_data_df(0)
ic_m, rank_ic_m = CorrM(), CorrM(mode="spearman")
ic, rank_ic = ic_m.extract(factor, target), rank_ic_m.extract(factor, target)
print(pd.DataFrame({"ic": ic, "rank_ic": rank_ic}))
def run(test_list=["case_1_1", "case_1_2", "case_1_3_1_4", "case_2_1", "case_2_2", "case_3_1_3_3"]):
def run(test_list=["case_1_1", "case_1_2", "case_1_3", "case_2_1", "case_2_2", "case_3_1_3_2"]):
"""
run the specific tests
python monitor.py case_3_1_3_3
python monitor.py case_3_1_3_2
Parameters
----------
@@ -193,4 +204,5 @@ def run(test_list=["case_1_1", "case_1_2", "case_1_3_1_4", "case_2_1", "case_2_2
if __name__ == "__main__":
qlib.init()
fire.Fire(run)

View File

@@ -0,0 +1,130 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "0e62a81e",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from tqdm.auto import tqdm\n",
"%matplotlib inline\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c503217b",
"metadata": {},
"outputs": [],
"source": [
"from qlib.data.monitor.analyser import Analyser\n",
"import qlib\n",
"qlib.init()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9c276470",
"metadata": {},
"outputs": [],
"source": [
"class SimpleDFA(Analyser):\n",
" \"\"\"Simple (D)ata(F)rame (A)nalyser\"\"\"\n",
" def analyse(self, data: pd.DataFrame, *args, **kwargs):\n",
" data.plot(*args, **kwargs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "110262e4",
"metadata": {},
"outputs": [],
"source": [
"from monitor import get_data_df, AutoCM"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0ea38c62",
"metadata": {},
"outputs": [],
"source": [
"# get data\n",
"factor_df = get_data_df([1], verbose=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dbded6fe",
"metadata": {},
"outputs": [],
"source": [
"# metric extractor\n",
"acm = AutoCM()\n",
"mtrc = acm.extract(factor_df)\n",
"print(mtrc)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "65517c81",
"metadata": {},
"outputs": [],
"source": [
"# Analyser\n",
"sa = SimpleDFA()\n",
"sa.analyse(mtrc, title='Auto Correlation')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dab6fb2e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,14 @@
from abc import abstractmethod
class Analyser:
"""
Analyser is supposed to process the output MetricExt and produce a analysis result
- The results could be a report or plot.
We suppose the Analyser doesn't need much computing resource (The heavy computation should be done in MetricExt)
"""
@abstractmethod
def analyse(self, *args, **kwargs):
...

View File

@@ -118,7 +118,7 @@ class AutoCM(MetricExt):
class CorrM(MetricExt):
"""correlation extractor """
"""correlation extractor"""
def __init__(self, mode="pearson"):
self.mode = mode