mirror of
https://github.com/microsoft/qlib.git
synced 2026-06-06 14:01:28 +08:00
209 lines
6.4 KiB
Python
209 lines
6.4 KiB
Python
"""
|
|
This script is the demonstrating the implementation of Metric Extractor and Detector
|
|
|
|
NOTE: A lot of details is not considered in this script
|
|
- Corner case that will raise error( std == 0)
|
|
|
|
|
|
|
|
The following functions are used to demonstrate the following examples
|
|
|
|
|
|
· Metric Extractor:
|
|
case 1) Basic statistics on different slices of the DataFrame df:
|
|
1) The statistics include:
|
|
· STD, Mean, Skewnes, Kurtosis
|
|
2) The above statistics can be calculated on the following data slices:
|
|
· df.groupby(['datetime'])
|
|
· df.groupby(['datetime', 'industry' ])
|
|
3) The statistics could be calculated on the time dimension for each instruments and factor(the factor can be represented by experssion)
|
|
· <df implemented by expresion>.groupby(['instrument', 'factor'])
|
|
case 2) Advanced statistics on different slices of the DataFrame df:
|
|
1) Auto-correlation:
|
|
· Calculate corr(df.loc[t, :, :], df.loc[t-w, :, :]), w=1, 2, ….
|
|
2) Correlation between factors:
|
|
· For any pair of factors (i, j): calculate corr(df.loc[t, :, i], df.loc[t, :, j]). The result is a correlation matrix with each element corresponds to a correlation value between a pair of factors.
|
|
|
|
· Detector: detect the abnormality of the extracted metric;
|
|
a) Algorithms:
|
|
§ Basic checks: NaN.
|
|
§ Point anomaly detection.
|
|
§ Segment anomaly detection.
|
|
b) Scenarios:
|
|
§ Online anomaly detection: monitoring streaming data.
|
|
The usage of the detectors are demonstrated in the `case_1_*`and `case_2_*`
|
|
|
|
|
|
case 3): Examples to use MetricExt to monitor IC and rank IC
|
|
1) IC(Information Coefficient) #case_3_1
|
|
2) RankIC #case_3_2
|
|
"""
|
|
|
|
# AUTO download data
|
|
from typing import List, Union
|
|
from qlib.utils import exists_qlib_data
|
|
from qlib.tests.data import GetData
|
|
from qlib.config import REG_CN
|
|
|
|
provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir
|
|
if not exists_qlib_data(provider_uri):
|
|
print(f"Qlib data is not found in {provider_uri}")
|
|
GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
|
|
|
|
import qlib
|
|
import pandas as pd
|
|
from qlib.contrib.data.handler import Alpha158
|
|
from qlib.data.dataset.loader import QlibDataLoader
|
|
from qlib.data.monitor.metric import format_conv
|
|
from qlib.data.monitor.metric import MeanM, SkewM, KurtM, StdM, AutoCM, CorrM
|
|
from qlib.data.monitor.detector import NDDetector, SWNDD, ThresholdD
|
|
from qlib.data import D
|
|
import fire
|
|
|
|
UNIVERSE = "csi300"
|
|
START_TIME = "20200101"
|
|
|
|
# ------------------ a helper function to get data to demonstrate the functionality --------------------
|
|
|
|
|
|
def get_data_df(col_idx: Union[int, List[int]] = 0, verbose: bool = True):
|
|
"""
|
|
a helper function to get data to demonstrate the functionality.
|
|
|
|
Parameters
|
|
----------
|
|
col_idx : Union[int, List[int]]
|
|
column index of the metrics
|
|
"""
|
|
dh = Alpha158(instruments=UNIVERSE, infer_processors=[], learn_processors=[], start_time=START_TIME)
|
|
df = dh.fetch()
|
|
|
|
if verbose:
|
|
print(df.head())
|
|
|
|
# We don't have industries in dataframe, we generate the with fake data
|
|
industry = pd.Series(df.index.get_level_values("instrument").str.slice(stop=2).to_list(), index=df.index)
|
|
|
|
# select a factor
|
|
factor_df = format_conv(df.iloc[:, col_idx], industry=industry)
|
|
if verbose:
|
|
print(f"Selected metric: {df.columns[col_idx]}")
|
|
print(factor_df)
|
|
return factor_df
|
|
|
|
|
|
def get_target(horizon=5):
|
|
target = f"Ref($close, -{horizon + 1})/Ref($close, -1) - 1" # There are lots of targets: return is one of them
|
|
qdl = QlibDataLoader(config=([target], ["target"]))
|
|
df = qdl.load(instruments=UNIVERSE, start_time=START_TIME) # Aligning with factor will improve performance
|
|
df = format_conv(df["target"])
|
|
return df
|
|
|
|
|
|
# ----------------- Cases to demonstrate the usage of detector and examples ----------------------
|
|
|
|
|
|
def case_1_1():
|
|
factor_df = get_data_df()
|
|
# 1) Extract metrics
|
|
|
|
# 1.1) df.groupby(["datetime"])
|
|
mtrc = MeanM()
|
|
m_mean = mtrc.extract(factor_df)
|
|
print(m_mean)
|
|
|
|
ndd = NDDetector()
|
|
ndd.fit(m_mean) # use historical data to fit detector
|
|
check_res = ndd.check(m_mean)
|
|
print(check_res) # detecting on new data or historical data
|
|
print(check_res.value_counts())
|
|
|
|
|
|
def case_1_2():
|
|
factor_df = get_data_df()
|
|
# 1.2) df.groupby("datetime", "industry")
|
|
mtrc = MeanM(group=["industry"])
|
|
m_multi = mtrc.extract(factor_df)
|
|
print(m_multi)
|
|
|
|
for col_name, s in m_multi.iteritems():
|
|
print(col_name)
|
|
ndd = NDDetector()
|
|
ndd.fit(s) # use historical data to fit detector
|
|
check_res = ndd.check(s)
|
|
print(check_res) # detecting on new data or historical data
|
|
print(check_res.value_counts())
|
|
|
|
|
|
def case_1_3():
|
|
# case 1.3
|
|
# factor_df = get_data_df()
|
|
qdl = QlibDataLoader(config=(["$close/Ref($close, 1) - 1"], ["return"]))
|
|
df = qdl.load(instruments=["SH600519"], start_time=START_TIME)
|
|
df = format_conv(df)
|
|
s = df.iloc[:, 0]
|
|
print(s)
|
|
dtc = SWNDD(window=20)
|
|
dtc.fit(s) # fit use historical data (TODO: updating will be supported in the future)
|
|
check_res = dtc.check(s) #
|
|
print(check_res)
|
|
print(check_res.value_counts())
|
|
print(check_res[check_res])
|
|
|
|
|
|
def case_2_1():
|
|
# · Calculate corr(df.loc[t, :, :], df.loc[t-w, :, :]), w=1, 2, ….
|
|
factor_df = get_data_df()
|
|
acm = AutoCM()
|
|
mtrc = acm.extract(factor_df)
|
|
print(mtrc)
|
|
|
|
thd = ThresholdD(0.0, reverse=True)
|
|
check_res = thd.check(mtrc)
|
|
|
|
print(check_res)
|
|
print(check_res.value_counts())
|
|
|
|
|
|
def case_2_2():
|
|
factor_df1, factor_df2 = get_data_df(0), get_data_df(1)
|
|
|
|
cm = CorrM()
|
|
mtrc = cm.extract(factor_df1, factor_df2)
|
|
print(mtrc)
|
|
|
|
thd = ThresholdD(0.0, reverse=True)
|
|
check_res = thd.check(mtrc)
|
|
|
|
print(check_res)
|
|
print(check_res.value_counts())
|
|
|
|
|
|
def case_3_1_3_2():
|
|
target, factor = get_target(), get_data_df(0)
|
|
ic_m, rank_ic_m = CorrM(), CorrM(mode="spearman")
|
|
ic, rank_ic = ic_m.extract(factor, target), rank_ic_m.extract(factor, target)
|
|
print(pd.DataFrame({"ic": ic, "rank_ic": rank_ic}))
|
|
|
|
|
|
def run(test_list=["case_1_1", "case_1_2", "case_1_3", "case_2_1", "case_2_2", "case_3_1_3_2"]):
|
|
"""
|
|
run the specific tests
|
|
|
|
python monitor.py case_3_1_3_2
|
|
|
|
Parameters
|
|
----------
|
|
test_list : str[]
|
|
The tests to run
|
|
"""
|
|
if isinstance(test_list, str):
|
|
test_list = [test_list]
|
|
for fn in test_list:
|
|
globals()[fn]()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
qlib.init()
|
|
fire.Fire(run)
|