mirror of
https://github.com/microsoft/qlib.git
synced 2026-06-06 14:01:28 +08:00
Compare commits
3 Commits
v0.9.6
...
qlib_monit
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
77ba7b4e91 | ||
|
|
7a639eeea7 | ||
|
|
cddaf90ef5 |
208
examples/data/monitor.py
Normal file
208
examples/data/monitor.py
Normal file
@@ -0,0 +1,208 @@
|
||||
"""
|
||||
This script is the demonstrating the implementation of Metric Extractor and Detector
|
||||
|
||||
NOTE: A lot of details is not considered in this script
|
||||
- Corner case that will raise error( std == 0)
|
||||
|
||||
|
||||
|
||||
The following functions are used to demonstrate the following examples
|
||||
|
||||
|
||||
· Metric Extractor:
|
||||
case 1) Basic statistics on different slices of the DataFrame df:
|
||||
1) The statistics include:
|
||||
· STD, Mean, Skewnes, Kurtosis
|
||||
2) The above statistics can be calculated on the following data slices:
|
||||
· df.groupby(['datetime'])
|
||||
· df.groupby(['datetime', 'industry' ])
|
||||
3) The statistics could be calculated on the time dimension for each instruments and factor(the factor can be represented by experssion)
|
||||
· <df implemented by expresion>.groupby(['instrument', 'factor'])
|
||||
case 2) Advanced statistics on different slices of the DataFrame df:
|
||||
1) Auto-correlation:
|
||||
· Calculate corr(df.loc[t, :, :], df.loc[t-w, :, :]), w=1, 2, ….
|
||||
2) Correlation between factors:
|
||||
· For any pair of factors (i, j): calculate corr(df.loc[t, :, i], df.loc[t, :, j]). The result is a correlation matrix with each element corresponds to a correlation value between a pair of factors.
|
||||
|
||||
· Detector: detect the abnormality of the extracted metric;
|
||||
a) Algorithms:
|
||||
§ Basic checks: NaN.
|
||||
§ Point anomaly detection.
|
||||
§ Segment anomaly detection.
|
||||
b) Scenarios:
|
||||
§ Online anomaly detection: monitoring streaming data.
|
||||
The usage of the detectors are demonstrated in the `case_1_*`and `case_2_*`
|
||||
|
||||
|
||||
case 3): Examples to use MetricExt to monitor IC and rank IC
|
||||
1) IC(Information Coefficient) #case_3_1
|
||||
2) RankIC #case_3_2
|
||||
"""
|
||||
|
||||
# AUTO download data
|
||||
from typing import List, Union
|
||||
from qlib.utils import exists_qlib_data
|
||||
from qlib.tests.data import GetData
|
||||
from qlib.config import REG_CN
|
||||
|
||||
provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir
|
||||
if not exists_qlib_data(provider_uri):
|
||||
print(f"Qlib data is not found in {provider_uri}")
|
||||
GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
|
||||
|
||||
import qlib
|
||||
import pandas as pd
|
||||
from qlib.contrib.data.handler import Alpha158
|
||||
from qlib.data.dataset.loader import QlibDataLoader
|
||||
from qlib.data.monitor.metric import format_conv
|
||||
from qlib.data.monitor.metric import MeanM, SkewM, KurtM, StdM, AutoCM, CorrM
|
||||
from qlib.data.monitor.detector import NDDetector, SWNDD, ThresholdD
|
||||
from qlib.data import D
|
||||
import fire
|
||||
|
||||
UNIVERSE = "csi300"
|
||||
START_TIME = "20200101"
|
||||
|
||||
# ------------------ a helper function to get data to demonstrate the functionality --------------------
|
||||
|
||||
|
||||
def get_data_df(col_idx: Union[int, List[int]] = 0, verbose: bool = True):
|
||||
"""
|
||||
a helper function to get data to demonstrate the functionality.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
col_idx : Union[int, List[int]]
|
||||
column index of the metrics
|
||||
"""
|
||||
dh = Alpha158(instruments=UNIVERSE, infer_processors=[], learn_processors=[], start_time=START_TIME)
|
||||
df = dh.fetch()
|
||||
|
||||
if verbose:
|
||||
print(df.head())
|
||||
|
||||
# We don't have industries in dataframe, we generate the with fake data
|
||||
industry = pd.Series(df.index.get_level_values("instrument").str.slice(stop=2).to_list(), index=df.index)
|
||||
|
||||
# select a factor
|
||||
factor_df = format_conv(df.iloc[:, col_idx], industry=industry)
|
||||
if verbose:
|
||||
print(f"Selected metric: {df.columns[col_idx]}")
|
||||
print(factor_df)
|
||||
return factor_df
|
||||
|
||||
|
||||
def get_target(horizon=5):
|
||||
target = f"Ref($close, -{horizon + 1})/Ref($close, -1) - 1" # There are lots of targets: return is one of them
|
||||
qdl = QlibDataLoader(config=([target], ["target"]))
|
||||
df = qdl.load(instruments=UNIVERSE, start_time=START_TIME) # Aligning with factor will improve performance
|
||||
df = format_conv(df["target"])
|
||||
return df
|
||||
|
||||
|
||||
# ----------------- Cases to demonstrate the usage of detector and examples ----------------------
|
||||
|
||||
|
||||
def case_1_1():
|
||||
factor_df = get_data_df()
|
||||
# 1) Extract metrics
|
||||
|
||||
# 1.1) df.groupby(["datetime"])
|
||||
mtrc = MeanM()
|
||||
m_mean = mtrc.extract(factor_df)
|
||||
print(m_mean)
|
||||
|
||||
ndd = NDDetector()
|
||||
ndd.fit(m_mean) # use historical data to fit detector
|
||||
check_res = ndd.check(m_mean)
|
||||
print(check_res) # detecting on new data or historical data
|
||||
print(check_res.value_counts())
|
||||
|
||||
|
||||
def case_1_2():
|
||||
factor_df = get_data_df()
|
||||
# 1.2) df.groupby("datetime", "industry")
|
||||
mtrc = MeanM(group=["industry"])
|
||||
m_multi = mtrc.extract(factor_df)
|
||||
print(m_multi)
|
||||
|
||||
for col_name, s in m_multi.iteritems():
|
||||
print(col_name)
|
||||
ndd = NDDetector()
|
||||
ndd.fit(s) # use historical data to fit detector
|
||||
check_res = ndd.check(s)
|
||||
print(check_res) # detecting on new data or historical data
|
||||
print(check_res.value_counts())
|
||||
|
||||
|
||||
def case_1_3():
|
||||
# case 1.3
|
||||
# factor_df = get_data_df()
|
||||
qdl = QlibDataLoader(config=(["$close/Ref($close, 1) - 1"], ["return"]))
|
||||
df = qdl.load(instruments=["SH600519"], start_time=START_TIME)
|
||||
df = format_conv(df)
|
||||
s = df.iloc[:, 0]
|
||||
print(s)
|
||||
dtc = SWNDD(window=20)
|
||||
dtc.fit(s) # fit use historical data (TODO: updating will be supported in the future)
|
||||
check_res = dtc.check(s) #
|
||||
print(check_res)
|
||||
print(check_res.value_counts())
|
||||
print(check_res[check_res])
|
||||
|
||||
|
||||
def case_2_1():
|
||||
# · Calculate corr(df.loc[t, :, :], df.loc[t-w, :, :]), w=1, 2, ….
|
||||
factor_df = get_data_df()
|
||||
acm = AutoCM()
|
||||
mtrc = acm.extract(factor_df)
|
||||
print(mtrc)
|
||||
|
||||
thd = ThresholdD(0.0, reverse=True)
|
||||
check_res = thd.check(mtrc)
|
||||
|
||||
print(check_res)
|
||||
print(check_res.value_counts())
|
||||
|
||||
|
||||
def case_2_2():
|
||||
factor_df1, factor_df2 = get_data_df(0), get_data_df(1)
|
||||
|
||||
cm = CorrM()
|
||||
mtrc = cm.extract(factor_df1, factor_df2)
|
||||
print(mtrc)
|
||||
|
||||
thd = ThresholdD(0.0, reverse=True)
|
||||
check_res = thd.check(mtrc)
|
||||
|
||||
print(check_res)
|
||||
print(check_res.value_counts())
|
||||
|
||||
|
||||
def case_3_1_3_2():
|
||||
target, factor = get_target(), get_data_df(0)
|
||||
ic_m, rank_ic_m = CorrM(), CorrM(mode="spearman")
|
||||
ic, rank_ic = ic_m.extract(factor, target), rank_ic_m.extract(factor, target)
|
||||
print(pd.DataFrame({"ic": ic, "rank_ic": rank_ic}))
|
||||
|
||||
|
||||
def run(test_list=["case_1_1", "case_1_2", "case_1_3", "case_2_1", "case_2_2", "case_3_1_3_2"]):
|
||||
"""
|
||||
run the specific tests
|
||||
|
||||
python monitor.py case_3_1_3_2
|
||||
|
||||
Parameters
|
||||
----------
|
||||
test_list : str[]
|
||||
The tests to run
|
||||
"""
|
||||
if isinstance(test_list, str):
|
||||
test_list = [test_list]
|
||||
for fn in test_list:
|
||||
globals()[fn]()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
qlib.init()
|
||||
fire.Fire(run)
|
||||
130
examples/data/monitor_analyser_demo.ipynb
Normal file
130
examples/data/monitor_analyser_demo.ipynb
Normal file
@@ -0,0 +1,130 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0e62a81e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"from tqdm.auto import tqdm\n",
|
||||
"%matplotlib inline\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c503217b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from qlib.data.monitor.analyser import Analyser\n",
|
||||
"import qlib\n",
|
||||
"qlib.init()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9c276470",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class SimpleDFA(Analyser):\n",
|
||||
" \"\"\"Simple (D)ata(F)rame (A)nalyser\"\"\"\n",
|
||||
" def analyse(self, data: pd.DataFrame, *args, **kwargs):\n",
|
||||
" data.plot(*args, **kwargs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "110262e4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from monitor import get_data_df, AutoCM"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0ea38c62",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# get data\n",
|
||||
"factor_df = get_data_df([1], verbose=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dbded6fe",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# metric extractor\n",
|
||||
"acm = AutoCM()\n",
|
||||
"mtrc = acm.extract(factor_df)\n",
|
||||
"print(mtrc)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "65517c81",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Analyser\n",
|
||||
"sa = SimpleDFA()\n",
|
||||
"sa.analyse(mtrc, title='Auto Correlation')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dab6fb2e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
},
|
||||
"toc": {
|
||||
"base_numbering": 1,
|
||||
"nav_menu": {},
|
||||
"number_sections": true,
|
||||
"sideBar": true,
|
||||
"skip_h1_title": false,
|
||||
"title_cell": "Table of Contents",
|
||||
"title_sidebar": "Contents",
|
||||
"toc_cell": false,
|
||||
"toc_position": {},
|
||||
"toc_section_display": true,
|
||||
"toc_window_display": false
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
0
qlib/data/monitor/__init__.py
Normal file
0
qlib/data/monitor/__init__.py
Normal file
14
qlib/data/monitor/analyser.py
Normal file
14
qlib/data/monitor/analyser.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from abc import abstractmethod
|
||||
|
||||
|
||||
class Analyser:
|
||||
"""
|
||||
Analyser is supposed to process the output MetricExt and produce a analysis result
|
||||
- The results could be a report or plot.
|
||||
|
||||
We suppose the Analyser doesn't need much computing resource (The heavy computation should be done in MetricExt)
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def analyse(self, *args, **kwargs):
|
||||
...
|
||||
103
qlib/data/monitor/detector.py
Normal file
103
qlib/data/monitor/detector.py
Normal file
@@ -0,0 +1,103 @@
|
||||
import pandas as pd
|
||||
import abc
|
||||
from typing import Union
|
||||
|
||||
|
||||
class Detector(metaclass=abc.ABCMeta):
|
||||
"""
|
||||
Detector is stateful
|
||||
|
||||
The input of detector is Series with shape <time> or Dataframe with shape <time, factor>
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def check(self, df: Union[pd.Series, pd.DataFrame]) -> pd.Series:
|
||||
"""
|
||||
Check the result of values
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : Union[pd.Series, pd.DataFrame]
|
||||
Data to be checked
|
||||
|
||||
Returns
|
||||
-------
|
||||
pd.Series:
|
||||
True: Abnormalities detected.
|
||||
False: No abnormality detected.
|
||||
"""
|
||||
pass
|
||||
|
||||
def fit(self, df: pd.DataFrame):
|
||||
raise NotImplementedError(f"This type of input is not supported")
|
||||
|
||||
def update(self, df: pd.DataFrame):
|
||||
"""
|
||||
The state of detector can be updated gradually.
|
||||
"""
|
||||
raise NotImplementedError(f"This type of input is not supported")
|
||||
|
||||
|
||||
class NDDetector(Detector):
|
||||
"""
|
||||
Normal Distribution Detector
|
||||
This will be used more in offline detector
|
||||
"""
|
||||
|
||||
def __init__(self, n=3):
|
||||
"""
|
||||
The detection range:
|
||||
- mean ± n * std
|
||||
"""
|
||||
self.n = n
|
||||
|
||||
def fit(self, s: pd.Series):
|
||||
self.mean = s.mean()
|
||||
self.std = s.std()
|
||||
|
||||
def check(self, s: pd.Series):
|
||||
return ~s.between(self.mean - self.std * self.n, self.mean + self.std * self.n)
|
||||
|
||||
|
||||
class SWNDD(Detector):
|
||||
"""(S)liding (W)indow (N)ormal (D)istribition (D)etector"""
|
||||
|
||||
def __init__(self, n=3, **rolling_args):
|
||||
self.rolling_args = rolling_args
|
||||
self.n = n
|
||||
|
||||
def fit(self, s: pd.Series):
|
||||
# TODO: pd.Dataframe is not supported now
|
||||
self.mean = s.rolling(**self.rolling_args).mean()
|
||||
self.std = s.rolling(**self.rolling_args).std()
|
||||
|
||||
def check(self, s: pd.Series):
|
||||
res = ~s.between(self.mean - self.std * self.n, self.mean + self.std * self.n)
|
||||
res = res & ~self.mean.isna() & ~self.std.isna()
|
||||
|
||||
return res.reindex(s.index)
|
||||
|
||||
|
||||
# TODO: daily normalization detector.
|
||||
|
||||
|
||||
class CountD(Detector):
|
||||
"""
|
||||
Count detector
|
||||
"""
|
||||
|
||||
# TODO: check if the number of counts is enough
|
||||
# TODO: This is a instance of Count Detector.
|
||||
|
||||
|
||||
class ThresholdD(Detector):
|
||||
"""
|
||||
Threshold (D)etector
|
||||
"""
|
||||
|
||||
def __init__(self, threshold, reverse=False):
|
||||
self.threshold = threshold
|
||||
self.reverse = reverse
|
||||
|
||||
def check(self, s: pd.Series):
|
||||
return (s < self.threshold) if self.reverse else (s > self.threshold)
|
||||
127
qlib/data/monitor/metric.py
Normal file
127
qlib/data/monitor/metric.py
Normal file
@@ -0,0 +1,127 @@
|
||||
import pandas as pd
|
||||
from typing import Union
|
||||
from abc import abstractmethod, ABCMeta
|
||||
|
||||
|
||||
def format_conv(df: pd.Series, **col_group):
|
||||
# TODO: col_group
|
||||
df = df.copy() # performance problems here
|
||||
if len(col_group) > 0:
|
||||
col_group_df = pd.DataFrame({name: group.reindex(df.index) for name, group in col_group.items()})
|
||||
col_group_df = col_group_df.reindex(df.index)
|
||||
# merge all the groups into df.index
|
||||
col_group_df = col_group_df.set_index(keys=col_group_df.columns.to_list(), append=True)
|
||||
df.index = col_group_df.index
|
||||
|
||||
ustk_cols = [col for col in df.index.names if col != "datetime"]
|
||||
return df.unstack(ustk_cols)
|
||||
|
||||
|
||||
class MetricExt(metaclass=ABCMeta):
|
||||
"""Metric Extractor
|
||||
Current design.
|
||||
The input data are assumed like qlib format
|
||||
The extracted information like time-series. Column could be multiple index
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def extract(self, df: Union[pd.Series, pd.DataFrame]) -> Union[pd.Series, pd.DataFrame]:
|
||||
pass
|
||||
|
||||
|
||||
# overall metrics
|
||||
class AggMetrics(MetricExt):
|
||||
"""
|
||||
TODO: this metric assumes that the daily assumptions(The operation is used on each row)
|
||||
"""
|
||||
|
||||
def __init__(self, group=None):
|
||||
if isinstance(group, str):
|
||||
group = [group]
|
||||
self.group = group
|
||||
|
||||
def extract(self, df: pd.DataFrame) -> pd.Series:
|
||||
if self.group is None:
|
||||
return df.apply(self.agg, axis=1)
|
||||
else:
|
||||
return df.groupby(self.group, axis=1).apply(self.agg, axis=1)
|
||||
|
||||
@abstractmethod
|
||||
def agg(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
class StdM(AggMetrics):
|
||||
def agg(self, s, *args, **kwargs):
|
||||
return s.std(*args, **kwargs)
|
||||
|
||||
|
||||
class MeanM(AggMetrics):
|
||||
def agg(self, s, *args, **kwargs):
|
||||
return s.mean(*args, **kwargs)
|
||||
|
||||
|
||||
class SkewM(AggMetrics):
|
||||
def agg(self, s, *args, **kwargs):
|
||||
return s.skew(*args, **kwargs)
|
||||
|
||||
|
||||
class KurtM(AggMetrics):
|
||||
def agg(self, s, *args, **kwargs):
|
||||
return pd.DataFrame.kurt(s, *args, **kwargs)
|
||||
|
||||
|
||||
# sliding window metrics
|
||||
class SWMetrics(MetricExt):
|
||||
"""
|
||||
(S)liding (W)indow Metrics
|
||||
|
||||
TODO: testing this class
|
||||
"""
|
||||
|
||||
def __init__(self, **rolling_args):
|
||||
self.rolling_args = rolling_args
|
||||
|
||||
def extract(self, df: Union[pd.Series, pd.DataFrame]) -> Union[pd.Series, pd.DataFrame]:
|
||||
if isinstance(pd.Series):
|
||||
return df.rolling(**self.rolling_args).apply(self.agg)
|
||||
elif isinstance(pd.DataFrame):
|
||||
return df.rolling(**self.rolling_args).apply(self.agg, axis=0)
|
||||
else:
|
||||
raise NotImplementedError(f"This type of input is not supported")
|
||||
|
||||
@abstractmethod
|
||||
def agg(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
## TODO: more metrics is ignored: mean, std, skew, kurt
|
||||
|
||||
|
||||
def calc_corr(df1: pd.DataFrame, df2: pd.DataFrame, mode):
|
||||
corr = {}
|
||||
for (t1, s1), (t2, s2) in zip(df1.iterrows(), df2.iterrows()):
|
||||
assert t1 == t2
|
||||
corr[t1] = s1.corr(s2, method=mode)
|
||||
return pd.Series(corr)
|
||||
|
||||
|
||||
class AutoCM(MetricExt):
|
||||
"""(A)uto (C)orrelation (M)etrics"""
|
||||
|
||||
def __init__(self, mode="pearson", shift=1):
|
||||
self.mode = mode
|
||||
self.shift = shift
|
||||
|
||||
def extract(self, df: pd.DataFrame):
|
||||
return calc_corr(df, df.shift(self.shift), self.mode)
|
||||
|
||||
|
||||
class CorrM(MetricExt):
|
||||
"""correlation extractor"""
|
||||
|
||||
def __init__(self, mode="pearson"):
|
||||
self.mode = mode
|
||||
|
||||
def extract(self, df1: pd.DataFrame, df2: pd.DataFrame):
|
||||
return calc_corr(df1, df2, self.mode)
|
||||
Reference in New Issue
Block a user