diff --git a/examples/data/monitor.py b/examples/data/monitor.py new file mode 100644 index 000000000..daefd9c63 --- /dev/null +++ b/examples/data/monitor.py @@ -0,0 +1,154 @@ +""" +This script is the demonstrating the implementation of following requirements. + + +NOTE: A lot of details is not considered in this script +- Corner case that will raise error( std == 0) + + +· Transformer: + 1) Basic statistics on different slices of the DataFrame df: + § The statistics include: + · STD, Mean, Skewnes, Kurtosis + § The above statistics can be calculated on the following data slices: + · df.groupby(['datetime']) + · df.groupby(['datetime', 'industry' ]) + · df.groupby(['instrument', 'factor']) + · df.apply("").groupby([..]), in which [..] could be any one of the above slicing rules. + 2) Advanced statistics on different slices of the DataFrame df: + § Auto-correlation: + · Calculate corr(df.loc[t, :, :], df.loc[t-w, :, :]), w=1, 2, …. + § Correlation between factors: + · For any pair of factors (i, j): calculate corr(df.loc[t, :, i], df.loc[t, :, j]). The result is a correlation matrix with each element corresponds to a correlation value between a pair of factors. + § The data slices are the same as those in 1). +· Monitor: + 1) Algorithms: + § Basic checks: NaN. + § Point anomaly detection. + § Segment anomaly detection. + 2) Scenarios: + § Online anomaly detection: monitoring streaming data. +Offline anomaly detection: verifying whole historical data. +""" + +# AUTO download data +from qlib.utils import exists_qlib_data +from qlib.tests.data import GetData +from qlib.config import REG_CN + +provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir +if not exists_qlib_data(provider_uri): + print(f"Qlib data is not found in {provider_uri}") + GetData().qlib_data(target_dir=provider_uri, region=REG_CN) + +import qlib + +qlib.init() +import pandas as pd +from qlib.contrib.data.handler import Alpha158 +from qlib.data.dataset.loader import QlibDataLoader +from qlib.data.monitor.metric import format_conv +from qlib.data.monitor.metric import MeanM, SkewM, KurtM, StdM, AutoCM, CorrM +from qlib.data.monitor.detector import NDDetector, SWNDD, ThresholdD +from qlib.data import D + + +def get_factor_df(col_idx=0): + dh = Alpha158(instruments="csi300", infer_processors=[], learn_processors=[], start_time="20200101") + df = dh.fetch() + + print(df.head()) + + # We don't have industries in dataframe, we generate the with fake data + industry = pd.Series(df.index.get_level_values("instrument").str.slice(stop=2).to_list(), index=df.index) + + # select a factor + factor_df = format_conv(df.iloc[:, col_idx], industry=industry) + print(f"Selected metric: {df.columns[col_idx]}") + + print(factor_df) + return factor_df + + +def case_1_1(): + factor_df = get_factor_df() + # 1) Extract metrics + + # 1.1) df.groupby(["datetime"]) + mtrc = MeanM() + m_mean = mtrc.extract(factor_df) + print(m_mean) + + ndd = NDDetector() + ndd.fit(m_mean) # use historical data to fit detector + check_res = ndd.check(m_mean) + print(check_res) # detecting on new data or historical data + print(check_res.value_counts()) + + +def case_1_2(): + factor_df = get_factor_df() + # 1.2) df.groupby("datetime", "industry") + mtrc = MeanM(group=["industry"]) + m_multi = mtrc.extract(factor_df) + print(m_multi) + + for col_name, s in m_multi.iteritems(): + print(col_name) + ndd = NDDetector() + ndd.fit(s) # use historical data to fit detector + check_res = ndd.check(s) + print(check_res) # detecting on new data or historical data + print(check_res.value_counts()) + + +def case_1_3_1_4(): + # case 1.3 and case 1.4 + # factor_df = get_factor_df() + qdl = QlibDataLoader(config=(["$close/Ref($close, 1) - 1"], ["return"])) + df = qdl.load(instruments=["SH600519"], start_time="20200101") + df = format_conv(df) + s = df.iloc[:, 0] + print(s) + dtc = SWNDD(window=20) + dtc.fit(s) # fit use historical data (TODO: updating will be supported in the future) + check_res = dtc.check(s) # + print(check_res) + print(check_res.value_counts()) + print(check_res[check_res]) + + +def case_2_1(): + # · Calculate corr(df.loc[t, :, :], df.loc[t-w, :, :]), w=1, 2, …. + factor_df = get_factor_df() + acm = AutoCM() + mtrc = acm.extract(factor_df) + print(mtrc) + + thd = ThresholdD(0.0, reverse=True) + check_res = thd.check(mtrc) + + print(check_res) + print(check_res.value_counts()) + + +def case_2_2(): + factor_df1, factor_df2 = get_factor_df(0), get_factor_df(1) + + cm = CorrM() + mtrc = cm.extract(factor_df1, factor_df2) + print(mtrc) + + thd = ThresholdD(0.0, reverse=True) + check_res = thd.check(mtrc) + + print(check_res) + print(check_res.value_counts()) + + +if __name__ == "__main__": + case_1_1() + case_1_2() + case_1_3_1_4() + case_2_1() + case_2_2() diff --git a/qlib/data/monitor/__init__.py b/qlib/data/monitor/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/qlib/data/monitor/detector.py b/qlib/data/monitor/detector.py new file mode 100644 index 000000000..072fbbf86 --- /dev/null +++ b/qlib/data/monitor/detector.py @@ -0,0 +1,103 @@ +import pandas as pd +import abc +from typing import Union + + +class Detector(metaclass=abc.ABCMeta): + """ + Detector is stateful + + The input of detector is Series with shape