Add analyser example and finetune example

add IC and rank IC
monitor initial version
2026-07-21 11:17:34 +08:00 · 2021-06-06 07:51:52 +00:00 · 2021-06-06 07:43:26 +00:00 · 2021-06-06 07:43:26 +00:00
6 changed files with 582 additions and 0 deletions
--- a/examples/data/monitor.py
+++ b/examples/data/monitor.py
@@ -0,0 +1,208 @@
+"""
+This script is the demonstrating the implementation of Metric Extractor and Detector
+
+NOTE: A lot of details is not considered in this script
+- Corner case that will raise error( std == 0)
+
+
+
+The following functions are used to demonstrate the following examples
+
+
+· Metric Extractor:
+	case 1) Basic statistics on different slices of the DataFrame df:
+		1) The statistics include:
+			· STD, Mean, Skewnes, Kurtosis
+		2) The above statistics can be calculated on the following data slices:
+			· df.groupby(['datetime'])
+			· df.groupby(['datetime', 'industry' ])
+                3) The statistics could be calculated on the time dimension for each instruments and factor(the factor can be represented by experssion)
+			· <df implemented by expresion>.groupby(['instrument', 'factor'])
+	case 2) Advanced statistics on different slices of the DataFrame df:
+		1) Auto-correlation:
+			· Calculate corr(df.loc[t, :, :], df.loc[t-w, :, :]), w=1, 2, ….
+		2) Correlation between factors:
+			· For any pair of factors (i, j): calculate corr(df.loc[t, :, i], df.loc[t, :,  j]). The result is a correlation matrix with each element corresponds to a correlation value between a pair of factors.
+
+· Detector:  detect the abnormality of the extracted metric;
+	a) Algorithms:
+		§ Basic checks:  NaN.
+		§ Point anomaly detection.
+		§ Segment anomaly detection.
+	b) Scenarios:
+		§ Online anomaly detection: monitoring streaming data.
+The usage of the detectors are demonstrated in the `case_1_*`and `case_2_*`
+
+
+case 3): Examples to use MetricExt to monitor IC and rank IC
+        1) IC(Information Coefficient)  #case_3_1
+        2) RankIC   #case_3_2
+"""
+
+# AUTO download data
+from typing import List, Union
+from qlib.utils import exists_qlib_data
+from qlib.tests.data import GetData
+from qlib.config import REG_CN
+
+provider_uri = "~/.qlib/qlib_data/cn_data"  # target_dir
+if not exists_qlib_data(provider_uri):
+    print(f"Qlib data is not found in {provider_uri}")
+    GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
+
+import qlib
+import pandas as pd
+from qlib.contrib.data.handler import Alpha158
+from qlib.data.dataset.loader import QlibDataLoader
+from qlib.data.monitor.metric import format_conv
+from qlib.data.monitor.metric import MeanM, SkewM, KurtM, StdM, AutoCM, CorrM
+from qlib.data.monitor.detector import NDDetector, SWNDD, ThresholdD
+from qlib.data import D
+import fire
+
+UNIVERSE = "csi300"
+START_TIME = "20200101"
+
+# ------------------ a helper function to get data to demonstrate the functionality --------------------
+
+
+def get_data_df(col_idx: Union[int, List[int]] = 0, verbose: bool = True):
+    """
+    a helper function to get data to demonstrate the functionality.
+
+    Parameters
+    ----------
+    col_idx : Union[int, List[int]]
+        column index of the metrics
+    """
+    dh = Alpha158(instruments=UNIVERSE, infer_processors=[], learn_processors=[], start_time=START_TIME)
+    df = dh.fetch()
+
+    if verbose:
+        print(df.head())
+
+    # We don't have industries in dataframe, we generate the with fake data
+    industry = pd.Series(df.index.get_level_values("instrument").str.slice(stop=2).to_list(), index=df.index)
+
+    # select a factor
+    factor_df = format_conv(df.iloc[:, col_idx], industry=industry)
+    if verbose:
+        print(f"Selected metric: {df.columns[col_idx]}")
+        print(factor_df)
+    return factor_df
+
+
+def get_target(horizon=5):
+    target = f"Ref($close, -{horizon + 1})/Ref($close, -1) - 1"  # There are lots of targets: return is one of them
+    qdl = QlibDataLoader(config=([target], ["target"]))
+    df = qdl.load(instruments=UNIVERSE, start_time=START_TIME)  # Aligning with factor will improve performance
+    df = format_conv(df["target"])
+    return df
+
+
+# -----------------  Cases to demonstrate the usage of detector and examples ----------------------
+
+
+def case_1_1():
+    factor_df = get_data_df()
+    # 1) Extract metrics
+
+    # 1.1) df.groupby(["datetime"])
+    mtrc = MeanM()
+    m_mean = mtrc.extract(factor_df)
+    print(m_mean)
+
+    ndd = NDDetector()
+    ndd.fit(m_mean)  # use historical data to fit detector
+    check_res = ndd.check(m_mean)
+    print(check_res)  #  detecting on new data or historical data
+    print(check_res.value_counts())
+
+
+def case_1_2():
+    factor_df = get_data_df()
+    # 1.2) df.groupby("datetime", "industry")
+    mtrc = MeanM(group=["industry"])
+    m_multi = mtrc.extract(factor_df)
+    print(m_multi)
+
+    for col_name, s in m_multi.iteritems():
+        print(col_name)
+        ndd = NDDetector()
+        ndd.fit(s)  # use historical data to fit detector
+        check_res = ndd.check(s)
+        print(check_res)  #  detecting on new data or historical data
+        print(check_res.value_counts())
+
+
+def case_1_3():
+    # case 1.3
+    # factor_df = get_data_df()
+    qdl = QlibDataLoader(config=(["$close/Ref($close, 1) - 1"], ["return"]))
+    df = qdl.load(instruments=["SH600519"], start_time=START_TIME)
+    df = format_conv(df)
+    s = df.iloc[:, 0]
+    print(s)
+    dtc = SWNDD(window=20)
+    dtc.fit(s)  # fit use historical data (TODO: updating will be supported in the future)
+    check_res = dtc.check(s)  #
+    print(check_res)
+    print(check_res.value_counts())
+    print(check_res[check_res])
+
+
+def case_2_1():
+    # · Calculate corr(df.loc[t, :, :], df.loc[t-w, :, :]), w=1, 2, ….
+    factor_df = get_data_df()
+    acm = AutoCM()
+    mtrc = acm.extract(factor_df)
+    print(mtrc)
+
+    thd = ThresholdD(0.0, reverse=True)
+    check_res = thd.check(mtrc)
+
+    print(check_res)
+    print(check_res.value_counts())
+
+
+def case_2_2():
+    factor_df1, factor_df2 = get_data_df(0), get_data_df(1)
+
+    cm = CorrM()
+    mtrc = cm.extract(factor_df1, factor_df2)
+    print(mtrc)
+
+    thd = ThresholdD(0.0, reverse=True)
+    check_res = thd.check(mtrc)
+
+    print(check_res)
+    print(check_res.value_counts())
+
+
+def case_3_1_3_2():
+    target, factor = get_target(), get_data_df(0)
+    ic_m, rank_ic_m = CorrM(), CorrM(mode="spearman")
+    ic, rank_ic = ic_m.extract(factor, target), rank_ic_m.extract(factor, target)
+    print(pd.DataFrame({"ic": ic, "rank_ic": rank_ic}))
+
+
+def run(test_list=["case_1_1", "case_1_2", "case_1_3", "case_2_1", "case_2_2", "case_3_1_3_2"]):
+    """
+    run the specific tests
+
+    python monitor.py case_3_1_3_2
+
+    Parameters
+    ----------
+    test_list :  str[]
+        The tests to run
+    """
+    if isinstance(test_list, str):
+        test_list = [test_list]
+    for fn in test_list:
+        globals()[fn]()
+
+
+if __name__ == "__main__":
+    qlib.init()
+    fire.Fire(run)
--- a/examples/data/monitor_analyser_demo.ipynb
+++ b/examples/data/monitor_analyser_demo.ipynb
@@ -0,0 +1,130 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e62a81e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "from tqdm.auto import tqdm\n",
+    "%matplotlib inline\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c503217b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qlib.data.monitor.analyser import Analyser\n",
+    "import qlib\n",
+    "qlib.init()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9c276470",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class SimpleDFA(Analyser):\n",
+    "    \"\"\"Simple (D)ata(F)rame (A)nalyser\"\"\"\n",
+    "    def analyse(self, data: pd.DataFrame, *args, **kwargs):\n",
+    "        data.plot(*args, **kwargs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "110262e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from monitor import get_data_df, AutoCM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0ea38c62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get data\n",
+    "factor_df = get_data_df([1], verbose=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dbded6fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# metric extractor\n",
+    "acm = AutoCM()\n",
+    "mtrc = acm.extract(factor_df)\n",
+    "print(mtrc)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "65517c81",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Analyser\n",
+    "sa = SimpleDFA()\n",
+    "sa.analyse(mtrc, title='Auto Correlation')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dab6fb2e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/qlib/data/monitor/init.py
+++ b/qlib/data/monitor/init.py
--- a/qlib/data/monitor/analyser.py
+++ b/qlib/data/monitor/analyser.py
@@ -0,0 +1,14 @@
+from abc import abstractmethod
+
+
+class Analyser:
+    """
+    Analyser is supposed to process the output MetricExt and produce a analysis result
+    - The results could be a report or plot.
+
+    We suppose the Analyser doesn't need much computing resource (The heavy computation should be done in MetricExt)
+    """
+
+    @abstractmethod
+    def analyse(self, *args, **kwargs):
+        ...
--- a/qlib/data/monitor/detector.py
+++ b/qlib/data/monitor/detector.py
@@ -0,0 +1,103 @@
+import pandas as pd
+import abc
+from typing import Union
+
+
+class Detector(metaclass=abc.ABCMeta):
+    """
+    Detector is stateful
+
+    The input of detector is  Series with shape <time> or Dataframe with shape <time, factor>
+    """
+
+    @abc.abstractmethod
+    def check(self, df: Union[pd.Series, pd.DataFrame]) -> pd.Series:
+        """
+        Check the result of values
+
+        Parameters
+        ----------
+        df : Union[pd.Series, pd.DataFrame]
+            Data to be checked
+
+        Returns
+        -------
+        pd.Series:
+            True: Abnormalities detected.
+            False: No abnormality detected.
+        """
+        pass
+
+    def fit(self, df: pd.DataFrame):
+        raise NotImplementedError(f"This type of input is not supported")
+
+    def update(self, df: pd.DataFrame):
+        """
+        The state of detector can be updated gradually.
+        """
+        raise NotImplementedError(f"This type of input is not supported")
+
+
+class NDDetector(Detector):
+    """
+    Normal Distribution Detector
+    This will be used more in offline detector
+    """
+
+    def __init__(self, n=3):
+        """
+        The detection range:
+        - mean ± n * std
+        """
+        self.n = n
+
+    def fit(self, s: pd.Series):
+        self.mean = s.mean()
+        self.std = s.std()
+
+    def check(self, s: pd.Series):
+        return ~s.between(self.mean - self.std * self.n, self.mean + self.std * self.n)
+
+
+class SWNDD(Detector):
+    """(S)liding (W)indow (N)ormal (D)istribition (D)etector"""
+
+    def __init__(self, n=3, **rolling_args):
+        self.rolling_args = rolling_args
+        self.n = n
+
+    def fit(self, s: pd.Series):
+        # TODO: pd.Dataframe is not supported now
+        self.mean = s.rolling(**self.rolling_args).mean()
+        self.std = s.rolling(**self.rolling_args).std()
+
+    def check(self, s: pd.Series):
+        res = ~s.between(self.mean - self.std * self.n, self.mean + self.std * self.n)
+        res = res & ~self.mean.isna() & ~self.std.isna()
+
+        return res.reindex(s.index)
+
+
+# TODO: daily normalization detector.
+
+
+class CountD(Detector):
+    """
+    Count detector
+    """
+
+    # TODO: check if the number of counts is enough
+    # TODO: This is a instance of Count Detector.
+
+
+class ThresholdD(Detector):
+    """
+    Threshold (D)etector
+    """
+
+    def __init__(self, threshold, reverse=False):
+        self.threshold = threshold
+        self.reverse = reverse
+
+    def check(self, s: pd.Series):
+        return (s < self.threshold) if self.reverse else (s > self.threshold)
--- a/qlib/data/monitor/metric.py
+++ b/qlib/data/monitor/metric.py
@@ -0,0 +1,127 @@
+import pandas as pd
+from typing import Union
+from abc import abstractmethod, ABCMeta
+
+
+def format_conv(df: pd.Series, **col_group):
+    # TODO: col_group
+    df = df.copy()  # performance problems here
+    if len(col_group) > 0:
+        col_group_df = pd.DataFrame({name: group.reindex(df.index) for name, group in col_group.items()})
+        col_group_df = col_group_df.reindex(df.index)
+        # merge all the groups into df.index
+        col_group_df = col_group_df.set_index(keys=col_group_df.columns.to_list(), append=True)
+        df.index = col_group_df.index
+
+    ustk_cols = [col for col in df.index.names if col != "datetime"]
+    return df.unstack(ustk_cols)
+
+
+class MetricExt(metaclass=ABCMeta):
+    """Metric Extractor
+    Current design.
+    The input data are assumed like qlib format
+    The extracted information like time-series. Column could be multiple index
+    """
+
+    @abstractmethod
+    def extract(self, df: Union[pd.Series, pd.DataFrame]) -> Union[pd.Series, pd.DataFrame]:
+        pass
+
+
+# overall metrics
+class AggMetrics(MetricExt):
+    """
+    TODO: this metric assumes that the daily assumptions(The operation is used on each row)
+    """
+
+    def __init__(self, group=None):
+        if isinstance(group, str):
+            group = [group]
+        self.group = group
+
+    def extract(self, df: pd.DataFrame) -> pd.Series:
+        if self.group is None:
+            return df.apply(self.agg, axis=1)
+        else:
+            return df.groupby(self.group, axis=1).apply(self.agg, axis=1)
+
+    @abstractmethod
+    def agg(self, *args, **kwargs):
+        pass
+
+
+class StdM(AggMetrics):
+    def agg(self, s, *args, **kwargs):
+        return s.std(*args, **kwargs)
+
+
+class MeanM(AggMetrics):
+    def agg(self, s, *args, **kwargs):
+        return s.mean(*args, **kwargs)
+
+
+class SkewM(AggMetrics):
+    def agg(self, s, *args, **kwargs):
+        return s.skew(*args, **kwargs)
+
+
+class KurtM(AggMetrics):
+    def agg(self, s, *args, **kwargs):
+        return pd.DataFrame.kurt(s, *args, **kwargs)
+
+
+# sliding window metrics
+class SWMetrics(MetricExt):
+    """
+    (S)liding (W)indow Metrics
+
+    TODO: testing this class
+    """
+
+    def __init__(self, **rolling_args):
+        self.rolling_args = rolling_args
+
+    def extract(self, df: Union[pd.Series, pd.DataFrame]) -> Union[pd.Series, pd.DataFrame]:
+        if isinstance(pd.Series):
+            return df.rolling(**self.rolling_args).apply(self.agg)
+        elif isinstance(pd.DataFrame):
+            return df.rolling(**self.rolling_args).apply(self.agg, axis=0)
+        else:
+            raise NotImplementedError(f"This type of input is not supported")
+
+    @abstractmethod
+    def agg(self, *args, **kwargs):
+        pass
+
+
+## TODO: more metrics is ignored: mean, std, skew, kurt
+
+
+def calc_corr(df1: pd.DataFrame, df2: pd.DataFrame, mode):
+    corr = {}
+    for (t1, s1), (t2, s2) in zip(df1.iterrows(), df2.iterrows()):
+        assert t1 == t2
+        corr[t1] = s1.corr(s2, method=mode)
+    return pd.Series(corr)
+
+
+class AutoCM(MetricExt):
+    """(A)uto (C)orrelation (M)etrics"""
+
+    def __init__(self, mode="pearson", shift=1):
+        self.mode = mode
+        self.shift = shift
+
+    def extract(self, df: pd.DataFrame):
+        return calc_corr(df, df.shift(self.shift), self.mode)
+
+
+class CorrM(MetricExt):
+    """correlation extractor"""
+
+    def __init__(self, mode="pearson"):
+        self.mode = mode
+
+    def extract(self, df1: pd.DataFrame, df2: pd.DataFrame):
+        return calc_corr(df1, df2, self.mode)
Author	SHA1	Message	Date
Young	77ba7b4e91	Add analyser example and finetune example	2021-06-06 07:51:52 +00:00
Young	7a639eeea7	add IC and rank IC	2021-06-06 07:43:26 +00:00
Young	cddaf90ef5	monitor initial version	2021-06-06 07:43:26 +00:00