Add analyser example and finetune example

2026-07-21 11:17:34 +08:00 · 2021-04-22 05:54:43 +00:00
parent 7a639eeea7
commit 77ba7b4e91
4 changed files with 207 additions and 51 deletions
--- a/examples/data/monitor.py
+++ b/examples/data/monitor.py
@@ -1,46 +1,46 @@
 """
-This script is the demonstrating the implementation of following requirements.
-
+This script is the demonstrating the implementation of Metric Extractor and Detector

 NOTE: A lot of details is not considered in this script
 - Corner case that will raise error( std == 0)


-· Transformer:
-	1) Basic statistics on different slices of the DataFrame df:
-		§ The statistics include:
+
+The following functions are used to demonstrate the following examples
+
+
+· Metric Extractor:
+	case 1) Basic statistics on different slices of the DataFrame df:
+		1) The statistics include:
 			· STD, Mean, Skewnes, Kurtosis
-		§ The above statistics can be calculated on the following data slices:
+		2) The above statistics can be calculated on the following data slices:
 			· df.groupby(['datetime'])
 			· df.groupby(['datetime', 'industry' ])
-			· df.groupby(['instrument', 'factor'])
-			· df.apply("<expresion>").groupby([..]), in which [..] could be any one of the above slicing rules.
-	2) Advanced statistics on different slices of the DataFrame df:
-		§ Auto-correlation:
+                3) The statistics could be calculated on the time dimension for each instruments and factor(the factor can be represented by experssion)
+			· <df implemented by expresion>.groupby(['instrument', 'factor'])
+	case 2) Advanced statistics on different slices of the DataFrame df:
+		1) Auto-correlation:
 			· Calculate corr(df.loc[t, :, :], df.loc[t-w, :, :]), w=1, 2, ….
-		§ Correlation between factors:
+		2) Correlation between factors:
 			· For any pair of factors (i, j): calculate corr(df.loc[t, :, i], df.loc[t, :,  j]). The result is a correlation matrix with each element corresponds to a correlation value between a pair of factors.
-		§ The data slices are the same as those in 1).
-· Monitor:
-	1) Algorithms:
+
+· Detector:  detect the abnormality of the extracted metric;
+	a) Algorithms:
 		§ Basic checks:  NaN.
 		§ Point anomaly detection.
 		§ Segment anomaly detection.
-	2) Scenarios:
+	b) Scenarios:
 		§ Online anomaly detection: monitoring streaming data.
-Offline anomaly detection: verifying whole historical data.
+The usage of the detectors are demonstrated in the `case_1_*`and `case_2_*`


-2021-2-19:
-
-Effectiveness metrics
- Standard metrics:
-    - [X] IC(Information Coefficient)  #case_3_1
-    - [ ] IR(Information Ratio): Informatio Ratio is related to backest
-    - [X] RankIC   #case_3_3
+case 3): Examples to use MetricExt to monitor IC and rank IC
+        1) IC(Information Coefficient)  #case_3_1
+        2) RankIC   #case_3_2
 """

 # AUTO download data
+from typing import List, Union
 from qlib.utils import exists_qlib_data
 from qlib.tests.data import GetData
 from qlib.config import REG_CN
@@ -51,8 +51,6 @@ if not exists_qlib_data(provider_uri):
    GetData().qlib_data(target_dir=provider_uri, region=REG_CN)

 import qlib
-
-qlib.init()
 import pandas as pd
 from qlib.contrib.data.handler import Alpha158
 from qlib.data.dataset.loader import QlibDataLoader
@@ -62,30 +60,51 @@ from qlib.data.monitor.detector import NDDetector, SWNDD, ThresholdD
 from qlib.data import D
 import fire

-
 UNIVERSE = "csi300"
 START_TIME = "20200101"

+# ------------------ a helper function to get data to demonstrate the functionality --------------------

-def get_factor_df(col_idx=0):
+
+def get_data_df(col_idx: Union[int, List[int]] = 0, verbose: bool = True):
+    """
+    a helper function to get data to demonstrate the functionality.
+
+    Parameters
+    ----------
+    col_idx : Union[int, List[int]]
+        column index of the metrics
+    """
    dh = Alpha158(instruments=UNIVERSE, infer_processors=[], learn_processors=[], start_time=START_TIME)
    df = dh.fetch()

-    print(df.head())
+    if verbose:
+        print(df.head())

    # We don't have industries in dataframe, we generate the with fake data
    industry = pd.Series(df.index.get_level_values("instrument").str.slice(stop=2).to_list(), index=df.index)

    # select a factor
    factor_df = format_conv(df.iloc[:, col_idx], industry=industry)
-    print(f"Selected metric: {df.columns[col_idx]}")
-
-    print(factor_df)
+    if verbose:
+        print(f"Selected metric: {df.columns[col_idx]}")
+        print(factor_df)
    return factor_df


+def get_target(horizon=5):
+    target = f"Ref($close, -{horizon + 1})/Ref($close, -1) - 1"  # There are lots of targets: return is one of them
+    qdl = QlibDataLoader(config=([target], ["target"]))
+    df = qdl.load(instruments=UNIVERSE, start_time=START_TIME)  # Aligning with factor will improve performance
+    df = format_conv(df["target"])
+    return df
+
+
+# -----------------  Cases to demonstrate the usage of detector and examples ----------------------
+
+
 def case_1_1():
-    factor_df = get_factor_df()
+    factor_df = get_data_df()
    # 1) Extract metrics

    # 1.1) df.groupby(["datetime"])
@@ -101,7 +120,7 @@ def case_1_1():


 def case_1_2():
-    factor_df = get_factor_df()
+    factor_df = get_data_df()
    # 1.2) df.groupby("datetime", "industry")
    mtrc = MeanM(group=["industry"])
    m_multi = mtrc.extract(factor_df)
@@ -116,9 +135,9 @@ def case_1_2():
        print(check_res.value_counts())


-def case_1_3_1_4():
-    # case 1.3 and case 1.4
-    # factor_df = get_factor_df()
+def case_1_3():
+    # case 1.3
+    # factor_df = get_data_df()
    qdl = QlibDataLoader(config=(["$close/Ref($close, 1) - 1"], ["return"]))
    df = qdl.load(instruments=["SH600519"], start_time=START_TIME)
    df = format_conv(df)
@@ -134,7 +153,7 @@ def case_1_3_1_4():

 def case_2_1():
    # · Calculate corr(df.loc[t, :, :], df.loc[t-w, :, :]), w=1, 2, ….
-    factor_df = get_factor_df()
+    factor_df = get_data_df()
    acm = AutoCM()
    mtrc = acm.extract(factor_df)
    print(mtrc)
@@ -147,7 +166,7 @@ def case_2_1():


 def case_2_2():
-    factor_df1, factor_df2 = get_factor_df(0), get_factor_df(1)
+    factor_df1, factor_df2 = get_data_df(0), get_data_df(1)

    cm = CorrM()
    mtrc = cm.extract(factor_df1, factor_df2)
@@ -160,26 +179,18 @@ def case_2_2():
    print(check_res.value_counts())


-def get_target(horizon=5):
-    target = f"Ref($close, -{horizon + 1})/Ref($close, -1) - 1"  # There are lots of targets: return is one of them
-    qdl = QlibDataLoader(config=([target], ["target"]))
-    df = qdl.load(instruments=UNIVERSE, start_time=START_TIME)  # Aligning with factor will improve performance
-    df = format_conv(df["target"])
-    return df
-
-
-def case_3_1_3_3():
-    target, factor = get_target(), get_factor_df(0)
+def case_3_1_3_2():
+    target, factor = get_target(), get_data_df(0)
    ic_m, rank_ic_m = CorrM(), CorrM(mode="spearman")
    ic, rank_ic = ic_m.extract(factor, target), rank_ic_m.extract(factor, target)
    print(pd.DataFrame({"ic": ic, "rank_ic": rank_ic}))


-def run(test_list=["case_1_1", "case_1_2", "case_1_3_1_4", "case_2_1", "case_2_2", "case_3_1_3_3"]):
+def run(test_list=["case_1_1", "case_1_2", "case_1_3", "case_2_1", "case_2_2", "case_3_1_3_2"]):
    """
    run the specific tests

-    python monitor.py case_3_1_3_3
+    python monitor.py case_3_1_3_2

    Parameters
    ----------
@@ -193,4 +204,5 @@ def run(test_list=["case_1_1", "case_1_2", "case_1_3_1_4", "case_2_1", "case_2_2


 if __name__ == "__main__":
+    qlib.init()
    fire.Fire(run)
--- a/examples/data/monitor_analyser_demo.ipynb
+++ b/examples/data/monitor_analyser_demo.ipynb
@@ -0,0 +1,130 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e62a81e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "from tqdm.auto import tqdm\n",
+    "%matplotlib inline\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c503217b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qlib.data.monitor.analyser import Analyser\n",
+    "import qlib\n",
+    "qlib.init()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9c276470",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class SimpleDFA(Analyser):\n",
+    "    \"\"\"Simple (D)ata(F)rame (A)nalyser\"\"\"\n",
+    "    def analyse(self, data: pd.DataFrame, *args, **kwargs):\n",
+    "        data.plot(*args, **kwargs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "110262e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from monitor import get_data_df, AutoCM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0ea38c62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get data\n",
+    "factor_df = get_data_df([1], verbose=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dbded6fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# metric extractor\n",
+    "acm = AutoCM()\n",
+    "mtrc = acm.extract(factor_df)\n",
+    "print(mtrc)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "65517c81",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Analyser\n",
+    "sa = SimpleDFA()\n",
+    "sa.analyse(mtrc, title='Auto Correlation')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dab6fb2e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/qlib/data/monitor/analyser.py
+++ b/qlib/data/monitor/analyser.py
@@ -0,0 +1,14 @@
+from abc import abstractmethod
+
+
+class Analyser:
+    """
+    Analyser is supposed to process the output MetricExt and produce a analysis result
+    - The results could be a report or plot.
+
+    We suppose the Analyser doesn't need much computing resource (The heavy computation should be done in MetricExt)
+    """
+
+    @abstractmethod
+    def analyse(self, *args, **kwargs):
+        ...
--- a/qlib/data/monitor/metric.py
+++ b/qlib/data/monitor/metric.py
@@ -118,7 +118,7 @@ class AutoCM(MetricExt):


 class CorrM(MetricExt):
-    """correlation extractor """
+    """correlation extractor"""

    def __init__(self, mode="pearson"):
        self.mode = mode