From 77ba7b4e91e5a73ef3d87e3b3b741cd757c41d0e Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Thu, 22 Apr 2021 05:54:43 +0000
Subject: [PATCH] Add analyser example and finetune example

---
 examples/data/monitor.py                  | 112 ++++++++++---------
 examples/data/monitor_analyser_demo.ipynb | 130 ++++++++++++++++++++++
 qlib/data/monitor/analyser.py             |  14 +++
 qlib/data/monitor/metric.py               |   2 +-
 4 files changed, 207 insertions(+), 51 deletions(-)
 create mode 100644 examples/data/monitor_analyser_demo.ipynb
 create mode 100644 qlib/data/monitor/analyser.py
diff --git a/examples/data/monitor.py b/examples/data/monitor.py
index f2f9ea572..21fa5d447 100644
--- a/examples/data/monitor.py
+++ b/examples/data/monitor.py
@@ -1,46 +1,46 @@
 """
-This script is the demonstrating the implementation of following requirements.
-
+This script is the demonstrating the implementation of Metric Extractor and Detector
 
 NOTE: A lot of details is not considered in this script
 - Corner case that will raise error( std == 0)
 
 
-· Transformer:
-	1) Basic statistics on different slices of the DataFrame df:
-		§ The statistics include:
+
+The following functions are used to demonstrate the following examples
+
+
+· Metric Extractor:
+	case 1) Basic statistics on different slices of the DataFrame df:
+		1) The statistics include:
 			· STD, Mean, Skewnes, Kurtosis
-		§ The above statistics can be calculated on the following data slices:
+		2) The above statistics can be calculated on the following data slices:
 			· df.groupby(['datetime'])
 			· df.groupby(['datetime', 'industry' ])
-			· df.groupby(['instrument', 'factor'])
-			· df.apply("<expresion>").groupby([..]), in which [..] could be any one of the above slicing rules.
-	2) Advanced statistics on different slices of the DataFrame df:
-		§ Auto-correlation:
+                3) The statistics could be calculated on the time dimension for each instruments and factor(the factor can be represented by experssion)
+			· <df implemented by expresion>.groupby(['instrument', 'factor'])
+	case 2) Advanced statistics on different slices of the DataFrame df:
+		1) Auto-correlation:
 			· Calculate corr(df.loc[t, :, :], df.loc[t-w, :, :]), w=1, 2, ….
-		§ Correlation between factors:
+		2) Correlation between factors:
 			· For any pair of factors (i, j): calculate corr(df.loc[t, :, i], df.loc[t, :,  j]). The result is a correlation matrix with each element corresponds to a correlation value between a pair of factors.
-		§ The data slices are the same as those in 1).
-· Monitor:
-	1) Algorithms:
+
+· Detector:  detect the abnormality of the extracted metric;
+	a) Algorithms:
 		§ Basic checks:  NaN.
 		§ Point anomaly detection.
 		§ Segment anomaly detection.
-	2) Scenarios:
+	b) Scenarios:
 		§ Online anomaly detection: monitoring streaming data.
-Offline anomaly detection: verifying whole historical data.
+The usage of the detectors are demonstrated in the `case_1_*`and `case_2_*`
 
 
-2021-2-19:
-
-Effectiveness metrics
-- Standard metrics:
-    - [X] IC(Information Coefficient)  #case_3_1
-    - [ ] IR(Information Ratio): Informatio Ratio is related to backest
-    - [X] RankIC   #case_3_3
+case 3): Examples to use MetricExt to monitor IC and rank IC
+        1) IC(Information Coefficient)  #case_3_1
+        2) RankIC   #case_3_2
 """
 
 # AUTO download data
+from typing import List, Union
 from qlib.utils import exists_qlib_data
 from qlib.tests.data import GetData
 from qlib.config import REG_CN
@@ -51,8 +51,6 @@ if not exists_qlib_data(provider_uri):
     GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
 
 import qlib
-
-qlib.init()
 import pandas as pd
 from qlib.contrib.data.handler import Alpha158
 from qlib.data.dataset.loader import QlibDataLoader
@@ -62,30 +60,51 @@ from qlib.data.monitor.detector import NDDetector, SWNDD, ThresholdD
 from qlib.data import D
 import fire
 
-
 UNIVERSE = "csi300"
 START_TIME = "20200101"
 
+# ------------------ a helper function to get data to demonstrate the functionality --------------------
 
-def get_factor_df(col_idx=0):
+
+def get_data_df(col_idx: Union[int, List[int]] = 0, verbose: bool = True):
+    """
+    a helper function to get data to demonstrate the functionality.
+
+    Parameters
+    ----------
+    col_idx : Union[int, List[int]]
+        column index of the metrics
+    """
     dh = Alpha158(instruments=UNIVERSE, infer_processors=[], learn_processors=[], start_time=START_TIME)
     df = dh.fetch()
 
-    print(df.head())
+    if verbose:
+        print(df.head())
 
     # We don't have industries in dataframe, we generate the with fake data
     industry = pd.Series(df.index.get_level_values("instrument").str.slice(stop=2).to_list(), index=df.index)
 
     # select a factor
     factor_df = format_conv(df.iloc[:, col_idx], industry=industry)
-    print(f"Selected metric: {df.columns[col_idx]}")
-
-    print(factor_df)
+    if verbose:
+        print(f"Selected metric: {df.columns[col_idx]}")
+        print(factor_df)
     return factor_df
 
 
+def get_target(horizon=5):
+    target = f"Ref($close, -{horizon + 1})/Ref($close, -1) - 1"  # There are lots of targets: return is one of them
+    qdl = QlibDataLoader(config=([target], ["target"]))
+    df = qdl.load(instruments=UNIVERSE, start_time=START_TIME)  # Aligning with factor will improve performance
+    df = format_conv(df["target"])
+    return df
+
+
+# -----------------  Cases to demonstrate the usage of detector and examples ----------------------
+
+
 def case_1_1():
-    factor_df = get_factor_df()
+    factor_df = get_data_df()
     # 1) Extract metrics
 
     # 1.1) df.groupby(["datetime"])
@@ -101,7 +120,7 @@ def case_1_1():
 
 
 def case_1_2():
-    factor_df = get_factor_df()
+    factor_df = get_data_df()
     # 1.2) df.groupby("datetime", "industry")
     mtrc = MeanM(group=["industry"])
     m_multi = mtrc.extract(factor_df)
@@ -116,9 +135,9 @@ def case_1_2():
         print(check_res.value_counts())
 
 
-def case_1_3_1_4():
-    # case 1.3 and case 1.4
-    # factor_df = get_factor_df()
+def case_1_3():
+    # case 1.3
+    # factor_df = get_data_df()
     qdl = QlibDataLoader(config=(["$close/Ref($close, 1) - 1"], ["return"]))
     df = qdl.load(instruments=["SH600519"], start_time=START_TIME)
     df = format_conv(df)
@@ -134,7 +153,7 @@ def case_1_3_1_4():
 
 def case_2_1():
     # · Calculate corr(df.loc[t, :, :], df.loc[t-w, :, :]), w=1, 2, ….
-    factor_df = get_factor_df()
+    factor_df = get_data_df()
     acm = AutoCM()
     mtrc = acm.extract(factor_df)
     print(mtrc)
@@ -147,7 +166,7 @@ def case_2_1():
 
 
 def case_2_2():
-    factor_df1, factor_df2 = get_factor_df(0), get_factor_df(1)
+    factor_df1, factor_df2 = get_data_df(0), get_data_df(1)
 
     cm = CorrM()
     mtrc = cm.extract(factor_df1, factor_df2)
@@ -160,26 +179,18 @@ def case_2_2():
     print(check_res.value_counts())
 
 
-def get_target(horizon=5):
-    target = f"Ref($close, -{horizon + 1})/Ref($close, -1) - 1"  # There are lots of targets: return is one of them
-    qdl = QlibDataLoader(config=([target], ["target"]))
-    df = qdl.load(instruments=UNIVERSE, start_time=START_TIME)  # Aligning with factor will improve performance
-    df = format_conv(df["target"])
-    return df
-
-
-def case_3_1_3_3():
-    target, factor = get_target(), get_factor_df(0)
+def case_3_1_3_2():
+    target, factor = get_target(), get_data_df(0)
     ic_m, rank_ic_m = CorrM(), CorrM(mode="spearman")
     ic, rank_ic = ic_m.extract(factor, target), rank_ic_m.extract(factor, target)
     print(pd.DataFrame({"ic": ic, "rank_ic": rank_ic}))
 
 
-def run(test_list=["case_1_1", "case_1_2", "case_1_3_1_4", "case_2_1", "case_2_2", "case_3_1_3_3"]):
+def run(test_list=["case_1_1", "case_1_2", "case_1_3", "case_2_1", "case_2_2", "case_3_1_3_2"]):
     """
     run the specific tests
 
-    python monitor.py case_3_1_3_3
+    python monitor.py case_3_1_3_2
 
     Parameters
     ----------
@@ -193,4 +204,5 @@ def run(test_list=["case_1_1", "case_1_2", "case_1_3_1_4", "case_2_1", "case_2_2
 
 
 if __name__ == "__main__":
+    qlib.init()
     fire.Fire(run)
diff --git a/examples/data/monitor_analyser_demo.ipynb b/examples/data/monitor_analyser_demo.ipynb
new file mode 100644
index 000000000..22dd11cec
--- /dev/null
+++ b/examples/data/monitor_analyser_demo.ipynb
@@ -0,0 +1,130 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e62a81e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "from tqdm.auto import tqdm\n",
+    "%matplotlib inline\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c503217b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qlib.data.monitor.analyser import Analyser\n",
+    "import qlib\n",
+    "qlib.init()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9c276470",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class SimpleDFA(Analyser):\n",
+    "    \"\"\"Simple (D)ata(F)rame (A)nalyser\"\"\"\n",
+    "    def analyse(self, data: pd.DataFrame, *args, **kwargs):\n",
+    "        data.plot(*args, **kwargs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "110262e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from monitor import get_data_df, AutoCM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0ea38c62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get data\n",
+    "factor_df = get_data_df([1], verbose=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dbded6fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# metric extractor\n",
+    "acm = AutoCM()\n",
+    "mtrc = acm.extract(factor_df)\n",
+    "print(mtrc)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "65517c81",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Analyser\n",
+    "sa = SimpleDFA()\n",
+    "sa.analyse(mtrc, title='Auto Correlation')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dab6fb2e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/qlib/data/monitor/analyser.py b/qlib/data/monitor/analyser.py
new file mode 100644
index 000000000..e2c6fd453
--- /dev/null
+++ b/qlib/data/monitor/analyser.py
@@ -0,0 +1,14 @@
+from abc import abstractmethod
+
+
+class Analyser:
+    """
+    Analyser is supposed to process the output MetricExt and produce a analysis result
+    - The results could be a report or plot.
+
+    We suppose the Analyser doesn't need much computing resource (The heavy computation should be done in MetricExt)
+    """
+
+    @abstractmethod
+    def analyse(self, *args, **kwargs):
+        ...
diff --git a/qlib/data/monitor/metric.py b/qlib/data/monitor/metric.py
index 15538b73f..db531e075 100644
--- a/qlib/data/monitor/metric.py
+++ b/qlib/data/monitor/metric.py
@@ -118,7 +118,7 @@ class AutoCM(MetricExt):
 
 
 class CorrM(MetricExt):
-    """correlation extractor """
+    """correlation extractor"""
 
     def __init__(self, mode="pearson"):
         self.mode = mode