Merge pull request #290 from you-n-g/online_srv

init version of online serving and rolling
2026-07-04 11:30:57 +08:00 · 2021-05-17 17:35:29 +08:00
parent 142a9dca3c 8c3a08b18d
commit d08146c30f
39 changed files with 3903 additions and 125 deletions
--- a/qlib/data/dataset/handler.py
+++ b/qlib/data/dataset/handler.py
@@ -7,7 +7,7 @@ import bisect
 import logging
 import warnings
 from inspect import getfullargspec
-from typing import Union, Tuple, List, Iterator, Optional
+from typing import Callable, Union, Tuple, List, Iterator, Optional

 import pandas as pd
 import numpy as np
@@ -36,7 +36,7 @@ class DataHandler(Serializable):
    The data handler try to maintain a handler with 2 level.
    `datetime` & `instruments`.

-    Any order of the index level can be suported (The order will be implied in the data).
+    Any order of the index level can be supported (The order will be implied in the data).
    The order  <`datetime`, `instruments`> will be used when the dataframe index name is missed.

    Example of the data:
@@ -51,6 +51,9 @@ class DataHandler(Serializable):
                   SH600004    13.313329  11800983.0       13.313329        13.317701    0.183632  0.0042
                   SH600005    37.796539  12231662.0       38.258602        37.919757    0.970325  0.0289

+
+    Tips for improving the performance of datahandler
+    - Fetching data with `col_set=CS_RAW` will return the raw data and may avoid pandas from copying the data when calling `loc`
    """

    def __init__(
@@ -74,7 +77,7 @@ class DataHandler(Serializable):
        data_loader : Union[dict, str, DataLoader]
            data loader to load the data.
        init_data :
-            intialize the original data in the constructor.
+            initialize the original data in the constructor.
        fetch_orig : bool
            Return the original data instead of copy if possible.
        """
@@ -125,7 +128,7 @@ class DataHandler(Serializable):

    def setup_data(self, enable_cache: bool = False):
        """
-        Set Up the data in case of running intialization for multiple time
+        Set Up the data in case of running initialization for multiple time

        It is responsible for maintaining following variable
        1) self._data
@@ -163,6 +166,7 @@ class DataHandler(Serializable):
        level: Union[str, int] = "datetime",
        col_set: Union[str, List[str]] = CS_ALL,
        squeeze: bool = False,
+        proc_func: Callable = None,
    ) -> pd.DataFrame:
        """
        fetch data from underlying data source
@@ -185,6 +189,14 @@ class DataHandler(Serializable):
            - if isinstance(col_set, List[str]):

                select several sets of meaningful columns, the returned data has multiple levels
+        proc_func: Callable
+            - Give a hook for processing data before fetching
+            - An example to explain the necessity of the hook:
+                - A Dataset learned some processors to process data which is related to data segmentation
+                - It will apply them every time when preparing data.
+                - The learned processor require the dataframe remains the same format when fitting and applying
+                - However the data format will change according to the parameters.
+                - So the processors should be applied to the underlayer data.

        squeeze : bool
            whether squeeze columns and index
@@ -193,8 +205,15 @@ class DataHandler(Serializable):
        -------
        pd.DataFrame.
        """
+        if proc_func is None:
+            df = self._data
+        else:
+            # FIXME: fetching by time first will be more friendly to `proc_func`
+            # Copy in case of `proc_func` changing the data inplace....
+            df = proc_func(fetch_df_by_index(self._data, selector, level, fetch_orig=self.fetch_orig).copy())
+
        # Fetch column  first will be more friendly to SepDataFrame
-        df = self._fetch_df_by_col(self._data, col_set)
+        df = self._fetch_df_by_col(df, col_set)
        df = fetch_df_by_index(df, selector, level, fetch_orig=self.fetch_orig)
        if squeeze:
            # squeeze columns
@@ -261,6 +280,10 @@ class DataHandler(Serializable):
 class DataHandlerLP(DataHandler):
    """
    DataHandler with **(L)earnable (P)rocessor**
+
+    Tips to improving the performance of data handler
+    - To reduce the memory cost
+        - `drop_raw=True`: this will modify the data inplace on raw data;
    """

    # data key
@@ -430,7 +453,7 @@ class DataHandlerLP(DataHandler):

    def setup_data(self, init_type: str = IT_FIT_SEQ, **kwargs):
        """
-        Set up the data in case of running intialization for multiple time
+        Set up the data in case of running initialization for multiple time

        Parameters
        ----------
@@ -474,6 +497,7 @@ class DataHandlerLP(DataHandler):
        level: Union[str, int] = "datetime",
        col_set=DataHandler.CS_ALL,
        data_key: str = DK_I,
+        proc_func: Callable = None,
    ) -> pd.DataFrame:
        """
        fetch data from underlying data source
@@ -488,12 +512,18 @@ class DataHandlerLP(DataHandler):
            select a set of meaningful columns.(e.g. features, columns).
        data_key : str
            the data to fetch:  DK_*.
+        proc_func: Callable
+            please refer to the doc of DataHandler.fetch

        Returns
        -------
        pd.DataFrame:
        """
        df = self._get_df_by_key(data_key)
+        if proc_func is not None:
+            # FIXME: fetch by time first will be more friendly to proc_func
+            # Copy incase of `proc_func` changing the data inplace....
+            df = proc_func(fetch_df_by_index(df, selector, level, fetch_orig=self.fetch_orig).copy())
        # Fetch column  first will be more friendly to SepDataFrame
        df = self._fetch_df_by_col(df, col_set)
        return fetch_df_by_index(df, selector, level, fetch_orig=self.fetch_orig)