Merge pull request #290 from you-n-g/online_srv

init version of online serving and rolling
2026-07-04 11:30:57 +08:00 · 2021-05-17 17:35:29 +08:00
parent 142a9dca3c 8c3a08b18d
commit d08146c30f
39 changed files with 3903 additions and 125 deletions
--- a/qlib/data/dataset/init.py
+++ b/qlib/data/dataset/init.py
@@ -27,7 +27,7 @@ class Dataset(Serializable):
        - setup data
            - The data related attributes' names should start with '_' so that it will not be saved on disk when serializing.

-        The data could specify the info to caculate the essential data for preparation
+        The data could specify the info to calculate the essential data for preparation
        """
        self.setup_data(**kwargs)
        super().__init__()
@@ -92,7 +92,7 @@ class DatasetH(Dataset):
        handler : Union[dict, DataHandler]
            handler could be:

-            - insntance of `DataHandler`
+            - instance of `DataHandler`

            - config of `DataHandler`.  Please refer to `DataHandler`

@@ -112,8 +112,9 @@ class DatasetH(Dataset):
                        'outsample': ("2017-01-01", "2020-08-01",),
                    }
        """
-        self.handler = init_instance_by_config(handler, accept_types=DataHandler)
+        self.handler: DataHandler = init_instance_by_config(handler, accept_types=DataHandler)
        self.segments = segments.copy()
+        self.fetch_kwargs = {}
        super().__init__(**kwargs)

    def config(self, handler_kwargs: dict = None, **kwargs):
@@ -123,7 +124,7 @@ class DatasetH(Dataset):
        Parameters
        ----------
        handler_kwargs : dict
-            Config of DataHanlder, which could include the following arguments:
+            Config of DataHandler, which could include the following arguments:

            - arguments of DataHandler.conf_data, such as 'instruments', 'start_time' and 'end_time'.

@@ -147,11 +148,11 @@ class DatasetH(Dataset):
        Parameters
        ----------
        handler_kwargs : dict
-            init arguments of DataHanlder, which could include the following arguments:
+            init arguments of DataHandler, which could include the following arguments:

            - init_type : Init Type of Handler

-            - enable_cache : wheter to enable cache
+            - enable_cache : whether to enable cache

        """
        super().setup_data(**kwargs)
@@ -171,7 +172,10 @@ class DatasetH(Dataset):
        ----------
        slc : slice
        """
-        return self.handler.fetch(slc, **kwargs)
+        if hasattr(self, "fetch_kwargs"):
+            return self.handler.fetch(slc, **kwargs, **self.fetch_kwargs)
+        else:
+            return self.handler.fetch(slc, **kwargs)

    def prepare(
        self,
@@ -199,6 +203,12 @@ class DatasetH(Dataset):
            The data to fetch:  DK_*
            Default is DK_I, which indicate fetching data for **inference**.

+        kwargs :
+            The parameters that kwargs may contain:
+                flt_col : str
+                    It only exists in TSDatasetH, can be used to add a column of data(True or False) to filter data.
+                    This parameter is only supported when it is an instance of TSDatasetH.
+
        Returns
        -------
        Union[List[pd.DataFrame], pd.DataFrame]:
@@ -231,7 +241,7 @@ class TSDataSampler:
    (T)ime-(S)eries DataSampler
    This is the result of TSDatasetH

-    It works like `torch.data.utils.Dataset`, it provides a very convient interface for constructing time-series
+    It works like `torch.data.utils.Dataset`, it provides a very convenient interface for constructing time-series
    dataset based on tabular data.

    If user have further requirements for processing data, user could process them based on `TSDataSampler` or create
@@ -243,7 +253,9 @@ class TSDataSampler:

    """

-    def __init__(self, data: pd.DataFrame, start, end, step_len: int, fillna_type: str = "none"):
+    def __init__(
+        self, data: pd.DataFrame, start, end, step_len: int, fillna_type: str = "none", dtype=None, flt_data=None
+    ):
        """
        Build a dataset which looks like torch.data.utils.Dataset.

@@ -265,6 +277,11 @@ class TSDataSampler:
                ffill with previous sample
            ffill+bfill:
                ffill with previous samples first and fill with later samples second
+        flt_data : pd.Series
+            a column of data(True or False) to filter data.
+            None:
+                kepp all data
+
        """
        self.start = start
        self.end = end
@@ -272,23 +289,51 @@ class TSDataSampler:
        self.fillna_type = fillna_type
        assert get_level_index(data, "datetime") == 0
        self.data = lazy_sort_index(data)
-        self.data_arr = np.array(self.data)  # Get index from numpy.array will much faster than DataFrame.values!
-        # NOTE: append last line with full NaN for better performance in `__getitem__`
-        self.data_arr = np.append(self.data_arr, np.full((1, self.data_arr.shape[1]), np.nan), axis=0)
+
+        kwargs = {"object": self.data}
+        if dtype is not None:
+            kwargs["dtype"] = dtype
+
+        self.data_arr = np.array(**kwargs)  # Get index from numpy.array will much faster than DataFrame.values!
+        # NOTE:
+        # - append last line with full NaN for better performance in `__getitem__`
+        # - Keep the same dtype will result in a better performance
+        self.data_arr = np.append(
+            self.data_arr, np.full((1, self.data_arr.shape[1]), np.nan, dtype=self.data_arr.dtype), axis=0
+        )
        self.nan_idx = -1  # The last line is all NaN

        # the data type will be changed
        # The index of usable data is between start_idx and end_idx
-        self.start_idx, self.end_idx = self.data.index.slice_locs(start=pd.Timestamp(start), end=pd.Timestamp(end))
        self.idx_df, self.idx_map = self.build_index(self.data)
+        self.data_index = deepcopy(self.data.index)
+
+        if flt_data is not None:
+            self.flt_data = np.array(flt_data.reindex(self.data_index)).reshape(-1)
+            self.idx_map = self.flt_idx_map(self.flt_data, self.idx_map)
+            self.data_index = self.data_index[np.where(self.flt_data == True)[0]]
+
+        self.start_idx, self.end_idx = self.data_index.slice_locs(start=pd.Timestamp(start), end=pd.Timestamp(end))
        self.idx_arr = np.array(self.idx_df.values, dtype=np.float64)  # for better performance

+        del self.data  # save memory
+
+    @staticmethod
+    def flt_idx_map(flt_data, idx_map):
+        idx = 0
+        new_idx_map = {}
+        for i, exist in enumerate(flt_data):
+            if exist:
+                new_idx_map[idx] = idx_map[i]
+                idx += 1
+        return new_idx_map
+
    def get_index(self):
        """
        Get the pandas index of the data, it will be useful in following scenarios
        - Special sampler will be used (e.g. user want to sample day by day)
        """
-        return self.data.index[self.start_idx : self.end_idx]
+        return self.data_index[self.start_idx : self.end_idx]

    def config(self, **kwargs):
        # Config the attributes
@@ -432,7 +477,7 @@ class TSDatasetH(DatasetH):
    (T)ime-(S)eries Dataset (H)andler


-    Covnert the tabular data to Time-Series data
+    Convert the tabular data to Time-Series data

    Requirements analysis

@@ -461,7 +506,7 @@ class TSDatasetH(DatasetH):
        cal = sorted(cal)
        self.cal = cal

-    def _prepare_seg(self, slc: slice, **kwargs) -> TSDataSampler:
+    def _prepare_raw_seg(self, slc: slice, **kwargs) -> pd.DataFrame:
        # Dataset decide how to slice data(Get more data for timeseries).
        start, end = slc.start, slc.stop
        start_idx = bisect.bisect_left(self.cal, pd.Timestamp(start))
@@ -470,6 +515,25 @@ class TSDatasetH(DatasetH):

        # TSDatasetH will retrieve more data for complete
        data = super()._prepare_seg(slice(pad_start, end), **kwargs)
+        return data

-        tsds = TSDataSampler(data=data, start=start, end=end, step_len=self.step_len)
+    def _prepare_seg(self, slc: slice, **kwargs) -> TSDataSampler:
+        """
+        split the _prepare_raw_seg is to leave a hook for data preprocessing before creating processing data
+        """
+        dtype = kwargs.pop("dtype", None)
+        start, end = slc.start, slc.stop
+        flt_col = kwargs.pop("flt_col", None)
+        # TSDatasetH will retrieve more data for complete
+        data = self._prepare_raw_seg(slc, **kwargs)
+
+        flt_kwargs = deepcopy(kwargs)
+        if flt_col is not None:
+            flt_kwargs["col_set"] = flt_col
+            flt_data = self._prepare_raw_seg(slc, **flt_kwargs)
+            assert len(flt_data.columns) == 1
+        else:
+            flt_data = None
+
+        tsds = TSDataSampler(data=data, start=start, end=end, step_len=self.step_len, dtype=dtype, flt_data=flt_data)
        return tsds
--- a/qlib/data/dataset/handler.py
+++ b/qlib/data/dataset/handler.py
@@ -7,7 +7,7 @@ import bisect
 import logging
 import warnings
 from inspect import getfullargspec
-from typing import Union, Tuple, List, Iterator, Optional
+from typing import Callable, Union, Tuple, List, Iterator, Optional

 import pandas as pd
 import numpy as np
@@ -36,7 +36,7 @@ class DataHandler(Serializable):
    The data handler try to maintain a handler with 2 level.
    `datetime` & `instruments`.

-    Any order of the index level can be suported (The order will be implied in the data).
+    Any order of the index level can be supported (The order will be implied in the data).
    The order  <`datetime`, `instruments`> will be used when the dataframe index name is missed.

    Example of the data:
@@ -51,6 +51,9 @@ class DataHandler(Serializable):
                   SH600004    13.313329  11800983.0       13.313329        13.317701    0.183632  0.0042
                   SH600005    37.796539  12231662.0       38.258602        37.919757    0.970325  0.0289

+
+    Tips for improving the performance of datahandler
+    - Fetching data with `col_set=CS_RAW` will return the raw data and may avoid pandas from copying the data when calling `loc`
    """

    def __init__(
@@ -74,7 +77,7 @@ class DataHandler(Serializable):
        data_loader : Union[dict, str, DataLoader]
            data loader to load the data.
        init_data :
-            intialize the original data in the constructor.
+            initialize the original data in the constructor.
        fetch_orig : bool
            Return the original data instead of copy if possible.
        """
@@ -125,7 +128,7 @@ class DataHandler(Serializable):

    def setup_data(self, enable_cache: bool = False):
        """
-        Set Up the data in case of running intialization for multiple time
+        Set Up the data in case of running initialization for multiple time

        It is responsible for maintaining following variable
        1) self._data
@@ -163,6 +166,7 @@ class DataHandler(Serializable):
        level: Union[str, int] = "datetime",
        col_set: Union[str, List[str]] = CS_ALL,
        squeeze: bool = False,
+        proc_func: Callable = None,
    ) -> pd.DataFrame:
        """
        fetch data from underlying data source
@@ -185,6 +189,14 @@ class DataHandler(Serializable):
            - if isinstance(col_set, List[str]):

                select several sets of meaningful columns, the returned data has multiple levels
+        proc_func: Callable
+            - Give a hook for processing data before fetching
+            - An example to explain the necessity of the hook:
+                - A Dataset learned some processors to process data which is related to data segmentation
+                - It will apply them every time when preparing data.
+                - The learned processor require the dataframe remains the same format when fitting and applying
+                - However the data format will change according to the parameters.
+                - So the processors should be applied to the underlayer data.

        squeeze : bool
            whether squeeze columns and index
@@ -193,8 +205,15 @@ class DataHandler(Serializable):
        -------
        pd.DataFrame.
        """
+        if proc_func is None:
+            df = self._data
+        else:
+            # FIXME: fetching by time first will be more friendly to `proc_func`
+            # Copy in case of `proc_func` changing the data inplace....
+            df = proc_func(fetch_df_by_index(self._data, selector, level, fetch_orig=self.fetch_orig).copy())
+
        # Fetch column  first will be more friendly to SepDataFrame
-        df = self._fetch_df_by_col(self._data, col_set)
+        df = self._fetch_df_by_col(df, col_set)
        df = fetch_df_by_index(df, selector, level, fetch_orig=self.fetch_orig)
        if squeeze:
            # squeeze columns
@@ -261,6 +280,10 @@ class DataHandler(Serializable):
 class DataHandlerLP(DataHandler):
    """
    DataHandler with **(L)earnable (P)rocessor**
+
+    Tips to improving the performance of data handler
+    - To reduce the memory cost
+        - `drop_raw=True`: this will modify the data inplace on raw data;
    """

    # data key
@@ -430,7 +453,7 @@ class DataHandlerLP(DataHandler):

    def setup_data(self, init_type: str = IT_FIT_SEQ, **kwargs):
        """
-        Set up the data in case of running intialization for multiple time
+        Set up the data in case of running initialization for multiple time

        Parameters
        ----------
@@ -474,6 +497,7 @@ class DataHandlerLP(DataHandler):
        level: Union[str, int] = "datetime",
        col_set=DataHandler.CS_ALL,
        data_key: str = DK_I,
+        proc_func: Callable = None,
    ) -> pd.DataFrame:
        """
        fetch data from underlying data source
@@ -488,12 +512,18 @@ class DataHandlerLP(DataHandler):
            select a set of meaningful columns.(e.g. features, columns).
        data_key : str
            the data to fetch:  DK_*.
+        proc_func: Callable
+            please refer to the doc of DataHandler.fetch

        Returns
        -------
        pd.DataFrame:
        """
        df = self._get_df_by_key(data_key)
+        if proc_func is not None:
+            # FIXME: fetch by time first will be more friendly to proc_func
+            # Copy incase of `proc_func` changing the data inplace....
+            df = proc_func(fetch_df_by_index(df, selector, level, fetch_orig=self.fetch_orig).copy())
        # Fetch column  first will be more friendly to SepDataFrame
        df = self._fetch_df_by_col(df, col_set)
        return fetch_df_by_index(df, selector, level, fetch_orig=self.fetch_orig)
--- a/qlib/data/dataset/loader.py
+++ b/qlib/data/dataset/loader.py
@@ -13,6 +13,7 @@ from qlib.data import D
 from qlib.data import filter as filter_module
 from qlib.data.filter import BaseDFilter
 from qlib.utils import load_dataset, init_instance_by_config
+from qlib.log import get_module_logger


 class DataLoader(abc.ABC):
@@ -224,6 +225,10 @@ class DataLoaderDH(DataLoader):
    DataLoader based on (D)ata (H)andler
    It is designed to load multiple data from data handler
    - If you just want to load data from single datahandler, you can write them in single data handler
+
+    TODO: What make this module not that easy to use.
+    - For online scenario
+        - The underlayer data handler should be configured. But data loader doesn't provide such interface & hook.
    """

    def __init__(self, handler_config: dict, fetch_kwargs: dict = {}, is_group=False):
@@ -265,7 +270,7 @@ class DataLoaderDH(DataLoader):

    def load(self, instruments=None, start_time=None, end_time=None) -> pd.DataFrame:
        if instruments is not None:
-            LOG.warning(f"instruments[{instruments}] is ignored")
+            get_module_logger(self.__class__.__name__).warning(f"instruments[{instruments}] is ignored")

        if self.is_group:
            df = pd.concat(
--- a/qlib/data/dataset/processor.py
+++ b/qlib/data/dataset/processor.py
@@ -2,6 +2,7 @@
 # Licensed under the MIT License.

 import abc
+from typing import Union, Text
 import numpy as np
 import pandas as pd
 import copy
@@ -14,7 +15,7 @@ from ...utils.paral import datetime_groupby_apply
 EPS = 1e-12


-def get_group_columns(df: pd.DataFrame, group: str):
+def get_group_columns(df: pd.DataFrame, group: Union[Text, None]):
    """
    get a group of columns from multi-index columns DataFrame