Merge

2026-07-04 11:30:57 +08:00 · 2020-10-29 13:01:42 +08:00
parent 60d0cfcf64 a50c9008b8
commit 490dbd908b
10 changed files with 278 additions and 203 deletions
--- a/qlib/data/dataset/init.py
+++ b/qlib/data/dataset/init.py
@@ -1,8 +1,133 @@
+from ...utils.serial import Serializable
+from typing import Union, List, Tuple
+from ...utils import init_instance_by_config
+from .handler import DataHandler
+import pandas as pd

-class Dataset:
+
+class Dataset(Serializable):
    '''
-    Preparing data for model training.
-    The type of dataset depends on the model. (It could be pd.DataFrame,  pytorch.DataLoader, etc.) 
+    Preparing data for model training and inferencing.
    '''
-    def generate(self):
+    def __init__(self, *args, **kwargs):
+        '''
+        init is designed to finish following steps
+        - setup data
+            - The data related attributes' names should start with '_' so that it will not be saved on disk when serializing
+        - initialize the state of the dataset(info to prepare the data)
+            - The name of essential state for preparing data should not start with '_' so that it could be serialized on disk when serializing.
+
+        The data could specify the info to caculate the essential data for preparation
+        '''
+        self.setup_data(*args, **kwargs)
+        super().__init__()
+
+    def setup_data(self, *args, **kwargs):
+        """
+        setup the data
+
+        We split the setup_data function for following situation
+        - 1) User have a Dataset object with learned status on disk
+        - 2) User load the Dataset object from the disk(Note the init function is skiped)
+        - 3) User call `setup_data` to load new data
+        - 4) User prepare data for model based on previous status
+        """
        pass
+
+    def prepare(self, *args, **kwargs) -> object:
+        """
+        The type of dataset depends on the model. (It could be pd.DataFrame, pytorch.DataLoader, etc.)
+        The parameters should specify the scope for the prepared data
+        The method sould
+        - process the data
+        - return the processed data
+
+        Returns
+        -------
+        object:
+            return the object
+        """
+        pass
+
+
+class DatasetH(Dataset):
+    '''
+    Dataset with Data(H)anler
+
+    User should try to put the data preprocessing functions into handler.
+    Only following data processing functions should be placed in Dataset
+    - The processing is related to specific model.
+    - The processing is related to data split
+    '''
+    def __init__(self, handler: Union[dict, DataHandler], segments: list):
+        """
+        Parameters
+        ----------
+        handler : Union[dict, DataHandler]
+            handler will be passed into setup_data
+        segments : list
+            handler will be passed into setup_data
+        """
+        super().__init__(handler, segments)
+
+    def setup_data(self, handler: Union[dict, DataHandler], segments: list):
+        """
+        setup the underlying data
+
+        Parameters
+        ----------
+        handler : Union[dict, DataHandler]
+            handler could be
+            1) insntance of `DataHandler`
+            2) config of `DataHandler`.  Please refer to `DataHandler`
+        segments : list
+            Describe the options to segment the data.
+            Here are some examples
+            1) 'segments': {
+                    'train': ("2008-01-01", "2014-12-31"),
+                    'valid': ("2017-01-01", "2020-08-01",),
+                    'test': ("2015-01-01", "2016-12-31",),
+                }
+            2) 'segments': {
+                    'insample': ("2008-01-01", "2014-12-31"),
+                    'outsample': ("2017-01-01", "2020-08-01",),
+                }
+        """
+        self._handler = init_instance_by_config(handler, accept_types=DataHandler)
+        self._segments = segments
+
+    def prepare(self,
+                segments: Union[List[str], Tuple[str], str, slice],
+                col_set=DataHandler.CS_ALL,
+                **kwargs) -> Union[List[pd.DataFrame], pd.DataFrame]:
+        """
+        prepare the data for learning and inference
+
+        Parameters
+        ----------
+        segments : Union[List[str], Tuple[str], str, slice]
+            Describe the scope of the data to be prepared
+            Here are some examples
+            1) 'train'
+            2) ['train', 'valid']
+        col_set : [TODO:type]
+            [TODO:description]
+
+        Returns
+        -------
+        Union[List[pd.DataFrame], pd.DataFrame]:
+            [TODO:description]
+
+        Raises
+        ------
+        NotImplementedError:
+            [TODO:description]
+        """
+        if isinstance(segments, (list, tuple)):
+            return [
+                self._handler.fetch(slice(*self._segments[seg]), col_set=col_set, **kwargs) for seg in segments
+            ]
+        elif isinstance(segments, str):
+            return self._handler.fetch(slice(*self._segments[segments]), col_set=col_set, **kwargs)
+        else:
+            raise NotImplementedError(f"This type of input is not supported")
--- a/qlib/data/dataset/handler.py
+++ b/qlib/data/dataset/handler.py
@@ -5,7 +5,7 @@
 import abc
 import bisect
 import logging
-from typing import Union, Tuple
+from typing import Union, Tuple, List

 import pandas as pd
 import numpy as np
@@ -15,6 +15,7 @@ from ...data import D
 from ...config import C
 from ...utils import parse_config, transform_end_date, init_instance_by_config
 from ...utils.serial import Serializable
+from .utils import get_level_index
 from pathlib import Path
 from .loader import DataLoader

@@ -82,34 +83,6 @@ class DataHandler(Serializable):
        self._data = self.data_loader.load(self.instruments, self.start_time, self.end_time)
        # TODO: cache

-    def _get_level_index(self, df: pd.DataFrame, level=Union[str, int]) -> int:
-        """
-
-        get the level index of `df` given `level`
-
-        Parameters
-        ----------
-        df : pd.DataFrame
-            data
-        level : Union[str, int]
-            index level
-
-        Returns
-        -------
-        int:
-            The level index in the multiple index
-        """
-        if isinstance(level, str):
-            try:
-                return df.index.names.index(level)
-            except (AttributeError, ValueError):
-                # NOTE: If level index is not given in the data, the default level index will be ('datetime', 'instrument')
-                return ('datetime', 'instrument').index(level)
-        elif isinstance(level, int):
-            return level
-        else:
-            raise NotImplementedError(f"This type of input is not supported")
-
    def _fetch_df_by_index(self, df: pd.DataFrame, selector: Union[pd.Timestamp, slice, str, list], level: Union[str, int]) -> pd.DataFrame:
        """
        fetch data from `data` with `selector` and `level`
@@ -123,11 +96,11 @@ class DataHandler(Serializable):
        """
        # Try to get the right index
        idx_slc = (selector, slice(None, None))
-        if self._get_level_index(df, level) == 1:
+        if get_level_index(df, level) == 1:
            idx_slc = idx_slc[1], idx_slc[0]
        return df.loc(axis=0)[idx_slc]

-    CS_ALL = '_all'
+    CS_ALL = '__all'

    def _fetch_df_by_col(self, df: pd.DataFrame, col_set: str) -> pd.DataFrame:
        cln = len(df.columns.levels)
@@ -138,7 +111,10 @@ class DataHandler(Serializable):
        else:
            return df.loc(axis=1)[col_set]

-    def fetch(self, selector: Union[pd.Timestamp, slice, str], level: Union[str, int]='datetime', col_set=CS_ALL) -> pd.DataFrame:
+    def fetch(self,
+              selector: Union[pd.Timestamp, slice, str],
+              level: Union[str, int] = 'datetime',
+              col_set: Union[str, List[str]] = CS_ALL) -> pd.DataFrame:
        """
        fetch data from underlying data source

@@ -148,8 +124,11 @@ class DataHandler(Serializable):
            describe how to select data by index
        level : Union[str, int]
            which index level to select the data
-        col_set : str
-            select a set of meaningful columns.(e.g. features, columns)
+        col_set : Union[str, List[str]]
+            if isinstance(col_set, str):
+                select a set of meaningful columns.(e.g. features, columns)
+            if isinstance(col_set, List[str]):
+                select several sets of meaningful columns, the returned data has multiple levels

        Returns
        -------
@@ -195,7 +174,15 @@ class DataHandlerLP(DataHandler):
    # - _proc_learn_df will be processed by infer_processors + learn_processors
    #   - (e.g. _proc_infer_df processed by learn_processors )

-    def __init__(self, instruments, start_time=None, end_time=None, data_loader: Tuple[dict, str, DataLoader]=None, infer_processors=[], learn_processors=[], process_type=PTYPE_A, **kwargs):
+    def __init__(self,
+                 instruments,
+                 start_time=None,
+                 end_time=None,
+                 data_loader: Tuple[dict, str, DataLoader] = None,
+                 infer_processors=[],
+                 learn_processors=[],
+                 process_type=PTYPE_A,
+                 **kwargs):
        """
        Parameters
        ----------
--- a/qlib/data/dataset/utils.py
+++ b/qlib/data/dataset/utils.py
@@ -0,0 +1,32 @@
+from typing import Union
+import pandas as pd
+
+
+def get_level_index(df: pd.DataFrame, level=Union[str, int]) -> int:
+    """
+
+    get the level index of `df` given `level`
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        data
+    level : Union[str, int]
+        index level
+
+    Returns
+    -------
+    int:
+        The level index in the multiple index
+    """
+    if isinstance(level, str):
+        try:
+            return df.index.names.index(level)
+        except (AttributeError, ValueError):
+            # NOTE: If level index is not given in the data, the default level index will be ('datetime', 'instrument')
+            return ('datetime', 'instrument').index(level)
+    elif isinstance(level, int):
+        return level
+    else:
+        raise NotImplementedError(f"This type of input is not supported")
+