1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-07-04 11:30:57 +08:00
This commit is contained in:
Jactus
2020-10-29 13:01:42 +08:00
10 changed files with 278 additions and 203 deletions

View File

@@ -1,8 +1,133 @@
from ...utils.serial import Serializable
from typing import Union, List, Tuple
from ...utils import init_instance_by_config
from .handler import DataHandler
import pandas as pd
class Dataset:
class Dataset(Serializable):
'''
Preparing data for model training.
The type of dataset depends on the model. (It could be pd.DataFrame, pytorch.DataLoader, etc.)
Preparing data for model training and inferencing.
'''
def generate(self):
def __init__(self, *args, **kwargs):
'''
init is designed to finish following steps
- setup data
- The data related attributes' names should start with '_' so that it will not be saved on disk when serializing
- initialize the state of the dataset(info to prepare the data)
- The name of essential state for preparing data should not start with '_' so that it could be serialized on disk when serializing.
The data could specify the info to caculate the essential data for preparation
'''
self.setup_data(*args, **kwargs)
super().__init__()
def setup_data(self, *args, **kwargs):
"""
setup the data
We split the setup_data function for following situation
- 1) User have a Dataset object with learned status on disk
- 2) User load the Dataset object from the disk(Note the init function is skiped)
- 3) User call `setup_data` to load new data
- 4) User prepare data for model based on previous status
"""
pass
def prepare(self, *args, **kwargs) -> object:
"""
The type of dataset depends on the model. (It could be pd.DataFrame, pytorch.DataLoader, etc.)
The parameters should specify the scope for the prepared data
The method sould
- process the data
- return the processed data
Returns
-------
object:
return the object
"""
pass
class DatasetH(Dataset):
'''
Dataset with Data(H)anler
User should try to put the data preprocessing functions into handler.
Only following data processing functions should be placed in Dataset
- The processing is related to specific model.
- The processing is related to data split
'''
def __init__(self, handler: Union[dict, DataHandler], segments: list):
"""
Parameters
----------
handler : Union[dict, DataHandler]
handler will be passed into setup_data
segments : list
handler will be passed into setup_data
"""
super().__init__(handler, segments)
def setup_data(self, handler: Union[dict, DataHandler], segments: list):
"""
setup the underlying data
Parameters
----------
handler : Union[dict, DataHandler]
handler could be
1) insntance of `DataHandler`
2) config of `DataHandler`. Please refer to `DataHandler`
segments : list
Describe the options to segment the data.
Here are some examples
1) 'segments': {
'train': ("2008-01-01", "2014-12-31"),
'valid': ("2017-01-01", "2020-08-01",),
'test': ("2015-01-01", "2016-12-31",),
}
2) 'segments': {
'insample': ("2008-01-01", "2014-12-31"),
'outsample': ("2017-01-01", "2020-08-01",),
}
"""
self._handler = init_instance_by_config(handler, accept_types=DataHandler)
self._segments = segments
def prepare(self,
segments: Union[List[str], Tuple[str], str, slice],
col_set=DataHandler.CS_ALL,
**kwargs) -> Union[List[pd.DataFrame], pd.DataFrame]:
"""
prepare the data for learning and inference
Parameters
----------
segments : Union[List[str], Tuple[str], str, slice]
Describe the scope of the data to be prepared
Here are some examples
1) 'train'
2) ['train', 'valid']
col_set : [TODO:type]
[TODO:description]
Returns
-------
Union[List[pd.DataFrame], pd.DataFrame]:
[TODO:description]
Raises
------
NotImplementedError:
[TODO:description]
"""
if isinstance(segments, (list, tuple)):
return [
self._handler.fetch(slice(*self._segments[seg]), col_set=col_set, **kwargs) for seg in segments
]
elif isinstance(segments, str):
return self._handler.fetch(slice(*self._segments[segments]), col_set=col_set, **kwargs)
else:
raise NotImplementedError(f"This type of input is not supported")

View File

@@ -5,7 +5,7 @@
import abc
import bisect
import logging
from typing import Union, Tuple
from typing import Union, Tuple, List
import pandas as pd
import numpy as np
@@ -15,6 +15,7 @@ from ...data import D
from ...config import C
from ...utils import parse_config, transform_end_date, init_instance_by_config
from ...utils.serial import Serializable
from .utils import get_level_index
from pathlib import Path
from .loader import DataLoader
@@ -82,34 +83,6 @@ class DataHandler(Serializable):
self._data = self.data_loader.load(self.instruments, self.start_time, self.end_time)
# TODO: cache
def _get_level_index(self, df: pd.DataFrame, level=Union[str, int]) -> int:
"""
get the level index of `df` given `level`
Parameters
----------
df : pd.DataFrame
data
level : Union[str, int]
index level
Returns
-------
int:
The level index in the multiple index
"""
if isinstance(level, str):
try:
return df.index.names.index(level)
except (AttributeError, ValueError):
# NOTE: If level index is not given in the data, the default level index will be ('datetime', 'instrument')
return ('datetime', 'instrument').index(level)
elif isinstance(level, int):
return level
else:
raise NotImplementedError(f"This type of input is not supported")
def _fetch_df_by_index(self, df: pd.DataFrame, selector: Union[pd.Timestamp, slice, str, list], level: Union[str, int]) -> pd.DataFrame:
"""
fetch data from `data` with `selector` and `level`
@@ -123,11 +96,11 @@ class DataHandler(Serializable):
"""
# Try to get the right index
idx_slc = (selector, slice(None, None))
if self._get_level_index(df, level) == 1:
if get_level_index(df, level) == 1:
idx_slc = idx_slc[1], idx_slc[0]
return df.loc(axis=0)[idx_slc]
CS_ALL = '_all'
CS_ALL = '__all'
def _fetch_df_by_col(self, df: pd.DataFrame, col_set: str) -> pd.DataFrame:
cln = len(df.columns.levels)
@@ -138,7 +111,10 @@ class DataHandler(Serializable):
else:
return df.loc(axis=1)[col_set]
def fetch(self, selector: Union[pd.Timestamp, slice, str], level: Union[str, int]='datetime', col_set=CS_ALL) -> pd.DataFrame:
def fetch(self,
selector: Union[pd.Timestamp, slice, str],
level: Union[str, int] = 'datetime',
col_set: Union[str, List[str]] = CS_ALL) -> pd.DataFrame:
"""
fetch data from underlying data source
@@ -148,8 +124,11 @@ class DataHandler(Serializable):
describe how to select data by index
level : Union[str, int]
which index level to select the data
col_set : str
select a set of meaningful columns.(e.g. features, columns)
col_set : Union[str, List[str]]
if isinstance(col_set, str):
select a set of meaningful columns.(e.g. features, columns)
if isinstance(col_set, List[str]):
select several sets of meaningful columns, the returned data has multiple levels
Returns
-------
@@ -195,7 +174,15 @@ class DataHandlerLP(DataHandler):
# - _proc_learn_df will be processed by infer_processors + learn_processors
# - (e.g. _proc_infer_df processed by learn_processors )
def __init__(self, instruments, start_time=None, end_time=None, data_loader: Tuple[dict, str, DataLoader]=None, infer_processors=[], learn_processors=[], process_type=PTYPE_A, **kwargs):
def __init__(self,
instruments,
start_time=None,
end_time=None,
data_loader: Tuple[dict, str, DataLoader] = None,
infer_processors=[],
learn_processors=[],
process_type=PTYPE_A,
**kwargs):
"""
Parameters
----------

View File

@@ -0,0 +1,32 @@
from typing import Union
import pandas as pd
def get_level_index(df: pd.DataFrame, level=Union[str, int]) -> int:
"""
get the level index of `df` given `level`
Parameters
----------
df : pd.DataFrame
data
level : Union[str, int]
index level
Returns
-------
int:
The level index in the multiple index
"""
if isinstance(level, str):
try:
return df.index.names.index(level)
except (AttributeError, ValueError):
# NOTE: If level index is not given in the data, the default level index will be ('datetime', 'instrument')
return ('datetime', 'instrument').index(level)
elif isinstance(level, int):
return level
else:
raise NotImplementedError(f"This type of input is not supported")