mirror of
https://github.com/microsoft/qlib.git
synced 2026-07-04 11:30:57 +08:00
Merge
This commit is contained in:
@@ -1,8 +1,133 @@
|
||||
from ...utils.serial import Serializable
|
||||
from typing import Union, List, Tuple
|
||||
from ...utils import init_instance_by_config
|
||||
from .handler import DataHandler
|
||||
import pandas as pd
|
||||
|
||||
class Dataset:
|
||||
|
||||
class Dataset(Serializable):
|
||||
'''
|
||||
Preparing data for model training.
|
||||
The type of dataset depends on the model. (It could be pd.DataFrame, pytorch.DataLoader, etc.)
|
||||
Preparing data for model training and inferencing.
|
||||
'''
|
||||
def generate(self):
|
||||
def __init__(self, *args, **kwargs):
|
||||
'''
|
||||
init is designed to finish following steps
|
||||
- setup data
|
||||
- The data related attributes' names should start with '_' so that it will not be saved on disk when serializing
|
||||
- initialize the state of the dataset(info to prepare the data)
|
||||
- The name of essential state for preparing data should not start with '_' so that it could be serialized on disk when serializing.
|
||||
|
||||
The data could specify the info to caculate the essential data for preparation
|
||||
'''
|
||||
self.setup_data(*args, **kwargs)
|
||||
super().__init__()
|
||||
|
||||
def setup_data(self, *args, **kwargs):
|
||||
"""
|
||||
setup the data
|
||||
|
||||
We split the setup_data function for following situation
|
||||
- 1) User have a Dataset object with learned status on disk
|
||||
- 2) User load the Dataset object from the disk(Note the init function is skiped)
|
||||
- 3) User call `setup_data` to load new data
|
||||
- 4) User prepare data for model based on previous status
|
||||
"""
|
||||
pass
|
||||
|
||||
def prepare(self, *args, **kwargs) -> object:
|
||||
"""
|
||||
The type of dataset depends on the model. (It could be pd.DataFrame, pytorch.DataLoader, etc.)
|
||||
The parameters should specify the scope for the prepared data
|
||||
The method sould
|
||||
- process the data
|
||||
- return the processed data
|
||||
|
||||
Returns
|
||||
-------
|
||||
object:
|
||||
return the object
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class DatasetH(Dataset):
|
||||
'''
|
||||
Dataset with Data(H)anler
|
||||
|
||||
User should try to put the data preprocessing functions into handler.
|
||||
Only following data processing functions should be placed in Dataset
|
||||
- The processing is related to specific model.
|
||||
- The processing is related to data split
|
||||
'''
|
||||
def __init__(self, handler: Union[dict, DataHandler], segments: list):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
handler : Union[dict, DataHandler]
|
||||
handler will be passed into setup_data
|
||||
segments : list
|
||||
handler will be passed into setup_data
|
||||
"""
|
||||
super().__init__(handler, segments)
|
||||
|
||||
def setup_data(self, handler: Union[dict, DataHandler], segments: list):
|
||||
"""
|
||||
setup the underlying data
|
||||
|
||||
Parameters
|
||||
----------
|
||||
handler : Union[dict, DataHandler]
|
||||
handler could be
|
||||
1) insntance of `DataHandler`
|
||||
2) config of `DataHandler`. Please refer to `DataHandler`
|
||||
segments : list
|
||||
Describe the options to segment the data.
|
||||
Here are some examples
|
||||
1) 'segments': {
|
||||
'train': ("2008-01-01", "2014-12-31"),
|
||||
'valid': ("2017-01-01", "2020-08-01",),
|
||||
'test': ("2015-01-01", "2016-12-31",),
|
||||
}
|
||||
2) 'segments': {
|
||||
'insample': ("2008-01-01", "2014-12-31"),
|
||||
'outsample': ("2017-01-01", "2020-08-01",),
|
||||
}
|
||||
"""
|
||||
self._handler = init_instance_by_config(handler, accept_types=DataHandler)
|
||||
self._segments = segments
|
||||
|
||||
def prepare(self,
|
||||
segments: Union[List[str], Tuple[str], str, slice],
|
||||
col_set=DataHandler.CS_ALL,
|
||||
**kwargs) -> Union[List[pd.DataFrame], pd.DataFrame]:
|
||||
"""
|
||||
prepare the data for learning and inference
|
||||
|
||||
Parameters
|
||||
----------
|
||||
segments : Union[List[str], Tuple[str], str, slice]
|
||||
Describe the scope of the data to be prepared
|
||||
Here are some examples
|
||||
1) 'train'
|
||||
2) ['train', 'valid']
|
||||
col_set : [TODO:type]
|
||||
[TODO:description]
|
||||
|
||||
Returns
|
||||
-------
|
||||
Union[List[pd.DataFrame], pd.DataFrame]:
|
||||
[TODO:description]
|
||||
|
||||
Raises
|
||||
------
|
||||
NotImplementedError:
|
||||
[TODO:description]
|
||||
"""
|
||||
if isinstance(segments, (list, tuple)):
|
||||
return [
|
||||
self._handler.fetch(slice(*self._segments[seg]), col_set=col_set, **kwargs) for seg in segments
|
||||
]
|
||||
elif isinstance(segments, str):
|
||||
return self._handler.fetch(slice(*self._segments[segments]), col_set=col_set, **kwargs)
|
||||
else:
|
||||
raise NotImplementedError(f"This type of input is not supported")
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
import abc
|
||||
import bisect
|
||||
import logging
|
||||
from typing import Union, Tuple
|
||||
from typing import Union, Tuple, List
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
@@ -15,6 +15,7 @@ from ...data import D
|
||||
from ...config import C
|
||||
from ...utils import parse_config, transform_end_date, init_instance_by_config
|
||||
from ...utils.serial import Serializable
|
||||
from .utils import get_level_index
|
||||
from pathlib import Path
|
||||
from .loader import DataLoader
|
||||
|
||||
@@ -82,34 +83,6 @@ class DataHandler(Serializable):
|
||||
self._data = self.data_loader.load(self.instruments, self.start_time, self.end_time)
|
||||
# TODO: cache
|
||||
|
||||
def _get_level_index(self, df: pd.DataFrame, level=Union[str, int]) -> int:
|
||||
"""
|
||||
|
||||
get the level index of `df` given `level`
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : pd.DataFrame
|
||||
data
|
||||
level : Union[str, int]
|
||||
index level
|
||||
|
||||
Returns
|
||||
-------
|
||||
int:
|
||||
The level index in the multiple index
|
||||
"""
|
||||
if isinstance(level, str):
|
||||
try:
|
||||
return df.index.names.index(level)
|
||||
except (AttributeError, ValueError):
|
||||
# NOTE: If level index is not given in the data, the default level index will be ('datetime', 'instrument')
|
||||
return ('datetime', 'instrument').index(level)
|
||||
elif isinstance(level, int):
|
||||
return level
|
||||
else:
|
||||
raise NotImplementedError(f"This type of input is not supported")
|
||||
|
||||
def _fetch_df_by_index(self, df: pd.DataFrame, selector: Union[pd.Timestamp, slice, str, list], level: Union[str, int]) -> pd.DataFrame:
|
||||
"""
|
||||
fetch data from `data` with `selector` and `level`
|
||||
@@ -123,11 +96,11 @@ class DataHandler(Serializable):
|
||||
"""
|
||||
# Try to get the right index
|
||||
idx_slc = (selector, slice(None, None))
|
||||
if self._get_level_index(df, level) == 1:
|
||||
if get_level_index(df, level) == 1:
|
||||
idx_slc = idx_slc[1], idx_slc[0]
|
||||
return df.loc(axis=0)[idx_slc]
|
||||
|
||||
CS_ALL = '_all'
|
||||
CS_ALL = '__all'
|
||||
|
||||
def _fetch_df_by_col(self, df: pd.DataFrame, col_set: str) -> pd.DataFrame:
|
||||
cln = len(df.columns.levels)
|
||||
@@ -138,7 +111,10 @@ class DataHandler(Serializable):
|
||||
else:
|
||||
return df.loc(axis=1)[col_set]
|
||||
|
||||
def fetch(self, selector: Union[pd.Timestamp, slice, str], level: Union[str, int]='datetime', col_set=CS_ALL) -> pd.DataFrame:
|
||||
def fetch(self,
|
||||
selector: Union[pd.Timestamp, slice, str],
|
||||
level: Union[str, int] = 'datetime',
|
||||
col_set: Union[str, List[str]] = CS_ALL) -> pd.DataFrame:
|
||||
"""
|
||||
fetch data from underlying data source
|
||||
|
||||
@@ -148,8 +124,11 @@ class DataHandler(Serializable):
|
||||
describe how to select data by index
|
||||
level : Union[str, int]
|
||||
which index level to select the data
|
||||
col_set : str
|
||||
select a set of meaningful columns.(e.g. features, columns)
|
||||
col_set : Union[str, List[str]]
|
||||
if isinstance(col_set, str):
|
||||
select a set of meaningful columns.(e.g. features, columns)
|
||||
if isinstance(col_set, List[str]):
|
||||
select several sets of meaningful columns, the returned data has multiple levels
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -195,7 +174,15 @@ class DataHandlerLP(DataHandler):
|
||||
# - _proc_learn_df will be processed by infer_processors + learn_processors
|
||||
# - (e.g. _proc_infer_df processed by learn_processors )
|
||||
|
||||
def __init__(self, instruments, start_time=None, end_time=None, data_loader: Tuple[dict, str, DataLoader]=None, infer_processors=[], learn_processors=[], process_type=PTYPE_A, **kwargs):
|
||||
def __init__(self,
|
||||
instruments,
|
||||
start_time=None,
|
||||
end_time=None,
|
||||
data_loader: Tuple[dict, str, DataLoader] = None,
|
||||
infer_processors=[],
|
||||
learn_processors=[],
|
||||
process_type=PTYPE_A,
|
||||
**kwargs):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
|
||||
32
qlib/data/dataset/utils.py
Normal file
32
qlib/data/dataset/utils.py
Normal file
@@ -0,0 +1,32 @@
|
||||
from typing import Union
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def get_level_index(df: pd.DataFrame, level=Union[str, int]) -> int:
|
||||
"""
|
||||
|
||||
get the level index of `df` given `level`
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : pd.DataFrame
|
||||
data
|
||||
level : Union[str, int]
|
||||
index level
|
||||
|
||||
Returns
|
||||
-------
|
||||
int:
|
||||
The level index in the multiple index
|
||||
"""
|
||||
if isinstance(level, str):
|
||||
try:
|
||||
return df.index.names.index(level)
|
||||
except (AttributeError, ValueError):
|
||||
# NOTE: If level index is not given in the data, the default level index will be ('datetime', 'instrument')
|
||||
return ('datetime', 'instrument').index(level)
|
||||
elif isinstance(level, int):
|
||||
return level
|
||||
else:
|
||||
raise NotImplementedError(f"This type of input is not supported")
|
||||
|
||||
Reference in New Issue
Block a user