1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-07-03 02:50:58 +08:00

Update docs and fix tabnet

This commit is contained in:
Jactus
2020-11-26 00:55:26 +08:00
parent 5be847909f
commit 87cee85cea
27 changed files with 624 additions and 495 deletions

View File

@@ -129,13 +129,13 @@ class Expression(abc.ABC):
Parameters
----------
instrument : str
instrument code
instrument code.
start_index : str
feature start index [in calendar]
feature start index [in calendar].
end_index : str
feature end index [in calendar]
feature end index [in calendar].
freq : str
feature frequency
feature frequency.
Returns
----------

View File

@@ -76,8 +76,8 @@ class MemCache(object):
Parameters
----------
mem_cache_size_limit: cache max size
limit_type: length or sizeof; length(call fun: len), size(call fun: sys.getsizeof)
mem_cache_size_limit: cache max size.
limit_type: length or sizeof; length(call fun: len), size(call fun: sys.getsizeof).
"""
if limit_type not in ["length", "sizeof"]:
raise ValueError(f"limit_type must be length or sizeof, your limit_type is {limit_type}")
@@ -118,9 +118,9 @@ class MemCacheExpire:
def set_cache(mem_cache, key, value):
"""set cache
:param mem_cache: MemCache attribute('c'/'i'/'f')
:param key: cache key
:param value: cache value
:param mem_cache: MemCache attribute('c'/'i'/'f').
:param key: cache key.
:param value: cache value.
"""
mem_cache[key] = value, time.time()
@@ -128,9 +128,9 @@ class MemCacheExpire:
def get_cache(mem_cache, key):
"""get mem cache
:param mem_cache: MemCache attribute('c'/'i'/'f')
:param key: cache key
:return: cache value; if cache not exist, return None
:param mem_cache: MemCache attribute('c'/'i'/'f').
:param key: cache key.
:return: cache value; if cache not exist, return None.
"""
value = None
expire = False
@@ -275,12 +275,12 @@ class ExpressionCache(BaseProviderCache):
Parameters
----------
cache_uri : str
the complete uri of expression cache file (include dir path)
the complete uri of expression cache file (include dir path).
Returns
-------
int
0(successful update)/ 1(no need to update)/ 2(update failure)
0(successful update)/ 1(no need to update)/ 2(update failure).
"""
raise NotImplementedError("Implement this method if you want to make expression cache up to date")
@@ -348,7 +348,7 @@ class DatasetCache(BaseProviderCache):
Parameters
----------
cache_uri : str
the complete uri of dataset cache file (include dir path)
the complete uri of dataset cache file (include dir path).
Returns
-------
@@ -361,9 +361,9 @@ class DatasetCache(BaseProviderCache):
def cache_to_origin_data(data, fields):
"""cache data to origin data
:param data: pd.DataFrame, cache data
:param fields: feature fields
:return: pd.DataFrame
:param data: pd.DataFrame, cache data.
:param fields: feature fields.
:return: pd.DataFrame.
"""
not_space_fields = remove_fields_space(fields)
data = data.loc[:, not_space_fields]
@@ -583,7 +583,7 @@ class DiskDatasetCache(DatasetCache):
:param cache_path:
:param start_time:
:param end_time:
:param fields: The fields order of the dataset cache is sorted. So rearrange the columns to make it consistent
:param fields: The fields order of the dataset cache is sorted. So rearrange the columns to make it consistent.
:return:
"""
@@ -771,12 +771,12 @@ class DiskDatasetCache(DatasetCache):
- This is a hdf file sorted by datetime
:param cache_path: The path to store the cache
:param instruments: The instruments to store the cache
:param fields: The fields to store the cache
:param freq: The freq to store the cache
:param cache_path: The path to store the cache.
:param instruments: The instruments to store the cache.
:param fields: The fields to store the cache.
:param freq: The freq to store the cache.
:return type pd.DataFrame; The fields of the returned DataFrame are consistent with the parameters of the function
:return type pd.DataFrame; The fields of the returned DataFrame are consistent with the parameters of the function.
"""
# get calendar
from .data import Cal

View File

@@ -51,13 +51,13 @@ class Client(object):
Parameters
----------
request_type : str
type of proposed request, 'calendar'/'instrument'/'feature'
type of proposed request, 'calendar'/'instrument'/'feature'.
request_content : dict
records the information of the request
records the information of the request.
msg_proc_func : func
the function to process the message when receiving response, should have arg `*args`
the function to process the message when receiving response, should have arg `*args`.
msg_queue: Queue
The queue to pass the messsage after callback
The queue to pass the messsage after callback.
"""
head_info = {"version": qlib.__version__}

View File

@@ -41,13 +41,13 @@ class CalendarProvider(abc.ABC):
Parameters
----------
start_time : str
start of the time range
start of the time range.
end_time : str
end of the time range
end of the time range.
freq : str
time frequency, available: year/quarter/month/week/day
time frequency, available: year/quarter/month/week/day.
future : bool
whether including future trading day
whether including future trading day.
Returns
----------
@@ -62,24 +62,24 @@ class CalendarProvider(abc.ABC):
Parameters
----------
start_time : str
start of the time range
start of the time range.
end_time : str
end of the time range
end of the time range.
freq : str
time frequency, available: year/quarter/month/week/day
time frequency, available: year/quarter/month/week/day.
future : bool
whether including future trading day
whether including future trading day.
Returns
-------
pd.Timestamp
the real start time
the real start time.
pd.Timestamp
the real end time
the real end time.
int
the index of start time
the index of start time.
int
the index of end time
the index of end time.
"""
start_time = pd.Timestamp(start_time)
end_time = pd.Timestamp(end_time)
@@ -103,16 +103,16 @@ class CalendarProvider(abc.ABC):
Parameters
----------
freq : str
frequency of read calendar file
frequency of read calendar file.
future : bool
whether including future trading day
whether including future trading day.
Returns
-------
list
list of timestamps
list of timestamps.
dict
dict composed by timestamp as key and index as value for fast search
dict composed by timestamp as key and index as value for fast search.
"""
flag = f"{freq}_future_{future}"
if flag in H["c"]:
@@ -141,14 +141,14 @@ class InstrumentProvider(abc.ABC):
Parameters
----------
market : str
market/industry/index shortname, e.g. all/sse/szse/sse50/csi300/csi500
market/industry/index shortname, e.g. all/sse/szse/sse50/csi300/csi500.
filter_pipe : list
the list of dynamic filters
the list of dynamic filters.
Returns
----------
dict
dict of stockpool config
dict of stockpool config.
{`market`=>base market name, `filter_pipe`=>list of filters}
example :
@@ -182,13 +182,13 @@ class InstrumentProvider(abc.ABC):
Parameters
----------
instruments : dict
stockpool config
stockpool config.
start_time : str
start of the time range
start of the time range.
end_time : str
end of the time range
end of the time range.
as_list : bool
return instruments as list or dict
return instruments as list or dict.
Returns
-------
@@ -243,15 +243,15 @@ class FeatureProvider(abc.ABC):
Parameters
----------
instrument : str
a certain instrument
a certain instrument.
field : str
a certain field of feature
a certain field of feature.
start_time : str
start of the time range
start of the time range.
end_time : str
end of the time range
end of the time range.
freq : str
time frequency, available: year/quarter/month/week/day
time frequency, available: year/quarter/month/week/day.
Returns
-------
@@ -294,15 +294,15 @@ class ExpressionProvider(abc.ABC):
Parameters
----------
instrument : str
a certain instrument
a certain instrument.
field : str
a certain field of feature
a certain field of feature.
start_time : str
start of the time range
start of the time range.
end_time : str
end of the time range
end of the time range.
freq : str
time frequency, available: year/quarter/month/week/day
time frequency, available: year/quarter/month/week/day.
Returns
-------
@@ -325,20 +325,20 @@ class DatasetProvider(abc.ABC):
Parameters
----------
instruments : list or dict
list/dict of instruments or dict of stockpool config
list/dict of instruments or dict of stockpool config.
fields : list
list of feature instances
list of feature instances.
start_time : str
start of the time range
start of the time range.
end_time : str
end of the time range
end of the time range.
freq : str
time frequency
time frequency.
Returns
----------
pd.DataFrame
a pandas dataframe with <instrument, datetime> index
a pandas dataframe with <instrument, datetime> index.
"""
raise NotImplementedError("Subclass of DatasetProvider must implement `Dataset` method")
@@ -357,17 +357,17 @@ class DatasetProvider(abc.ABC):
Parameters
----------
instruments : list or dict
list/dict of instruments or dict of stockpool config
list/dict of instruments or dict of stockpool config.
fields : list
list of feature instances
list of feature instances.
start_time : str
start of the time range
start of the time range.
end_time : str
end of the time range
end of the time range.
freq : str
time frequency
time frequency.
disk_cache : int
whether to skip(0)/use(1)/replace(2) disk_cache
whether to skip(0)/use(1)/replace(2) disk_cache.
"""
return DiskDatasetCache._uri(instruments, fields, start_time, end_time, freq, disk_cache)
@@ -526,7 +526,7 @@ class LocalCalendarProvider(CalendarProvider):
Parameters
----------
freq : str
frequency of read calendar file
frequency of read calendar file.
Returns
----------

View File

@@ -17,7 +17,7 @@ class Dataset(Serializable):
init is designed to finish following steps:
- setup data
- The data related attributes' names should start with '_' so that it will not be saved on disk when serializing
- The data related attributes' names should start with '_' so that it will not be saved on disk when serializing.
- initialize the state of the dataset(info to prepare the data)
- The name of essential state for preparing data should not start with '_' so that it could be serialized on disk when serializing.
@@ -29,17 +29,17 @@ class Dataset(Serializable):
def setup_data(self, *args, **kwargs):
"""
setup the data
Setup the data.
We split the setup_data function for following situation:
- User have a Dataset object with learned status on disk
- User have a Dataset object with learned status on disk.
- User load the Dataset object from the disk(Note the init function is skiped)
- User load the Dataset object from the disk(Note the init function is skiped).
- User call `setup_data` to load new data
- User call `setup_data` to load new data.
- User prepare data for model based on previous status
- User prepare data for model based on previous status.
"""
pass
@@ -66,9 +66,10 @@ class DatasetH(Dataset):
User should try to put the data preprocessing functions into handler.
Only following data processing functions should be placed in Dataset:
- The processing is related to specific model.
- The processing is related to data split
- The processing is related to data split.
"""
def __init__(self, handler: Union[dict, DataHandler], segments: list):
@@ -76,15 +77,15 @@ class DatasetH(Dataset):
Parameters
----------
handler : Union[dict, DataHandler]
handler will be passed into setup_data
handler will be passed into setup_data.
segments : list
handler will be passed into setup_data
handler will be passed into setup_data.
"""
super().__init__(handler, segments)
def setup_data(self, handler: Union[dict, DataHandler], segments: list):
"""
setup the underlying data
Setup the underlying data.
Parameters
----------
@@ -121,7 +122,7 @@ class DatasetH(Dataset):
**kwargs,
) -> Union[List[pd.DataFrame], pd.DataFrame]:
"""
prepare the data for learning and inference
Prepare the data for learning and inference.
Parameters
----------
@@ -132,11 +133,12 @@ class DatasetH(Dataset):
- 'train'
- ['train', 'valid']
col_set : str
The col_set will be passed to self._handler when fetching data
data_key: str
The col_set will be passed to self._handler when fetching data.
data_key : str
The data to fetch: DK_*
Default is DK_I, which indicate fetching data for **inference**
Default is DK_I, which indicate fetching data for **inference**.
Returns
-------

View File

@@ -29,7 +29,7 @@ class DataHandler(Serializable):
"""
The steps to using a handler
1. initialized data handler (call by `init`).
2. use the data
2. use the data.
The data handler try to maintain a handler with 2 level.
@@ -65,17 +65,17 @@ class DataHandler(Serializable):
Parameters
----------
instruments :
The stock list to retrive
The stock list to retrive.
start_time :
start_time of the original data
start_time of the original data.
end_time :
end_time of the original data
end_time of the original data.
data_loader : Tuple[dict, str, DataLoader]
data loader to load the data
data loader to load the data.
init_data :
intialize the original data in the constructor
intialize the original data in the constructor.
fetch_orig : bool
Return the original data instead of copy if possible
Return the original data instead of copy if possible.
"""
# Set logger
self.logger = get_module_logger("DataHandler")
@@ -219,9 +219,9 @@ class DataHandler(Serializable):
get a iterator of sliced data with given periods
Args:
periods (int): number of periods
min_periods (int): minimum periods for sliced dataframe
kwargs (dict): will be passed to `self.fetch`
periods (int): number of periods.
min_periods (int): minimum periods for sliced dataframe.
kwargs (dict): will be passed to `self.fetch`.
"""
trading_dates = self._data.index.unique(level="datetime")
if min_periods is None:
@@ -377,7 +377,7 @@ class DataHandlerLP(DataHandler):
Parameters
----------
init_type : str
The type `IT_*` listed above
The type `IT_*` listed above.
enable_cache : bool
default value is false:
@@ -419,13 +419,13 @@ class DataHandlerLP(DataHandler):
Parameters
----------
selector : Union[pd.Timestamp, slice, str]
describe how to select data by index
describe how to select data by index.
level : Union[str, int]
which index level to select the data
which index level to select the data.
col_set : str
select a set of meaningful columns.(e.g. features, columns)
data_key: str
The data to fetch: DK_*
select a set of meaningful columns.(e.g. features, columns).
data_key : str
the data to fetch: DK_*.
Returns
-------
@@ -443,9 +443,9 @@ class DataHandlerLP(DataHandler):
Parameters
----------
col_set : str
select a set of meaningful columns.(e.g. features, columns)
data_key: str
The data to fetch: DK_*
select a set of meaningful columns.(e.g. features, columns).
data_key : str
the data to fetch: DK_*.
Returns
-------

View File

@@ -100,16 +100,16 @@ class DLWParser(DataLoader):
Parameters
----------
instruments :
the instruments
the instruments.
exprs : list
The expressions to describe the content of the data
the expressions to describe the content of the data.
names : list
The name of the data
the name of the data.
Returns
-------
pd.DataFrame:
the queried dataframe
the queried dataframe.
"""
pass

View File

@@ -21,7 +21,7 @@ def get_group_columns(df: pd.DataFrame, group: str):
Parameters
----------
df : pd.DataFrame
with multi of columns
with multi of columns.
group : str
the name of the feature group, i.e. the first level value of the group index.
"""
@@ -56,7 +56,7 @@ class Processor(Serializable):
Parameters
----------
df : pd.DataFrame
The raw_df of handler or result from previous processor
The raw_df of handler or result from previous processor.
"""
pass
@@ -68,7 +68,7 @@ class Processor(Serializable):
Returns
-------
bool:
if it is usable for infenrece
if it is usable for infenrece.
"""
return True

View File

@@ -32,7 +32,7 @@ class BaseDFilter(abc.ABC):
Parameters
----------
config : dict
dict of config parameters
dict of config parameters.
"""
raise NotImplementedError("Subclass of BaseDFilter must reimplement `from_config` method")
@@ -43,7 +43,7 @@ class BaseDFilter(abc.ABC):
Returns
----------
dict
return the dict of config parameters
return the dict of config parameters.
"""
raise NotImplementedError("Subclass of BaseDFilter must reimplement `to_config` method")
@@ -69,9 +69,9 @@ class SeriesDFilter(BaseDFilter):
Parameters
----------
fstart_time: str
the time for the filter rule to start filter the instruments
the time for the filter rule to start filter the instruments.
fend_time: str
the time for the filter rule to stop filter the instruments
the time for the filter rule to stop filter the instruments.
"""
super(SeriesDFilter, self).__init__()
self.filter_start_time = pd.Timestamp(fstart_time) if fstart_time else None
@@ -83,12 +83,12 @@ class SeriesDFilter(BaseDFilter):
Parameters
----------
instruments: dict
the dict of instruments in the form {instrument_name => list of timestamp tuple}
the dict of instruments in the form {instrument_name => list of timestamp tuple}.
Returns
----------
pd.Timestamp, pd.Timestamp
the lower time bound and upper time bound of all the instruments
the lower time bound and upper time bound of all the instruments.
"""
trange = Cal.calendar(freq=self.filter_freq)
ubound, lbound = trange[0], trange[-1]
@@ -105,14 +105,14 @@ class SeriesDFilter(BaseDFilter):
Parameters
----------
time_range : D.calendar
the time range of the instruments
the time range of the instruments.
target_timestamp : list
the list of tuple (timestamp, timestamp)
the list of tuple (timestamp, timestamp).
Returns
----------
pd.Series
the series of bool value for an instrument
the series of bool value for an instrument.
"""
# Construct a whole dict of {date => bool}
timestamp_series = {timestamp: False for timestamp in time_range}
@@ -124,19 +124,19 @@ class SeriesDFilter(BaseDFilter):
return timestamp_series
def _filterSeries(self, timestamp_series, filter_series):
"""Filter the timestamp series with filter series by using element-wise AND operation of the two series
"""Filter the timestamp series with filter series by using element-wise AND operation of the two series.
Parameters
----------
timestamp_series : pd.Series
the series of bool value indicating existing time
the series of bool value indicating existing time.
filter_series : pd.Series
the series of bool value indicating filter feature
the series of bool value indicating filter feature.
Returns
----------
pd.Series
the series of bool value indicating whether the date satisfies the filter condition and exists in target timestamp
the series of bool value indicating whether the date satisfies the filter condition and exists in target timestamp.
"""
fstart, fend = list(filter_series.keys())[0], list(filter_series.keys())[-1]
filter_series = filter_series.astype("bool") # Make sure the filter_series is boolean
@@ -144,17 +144,17 @@ class SeriesDFilter(BaseDFilter):
return timestamp_series
def _toTimestamp(self, timestamp_series):
"""Convert the timestamp series to a list of tuple (timestamp, timestamp) indicating a continuous range of TRUE
"""Convert the timestamp series to a list of tuple (timestamp, timestamp) indicating a continuous range of TRUE.
Parameters
----------
timestamp_series: pd.Series
the series of bool value after being filtered
the series of bool value after being filtered.
Returns
----------
list
the list of tuple (timestamp, timestamp)
the list of tuple (timestamp, timestamp).
"""
# sort the timestamp_series according to the timestamps
timestamp_series.sort_index()
@@ -194,18 +194,18 @@ class SeriesDFilter(BaseDFilter):
Parameters
----------
instruments : dict
the dict of instruments to be filtered
the dict of instruments to be filtered.
fstart : pd.Timestamp
start time of filter
start time of filter.
fend : pd.Timestamp
end time of filter
end time of filter.
.. note:: fstart/fend indicates the intersection of instruments start/end time and filter start/end time
.. note:: fstart/fend indicates the intersection of instruments start/end time and filter start/end time.
Returns
----------
pd.Dataframe
a series of {pd.Timestamp => bool}
a series of {pd.Timestamp => bool}.
"""
raise NotImplementedError("Subclass of SeriesDFilter must reimplement `getFilterSeries` method")
@@ -215,16 +215,16 @@ class SeriesDFilter(BaseDFilter):
Parameters
----------
instruments: dict
input instruments to be filtered
input instruments to be filtered.
start_time: str
start of the time range
start of the time range.
end_time: str
end of the time range
end of the time range.
Returns
----------
dict
filtered instruments, same structure as input instruments
filtered instruments, same structure as input instruments.
"""
lbound, ubound = self._getTimeBound(instruments)
start_time = pd.Timestamp(start_time or lbound)
@@ -272,7 +272,7 @@ class NameDFilter(SeriesDFilter):
params:
------
name_rule_re: str
regular expression for the name rule
regular expression for the name rule.
"""
super(NameDFilter, self).__init__(fstart_time, fend_time)
self.name_rule_re = name_rule_re
@@ -325,13 +325,13 @@ class ExpressionDFilter(SeriesDFilter):
params:
------
fstart_time: str
filter the feature starting from this time
filter the feature starting from this time.
fend_time: str
filter the feature ending by this time
filter the feature ending by this time.
rule_expression: str
an input expression for the rule
an input expression for the rule.
keep: bool
whether to keep the instruments of which features don't exist in the filter time span
whether to keep the instruments of which features don't exist in the filter time span.
"""
super(ExpressionDFilter, self).__init__(fstart_time, fend_time)
self.rule_expression = rule_expression