mirror of
https://github.com/microsoft/qlib.git
synced 2026-07-06 04:20:57 +08:00
Support Point-in-time Data Operation (#343)
* add period ops class * black format * add pit data read * fix bug in period ops * update ops runnable * update PIT test example * black format * update PIT test * update tets_PIT * update code format * add check_feature_exist * black format * optimize the PIT Algorithm * fix bug * update example * update test_PIT name * add pit collector * black format * fix bugs * fix try * fix bug & add dump_pit.py * Successfully run and understand PIT * Add some docs and remove a bug * mv crypto collector * black format * Run succesfully after merging master * Pass test and fix code * remove useless PIT code * fix PYlint * Rename Co-authored-by: Young <afe.young@gmail.com>
This commit is contained in:
@@ -15,6 +15,7 @@ from .data import (
|
||||
LocalCalendarProvider,
|
||||
LocalInstrumentProvider,
|
||||
LocalFeatureProvider,
|
||||
LocalPITProvider,
|
||||
LocalExpressionProvider,
|
||||
LocalDatasetProvider,
|
||||
ClientCalendarProvider,
|
||||
|
||||
@@ -6,12 +6,20 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import abc
|
||||
|
||||
import pandas as pd
|
||||
from ..log import get_module_logger
|
||||
|
||||
|
||||
class Expression(abc.ABC):
|
||||
"""Expression base class"""
|
||||
"""
|
||||
Expression base class
|
||||
|
||||
Expression is designed to handle the calculation of data with the format below
|
||||
data with two dimension for each instrument,
|
||||
- feature
|
||||
- time: it could be observation time or period time.
|
||||
- period time is designed for Point-in-time database. For example, the period time maybe 2014Q4, its value can observed for multiple times(different value may be observed at different time due to amendment).
|
||||
"""
|
||||
|
||||
def __str__(self):
|
||||
return type(self).__name__
|
||||
@@ -124,8 +132,18 @@ class Expression(abc.ABC):
|
||||
|
||||
return Or(other, self)
|
||||
|
||||
def load(self, instrument, start_index, end_index, freq):
|
||||
def load(self, instrument, start_index, end_index, *args):
|
||||
"""load feature
|
||||
This function is responsible for loading feature/expression based on the expression engine.
|
||||
|
||||
The concerate implementation will be seperated by two parts
|
||||
1) caching data, handle errors.
|
||||
- This part is shared by all the expressions and implemented in Expression
|
||||
2) processing and calculating data based on the specific expression.
|
||||
- This part is different in each expression and implemented in each expression
|
||||
|
||||
Expresion Engine is shared by different data.
|
||||
Different data will have different extra infomation for `args`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
@@ -135,8 +153,15 @@ class Expression(abc.ABC):
|
||||
feature start index [in calendar].
|
||||
end_index : str
|
||||
feature end index [in calendar].
|
||||
freq : str
|
||||
feature frequency.
|
||||
|
||||
*args may contains following information;
|
||||
1) if it is used in basic experssion engine data, it contains following arguments
|
||||
freq : str
|
||||
feature frequency.
|
||||
|
||||
2) if is used in PIT data, it contains following arguments
|
||||
cur_pit:
|
||||
it is designed for the point-in-time data.
|
||||
|
||||
Returns
|
||||
----------
|
||||
@@ -146,26 +171,26 @@ class Expression(abc.ABC):
|
||||
from .cache import H # pylint: disable=C0415
|
||||
|
||||
# cache
|
||||
args = str(self), instrument, start_index, end_index, freq
|
||||
if args in H["f"]:
|
||||
return H["f"][args]
|
||||
cache_key = str(self), instrument, start_index, end_index, *args
|
||||
if cache_key in H["f"]:
|
||||
return H["f"][cache_key]
|
||||
if start_index is not None and end_index is not None and start_index > end_index:
|
||||
raise ValueError("Invalid index range: {} {}".format(start_index, end_index))
|
||||
try:
|
||||
series = self._load_internal(instrument, start_index, end_index, freq)
|
||||
series = self._load_internal(instrument, start_index, end_index, *args)
|
||||
except Exception as e:
|
||||
get_module_logger("data").debug(
|
||||
f"Loading data error: instrument={instrument}, expression={str(self)}, "
|
||||
f"start_index={start_index}, end_index={end_index}, freq={freq}. "
|
||||
f"start_index={start_index}, end_index={end_index}, args={args}. "
|
||||
f"error info: {str(e)}"
|
||||
)
|
||||
raise
|
||||
series.name = str(self)
|
||||
H["f"][args] = series
|
||||
H["f"][cache_key] = series
|
||||
return series
|
||||
|
||||
@abc.abstractmethod
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
def _load_internal(self, instrument, start_index, end_index, *args) -> pd.Series:
|
||||
raise NotImplementedError("This function must be implemented in your newly defined feature")
|
||||
|
||||
@abc.abstractmethod
|
||||
@@ -225,6 +250,16 @@ class Feature(Expression):
|
||||
return 0, 0
|
||||
|
||||
|
||||
class PFeature(Feature):
|
||||
def __str__(self):
|
||||
return "$$" + self._name
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, cur_time):
|
||||
from .data import PITD # pylint: disable=C0415
|
||||
|
||||
return PITD.period_feature(instrument, str(self), start_index, end_index, cur_time)
|
||||
|
||||
|
||||
class ExpressionOps(Expression):
|
||||
"""Operator Expression
|
||||
|
||||
|
||||
@@ -34,6 +34,8 @@ from ..utils import (
|
||||
code_to_fname,
|
||||
set_log_with_config,
|
||||
time_to_slc_point,
|
||||
read_period_data,
|
||||
get_period_list,
|
||||
)
|
||||
from ..utils.paral import ParallelExt
|
||||
from .ops import Operators # pylint: disable=W0611
|
||||
@@ -331,6 +333,38 @@ class FeatureProvider(abc.ABC):
|
||||
raise NotImplementedError("Subclass of FeatureProvider must implement `feature` method")
|
||||
|
||||
|
||||
class PITProvider(abc.ABC):
|
||||
@abc.abstractmethod
|
||||
def period_feature(self, instrument, field, start_index: int, end_index: int, cur_time: pd.Timestamp) -> pd.Series:
|
||||
"""
|
||||
get the historical periods data series between `start_index` and `end_index`
|
||||
|
||||
Parameters
|
||||
----------
|
||||
start_index: int
|
||||
start_index is a relative index to the latest period to cur_time
|
||||
|
||||
end_index: int
|
||||
end_index is a relative index to the latest period to cur_time
|
||||
in most cases, the start_index and end_index will be a non-positive values
|
||||
For example, start_index == -3 end_index == 0 and current period index is cur_idx,
|
||||
then the data between [start_index + cur_idx, end_index + cur_idx] will be retrieved.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pd.Series
|
||||
The index will be integers to indicate the periods of the data
|
||||
An typical examples will be
|
||||
TODO
|
||||
|
||||
Raises
|
||||
------
|
||||
FileNotFoundError
|
||||
This exception will be raised if the queried data do not exist.
|
||||
"""
|
||||
raise NotImplementedError(f"Please implement the `period_feature` method")
|
||||
|
||||
|
||||
class ExpressionProvider(abc.ABC):
|
||||
"""Expression provider class
|
||||
|
||||
@@ -694,6 +728,89 @@ class LocalFeatureProvider(FeatureProvider, ProviderBackendMixin):
|
||||
return self.backend_obj(instrument=instrument, field=field, freq=freq)[start_index : end_index + 1]
|
||||
|
||||
|
||||
class LocalPITProvider(PITProvider):
|
||||
# TODO: Add PIT backend file storage
|
||||
# NOTE: This class is not multi-threading-safe!!!!
|
||||
|
||||
def period_feature(self, instrument, field, start_index, end_index, cur_time):
|
||||
if not isinstance(cur_time, pd.Timestamp):
|
||||
raise ValueError(
|
||||
f"Expected pd.Timestamp for `cur_time`, got '{cur_time}'. Advices: you can't query PIT data directly(e.g. '$$roewa_q'), you must use `P` operator to convert data to each day (e.g. 'P($$roewa_q)')"
|
||||
)
|
||||
|
||||
assert end_index <= 0 # PIT don't support querying future data
|
||||
|
||||
DATA_RECORDS = [
|
||||
("date", C.pit_record_type["date"]),
|
||||
("period", C.pit_record_type["period"]),
|
||||
("value", C.pit_record_type["value"]),
|
||||
("_next", C.pit_record_type["index"]),
|
||||
]
|
||||
VALUE_DTYPE = C.pit_record_type["value"]
|
||||
|
||||
field = str(field).lower()[2:]
|
||||
instrument = code_to_fname(instrument)
|
||||
|
||||
# {For acceleration
|
||||
# start_index, end_index, cur_index = kwargs["info"]
|
||||
# if cur_index == start_index:
|
||||
# if not hasattr(self, "all_fields"):
|
||||
# self.all_fields = []
|
||||
# self.all_fields.append(field)
|
||||
# if not hasattr(self, "period_index"):
|
||||
# self.period_index = {}
|
||||
# if field not in self.period_index:
|
||||
# self.period_index[field] = {}
|
||||
# For acceleration}
|
||||
|
||||
if not field.endswith("_q") and not field.endswith("_a"):
|
||||
raise ValueError("period field must ends with '_q' or '_a'")
|
||||
quarterly = field.endswith("_q")
|
||||
index_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.index"
|
||||
data_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.data"
|
||||
if not (index_path.exists() and data_path.exists()):
|
||||
raise FileNotFoundError("No file is found. Raise exception and ")
|
||||
# NOTE: The most significant performance loss is here.
|
||||
# Does the accelration that makes the program complicated really matters?
|
||||
# - It make parameters parameters of the interface complicate
|
||||
# - It does not performance in the optimal way (places all the pieces together, we may achieve higher performance)
|
||||
# - If we design it carefully, we can go through for only once to get the historical evolution of the data.
|
||||
# So I decide to deprecated previous implementation and keep the logic of the program simple
|
||||
# Instead, I'll add a cache for the index file.
|
||||
data = np.fromfile(data_path, dtype=DATA_RECORDS)
|
||||
|
||||
# find all revision periods before `cur_time`
|
||||
cur_time_int = int(cur_time.year) * 10000 + int(cur_time.month) * 100 + int(cur_time.day)
|
||||
loc = np.searchsorted(data["date"], cur_time_int, side="right")
|
||||
if loc <= 0:
|
||||
return pd.Series()
|
||||
last_period = data["period"][:loc].max() # return the latest quarter
|
||||
first_period = data["period"][:loc].min()
|
||||
|
||||
period_list = get_period_list(first_period, last_period, quarterly)
|
||||
period_list = period_list[max(0, len(period_list) + start_index - 1) : len(period_list) + end_index]
|
||||
value = np.full((len(period_list),), np.nan, dtype=VALUE_DTYPE)
|
||||
for i, period in enumerate(period_list):
|
||||
# last_period_index = self.period_index[field].get(period) # For acceleration
|
||||
value[i], now_period_index = read_period_data(
|
||||
index_path, data_path, period, cur_time_int, quarterly # , last_period_index # For acceleration
|
||||
)
|
||||
# self.period_index[field].update({period: now_period_index}) # For acceleration
|
||||
# NOTE: the index is period_list; So it may result in unexpected values(e.g. nan)
|
||||
# when calculation between different features and only part of its financial indicator is published
|
||||
series = pd.Series(value, index=period_list, dtype=VALUE_DTYPE)
|
||||
|
||||
# {For acceleration
|
||||
# if cur_index == end_index:
|
||||
# self.all_fields.remove(field)
|
||||
# if not len(self.all_fields):
|
||||
# del self.all_fields
|
||||
# del self.period_index
|
||||
# For acceleration}
|
||||
|
||||
return series
|
||||
|
||||
|
||||
class LocalExpressionProvider(ExpressionProvider):
|
||||
"""Local expression data provider class
|
||||
|
||||
@@ -1003,6 +1120,8 @@ class ClientDatasetProvider(DatasetProvider):
|
||||
|
||||
class BaseProvider:
|
||||
"""Local provider class
|
||||
It is a set of interface that allow users to access data.
|
||||
Because PITD is not exposed publicly to users, so it is not included in the interface.
|
||||
|
||||
To keep compatible with old qlib provider.
|
||||
"""
|
||||
@@ -1126,6 +1245,7 @@ if sys.version_info >= (3, 9):
|
||||
CalendarProviderWrapper = Annotated[CalendarProvider, Wrapper]
|
||||
InstrumentProviderWrapper = Annotated[InstrumentProvider, Wrapper]
|
||||
FeatureProviderWrapper = Annotated[FeatureProvider, Wrapper]
|
||||
PITProviderWrapper = Annotated[PITProvider, Wrapper]
|
||||
ExpressionProviderWrapper = Annotated[ExpressionProvider, Wrapper]
|
||||
DatasetProviderWrapper = Annotated[DatasetProvider, Wrapper]
|
||||
BaseProviderWrapper = Annotated[BaseProvider, Wrapper]
|
||||
@@ -1133,6 +1253,7 @@ else:
|
||||
CalendarProviderWrapper = CalendarProvider
|
||||
InstrumentProviderWrapper = InstrumentProvider
|
||||
FeatureProviderWrapper = FeatureProvider
|
||||
PITProviderWrapper = PITProvider
|
||||
ExpressionProviderWrapper = ExpressionProvider
|
||||
DatasetProviderWrapper = DatasetProvider
|
||||
BaseProviderWrapper = BaseProvider
|
||||
@@ -1140,6 +1261,7 @@ else:
|
||||
Cal: CalendarProviderWrapper = Wrapper()
|
||||
Inst: InstrumentProviderWrapper = Wrapper()
|
||||
FeatureD: FeatureProviderWrapper = Wrapper()
|
||||
PITD: PITProviderWrapper = Wrapper()
|
||||
ExpressionD: ExpressionProviderWrapper = Wrapper()
|
||||
DatasetD: DatasetProviderWrapper = Wrapper()
|
||||
D: BaseProviderWrapper = Wrapper()
|
||||
@@ -1165,6 +1287,11 @@ def register_all_wrappers(C):
|
||||
register_wrapper(FeatureD, feature_provider, "qlib.data")
|
||||
logger.debug(f"registering FeatureD {C.feature_provider}")
|
||||
|
||||
if getattr(C, "pit_provider", None) is not None:
|
||||
pit_provider = init_instance_by_config(C.pit_provider, module)
|
||||
register_wrapper(PITD, pit_provider, "qlib.data")
|
||||
logger.debug(f"registering PITD {C.pit_provider}")
|
||||
|
||||
if getattr(C, "expression_provider", None) is not None:
|
||||
# This provider is unnecessary in client provider
|
||||
_eprovider = init_instance_by_config(C.expression_provider, module)
|
||||
|
||||
141
qlib/data/ops.py
141
qlib/data/ops.py
@@ -10,9 +10,7 @@ import pandas as pd
|
||||
|
||||
from typing import Union, List, Type
|
||||
from scipy.stats import percentileofscore
|
||||
|
||||
from .base import Expression, ExpressionOps, Feature
|
||||
|
||||
from .base import Expression, ExpressionOps, Feature, PFeature
|
||||
from ..log import get_module_logger
|
||||
from ..utils import get_callable_kwargs
|
||||
|
||||
@@ -84,8 +82,8 @@ class NpElemOperator(ElemOperator):
|
||||
self.func = func
|
||||
super(NpElemOperator, self).__init__(feature)
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
def _load_internal(self, instrument, start_index, end_index, *args):
|
||||
series = self.feature.load(instrument, start_index, end_index, *args)
|
||||
return getattr(np, self.func)(series)
|
||||
|
||||
|
||||
@@ -124,11 +122,11 @@ class Sign(NpElemOperator):
|
||||
def __init__(self, feature):
|
||||
super(Sign, self).__init__(feature, "sign")
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
def _load_internal(self, instrument, start_index, end_index, *args):
|
||||
"""
|
||||
To avoid error raised by bool type input, we transform the data into float32.
|
||||
"""
|
||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
series = self.feature.load(instrument, start_index, end_index, *args)
|
||||
# TODO: More precision types should be configurable
|
||||
series = series.astype(np.float32)
|
||||
return getattr(np, self.func)(series)
|
||||
@@ -173,8 +171,8 @@ class Power(NpElemOperator):
|
||||
def __str__(self):
|
||||
return "{}({},{})".format(type(self).__name__, self.feature, self.exponent)
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
def _load_internal(self, instrument, start_index, end_index, *args):
|
||||
series = self.feature.load(instrument, start_index, end_index, *args)
|
||||
return getattr(np, self.func)(series, self.exponent)
|
||||
|
||||
|
||||
@@ -201,8 +199,8 @@ class Mask(NpElemOperator):
|
||||
def __str__(self):
|
||||
return "{}({},{})".format(type(self).__name__, self.feature, self.instrument.lower())
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
return self.feature.load(self.instrument, start_index, end_index, freq)
|
||||
def _load_internal(self, instrument, start_index, end_index, *args):
|
||||
return self.feature.load(self.instrument, start_index, end_index, *args)
|
||||
|
||||
|
||||
class Not(NpElemOperator):
|
||||
@@ -252,24 +250,24 @@ class PairOperator(ExpressionOps):
|
||||
return "{}({},{})".format(type(self).__name__, self.feature_left, self.feature_right)
|
||||
|
||||
def get_longest_back_rolling(self):
|
||||
if isinstance(self.feature_left, Expression):
|
||||
if isinstance(self.feature_left, (Expression,)):
|
||||
left_br = self.feature_left.get_longest_back_rolling()
|
||||
else:
|
||||
left_br = 0
|
||||
|
||||
if isinstance(self.feature_right, Expression):
|
||||
if isinstance(self.feature_right, (Expression,)):
|
||||
right_br = self.feature_right.get_longest_back_rolling()
|
||||
else:
|
||||
right_br = 0
|
||||
return max(left_br, right_br)
|
||||
|
||||
def get_extended_window_size(self):
|
||||
if isinstance(self.feature_left, Expression):
|
||||
if isinstance(self.feature_left, (Expression,)):
|
||||
ll, lr = self.feature_left.get_extended_window_size()
|
||||
else:
|
||||
ll, lr = 0, 0
|
||||
|
||||
if isinstance(self.feature_right, Expression):
|
||||
if isinstance(self.feature_right, (Expression,)):
|
||||
rl, rr = self.feature_right.get_extended_window_size()
|
||||
else:
|
||||
rl, rr = 0, 0
|
||||
@@ -298,16 +296,16 @@ class NpPairOperator(PairOperator):
|
||||
self.func = func
|
||||
super(NpPairOperator, self).__init__(feature_left, feature_right)
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
def _load_internal(self, instrument, start_index, end_index, *args):
|
||||
assert any(
|
||||
[isinstance(self.feature_left, Expression), self.feature_right, Expression]
|
||||
[isinstance(self.feature_left, (Expression,)), self.feature_right, Expression]
|
||||
), "at least one of two inputs is Expression instance"
|
||||
if isinstance(self.feature_left, Expression):
|
||||
series_left = self.feature_left.load(instrument, start_index, end_index, freq)
|
||||
if isinstance(self.feature_left, (Expression,)):
|
||||
series_left = self.feature_left.load(instrument, start_index, end_index, *args)
|
||||
else:
|
||||
series_left = self.feature_left # numeric value
|
||||
if isinstance(self.feature_right, Expression):
|
||||
series_right = self.feature_right.load(instrument, start_index, end_index, freq)
|
||||
if isinstance(self.feature_right, (Expression,)):
|
||||
series_right = self.feature_right.load(instrument, start_index, end_index, *args)
|
||||
else:
|
||||
series_right = self.feature_right
|
||||
check_length = isinstance(series_left, (np.ndarray, pd.Series)) and isinstance(
|
||||
@@ -637,48 +635,48 @@ class If(ExpressionOps):
|
||||
def __str__(self):
|
||||
return "If({},{},{})".format(self.condition, self.feature_left, self.feature_right)
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
series_cond = self.condition.load(instrument, start_index, end_index, freq)
|
||||
if isinstance(self.feature_left, Expression):
|
||||
series_left = self.feature_left.load(instrument, start_index, end_index, freq)
|
||||
def _load_internal(self, instrument, start_index, end_index, *args):
|
||||
series_cond = self.condition.load(instrument, start_index, end_index, *args)
|
||||
if isinstance(self.feature_left, (Expression,)):
|
||||
series_left = self.feature_left.load(instrument, start_index, end_index, *args)
|
||||
else:
|
||||
series_left = self.feature_left
|
||||
if isinstance(self.feature_right, Expression):
|
||||
series_right = self.feature_right.load(instrument, start_index, end_index, freq)
|
||||
if isinstance(self.feature_right, (Expression,)):
|
||||
series_right = self.feature_right.load(instrument, start_index, end_index, *args)
|
||||
else:
|
||||
series_right = self.feature_right
|
||||
series = pd.Series(np.where(series_cond, series_left, series_right), index=series_cond.index)
|
||||
return series
|
||||
|
||||
def get_longest_back_rolling(self):
|
||||
if isinstance(self.feature_left, Expression):
|
||||
if isinstance(self.feature_left, (Expression,)):
|
||||
left_br = self.feature_left.get_longest_back_rolling()
|
||||
else:
|
||||
left_br = 0
|
||||
|
||||
if isinstance(self.feature_right, Expression):
|
||||
if isinstance(self.feature_right, (Expression,)):
|
||||
right_br = self.feature_right.get_longest_back_rolling()
|
||||
else:
|
||||
right_br = 0
|
||||
|
||||
if isinstance(self.condition, Expression):
|
||||
if isinstance(self.condition, (Expression,)):
|
||||
c_br = self.condition.get_longest_back_rolling()
|
||||
else:
|
||||
c_br = 0
|
||||
return max(left_br, right_br, c_br)
|
||||
|
||||
def get_extended_window_size(self):
|
||||
if isinstance(self.feature_left, Expression):
|
||||
if isinstance(self.feature_left, (Expression,)):
|
||||
ll, lr = self.feature_left.get_extended_window_size()
|
||||
else:
|
||||
ll, lr = 0, 0
|
||||
|
||||
if isinstance(self.feature_right, Expression):
|
||||
if isinstance(self.feature_right, (Expression,)):
|
||||
rl, rr = self.feature_right.get_extended_window_size()
|
||||
else:
|
||||
rl, rr = 0, 0
|
||||
|
||||
if isinstance(self.condition, Expression):
|
||||
if isinstance(self.condition, (Expression,)):
|
||||
cl, cr = self.condition.get_extended_window_size()
|
||||
else:
|
||||
cl, cr = 0, 0
|
||||
@@ -719,8 +717,8 @@ class Rolling(ExpressionOps):
|
||||
def __str__(self):
|
||||
return "{}({},{})".format(type(self).__name__, self.feature, self.N)
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
def _load_internal(self, instrument, start_index, end_index, *args):
|
||||
series = self.feature.load(instrument, start_index, end_index, *args)
|
||||
# NOTE: remove all null check,
|
||||
# now it's user's responsibility to decide whether use features in null days
|
||||
# isnull = series.isnull() # NOTE: isnull = NaN, inf is not null
|
||||
@@ -777,8 +775,8 @@ class Ref(Rolling):
|
||||
def __init__(self, feature, N):
|
||||
super(Ref, self).__init__(feature, N, "ref")
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
def _load_internal(self, instrument, start_index, end_index, *args):
|
||||
series = self.feature.load(instrument, start_index, end_index, *args)
|
||||
# N = 0, return first day
|
||||
if series.empty:
|
||||
return series # Pandas bug, see: https://github.com/pandas-dev/pandas/issues/21049
|
||||
@@ -967,8 +965,8 @@ class IdxMax(Rolling):
|
||||
def __init__(self, feature, N):
|
||||
super(IdxMax, self).__init__(feature, N, "idxmax")
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
def _load_internal(self, instrument, start_index, end_index, *args):
|
||||
series = self.feature.load(instrument, start_index, end_index, *args)
|
||||
if self.N == 0:
|
||||
series = series.expanding(min_periods=1).apply(lambda x: x.argmax() + 1, raw=True)
|
||||
else:
|
||||
@@ -1015,8 +1013,8 @@ class IdxMin(Rolling):
|
||||
def __init__(self, feature, N):
|
||||
super(IdxMin, self).__init__(feature, N, "idxmin")
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
def _load_internal(self, instrument, start_index, end_index, *args):
|
||||
series = self.feature.load(instrument, start_index, end_index, *args)
|
||||
if self.N == 0:
|
||||
series = series.expanding(min_periods=1).apply(lambda x: x.argmin() + 1, raw=True)
|
||||
else:
|
||||
@@ -1047,8 +1045,8 @@ class Quantile(Rolling):
|
||||
def __str__(self):
|
||||
return "{}({},{},{})".format(type(self).__name__, self.feature, self.N, self.qscore)
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
def _load_internal(self, instrument, start_index, end_index, *args):
|
||||
series = self.feature.load(instrument, start_index, end_index, *args)
|
||||
if self.N == 0:
|
||||
series = series.expanding(min_periods=1).quantile(self.qscore)
|
||||
else:
|
||||
@@ -1095,8 +1093,8 @@ class Mad(Rolling):
|
||||
def __init__(self, feature, N):
|
||||
super(Mad, self).__init__(feature, N, "mad")
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
def _load_internal(self, instrument, start_index, end_index, *args):
|
||||
series = self.feature.load(instrument, start_index, end_index, *args)
|
||||
# TODO: implement in Cython
|
||||
|
||||
def mad(x):
|
||||
@@ -1129,8 +1127,8 @@ class Rank(Rolling):
|
||||
def __init__(self, feature, N):
|
||||
super(Rank, self).__init__(feature, N, "rank")
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
def _load_internal(self, instrument, start_index, end_index, *args):
|
||||
series = self.feature.load(instrument, start_index, end_index, *args)
|
||||
# TODO: implement in Cython
|
||||
|
||||
def rank(x):
|
||||
@@ -1187,8 +1185,8 @@ class Delta(Rolling):
|
||||
def __init__(self, feature, N):
|
||||
super(Delta, self).__init__(feature, N, "delta")
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
def _load_internal(self, instrument, start_index, end_index, *args):
|
||||
series = self.feature.load(instrument, start_index, end_index, *args)
|
||||
if self.N == 0:
|
||||
series = series - series.iloc[0]
|
||||
else:
|
||||
@@ -1225,8 +1223,8 @@ class Slope(Rolling):
|
||||
def __init__(self, feature, N):
|
||||
super(Slope, self).__init__(feature, N, "slope")
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
def _load_internal(self, instrument, start_index, end_index, *args):
|
||||
series = self.feature.load(instrument, start_index, end_index, *args)
|
||||
if self.N == 0:
|
||||
series = pd.Series(expanding_slope(series.values), index=series.index)
|
||||
else:
|
||||
@@ -1253,8 +1251,8 @@ class Rsquare(Rolling):
|
||||
def __init__(self, feature, N):
|
||||
super(Rsquare, self).__init__(feature, N, "rsquare")
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
_series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
def _load_internal(self, instrument, start_index, end_index, *args):
|
||||
_series = self.feature.load(instrument, start_index, end_index, *args)
|
||||
if self.N == 0:
|
||||
series = pd.Series(expanding_rsquare(_series.values), index=_series.index)
|
||||
else:
|
||||
@@ -1282,8 +1280,8 @@ class Resi(Rolling):
|
||||
def __init__(self, feature, N):
|
||||
super(Resi, self).__init__(feature, N, "resi")
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
def _load_internal(self, instrument, start_index, end_index, *args):
|
||||
series = self.feature.load(instrument, start_index, end_index, *args)
|
||||
if self.N == 0:
|
||||
series = pd.Series(expanding_resi(series.values), index=series.index)
|
||||
else:
|
||||
@@ -1310,8 +1308,8 @@ class WMA(Rolling):
|
||||
def __init__(self, feature, N):
|
||||
super(WMA, self).__init__(feature, N, "wma")
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
def _load_internal(self, instrument, start_index, end_index, *args):
|
||||
series = self.feature.load(instrument, start_index, end_index, *args)
|
||||
# TODO: implement in Cython
|
||||
|
||||
def weighted_mean(x):
|
||||
@@ -1345,8 +1343,8 @@ class EMA(Rolling):
|
||||
def __init__(self, feature, N):
|
||||
super(EMA, self).__init__(feature, N, "ema")
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
def _load_internal(self, instrument, start_index, end_index, *args):
|
||||
series = self.feature.load(instrument, start_index, end_index, *args)
|
||||
|
||||
def exp_weighted_mean(x):
|
||||
a = 1 - 2 / (1 + len(x))
|
||||
@@ -1392,17 +1390,17 @@ class PairRolling(ExpressionOps):
|
||||
def __str__(self):
|
||||
return "{}({},{},{})".format(type(self).__name__, self.feature_left, self.feature_right, self.N)
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
def _load_internal(self, instrument, start_index, end_index, *args):
|
||||
assert any(
|
||||
[isinstance(self.feature_left, Expression), self.feature_right, Expression]
|
||||
), "at least one of two inputs is Expression instance"
|
||||
|
||||
if isinstance(self.feature_left, Expression):
|
||||
series_left = self.feature_left.load(instrument, start_index, end_index, freq)
|
||||
series_left = self.feature_left.load(instrument, start_index, end_index, *args)
|
||||
else:
|
||||
series_left = self.feature_left # numeric value
|
||||
if isinstance(self.feature_right, Expression):
|
||||
series_right = self.feature_right.load(instrument, start_index, end_index, freq)
|
||||
series_right = self.feature_right.load(instrument, start_index, end_index, *args)
|
||||
else:
|
||||
series_right = self.feature_right
|
||||
|
||||
@@ -1465,12 +1463,12 @@ class Corr(PairRolling):
|
||||
def __init__(self, feature_left, feature_right, N):
|
||||
super(Corr, self).__init__(feature_left, feature_right, N, "corr")
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
res: pd.Series = super(Corr, self)._load_internal(instrument, start_index, end_index, freq)
|
||||
def _load_internal(self, instrument, start_index, end_index, *args):
|
||||
res: pd.Series = super(Corr, self)._load_internal(instrument, start_index, end_index, *args)
|
||||
|
||||
# NOTE: Load uses MemCache, so calling load again will not cause performance degradation
|
||||
series_left = self.feature_left.load(instrument, start_index, end_index, freq)
|
||||
series_right = self.feature_right.load(instrument, start_index, end_index, freq)
|
||||
series_left = self.feature_left.load(instrument, start_index, end_index, *args)
|
||||
series_right = self.feature_right.load(instrument, start_index, end_index, *args)
|
||||
res.loc[
|
||||
np.isclose(series_left.rolling(self.N, min_periods=1).std(), 0, atol=2e-05)
|
||||
| np.isclose(series_right.rolling(self.N, min_periods=1).std(), 0, atol=2e-05)
|
||||
@@ -1529,8 +1527,8 @@ class TResample(ElemOperator):
|
||||
def __str__(self):
|
||||
return "{}({},{})".format(type(self).__name__, self.feature, self.freq)
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
def _load_internal(self, instrument, start_index, end_index, *args):
|
||||
series = self.feature.load(instrument, start_index, end_index, *args)
|
||||
|
||||
if series.empty:
|
||||
return series
|
||||
@@ -1590,6 +1588,7 @@ OpsList = [
|
||||
IdxMin,
|
||||
If,
|
||||
Feature,
|
||||
PFeature,
|
||||
] + [TResample]
|
||||
|
||||
|
||||
@@ -1622,7 +1621,7 @@ class OpsWrapper:
|
||||
else:
|
||||
_ops_class = _operator
|
||||
|
||||
if not issubclass(_ops_class, Expression):
|
||||
if not issubclass(_ops_class, (Expression,)):
|
||||
raise TypeError("operator must be subclass of ExpressionOps, not {}".format(_ops_class))
|
||||
|
||||
if _ops_class.__name__ in self._ops:
|
||||
@@ -1644,8 +1643,10 @@ def register_all_ops(C):
|
||||
"""register all operator"""
|
||||
logger = get_module_logger("ops")
|
||||
|
||||
from qlib.data.pit import P # pylint: disable=C0415
|
||||
|
||||
Operators.reset()
|
||||
Operators.register(OpsList)
|
||||
Operators.register(OpsList + [P])
|
||||
|
||||
if getattr(C, "custom_ops", None) is not None:
|
||||
Operators.register(C.custom_ops)
|
||||
|
||||
57
qlib/data/pit.py
Normal file
57
qlib/data/pit.py
Normal file
@@ -0,0 +1,57 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
"""
|
||||
Qlib follow the logic below to supporting point-in-time database
|
||||
|
||||
For each stock, the format of its data is <observe_time, feature>. Expression Engine support calculation on such format of data
|
||||
|
||||
To calculate the feature value f_t at a specific observe time t, data with format <period_time, feature> will be used.
|
||||
For example, the average earning of last 4 quarters (period_time) on 20190719 (observe_time)
|
||||
|
||||
The calculation of both <period_time, feature> and <observe_time, feature> data rely on expression engine. It consists of 2 phases.
|
||||
1) calculation <period_time, feature> at each observation time t and it will collasped into a point (just like a normal feature)
|
||||
2) concatenate all th collasped data, we will get data with format <observe_time, feature>.
|
||||
Qlib will use the operator `P` to perform the collapse.
|
||||
"""
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from qlib.data.ops import ElemOperator
|
||||
from qlib.log import get_module_logger
|
||||
from .data import Cal
|
||||
|
||||
|
||||
class P(ElemOperator):
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
|
||||
_calendar = Cal.calendar(freq=freq)
|
||||
resample_data = np.empty(end_index - start_index + 1, dtype="float32")
|
||||
|
||||
for cur_index in range(start_index, end_index + 1):
|
||||
cur_time = _calendar[cur_index]
|
||||
# To load expression accurately, more historical data are required
|
||||
start_ws, end_ws = self.feature.get_extended_window_size()
|
||||
if end_ws > 0:
|
||||
raise ValueError(
|
||||
"PIT database does not support referring to future period (e.g. expressions like `Ref('$$roewa_q', -1)` are not supported"
|
||||
)
|
||||
|
||||
# The calculated value will always the last element, so the end_offset is zero.
|
||||
try:
|
||||
s = self.feature.load(instrument, -start_ws, 0, cur_time)
|
||||
resample_data[cur_index - start_index] = s.iloc[-1] if len(s) > 0 else np.nan
|
||||
except FileNotFoundError:
|
||||
get_module_logger("base").warning(f"WARN: period data not found for {str(self)}")
|
||||
return pd.Series(dtype="float32", name=str(self))
|
||||
|
||||
resample_series = pd.Series(
|
||||
resample_data, index=pd.RangeIndex(start_index, end_index + 1), dtype="float32", name=str(self)
|
||||
)
|
||||
return resample_series
|
||||
|
||||
def get_longest_back_rolling(self):
|
||||
# The period data will collapse as a normal feature. So no extending and looking back
|
||||
return 0
|
||||
|
||||
def get_extended_window_size(self):
|
||||
# The period data will collapse as a normal feature. So no extending and looking back
|
||||
return 0, 0
|
||||
Reference in New Issue
Block a user