1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-07-06 04:20:57 +08:00

Support Point-in-time Data Operation (#343)

* add period ops class

* black format

* add pit data read

* fix bug in period ops

* update ops runnable

* update PIT test example

* black format

* update PIT test

* update tets_PIT

* update code format

* add check_feature_exist

* black format

* optimize the PIT Algorithm

* fix bug

* update example

* update test_PIT name

* add pit collector

* black format

* fix bugs

* fix try

* fix bug & add dump_pit.py

* Successfully run and understand PIT

* Add some docs and remove a bug

* mv crypto collector

* black format

* Run succesfully after merging master

* Pass test and fix code

* remove useless PIT code

* fix PYlint

* Rename

Co-authored-by: Young <afe.young@gmail.com>
This commit is contained in:
bxdd
2022-03-10 14:27:52 +08:00
committed by GitHub
parent 3a911bc09b
commit faa99f30fa
19 changed files with 1459 additions and 141 deletions

View File

@@ -15,6 +15,7 @@ from .data import (
LocalCalendarProvider,
LocalInstrumentProvider,
LocalFeatureProvider,
LocalPITProvider,
LocalExpressionProvider,
LocalDatasetProvider,
ClientCalendarProvider,

View File

@@ -6,12 +6,20 @@ from __future__ import division
from __future__ import print_function
import abc
import pandas as pd
from ..log import get_module_logger
class Expression(abc.ABC):
"""Expression base class"""
"""
Expression base class
Expression is designed to handle the calculation of data with the format below
data with two dimension for each instrument,
- feature
- time: it could be observation time or period time.
- period time is designed for Point-in-time database. For example, the period time maybe 2014Q4, its value can observed for multiple times(different value may be observed at different time due to amendment).
"""
def __str__(self):
return type(self).__name__
@@ -124,8 +132,18 @@ class Expression(abc.ABC):
return Or(other, self)
def load(self, instrument, start_index, end_index, freq):
def load(self, instrument, start_index, end_index, *args):
"""load feature
This function is responsible for loading feature/expression based on the expression engine.
The concerate implementation will be seperated by two parts
1) caching data, handle errors.
- This part is shared by all the expressions and implemented in Expression
2) processing and calculating data based on the specific expression.
- This part is different in each expression and implemented in each expression
Expresion Engine is shared by different data.
Different data will have different extra infomation for `args`.
Parameters
----------
@@ -135,8 +153,15 @@ class Expression(abc.ABC):
feature start index [in calendar].
end_index : str
feature end index [in calendar].
freq : str
feature frequency.
*args may contains following information;
1) if it is used in basic experssion engine data, it contains following arguments
freq : str
feature frequency.
2) if is used in PIT data, it contains following arguments
cur_pit:
it is designed for the point-in-time data.
Returns
----------
@@ -146,26 +171,26 @@ class Expression(abc.ABC):
from .cache import H # pylint: disable=C0415
# cache
args = str(self), instrument, start_index, end_index, freq
if args in H["f"]:
return H["f"][args]
cache_key = str(self), instrument, start_index, end_index, *args
if cache_key in H["f"]:
return H["f"][cache_key]
if start_index is not None and end_index is not None and start_index > end_index:
raise ValueError("Invalid index range: {} {}".format(start_index, end_index))
try:
series = self._load_internal(instrument, start_index, end_index, freq)
series = self._load_internal(instrument, start_index, end_index, *args)
except Exception as e:
get_module_logger("data").debug(
f"Loading data error: instrument={instrument}, expression={str(self)}, "
f"start_index={start_index}, end_index={end_index}, freq={freq}. "
f"start_index={start_index}, end_index={end_index}, args={args}. "
f"error info: {str(e)}"
)
raise
series.name = str(self)
H["f"][args] = series
H["f"][cache_key] = series
return series
@abc.abstractmethod
def _load_internal(self, instrument, start_index, end_index, freq):
def _load_internal(self, instrument, start_index, end_index, *args) -> pd.Series:
raise NotImplementedError("This function must be implemented in your newly defined feature")
@abc.abstractmethod
@@ -225,6 +250,16 @@ class Feature(Expression):
return 0, 0
class PFeature(Feature):
def __str__(self):
return "$$" + self._name
def _load_internal(self, instrument, start_index, end_index, cur_time):
from .data import PITD # pylint: disable=C0415
return PITD.period_feature(instrument, str(self), start_index, end_index, cur_time)
class ExpressionOps(Expression):
"""Operator Expression

View File

@@ -34,6 +34,8 @@ from ..utils import (
code_to_fname,
set_log_with_config,
time_to_slc_point,
read_period_data,
get_period_list,
)
from ..utils.paral import ParallelExt
from .ops import Operators # pylint: disable=W0611
@@ -331,6 +333,38 @@ class FeatureProvider(abc.ABC):
raise NotImplementedError("Subclass of FeatureProvider must implement `feature` method")
class PITProvider(abc.ABC):
@abc.abstractmethod
def period_feature(self, instrument, field, start_index: int, end_index: int, cur_time: pd.Timestamp) -> pd.Series:
"""
get the historical periods data series between `start_index` and `end_index`
Parameters
----------
start_index: int
start_index is a relative index to the latest period to cur_time
end_index: int
end_index is a relative index to the latest period to cur_time
in most cases, the start_index and end_index will be a non-positive values
For example, start_index == -3 end_index == 0 and current period index is cur_idx,
then the data between [start_index + cur_idx, end_index + cur_idx] will be retrieved.
Returns
-------
pd.Series
The index will be integers to indicate the periods of the data
An typical examples will be
TODO
Raises
------
FileNotFoundError
This exception will be raised if the queried data do not exist.
"""
raise NotImplementedError(f"Please implement the `period_feature` method")
class ExpressionProvider(abc.ABC):
"""Expression provider class
@@ -694,6 +728,89 @@ class LocalFeatureProvider(FeatureProvider, ProviderBackendMixin):
return self.backend_obj(instrument=instrument, field=field, freq=freq)[start_index : end_index + 1]
class LocalPITProvider(PITProvider):
# TODO: Add PIT backend file storage
# NOTE: This class is not multi-threading-safe!!!!
def period_feature(self, instrument, field, start_index, end_index, cur_time):
if not isinstance(cur_time, pd.Timestamp):
raise ValueError(
f"Expected pd.Timestamp for `cur_time`, got '{cur_time}'. Advices: you can't query PIT data directly(e.g. '$$roewa_q'), you must use `P` operator to convert data to each day (e.g. 'P($$roewa_q)')"
)
assert end_index <= 0 # PIT don't support querying future data
DATA_RECORDS = [
("date", C.pit_record_type["date"]),
("period", C.pit_record_type["period"]),
("value", C.pit_record_type["value"]),
("_next", C.pit_record_type["index"]),
]
VALUE_DTYPE = C.pit_record_type["value"]
field = str(field).lower()[2:]
instrument = code_to_fname(instrument)
# {For acceleration
# start_index, end_index, cur_index = kwargs["info"]
# if cur_index == start_index:
# if not hasattr(self, "all_fields"):
# self.all_fields = []
# self.all_fields.append(field)
# if not hasattr(self, "period_index"):
# self.period_index = {}
# if field not in self.period_index:
# self.period_index[field] = {}
# For acceleration}
if not field.endswith("_q") and not field.endswith("_a"):
raise ValueError("period field must ends with '_q' or '_a'")
quarterly = field.endswith("_q")
index_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.index"
data_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.data"
if not (index_path.exists() and data_path.exists()):
raise FileNotFoundError("No file is found. Raise exception and ")
# NOTE: The most significant performance loss is here.
# Does the accelration that makes the program complicated really matters?
# - It make parameters parameters of the interface complicate
# - It does not performance in the optimal way (places all the pieces together, we may achieve higher performance)
# - If we design it carefully, we can go through for only once to get the historical evolution of the data.
# So I decide to deprecated previous implementation and keep the logic of the program simple
# Instead, I'll add a cache for the index file.
data = np.fromfile(data_path, dtype=DATA_RECORDS)
# find all revision periods before `cur_time`
cur_time_int = int(cur_time.year) * 10000 + int(cur_time.month) * 100 + int(cur_time.day)
loc = np.searchsorted(data["date"], cur_time_int, side="right")
if loc <= 0:
return pd.Series()
last_period = data["period"][:loc].max() # return the latest quarter
first_period = data["period"][:loc].min()
period_list = get_period_list(first_period, last_period, quarterly)
period_list = period_list[max(0, len(period_list) + start_index - 1) : len(period_list) + end_index]
value = np.full((len(period_list),), np.nan, dtype=VALUE_DTYPE)
for i, period in enumerate(period_list):
# last_period_index = self.period_index[field].get(period) # For acceleration
value[i], now_period_index = read_period_data(
index_path, data_path, period, cur_time_int, quarterly # , last_period_index # For acceleration
)
# self.period_index[field].update({period: now_period_index}) # For acceleration
# NOTE: the index is period_list; So it may result in unexpected values(e.g. nan)
# when calculation between different features and only part of its financial indicator is published
series = pd.Series(value, index=period_list, dtype=VALUE_DTYPE)
# {For acceleration
# if cur_index == end_index:
# self.all_fields.remove(field)
# if not len(self.all_fields):
# del self.all_fields
# del self.period_index
# For acceleration}
return series
class LocalExpressionProvider(ExpressionProvider):
"""Local expression data provider class
@@ -1003,6 +1120,8 @@ class ClientDatasetProvider(DatasetProvider):
class BaseProvider:
"""Local provider class
It is a set of interface that allow users to access data.
Because PITD is not exposed publicly to users, so it is not included in the interface.
To keep compatible with old qlib provider.
"""
@@ -1126,6 +1245,7 @@ if sys.version_info >= (3, 9):
CalendarProviderWrapper = Annotated[CalendarProvider, Wrapper]
InstrumentProviderWrapper = Annotated[InstrumentProvider, Wrapper]
FeatureProviderWrapper = Annotated[FeatureProvider, Wrapper]
PITProviderWrapper = Annotated[PITProvider, Wrapper]
ExpressionProviderWrapper = Annotated[ExpressionProvider, Wrapper]
DatasetProviderWrapper = Annotated[DatasetProvider, Wrapper]
BaseProviderWrapper = Annotated[BaseProvider, Wrapper]
@@ -1133,6 +1253,7 @@ else:
CalendarProviderWrapper = CalendarProvider
InstrumentProviderWrapper = InstrumentProvider
FeatureProviderWrapper = FeatureProvider
PITProviderWrapper = PITProvider
ExpressionProviderWrapper = ExpressionProvider
DatasetProviderWrapper = DatasetProvider
BaseProviderWrapper = BaseProvider
@@ -1140,6 +1261,7 @@ else:
Cal: CalendarProviderWrapper = Wrapper()
Inst: InstrumentProviderWrapper = Wrapper()
FeatureD: FeatureProviderWrapper = Wrapper()
PITD: PITProviderWrapper = Wrapper()
ExpressionD: ExpressionProviderWrapper = Wrapper()
DatasetD: DatasetProviderWrapper = Wrapper()
D: BaseProviderWrapper = Wrapper()
@@ -1165,6 +1287,11 @@ def register_all_wrappers(C):
register_wrapper(FeatureD, feature_provider, "qlib.data")
logger.debug(f"registering FeatureD {C.feature_provider}")
if getattr(C, "pit_provider", None) is not None:
pit_provider = init_instance_by_config(C.pit_provider, module)
register_wrapper(PITD, pit_provider, "qlib.data")
logger.debug(f"registering PITD {C.pit_provider}")
if getattr(C, "expression_provider", None) is not None:
# This provider is unnecessary in client provider
_eprovider = init_instance_by_config(C.expression_provider, module)

View File

@@ -10,9 +10,7 @@ import pandas as pd
from typing import Union, List, Type
from scipy.stats import percentileofscore
from .base import Expression, ExpressionOps, Feature
from .base import Expression, ExpressionOps, Feature, PFeature
from ..log import get_module_logger
from ..utils import get_callable_kwargs
@@ -84,8 +82,8 @@ class NpElemOperator(ElemOperator):
self.func = func
super(NpElemOperator, self).__init__(feature)
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
def _load_internal(self, instrument, start_index, end_index, *args):
series = self.feature.load(instrument, start_index, end_index, *args)
return getattr(np, self.func)(series)
@@ -124,11 +122,11 @@ class Sign(NpElemOperator):
def __init__(self, feature):
super(Sign, self).__init__(feature, "sign")
def _load_internal(self, instrument, start_index, end_index, freq):
def _load_internal(self, instrument, start_index, end_index, *args):
"""
To avoid error raised by bool type input, we transform the data into float32.
"""
series = self.feature.load(instrument, start_index, end_index, freq)
series = self.feature.load(instrument, start_index, end_index, *args)
# TODO: More precision types should be configurable
series = series.astype(np.float32)
return getattr(np, self.func)(series)
@@ -173,8 +171,8 @@ class Power(NpElemOperator):
def __str__(self):
return "{}({},{})".format(type(self).__name__, self.feature, self.exponent)
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
def _load_internal(self, instrument, start_index, end_index, *args):
series = self.feature.load(instrument, start_index, end_index, *args)
return getattr(np, self.func)(series, self.exponent)
@@ -201,8 +199,8 @@ class Mask(NpElemOperator):
def __str__(self):
return "{}({},{})".format(type(self).__name__, self.feature, self.instrument.lower())
def _load_internal(self, instrument, start_index, end_index, freq):
return self.feature.load(self.instrument, start_index, end_index, freq)
def _load_internal(self, instrument, start_index, end_index, *args):
return self.feature.load(self.instrument, start_index, end_index, *args)
class Not(NpElemOperator):
@@ -252,24 +250,24 @@ class PairOperator(ExpressionOps):
return "{}({},{})".format(type(self).__name__, self.feature_left, self.feature_right)
def get_longest_back_rolling(self):
if isinstance(self.feature_left, Expression):
if isinstance(self.feature_left, (Expression,)):
left_br = self.feature_left.get_longest_back_rolling()
else:
left_br = 0
if isinstance(self.feature_right, Expression):
if isinstance(self.feature_right, (Expression,)):
right_br = self.feature_right.get_longest_back_rolling()
else:
right_br = 0
return max(left_br, right_br)
def get_extended_window_size(self):
if isinstance(self.feature_left, Expression):
if isinstance(self.feature_left, (Expression,)):
ll, lr = self.feature_left.get_extended_window_size()
else:
ll, lr = 0, 0
if isinstance(self.feature_right, Expression):
if isinstance(self.feature_right, (Expression,)):
rl, rr = self.feature_right.get_extended_window_size()
else:
rl, rr = 0, 0
@@ -298,16 +296,16 @@ class NpPairOperator(PairOperator):
self.func = func
super(NpPairOperator, self).__init__(feature_left, feature_right)
def _load_internal(self, instrument, start_index, end_index, freq):
def _load_internal(self, instrument, start_index, end_index, *args):
assert any(
[isinstance(self.feature_left, Expression), self.feature_right, Expression]
[isinstance(self.feature_left, (Expression,)), self.feature_right, Expression]
), "at least one of two inputs is Expression instance"
if isinstance(self.feature_left, Expression):
series_left = self.feature_left.load(instrument, start_index, end_index, freq)
if isinstance(self.feature_left, (Expression,)):
series_left = self.feature_left.load(instrument, start_index, end_index, *args)
else:
series_left = self.feature_left # numeric value
if isinstance(self.feature_right, Expression):
series_right = self.feature_right.load(instrument, start_index, end_index, freq)
if isinstance(self.feature_right, (Expression,)):
series_right = self.feature_right.load(instrument, start_index, end_index, *args)
else:
series_right = self.feature_right
check_length = isinstance(series_left, (np.ndarray, pd.Series)) and isinstance(
@@ -637,48 +635,48 @@ class If(ExpressionOps):
def __str__(self):
return "If({},{},{})".format(self.condition, self.feature_left, self.feature_right)
def _load_internal(self, instrument, start_index, end_index, freq):
series_cond = self.condition.load(instrument, start_index, end_index, freq)
if isinstance(self.feature_left, Expression):
series_left = self.feature_left.load(instrument, start_index, end_index, freq)
def _load_internal(self, instrument, start_index, end_index, *args):
series_cond = self.condition.load(instrument, start_index, end_index, *args)
if isinstance(self.feature_left, (Expression,)):
series_left = self.feature_left.load(instrument, start_index, end_index, *args)
else:
series_left = self.feature_left
if isinstance(self.feature_right, Expression):
series_right = self.feature_right.load(instrument, start_index, end_index, freq)
if isinstance(self.feature_right, (Expression,)):
series_right = self.feature_right.load(instrument, start_index, end_index, *args)
else:
series_right = self.feature_right
series = pd.Series(np.where(series_cond, series_left, series_right), index=series_cond.index)
return series
def get_longest_back_rolling(self):
if isinstance(self.feature_left, Expression):
if isinstance(self.feature_left, (Expression,)):
left_br = self.feature_left.get_longest_back_rolling()
else:
left_br = 0
if isinstance(self.feature_right, Expression):
if isinstance(self.feature_right, (Expression,)):
right_br = self.feature_right.get_longest_back_rolling()
else:
right_br = 0
if isinstance(self.condition, Expression):
if isinstance(self.condition, (Expression,)):
c_br = self.condition.get_longest_back_rolling()
else:
c_br = 0
return max(left_br, right_br, c_br)
def get_extended_window_size(self):
if isinstance(self.feature_left, Expression):
if isinstance(self.feature_left, (Expression,)):
ll, lr = self.feature_left.get_extended_window_size()
else:
ll, lr = 0, 0
if isinstance(self.feature_right, Expression):
if isinstance(self.feature_right, (Expression,)):
rl, rr = self.feature_right.get_extended_window_size()
else:
rl, rr = 0, 0
if isinstance(self.condition, Expression):
if isinstance(self.condition, (Expression,)):
cl, cr = self.condition.get_extended_window_size()
else:
cl, cr = 0, 0
@@ -719,8 +717,8 @@ class Rolling(ExpressionOps):
def __str__(self):
return "{}({},{})".format(type(self).__name__, self.feature, self.N)
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
def _load_internal(self, instrument, start_index, end_index, *args):
series = self.feature.load(instrument, start_index, end_index, *args)
# NOTE: remove all null check,
# now it's user's responsibility to decide whether use features in null days
# isnull = series.isnull() # NOTE: isnull = NaN, inf is not null
@@ -777,8 +775,8 @@ class Ref(Rolling):
def __init__(self, feature, N):
super(Ref, self).__init__(feature, N, "ref")
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
def _load_internal(self, instrument, start_index, end_index, *args):
series = self.feature.load(instrument, start_index, end_index, *args)
# N = 0, return first day
if series.empty:
return series # Pandas bug, see: https://github.com/pandas-dev/pandas/issues/21049
@@ -967,8 +965,8 @@ class IdxMax(Rolling):
def __init__(self, feature, N):
super(IdxMax, self).__init__(feature, N, "idxmax")
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
def _load_internal(self, instrument, start_index, end_index, *args):
series = self.feature.load(instrument, start_index, end_index, *args)
if self.N == 0:
series = series.expanding(min_periods=1).apply(lambda x: x.argmax() + 1, raw=True)
else:
@@ -1015,8 +1013,8 @@ class IdxMin(Rolling):
def __init__(self, feature, N):
super(IdxMin, self).__init__(feature, N, "idxmin")
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
def _load_internal(self, instrument, start_index, end_index, *args):
series = self.feature.load(instrument, start_index, end_index, *args)
if self.N == 0:
series = series.expanding(min_periods=1).apply(lambda x: x.argmin() + 1, raw=True)
else:
@@ -1047,8 +1045,8 @@ class Quantile(Rolling):
def __str__(self):
return "{}({},{},{})".format(type(self).__name__, self.feature, self.N, self.qscore)
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
def _load_internal(self, instrument, start_index, end_index, *args):
series = self.feature.load(instrument, start_index, end_index, *args)
if self.N == 0:
series = series.expanding(min_periods=1).quantile(self.qscore)
else:
@@ -1095,8 +1093,8 @@ class Mad(Rolling):
def __init__(self, feature, N):
super(Mad, self).__init__(feature, N, "mad")
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
def _load_internal(self, instrument, start_index, end_index, *args):
series = self.feature.load(instrument, start_index, end_index, *args)
# TODO: implement in Cython
def mad(x):
@@ -1129,8 +1127,8 @@ class Rank(Rolling):
def __init__(self, feature, N):
super(Rank, self).__init__(feature, N, "rank")
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
def _load_internal(self, instrument, start_index, end_index, *args):
series = self.feature.load(instrument, start_index, end_index, *args)
# TODO: implement in Cython
def rank(x):
@@ -1187,8 +1185,8 @@ class Delta(Rolling):
def __init__(self, feature, N):
super(Delta, self).__init__(feature, N, "delta")
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
def _load_internal(self, instrument, start_index, end_index, *args):
series = self.feature.load(instrument, start_index, end_index, *args)
if self.N == 0:
series = series - series.iloc[0]
else:
@@ -1225,8 +1223,8 @@ class Slope(Rolling):
def __init__(self, feature, N):
super(Slope, self).__init__(feature, N, "slope")
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
def _load_internal(self, instrument, start_index, end_index, *args):
series = self.feature.load(instrument, start_index, end_index, *args)
if self.N == 0:
series = pd.Series(expanding_slope(series.values), index=series.index)
else:
@@ -1253,8 +1251,8 @@ class Rsquare(Rolling):
def __init__(self, feature, N):
super(Rsquare, self).__init__(feature, N, "rsquare")
def _load_internal(self, instrument, start_index, end_index, freq):
_series = self.feature.load(instrument, start_index, end_index, freq)
def _load_internal(self, instrument, start_index, end_index, *args):
_series = self.feature.load(instrument, start_index, end_index, *args)
if self.N == 0:
series = pd.Series(expanding_rsquare(_series.values), index=_series.index)
else:
@@ -1282,8 +1280,8 @@ class Resi(Rolling):
def __init__(self, feature, N):
super(Resi, self).__init__(feature, N, "resi")
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
def _load_internal(self, instrument, start_index, end_index, *args):
series = self.feature.load(instrument, start_index, end_index, *args)
if self.N == 0:
series = pd.Series(expanding_resi(series.values), index=series.index)
else:
@@ -1310,8 +1308,8 @@ class WMA(Rolling):
def __init__(self, feature, N):
super(WMA, self).__init__(feature, N, "wma")
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
def _load_internal(self, instrument, start_index, end_index, *args):
series = self.feature.load(instrument, start_index, end_index, *args)
# TODO: implement in Cython
def weighted_mean(x):
@@ -1345,8 +1343,8 @@ class EMA(Rolling):
def __init__(self, feature, N):
super(EMA, self).__init__(feature, N, "ema")
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
def _load_internal(self, instrument, start_index, end_index, *args):
series = self.feature.load(instrument, start_index, end_index, *args)
def exp_weighted_mean(x):
a = 1 - 2 / (1 + len(x))
@@ -1392,17 +1390,17 @@ class PairRolling(ExpressionOps):
def __str__(self):
return "{}({},{},{})".format(type(self).__name__, self.feature_left, self.feature_right, self.N)
def _load_internal(self, instrument, start_index, end_index, freq):
def _load_internal(self, instrument, start_index, end_index, *args):
assert any(
[isinstance(self.feature_left, Expression), self.feature_right, Expression]
), "at least one of two inputs is Expression instance"
if isinstance(self.feature_left, Expression):
series_left = self.feature_left.load(instrument, start_index, end_index, freq)
series_left = self.feature_left.load(instrument, start_index, end_index, *args)
else:
series_left = self.feature_left # numeric value
if isinstance(self.feature_right, Expression):
series_right = self.feature_right.load(instrument, start_index, end_index, freq)
series_right = self.feature_right.load(instrument, start_index, end_index, *args)
else:
series_right = self.feature_right
@@ -1465,12 +1463,12 @@ class Corr(PairRolling):
def __init__(self, feature_left, feature_right, N):
super(Corr, self).__init__(feature_left, feature_right, N, "corr")
def _load_internal(self, instrument, start_index, end_index, freq):
res: pd.Series = super(Corr, self)._load_internal(instrument, start_index, end_index, freq)
def _load_internal(self, instrument, start_index, end_index, *args):
res: pd.Series = super(Corr, self)._load_internal(instrument, start_index, end_index, *args)
# NOTE: Load uses MemCache, so calling load again will not cause performance degradation
series_left = self.feature_left.load(instrument, start_index, end_index, freq)
series_right = self.feature_right.load(instrument, start_index, end_index, freq)
series_left = self.feature_left.load(instrument, start_index, end_index, *args)
series_right = self.feature_right.load(instrument, start_index, end_index, *args)
res.loc[
np.isclose(series_left.rolling(self.N, min_periods=1).std(), 0, atol=2e-05)
| np.isclose(series_right.rolling(self.N, min_periods=1).std(), 0, atol=2e-05)
@@ -1529,8 +1527,8 @@ class TResample(ElemOperator):
def __str__(self):
return "{}({},{})".format(type(self).__name__, self.feature, self.freq)
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
def _load_internal(self, instrument, start_index, end_index, *args):
series = self.feature.load(instrument, start_index, end_index, *args)
if series.empty:
return series
@@ -1590,6 +1588,7 @@ OpsList = [
IdxMin,
If,
Feature,
PFeature,
] + [TResample]
@@ -1622,7 +1621,7 @@ class OpsWrapper:
else:
_ops_class = _operator
if not issubclass(_ops_class, Expression):
if not issubclass(_ops_class, (Expression,)):
raise TypeError("operator must be subclass of ExpressionOps, not {}".format(_ops_class))
if _ops_class.__name__ in self._ops:
@@ -1644,8 +1643,10 @@ def register_all_ops(C):
"""register all operator"""
logger = get_module_logger("ops")
from qlib.data.pit import P # pylint: disable=C0415
Operators.reset()
Operators.register(OpsList)
Operators.register(OpsList + [P])
if getattr(C, "custom_ops", None) is not None:
Operators.register(C.custom_ops)

57
qlib/data/pit.py Normal file
View File

@@ -0,0 +1,57 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
Qlib follow the logic below to supporting point-in-time database
For each stock, the format of its data is <observe_time, feature>. Expression Engine support calculation on such format of data
To calculate the feature value f_t at a specific observe time t, data with format <period_time, feature> will be used.
For example, the average earning of last 4 quarters (period_time) on 20190719 (observe_time)
The calculation of both <period_time, feature> and <observe_time, feature> data rely on expression engine. It consists of 2 phases.
1) calculation <period_time, feature> at each observation time t and it will collasped into a point (just like a normal feature)
2) concatenate all th collasped data, we will get data with format <observe_time, feature>.
Qlib will use the operator `P` to perform the collapse.
"""
import numpy as np
import pandas as pd
from qlib.data.ops import ElemOperator
from qlib.log import get_module_logger
from .data import Cal
class P(ElemOperator):
def _load_internal(self, instrument, start_index, end_index, freq):
_calendar = Cal.calendar(freq=freq)
resample_data = np.empty(end_index - start_index + 1, dtype="float32")
for cur_index in range(start_index, end_index + 1):
cur_time = _calendar[cur_index]
# To load expression accurately, more historical data are required
start_ws, end_ws = self.feature.get_extended_window_size()
if end_ws > 0:
raise ValueError(
"PIT database does not support referring to future period (e.g. expressions like `Ref('$$roewa_q', -1)` are not supported"
)
# The calculated value will always the last element, so the end_offset is zero.
try:
s = self.feature.load(instrument, -start_ws, 0, cur_time)
resample_data[cur_index - start_index] = s.iloc[-1] if len(s) > 0 else np.nan
except FileNotFoundError:
get_module_logger("base").warning(f"WARN: period data not found for {str(self)}")
return pd.Series(dtype="float32", name=str(self))
resample_series = pd.Series(
resample_data, index=pd.RangeIndex(start_index, end_index + 1), dtype="float32", name=str(self)
)
return resample_series
def get_longest_back_rolling(self):
# The period data will collapse as a normal feature. So no extending and looking back
return 0
def get_extended_window_size(self):
# The period data will collapse as a normal feature. So no extending and looking back
return 0, 0