mirror of
https://github.com/microsoft/qlib.git
synced 2026-06-06 05:51:17 +08:00
* Add PRef operator (#988) * Fix type annotations * Add test_pref_operator test case field * Add note to PITProvider * Add period parameter comment
This commit is contained in:
@@ -162,6 +162,9 @@ class Expression(abc.ABC):
|
||||
2) if is used in PIT data, it contains following arguments
|
||||
cur_pit:
|
||||
it is designed for the point-in-time data.
|
||||
period: int
|
||||
This is used for query specific period.
|
||||
The period is represented with int in Qlib. (e.g. 202001 may represent the first quarter in 2020)
|
||||
|
||||
Returns
|
||||
----------
|
||||
@@ -254,10 +257,10 @@ class PFeature(Feature):
|
||||
def __str__(self):
|
||||
return "$$" + self._name
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, cur_time):
|
||||
def _load_internal(self, instrument, start_index, end_index, cur_time, period=None):
|
||||
from .data import PITD # pylint: disable=C0415
|
||||
|
||||
return PITD.period_feature(instrument, str(self), start_index, end_index, cur_time)
|
||||
return PITD.period_feature(instrument, str(self), start_index, end_index, cur_time, period)
|
||||
|
||||
|
||||
class ExpressionOps(Expression):
|
||||
|
||||
@@ -12,7 +12,7 @@ import queue
|
||||
import bisect
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from typing import List, Union
|
||||
from typing import List, Union, Optional
|
||||
|
||||
# For supporting multiprocessing in outer code, joblib is used
|
||||
from joblib import delayed
|
||||
@@ -335,7 +335,15 @@ class FeatureProvider(abc.ABC):
|
||||
|
||||
class PITProvider(abc.ABC):
|
||||
@abc.abstractmethod
|
||||
def period_feature(self, instrument, field, start_index: int, end_index: int, cur_time: pd.Timestamp) -> pd.Series:
|
||||
def period_feature(
|
||||
self,
|
||||
instrument,
|
||||
field,
|
||||
start_index: int,
|
||||
end_index: int,
|
||||
cur_time: pd.Timestamp,
|
||||
period: Optional[int] = None,
|
||||
) -> pd.Series:
|
||||
"""
|
||||
get the historical periods data series between `start_index` and `end_index`
|
||||
|
||||
@@ -350,6 +358,11 @@ class PITProvider(abc.ABC):
|
||||
For example, start_index == -3 end_index == 0 and current period index is cur_idx,
|
||||
then the data between [start_index + cur_idx, end_index + cur_idx] will be retrieved.
|
||||
|
||||
period: int
|
||||
This is used for query specific period.
|
||||
The period is represented with int in Qlib. (e.g. 202001 may represent the first quarter in 2020)
|
||||
NOTE: `period` will override `start_index` and `end_index`
|
||||
|
||||
Returns
|
||||
-------
|
||||
pd.Series
|
||||
@@ -732,7 +745,7 @@ class LocalPITProvider(PITProvider):
|
||||
# TODO: Add PIT backend file storage
|
||||
# NOTE: This class is not multi-threading-safe!!!!
|
||||
|
||||
def period_feature(self, instrument, field, start_index, end_index, cur_time):
|
||||
def period_feature(self, instrument, field, start_index, end_index, cur_time, period=None):
|
||||
if not isinstance(cur_time, pd.Timestamp):
|
||||
raise ValueError(
|
||||
f"Expected pd.Timestamp for `cur_time`, got '{cur_time}'. Advices: you can't query PIT data directly(e.g. '$$roewa_q'), you must use `P` operator to convert data to each day (e.g. 'P($$roewa_q)')"
|
||||
@@ -771,8 +784,8 @@ class LocalPITProvider(PITProvider):
|
||||
if not (index_path.exists() and data_path.exists()):
|
||||
raise FileNotFoundError("No file is found. Raise exception and ")
|
||||
# NOTE: The most significant performance loss is here.
|
||||
# Does the accelration that makes the program complicated really matters?
|
||||
# - It make parameters parameters of the interface complicate
|
||||
# Does the acceleration that makes the program complicated really matters?
|
||||
# - It makes parameters of the interface complicate
|
||||
# - It does not performance in the optimal way (places all the pieces together, we may achieve higher performance)
|
||||
# - If we design it carefully, we can go through for only once to get the historical evolution of the data.
|
||||
# So I decide to deprecated previous implementation and keep the logic of the program simple
|
||||
@@ -786,14 +799,20 @@ class LocalPITProvider(PITProvider):
|
||||
return pd.Series()
|
||||
last_period = data["period"][:loc].max() # return the latest quarter
|
||||
first_period = data["period"][:loc].min()
|
||||
|
||||
period_list = get_period_list(first_period, last_period, quarterly)
|
||||
period_list = period_list[max(0, len(period_list) + start_index - 1) : len(period_list) + end_index]
|
||||
if period is not None:
|
||||
# NOTE: `period` has higher priority than `start_index` & `end_index`
|
||||
if period not in period_list:
|
||||
return pd.Series()
|
||||
else:
|
||||
period_list = [period]
|
||||
else:
|
||||
period_list = period_list[max(0, len(period_list) + start_index - 1) : len(period_list) + end_index]
|
||||
value = np.full((len(period_list),), np.nan, dtype=VALUE_DTYPE)
|
||||
for i, period in enumerate(period_list):
|
||||
for i, p in enumerate(period_list):
|
||||
# last_period_index = self.period_index[field].get(period) # For acceleration
|
||||
value[i], now_period_index = read_period_data(
|
||||
index_path, data_path, period, cur_time_int, quarterly # , last_period_index # For acceleration
|
||||
index_path, data_path, p, cur_time_int, quarterly # , last_period_index # For acceleration
|
||||
)
|
||||
# self.period_index[field].update({period: now_period_index}) # For acceleration
|
||||
# NOTE: the index is period_list; So it may result in unexpected values(e.g. nan)
|
||||
|
||||
@@ -1643,10 +1643,10 @@ def register_all_ops(C):
|
||||
"""register all operator"""
|
||||
logger = get_module_logger("ops")
|
||||
|
||||
from qlib.data.pit import P # pylint: disable=C0415
|
||||
from qlib.data.pit import P, PRef # pylint: disable=C0415
|
||||
|
||||
Operators.reset()
|
||||
Operators.register(OpsList + [P])
|
||||
Operators.register(OpsList + [P, PRef])
|
||||
|
||||
if getattr(C, "custom_ops", None) is not None:
|
||||
Operators.register(C.custom_ops)
|
||||
|
||||
@@ -37,7 +37,7 @@ class P(ElemOperator):
|
||||
|
||||
# The calculated value will always the last element, so the end_offset is zero.
|
||||
try:
|
||||
s = self.feature.load(instrument, -start_ws, 0, cur_time)
|
||||
s = self._load_feature(instrument, -start_ws, 0, cur_time)
|
||||
resample_data[cur_index - start_index] = s.iloc[-1] if len(s) > 0 else np.nan
|
||||
except FileNotFoundError:
|
||||
get_module_logger("base").warning(f"WARN: period data not found for {str(self)}")
|
||||
@@ -48,6 +48,9 @@ class P(ElemOperator):
|
||||
)
|
||||
return resample_series
|
||||
|
||||
def _load_feature(self, instrument, start_index, end_index, cur_time):
|
||||
return self.feature.load(instrument, start_index, end_index, cur_time)
|
||||
|
||||
def get_longest_back_rolling(self):
|
||||
# The period data will collapse as a normal feature. So no extending and looking back
|
||||
return 0
|
||||
@@ -55,3 +58,15 @@ class P(ElemOperator):
|
||||
def get_extended_window_size(self):
|
||||
# The period data will collapse as a normal feature. So no extending and looking back
|
||||
return 0, 0
|
||||
|
||||
|
||||
class PRef(P):
|
||||
def __init__(self, feature, period):
|
||||
super().__init__(feature)
|
||||
self.period = period
|
||||
|
||||
def __str__(self):
|
||||
return f"{super().__str__()}[{self.period}]"
|
||||
|
||||
def _load_feature(self, instrument, start_index, end_index, cur_time):
|
||||
return self.feature.load(instrument, start_index, end_index, cur_time, self.period)
|
||||
|
||||
@@ -92,7 +92,7 @@ class TestPIT(unittest.TestCase):
|
||||
"P((Ref($$roewa_q, 1) +$$roewa_q) / 2)",
|
||||
]
|
||||
instruments = ["sh600519"]
|
||||
data = D.features(instruments, fields, start_time="2019-01-01", end_time="20190719", freq="day")
|
||||
data = D.features(instruments, fields, start_time="2019-01-01", end_time="2019-07-19", freq="day")
|
||||
expect = """
|
||||
P(Mean($$roewa_q, 1)) P($$roewa_q) P(Mean($$roewa_q, 2)) P(Ref($$roewa_q, 1)) P((Ref($$roewa_q, 1) +$$roewa_q) / 2)
|
||||
instrument datetime
|
||||
@@ -189,6 +189,52 @@ class TestPIT(unittest.TestCase):
|
||||
fields += ["P(Sum($$yoyni_q, 4))"]
|
||||
fields += ["$close", "P($$roewa_q) * $close"]
|
||||
data = D.features(instruments, fields, start_time="2019-01-01", end_time="2020-01-01", freq="day")
|
||||
except_data = """
|
||||
P($$roewa_q) P($$yoyni_q) P(($$roewa_q / $$yoyni_q) / Ref($$roewa_q / $$yoyni_q, 1) - 1) P(Sum($$yoyni_q, 4)) $close P($$roewa_q) * $close
|
||||
instrument datetime
|
||||
sh600519 2019-01-02 0.255220 0.243892 1.484224 1.661578 63.595333 16.230801
|
||||
2019-01-03 0.255220 0.243892 1.484224 1.661578 62.641907 15.987467
|
||||
2019-01-04 0.255220 0.243892 1.484224 1.661578 63.915985 16.312637
|
||||
2019-01-07 0.255220 0.243892 1.484224 1.661578 64.286530 16.407207
|
||||
2019-01-08 0.255220 0.243892 1.484224 1.661578 64.212196 16.388237
|
||||
... ... ... ... ... ... ...
|
||||
2019-12-25 0.255819 0.219821 0.677052 1.081693 122.150467 31.248409
|
||||
2019-12-26 0.255819 0.219821 0.677052 1.081693 122.301315 31.286999
|
||||
2019-12-27 0.255819 0.219821 0.677052 1.081693 125.307404 32.056015
|
||||
2019-12-30 0.255819 0.219821 0.677052 1.081693 127.763992 32.684456
|
||||
2019-12-31 0.255819 0.219821 0.677052 1.081693 127.462303 32.607277
|
||||
|
||||
[244 rows x 6 columns]
|
||||
"""
|
||||
self.check_same(data, except_data)
|
||||
|
||||
def test_pref_operator(self):
|
||||
instruments = ["sh600519"]
|
||||
fields = [
|
||||
"PRef($$roewa_q, 201902)",
|
||||
"PRef($$yoyni_q, 201801)",
|
||||
"P($$roewa_q)",
|
||||
"P($$roewa_q) / PRef($$roewa_q, 201801)",
|
||||
]
|
||||
data = D.features(instruments, fields, start_time="2018-04-28", end_time="2019-07-19", freq="day")
|
||||
except_data = """
|
||||
PRef($$roewa_q, 201902) PRef($$yoyni_q, 201801) P($$roewa_q) P($$roewa_q) / PRef($$roewa_q, 201801)
|
||||
instrument datetime
|
||||
sh600519 2018-05-02 NaN 0.395075 0.088887 1.000000
|
||||
2018-05-03 NaN 0.395075 0.088887 1.000000
|
||||
2018-05-04 NaN 0.395075 0.088887 1.000000
|
||||
2018-05-07 NaN 0.395075 0.088887 1.000000
|
||||
2018-05-08 NaN 0.395075 0.088887 1.000000
|
||||
... ... ... ... ...
|
||||
2019-07-15 0.000000 0.395075 0.000000 0.000000
|
||||
2019-07-16 0.000000 0.395075 0.000000 0.000000
|
||||
2019-07-17 0.000000 0.395075 0.000000 0.000000
|
||||
2019-07-18 0.175322 0.395075 0.175322 1.972414
|
||||
2019-07-19 0.175322 0.395075 0.175322 1.972414
|
||||
|
||||
[299 rows x 4 columns]
|
||||
"""
|
||||
self.check_same(data, except_data)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user