diff --git a/qlib/data/base.py b/qlib/data/base.py index 1a3fd1b0e..953c22253 100644 --- a/qlib/data/base.py +++ b/qlib/data/base.py @@ -162,6 +162,9 @@ class Expression(abc.ABC): 2) if is used in PIT data, it contains following arguments cur_pit: it is designed for the point-in-time data. + period: int + This is used for query specific period. + The period is represented with int in Qlib. (e.g. 202001 may represent the first quarter in 2020) Returns ---------- @@ -254,10 +257,10 @@ class PFeature(Feature): def __str__(self): return "$$" + self._name - def _load_internal(self, instrument, start_index, end_index, cur_time): + def _load_internal(self, instrument, start_index, end_index, cur_time, period=None): from .data import PITD # pylint: disable=C0415 - return PITD.period_feature(instrument, str(self), start_index, end_index, cur_time) + return PITD.period_feature(instrument, str(self), start_index, end_index, cur_time, period) class ExpressionOps(Expression): diff --git a/qlib/data/data.py b/qlib/data/data.py index bb2a2acdb..ee82a2e2e 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -12,7 +12,7 @@ import queue import bisect import numpy as np import pandas as pd -from typing import List, Union +from typing import List, Union, Optional # For supporting multiprocessing in outer code, joblib is used from joblib import delayed @@ -335,7 +335,15 @@ class FeatureProvider(abc.ABC): class PITProvider(abc.ABC): @abc.abstractmethod - def period_feature(self, instrument, field, start_index: int, end_index: int, cur_time: pd.Timestamp) -> pd.Series: + def period_feature( + self, + instrument, + field, + start_index: int, + end_index: int, + cur_time: pd.Timestamp, + period: Optional[int] = None, + ) -> pd.Series: """ get the historical periods data series between `start_index` and `end_index` @@ -350,6 +358,11 @@ class PITProvider(abc.ABC): For example, start_index == -3 end_index == 0 and current period index is cur_idx, then the data between [start_index + cur_idx, end_index + cur_idx] will be retrieved. + period: int + This is used for query specific period. + The period is represented with int in Qlib. (e.g. 202001 may represent the first quarter in 2020) + NOTE: `period` will override `start_index` and `end_index` + Returns ------- pd.Series @@ -732,7 +745,7 @@ class LocalPITProvider(PITProvider): # TODO: Add PIT backend file storage # NOTE: This class is not multi-threading-safe!!!! - def period_feature(self, instrument, field, start_index, end_index, cur_time): + def period_feature(self, instrument, field, start_index, end_index, cur_time, period=None): if not isinstance(cur_time, pd.Timestamp): raise ValueError( f"Expected pd.Timestamp for `cur_time`, got '{cur_time}'. Advices: you can't query PIT data directly(e.g. '$$roewa_q'), you must use `P` operator to convert data to each day (e.g. 'P($$roewa_q)')" @@ -771,8 +784,8 @@ class LocalPITProvider(PITProvider): if not (index_path.exists() and data_path.exists()): raise FileNotFoundError("No file is found. Raise exception and ") # NOTE: The most significant performance loss is here. - # Does the accelration that makes the program complicated really matters? - # - It make parameters parameters of the interface complicate + # Does the acceleration that makes the program complicated really matters? + # - It makes parameters of the interface complicate # - It does not performance in the optimal way (places all the pieces together, we may achieve higher performance) # - If we design it carefully, we can go through for only once to get the historical evolution of the data. # So I decide to deprecated previous implementation and keep the logic of the program simple @@ -786,14 +799,20 @@ class LocalPITProvider(PITProvider): return pd.Series() last_period = data["period"][:loc].max() # return the latest quarter first_period = data["period"][:loc].min() - period_list = get_period_list(first_period, last_period, quarterly) - period_list = period_list[max(0, len(period_list) + start_index - 1) : len(period_list) + end_index] + if period is not None: + # NOTE: `period` has higher priority than `start_index` & `end_index` + if period not in period_list: + return pd.Series() + else: + period_list = [period] + else: + period_list = period_list[max(0, len(period_list) + start_index - 1) : len(period_list) + end_index] value = np.full((len(period_list),), np.nan, dtype=VALUE_DTYPE) - for i, period in enumerate(period_list): + for i, p in enumerate(period_list): # last_period_index = self.period_index[field].get(period) # For acceleration value[i], now_period_index = read_period_data( - index_path, data_path, period, cur_time_int, quarterly # , last_period_index # For acceleration + index_path, data_path, p, cur_time_int, quarterly # , last_period_index # For acceleration ) # self.period_index[field].update({period: now_period_index}) # For acceleration # NOTE: the index is period_list; So it may result in unexpected values(e.g. nan) diff --git a/qlib/data/ops.py b/qlib/data/ops.py index bdc032c03..2b742bebe 100644 --- a/qlib/data/ops.py +++ b/qlib/data/ops.py @@ -1643,10 +1643,10 @@ def register_all_ops(C): """register all operator""" logger = get_module_logger("ops") - from qlib.data.pit import P # pylint: disable=C0415 + from qlib.data.pit import P, PRef # pylint: disable=C0415 Operators.reset() - Operators.register(OpsList + [P]) + Operators.register(OpsList + [P, PRef]) if getattr(C, "custom_ops", None) is not None: Operators.register(C.custom_ops) diff --git a/qlib/data/pit.py b/qlib/data/pit.py index ebe01eaf2..093b98cab 100644 --- a/qlib/data/pit.py +++ b/qlib/data/pit.py @@ -37,7 +37,7 @@ class P(ElemOperator): # The calculated value will always the last element, so the end_offset is zero. try: - s = self.feature.load(instrument, -start_ws, 0, cur_time) + s = self._load_feature(instrument, -start_ws, 0, cur_time) resample_data[cur_index - start_index] = s.iloc[-1] if len(s) > 0 else np.nan except FileNotFoundError: get_module_logger("base").warning(f"WARN: period data not found for {str(self)}") @@ -48,6 +48,9 @@ class P(ElemOperator): ) return resample_series + def _load_feature(self, instrument, start_index, end_index, cur_time): + return self.feature.load(instrument, start_index, end_index, cur_time) + def get_longest_back_rolling(self): # The period data will collapse as a normal feature. So no extending and looking back return 0 @@ -55,3 +58,15 @@ class P(ElemOperator): def get_extended_window_size(self): # The period data will collapse as a normal feature. So no extending and looking back return 0, 0 + + +class PRef(P): + def __init__(self, feature, period): + super().__init__(feature) + self.period = period + + def __str__(self): + return f"{super().__str__()}[{self.period}]" + + def _load_feature(self, instrument, start_index, end_index, cur_time): + return self.feature.load(instrument, start_index, end_index, cur_time, self.period) diff --git a/scripts/data_collector/pit/test_pit.py b/scripts/data_collector/pit/test_pit.py index 4dedd85cf..71d94ee8b 100644 --- a/scripts/data_collector/pit/test_pit.py +++ b/scripts/data_collector/pit/test_pit.py @@ -92,7 +92,7 @@ class TestPIT(unittest.TestCase): "P((Ref($$roewa_q, 1) +$$roewa_q) / 2)", ] instruments = ["sh600519"] - data = D.features(instruments, fields, start_time="2019-01-01", end_time="20190719", freq="day") + data = D.features(instruments, fields, start_time="2019-01-01", end_time="2019-07-19", freq="day") expect = """ P(Mean($$roewa_q, 1)) P($$roewa_q) P(Mean($$roewa_q, 2)) P(Ref($$roewa_q, 1)) P((Ref($$roewa_q, 1) +$$roewa_q) / 2) instrument datetime @@ -189,6 +189,52 @@ class TestPIT(unittest.TestCase): fields += ["P(Sum($$yoyni_q, 4))"] fields += ["$close", "P($$roewa_q) * $close"] data = D.features(instruments, fields, start_time="2019-01-01", end_time="2020-01-01", freq="day") + except_data = """ + P($$roewa_q) P($$yoyni_q) P(($$roewa_q / $$yoyni_q) / Ref($$roewa_q / $$yoyni_q, 1) - 1) P(Sum($$yoyni_q, 4)) $close P($$roewa_q) * $close + instrument datetime + sh600519 2019-01-02 0.255220 0.243892 1.484224 1.661578 63.595333 16.230801 + 2019-01-03 0.255220 0.243892 1.484224 1.661578 62.641907 15.987467 + 2019-01-04 0.255220 0.243892 1.484224 1.661578 63.915985 16.312637 + 2019-01-07 0.255220 0.243892 1.484224 1.661578 64.286530 16.407207 + 2019-01-08 0.255220 0.243892 1.484224 1.661578 64.212196 16.388237 + ... ... ... ... ... ... ... + 2019-12-25 0.255819 0.219821 0.677052 1.081693 122.150467 31.248409 + 2019-12-26 0.255819 0.219821 0.677052 1.081693 122.301315 31.286999 + 2019-12-27 0.255819 0.219821 0.677052 1.081693 125.307404 32.056015 + 2019-12-30 0.255819 0.219821 0.677052 1.081693 127.763992 32.684456 + 2019-12-31 0.255819 0.219821 0.677052 1.081693 127.462303 32.607277 + + [244 rows x 6 columns] + """ + self.check_same(data, except_data) + + def test_pref_operator(self): + instruments = ["sh600519"] + fields = [ + "PRef($$roewa_q, 201902)", + "PRef($$yoyni_q, 201801)", + "P($$roewa_q)", + "P($$roewa_q) / PRef($$roewa_q, 201801)", + ] + data = D.features(instruments, fields, start_time="2018-04-28", end_time="2019-07-19", freq="day") + except_data = """ + PRef($$roewa_q, 201902) PRef($$yoyni_q, 201801) P($$roewa_q) P($$roewa_q) / PRef($$roewa_q, 201801) + instrument datetime + sh600519 2018-05-02 NaN 0.395075 0.088887 1.000000 + 2018-05-03 NaN 0.395075 0.088887 1.000000 + 2018-05-04 NaN 0.395075 0.088887 1.000000 + 2018-05-07 NaN 0.395075 0.088887 1.000000 + 2018-05-08 NaN 0.395075 0.088887 1.000000 + ... ... ... ... ... + 2019-07-15 0.000000 0.395075 0.000000 0.000000 + 2019-07-16 0.000000 0.395075 0.000000 0.000000 + 2019-07-17 0.000000 0.395075 0.000000 0.000000 + 2019-07-18 0.175322 0.395075 0.175322 1.972414 + 2019-07-19 0.175322 0.395075 0.175322 1.972414 + + [299 rows x 4 columns] + """ + self.check_same(data, except_data) if __name__ == "__main__":