Support Point-in-time Data Operation (#343)

* add period ops class * black format * add pit data read * fix bug in period ops * update ops runnable * update PIT test example * black format * update PIT test * update tets_PIT * update code format * add check_feature_exist * black format * optimize the PIT Algorithm * fix bug * update example * update test_PIT name * add pit collector * black format * fix bugs * fix try * fix bug & add dump_pit.py * Successfully run and understand PIT * Add some docs and remove a bug * mv crypto collector * black format * Run succesfully after merging master * Pass test and fix code * remove useless PIT code * fix PYlint * Rename Co-authored-by: Young <afe.young@gmail.com>
2026-07-06 04:20:57 +08:00 · 2022-03-10 14:27:52 +08:00
parent 3a911bc09b
commit faa99f30fa
19 changed files with 1459 additions and 141 deletions
--- a/qlib/data/init.py
+++ b/qlib/data/init.py
@@ -15,6 +15,7 @@ from .data import (
    LocalCalendarProvider,
    LocalInstrumentProvider,
    LocalFeatureProvider,
+    LocalPITProvider,
    LocalExpressionProvider,
    LocalDatasetProvider,
    ClientCalendarProvider,
--- a/qlib/data/base.py
+++ b/qlib/data/base.py
@@ -6,12 +6,20 @@ from __future__ import division
 from __future__ import print_function

 import abc
-
+import pandas as pd
 from ..log import get_module_logger


 class Expression(abc.ABC):
-    """Expression base class"""
+    """
+    Expression base class
+
+    Expression is designed to handle the calculation of data with the format below
+    data with two dimension for each instrument,
+    - feature
+    - time:  it  could be observation time or period time.
+        - period time is designed for Point-in-time database.  For example, the period time maybe 2014Q4, its value can observed for multiple times(different value may be observed at different time due to amendment).
+    """

    def __str__(self):
        return type(self).__name__
@@ -124,8 +132,18 @@ class Expression(abc.ABC):

        return Or(other, self)

-    def load(self, instrument, start_index, end_index, freq):
+    def load(self, instrument, start_index, end_index, *args):
        """load  feature
+        This function is responsible for loading feature/expression based on the expression engine.
+
+        The concerate implementation will be seperated by two parts
+        1) caching data, handle errors.
+            - This part is shared by all the expressions and implemented in Expression
+        2) processing and calculating data based on the specific expression.
+            - This part is different in each expression and implemented in each expression
+
+        Expresion Engine is shared by different data.
+        Different data will have different extra infomation for `args`.

        Parameters
        ----------
@@ -135,8 +153,15 @@ class Expression(abc.ABC):
            feature start index [in calendar].
        end_index : str
            feature end  index  [in calendar].
-        freq : str
-            feature frequency.
+
+        *args may contains following information;
+        1) if it is used in basic experssion engine data, it contains following arguments
+            freq : str
+                feature frequency.
+
+        2) if is used in PIT data, it contains following arguments
+            cur_pit:
+                it is designed for the point-in-time data.

        Returns
        ----------
@@ -146,26 +171,26 @@ class Expression(abc.ABC):
        from .cache import H  # pylint: disable=C0415

        # cache
-        args = str(self), instrument, start_index, end_index, freq
-        if args in H["f"]:
-            return H["f"][args]
+        cache_key = str(self), instrument, start_index, end_index, *args
+        if cache_key in H["f"]:
+            return H["f"][cache_key]
        if start_index is not None and end_index is not None and start_index > end_index:
            raise ValueError("Invalid index range: {} {}".format(start_index, end_index))
        try:
-            series = self._load_internal(instrument, start_index, end_index, freq)
+            series = self._load_internal(instrument, start_index, end_index, *args)
        except Exception as e:
            get_module_logger("data").debug(
                f"Loading data error: instrument={instrument}, expression={str(self)}, "
-                f"start_index={start_index}, end_index={end_index}, freq={freq}. "
+                f"start_index={start_index}, end_index={end_index}, args={args}. "
                f"error info: {str(e)}"
            )
            raise
        series.name = str(self)
-        H["f"][args] = series
+        H["f"][cache_key] = series
        return series

    @abc.abstractmethod
-    def _load_internal(self, instrument, start_index, end_index, freq):
+    def _load_internal(self, instrument, start_index, end_index, *args) -> pd.Series:
        raise NotImplementedError("This function must be implemented in your newly defined feature")

    @abc.abstractmethod
@@ -225,6 +250,16 @@ class Feature(Expression):
        return 0, 0


+class PFeature(Feature):
+    def __str__(self):
+        return "$$" + self._name
+
+    def _load_internal(self, instrument, start_index, end_index, cur_time):
+        from .data import PITD  # pylint: disable=C0415
+
+        return PITD.period_feature(instrument, str(self), start_index, end_index, cur_time)
+
+
 class ExpressionOps(Expression):
    """Operator Expression

--- a/qlib/data/data.py
+++ b/qlib/data/data.py
@@ -34,6 +34,8 @@ from ..utils import (
    code_to_fname,
    set_log_with_config,
    time_to_slc_point,
+    read_period_data,
+    get_period_list,
 )
 from ..utils.paral import ParallelExt
 from .ops import Operators  # pylint: disable=W0611
@@ -331,6 +333,38 @@ class FeatureProvider(abc.ABC):
        raise NotImplementedError("Subclass of FeatureProvider must implement `feature` method")


+class PITProvider(abc.ABC):
+    @abc.abstractmethod
+    def period_feature(self, instrument, field, start_index: int, end_index: int, cur_time: pd.Timestamp) -> pd.Series:
+        """
+        get the historical periods data series between `start_index` and `end_index`
+
+        Parameters
+        ----------
+        start_index: int
+            start_index is a relative index to the latest period to cur_time
+
+        end_index: int
+            end_index is a relative index to the latest period to cur_time
+            in most cases, the start_index and end_index will be a non-positive values
+            For example, start_index == -3 end_index == 0 and current period index is cur_idx,
+            then the data between [start_index + cur_idx, end_index + cur_idx] will be retrieved.
+
+        Returns
+        -------
+        pd.Series
+            The index will be integers to indicate the periods of the data
+            An typical examples will be
+            TODO
+
+        Raises
+        ------
+        FileNotFoundError
+            This exception will be raised if the queried data do not exist.
+        """
+        raise NotImplementedError(f"Please implement the `period_feature` method")
+
+
 class ExpressionProvider(abc.ABC):
    """Expression provider class

@@ -694,6 +728,89 @@ class LocalFeatureProvider(FeatureProvider, ProviderBackendMixin):
        return self.backend_obj(instrument=instrument, field=field, freq=freq)[start_index : end_index + 1]


+class LocalPITProvider(PITProvider):
+    # TODO: Add PIT backend file storage
+    # NOTE: This class is not multi-threading-safe!!!!
+
+    def period_feature(self, instrument, field, start_index, end_index, cur_time):
+        if not isinstance(cur_time, pd.Timestamp):
+            raise ValueError(
+                f"Expected pd.Timestamp for `cur_time`, got '{cur_time}'. Advices: you can't query PIT data directly(e.g. '$$roewa_q'), you must use `P` operator to convert data to each day (e.g. 'P($$roewa_q)')"
+            )
+
+        assert end_index <= 0  # PIT don't support querying future data
+
+        DATA_RECORDS = [
+            ("date", C.pit_record_type["date"]),
+            ("period", C.pit_record_type["period"]),
+            ("value", C.pit_record_type["value"]),
+            ("_next", C.pit_record_type["index"]),
+        ]
+        VALUE_DTYPE = C.pit_record_type["value"]
+
+        field = str(field).lower()[2:]
+        instrument = code_to_fname(instrument)
+
+        # {For acceleration
+        # start_index, end_index, cur_index = kwargs["info"]
+        # if cur_index == start_index:
+        #     if not hasattr(self, "all_fields"):
+        #         self.all_fields = []
+        #     self.all_fields.append(field)
+        #     if not hasattr(self, "period_index"):
+        #         self.period_index = {}
+        #     if field not in self.period_index:
+        #         self.period_index[field] = {}
+        # For acceleration}
+
+        if not field.endswith("_q") and not field.endswith("_a"):
+            raise ValueError("period field must ends with '_q' or '_a'")
+        quarterly = field.endswith("_q")
+        index_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.index"
+        data_path = C.dpm.get_data_uri() / "financial" / instrument.lower() / f"{field}.data"
+        if not (index_path.exists() and data_path.exists()):
+            raise FileNotFoundError("No file is found. Raise exception and  ")
+        # NOTE: The most significant performance loss is here.
+        # Does the accelration that makes the program complicated really matters?
+        # - It make parameters parameters of the interface complicate
+        # - It does not performance in the optimal way (places all the pieces together, we may achieve higher performance)
+        #    - If we design it carefully, we can go through for only once to get the historical evolution of the data.
+        # So I decide to deprecated previous implementation and keep the logic of the program simple
+        # Instead, I'll add a cache for the index file.
+        data = np.fromfile(data_path, dtype=DATA_RECORDS)
+
+        # find all revision periods before `cur_time`
+        cur_time_int = int(cur_time.year) * 10000 + int(cur_time.month) * 100 + int(cur_time.day)
+        loc = np.searchsorted(data["date"], cur_time_int, side="right")
+        if loc <= 0:
+            return pd.Series()
+        last_period = data["period"][:loc].max()  # return the latest quarter
+        first_period = data["period"][:loc].min()
+
+        period_list = get_period_list(first_period, last_period, quarterly)
+        period_list = period_list[max(0, len(period_list) + start_index - 1) : len(period_list) + end_index]
+        value = np.full((len(period_list),), np.nan, dtype=VALUE_DTYPE)
+        for i, period in enumerate(period_list):
+            # last_period_index = self.period_index[field].get(period)  # For acceleration
+            value[i], now_period_index = read_period_data(
+                index_path, data_path, period, cur_time_int, quarterly  # , last_period_index  # For acceleration
+            )
+            # self.period_index[field].update({period: now_period_index})  # For acceleration
+        # NOTE: the index is period_list; So it may result in unexpected values(e.g. nan)
+        # when calculation between different features and only part of its financial indicator is published
+        series = pd.Series(value, index=period_list, dtype=VALUE_DTYPE)
+
+        # {For acceleration
+        # if cur_index == end_index:
+        #     self.all_fields.remove(field)
+        #     if not len(self.all_fields):
+        #         del self.all_fields
+        #         del self.period_index
+        # For acceleration}
+
+        return series
+
+
 class LocalExpressionProvider(ExpressionProvider):
    """Local expression data provider class

@@ -1003,6 +1120,8 @@ class ClientDatasetProvider(DatasetProvider):

 class BaseProvider:
    """Local provider class
+    It is a set of interface that allow users to access data.
+    Because PITD is not exposed publicly to users, so it is not included in the interface.

    To keep compatible with old qlib provider.
    """
@@ -1126,6 +1245,7 @@ if sys.version_info >= (3, 9):
    CalendarProviderWrapper = Annotated[CalendarProvider, Wrapper]
    InstrumentProviderWrapper = Annotated[InstrumentProvider, Wrapper]
    FeatureProviderWrapper = Annotated[FeatureProvider, Wrapper]
+    PITProviderWrapper = Annotated[PITProvider, Wrapper]
    ExpressionProviderWrapper = Annotated[ExpressionProvider, Wrapper]
    DatasetProviderWrapper = Annotated[DatasetProvider, Wrapper]
    BaseProviderWrapper = Annotated[BaseProvider, Wrapper]
@@ -1133,6 +1253,7 @@ else:
    CalendarProviderWrapper = CalendarProvider
    InstrumentProviderWrapper = InstrumentProvider
    FeatureProviderWrapper = FeatureProvider
+    PITProviderWrapper = PITProvider
    ExpressionProviderWrapper = ExpressionProvider
    DatasetProviderWrapper = DatasetProvider
    BaseProviderWrapper = BaseProvider
@@ -1140,6 +1261,7 @@ else:
 Cal: CalendarProviderWrapper = Wrapper()
 Inst: InstrumentProviderWrapper = Wrapper()
 FeatureD: FeatureProviderWrapper = Wrapper()
+PITD: PITProviderWrapper = Wrapper()
 ExpressionD: ExpressionProviderWrapper = Wrapper()
 DatasetD: DatasetProviderWrapper = Wrapper()
 D: BaseProviderWrapper = Wrapper()
@@ -1165,6 +1287,11 @@ def register_all_wrappers(C):
        register_wrapper(FeatureD, feature_provider, "qlib.data")
        logger.debug(f"registering FeatureD {C.feature_provider}")

+    if getattr(C, "pit_provider", None) is not None:
+        pit_provider = init_instance_by_config(C.pit_provider, module)
+        register_wrapper(PITD, pit_provider, "qlib.data")
+        logger.debug(f"registering PITD {C.pit_provider}")
+
    if getattr(C, "expression_provider", None) is not None:
        # This provider is unnecessary in client provider
        _eprovider = init_instance_by_config(C.expression_provider, module)
--- a/qlib/data/ops.py
+++ b/qlib/data/ops.py
@@ -10,9 +10,7 @@ import pandas as pd

 from typing import Union, List, Type
 from scipy.stats import percentileofscore
-
-from .base import Expression, ExpressionOps, Feature
-
+from .base import Expression, ExpressionOps, Feature, PFeature
 from ..log import get_module_logger
 from ..utils import get_callable_kwargs

@@ -84,8 +82,8 @@ class NpElemOperator(ElemOperator):
        self.func = func
        super(NpElemOperator, self).__init__(feature)

-    def _load_internal(self, instrument, start_index, end_index, freq):
-        series = self.feature.load(instrument, start_index, end_index, freq)
+    def _load_internal(self, instrument, start_index, end_index, *args):
+        series = self.feature.load(instrument, start_index, end_index, *args)
        return getattr(np, self.func)(series)


@@ -124,11 +122,11 @@ class Sign(NpElemOperator):
    def __init__(self, feature):
        super(Sign, self).__init__(feature, "sign")

-    def _load_internal(self, instrument, start_index, end_index, freq):
+    def _load_internal(self, instrument, start_index, end_index, *args):
        """
        To avoid error raised by bool type input, we transform the data into float32.
        """
-        series = self.feature.load(instrument, start_index, end_index, freq)
+        series = self.feature.load(instrument, start_index, end_index, *args)
        # TODO:  More precision types should be configurable
        series = series.astype(np.float32)
        return getattr(np, self.func)(series)
@@ -173,8 +171,8 @@ class Power(NpElemOperator):
    def __str__(self):
        return "{}({},{})".format(type(self).__name__, self.feature, self.exponent)

-    def _load_internal(self, instrument, start_index, end_index, freq):
-        series = self.feature.load(instrument, start_index, end_index, freq)
+    def _load_internal(self, instrument, start_index, end_index, *args):
+        series = self.feature.load(instrument, start_index, end_index, *args)
        return getattr(np, self.func)(series, self.exponent)


@@ -201,8 +199,8 @@ class Mask(NpElemOperator):
    def __str__(self):
        return "{}({},{})".format(type(self).__name__, self.feature, self.instrument.lower())

-    def _load_internal(self, instrument, start_index, end_index, freq):
-        return self.feature.load(self.instrument, start_index, end_index, freq)
+    def _load_internal(self, instrument, start_index, end_index, *args):
+        return self.feature.load(self.instrument, start_index, end_index, *args)


 class Not(NpElemOperator):
@@ -252,24 +250,24 @@ class PairOperator(ExpressionOps):
        return "{}({},{})".format(type(self).__name__, self.feature_left, self.feature_right)

    def get_longest_back_rolling(self):
-        if isinstance(self.feature_left, Expression):
+        if isinstance(self.feature_left, (Expression,)):
            left_br = self.feature_left.get_longest_back_rolling()
        else:
            left_br = 0

-        if isinstance(self.feature_right, Expression):
+        if isinstance(self.feature_right, (Expression,)):
            right_br = self.feature_right.get_longest_back_rolling()
        else:
            right_br = 0
        return max(left_br, right_br)

    def get_extended_window_size(self):
-        if isinstance(self.feature_left, Expression):
+        if isinstance(self.feature_left, (Expression,)):
            ll, lr = self.feature_left.get_extended_window_size()
        else:
            ll, lr = 0, 0

-        if isinstance(self.feature_right, Expression):
+        if isinstance(self.feature_right, (Expression,)):
            rl, rr = self.feature_right.get_extended_window_size()
        else:
            rl, rr = 0, 0
@@ -298,16 +296,16 @@ class NpPairOperator(PairOperator):
        self.func = func
        super(NpPairOperator, self).__init__(feature_left, feature_right)

-    def _load_internal(self, instrument, start_index, end_index, freq):
+    def _load_internal(self, instrument, start_index, end_index, *args):
        assert any(
-            [isinstance(self.feature_left, Expression), self.feature_right, Expression]
+            [isinstance(self.feature_left, (Expression,)), self.feature_right, Expression]
        ), "at least one of two inputs is Expression instance"
-        if isinstance(self.feature_left, Expression):
-            series_left = self.feature_left.load(instrument, start_index, end_index, freq)
+        if isinstance(self.feature_left, (Expression,)):
+            series_left = self.feature_left.load(instrument, start_index, end_index, *args)
        else:
            series_left = self.feature_left  # numeric value
-        if isinstance(self.feature_right, Expression):
-            series_right = self.feature_right.load(instrument, start_index, end_index, freq)
+        if isinstance(self.feature_right, (Expression,)):
+            series_right = self.feature_right.load(instrument, start_index, end_index, *args)
        else:
            series_right = self.feature_right
        check_length = isinstance(series_left, (np.ndarray, pd.Series)) and isinstance(
@@ -637,48 +635,48 @@ class If(ExpressionOps):
    def __str__(self):
        return "If({},{},{})".format(self.condition, self.feature_left, self.feature_right)

-    def _load_internal(self, instrument, start_index, end_index, freq):
-        series_cond = self.condition.load(instrument, start_index, end_index, freq)
-        if isinstance(self.feature_left, Expression):
-            series_left = self.feature_left.load(instrument, start_index, end_index, freq)
+    def _load_internal(self, instrument, start_index, end_index, *args):
+        series_cond = self.condition.load(instrument, start_index, end_index, *args)
+        if isinstance(self.feature_left, (Expression,)):
+            series_left = self.feature_left.load(instrument, start_index, end_index, *args)
        else:
            series_left = self.feature_left
-        if isinstance(self.feature_right, Expression):
-            series_right = self.feature_right.load(instrument, start_index, end_index, freq)
+        if isinstance(self.feature_right, (Expression,)):
+            series_right = self.feature_right.load(instrument, start_index, end_index, *args)
        else:
            series_right = self.feature_right
        series = pd.Series(np.where(series_cond, series_left, series_right), index=series_cond.index)
        return series

    def get_longest_back_rolling(self):
-        if isinstance(self.feature_left, Expression):
+        if isinstance(self.feature_left, (Expression,)):
            left_br = self.feature_left.get_longest_back_rolling()
        else:
            left_br = 0

-        if isinstance(self.feature_right, Expression):
+        if isinstance(self.feature_right, (Expression,)):
            right_br = self.feature_right.get_longest_back_rolling()
        else:
            right_br = 0

-        if isinstance(self.condition, Expression):
+        if isinstance(self.condition, (Expression,)):
            c_br = self.condition.get_longest_back_rolling()
        else:
            c_br = 0
        return max(left_br, right_br, c_br)

    def get_extended_window_size(self):
-        if isinstance(self.feature_left, Expression):
+        if isinstance(self.feature_left, (Expression,)):
            ll, lr = self.feature_left.get_extended_window_size()
        else:
            ll, lr = 0, 0

-        if isinstance(self.feature_right, Expression):
+        if isinstance(self.feature_right, (Expression,)):
            rl, rr = self.feature_right.get_extended_window_size()
        else:
            rl, rr = 0, 0

-        if isinstance(self.condition, Expression):
+        if isinstance(self.condition, (Expression,)):
            cl, cr = self.condition.get_extended_window_size()
        else:
            cl, cr = 0, 0
@@ -719,8 +717,8 @@ class Rolling(ExpressionOps):
    def __str__(self):
        return "{}({},{})".format(type(self).__name__, self.feature, self.N)

-    def _load_internal(self, instrument, start_index, end_index, freq):
-        series = self.feature.load(instrument, start_index, end_index, freq)
+    def _load_internal(self, instrument, start_index, end_index, *args):
+        series = self.feature.load(instrument, start_index, end_index, *args)
        # NOTE: remove all null check,
        # now it's user's responsibility to decide whether use features in null days
        # isnull = series.isnull() # NOTE: isnull = NaN, inf is not null
@@ -777,8 +775,8 @@ class Ref(Rolling):
    def __init__(self, feature, N):
        super(Ref, self).__init__(feature, N, "ref")

-    def _load_internal(self, instrument, start_index, end_index, freq):
-        series = self.feature.load(instrument, start_index, end_index, freq)
+    def _load_internal(self, instrument, start_index, end_index, *args):
+        series = self.feature.load(instrument, start_index, end_index, *args)
        # N = 0, return first day
        if series.empty:
            return series  # Pandas bug, see: https://github.com/pandas-dev/pandas/issues/21049
@@ -967,8 +965,8 @@ class IdxMax(Rolling):
    def __init__(self, feature, N):
        super(IdxMax, self).__init__(feature, N, "idxmax")

-    def _load_internal(self, instrument, start_index, end_index, freq):
-        series = self.feature.load(instrument, start_index, end_index, freq)
+    def _load_internal(self, instrument, start_index, end_index, *args):
+        series = self.feature.load(instrument, start_index, end_index, *args)
        if self.N == 0:
            series = series.expanding(min_periods=1).apply(lambda x: x.argmax() + 1, raw=True)
        else:
@@ -1015,8 +1013,8 @@ class IdxMin(Rolling):
    def __init__(self, feature, N):
        super(IdxMin, self).__init__(feature, N, "idxmin")

-    def _load_internal(self, instrument, start_index, end_index, freq):
-        series = self.feature.load(instrument, start_index, end_index, freq)
+    def _load_internal(self, instrument, start_index, end_index, *args):
+        series = self.feature.load(instrument, start_index, end_index, *args)
        if self.N == 0:
            series = series.expanding(min_periods=1).apply(lambda x: x.argmin() + 1, raw=True)
        else:
@@ -1047,8 +1045,8 @@ class Quantile(Rolling):
    def __str__(self):
        return "{}({},{},{})".format(type(self).__name__, self.feature, self.N, self.qscore)

-    def _load_internal(self, instrument, start_index, end_index, freq):
-        series = self.feature.load(instrument, start_index, end_index, freq)
+    def _load_internal(self, instrument, start_index, end_index, *args):
+        series = self.feature.load(instrument, start_index, end_index, *args)
        if self.N == 0:
            series = series.expanding(min_periods=1).quantile(self.qscore)
        else:
@@ -1095,8 +1093,8 @@ class Mad(Rolling):
    def __init__(self, feature, N):
        super(Mad, self).__init__(feature, N, "mad")

-    def _load_internal(self, instrument, start_index, end_index, freq):
-        series = self.feature.load(instrument, start_index, end_index, freq)
+    def _load_internal(self, instrument, start_index, end_index, *args):
+        series = self.feature.load(instrument, start_index, end_index, *args)
        # TODO: implement in Cython

        def mad(x):
@@ -1129,8 +1127,8 @@ class Rank(Rolling):
    def __init__(self, feature, N):
        super(Rank, self).__init__(feature, N, "rank")

-    def _load_internal(self, instrument, start_index, end_index, freq):
-        series = self.feature.load(instrument, start_index, end_index, freq)
+    def _load_internal(self, instrument, start_index, end_index, *args):
+        series = self.feature.load(instrument, start_index, end_index, *args)
        # TODO: implement in Cython

        def rank(x):
@@ -1187,8 +1185,8 @@ class Delta(Rolling):
    def __init__(self, feature, N):
        super(Delta, self).__init__(feature, N, "delta")

-    def _load_internal(self, instrument, start_index, end_index, freq):
-        series = self.feature.load(instrument, start_index, end_index, freq)
+    def _load_internal(self, instrument, start_index, end_index, *args):
+        series = self.feature.load(instrument, start_index, end_index, *args)
        if self.N == 0:
            series = series - series.iloc[0]
        else:
@@ -1225,8 +1223,8 @@ class Slope(Rolling):
    def __init__(self, feature, N):
        super(Slope, self).__init__(feature, N, "slope")

-    def _load_internal(self, instrument, start_index, end_index, freq):
-        series = self.feature.load(instrument, start_index, end_index, freq)
+    def _load_internal(self, instrument, start_index, end_index, *args):
+        series = self.feature.load(instrument, start_index, end_index, *args)
        if self.N == 0:
            series = pd.Series(expanding_slope(series.values), index=series.index)
        else:
@@ -1253,8 +1251,8 @@ class Rsquare(Rolling):
    def __init__(self, feature, N):
        super(Rsquare, self).__init__(feature, N, "rsquare")

-    def _load_internal(self, instrument, start_index, end_index, freq):
-        _series = self.feature.load(instrument, start_index, end_index, freq)
+    def _load_internal(self, instrument, start_index, end_index, *args):
+        _series = self.feature.load(instrument, start_index, end_index, *args)
        if self.N == 0:
            series = pd.Series(expanding_rsquare(_series.values), index=_series.index)
        else:
@@ -1282,8 +1280,8 @@ class Resi(Rolling):
    def __init__(self, feature, N):
        super(Resi, self).__init__(feature, N, "resi")

-    def _load_internal(self, instrument, start_index, end_index, freq):
-        series = self.feature.load(instrument, start_index, end_index, freq)
+    def _load_internal(self, instrument, start_index, end_index, *args):
+        series = self.feature.load(instrument, start_index, end_index, *args)
        if self.N == 0:
            series = pd.Series(expanding_resi(series.values), index=series.index)
        else:
@@ -1310,8 +1308,8 @@ class WMA(Rolling):
    def __init__(self, feature, N):
        super(WMA, self).__init__(feature, N, "wma")

-    def _load_internal(self, instrument, start_index, end_index, freq):
-        series = self.feature.load(instrument, start_index, end_index, freq)
+    def _load_internal(self, instrument, start_index, end_index, *args):
+        series = self.feature.load(instrument, start_index, end_index, *args)
        # TODO: implement in Cython

        def weighted_mean(x):
@@ -1345,8 +1343,8 @@ class EMA(Rolling):
    def __init__(self, feature, N):
        super(EMA, self).__init__(feature, N, "ema")

-    def _load_internal(self, instrument, start_index, end_index, freq):
-        series = self.feature.load(instrument, start_index, end_index, freq)
+    def _load_internal(self, instrument, start_index, end_index, *args):
+        series = self.feature.load(instrument, start_index, end_index, *args)

        def exp_weighted_mean(x):
            a = 1 - 2 / (1 + len(x))
@@ -1392,17 +1390,17 @@ class PairRolling(ExpressionOps):
    def __str__(self):
        return "{}({},{},{})".format(type(self).__name__, self.feature_left, self.feature_right, self.N)

-    def _load_internal(self, instrument, start_index, end_index, freq):
+    def _load_internal(self, instrument, start_index, end_index, *args):
        assert any(
            [isinstance(self.feature_left, Expression), self.feature_right, Expression]
        ), "at least one of two inputs is Expression instance"

        if isinstance(self.feature_left, Expression):
-            series_left = self.feature_left.load(instrument, start_index, end_index, freq)
+            series_left = self.feature_left.load(instrument, start_index, end_index, *args)
        else:
            series_left = self.feature_left  # numeric value
        if isinstance(self.feature_right, Expression):
-            series_right = self.feature_right.load(instrument, start_index, end_index, freq)
+            series_right = self.feature_right.load(instrument, start_index, end_index, *args)
        else:
            series_right = self.feature_right

@@ -1465,12 +1463,12 @@ class Corr(PairRolling):
    def __init__(self, feature_left, feature_right, N):
        super(Corr, self).__init__(feature_left, feature_right, N, "corr")

-    def _load_internal(self, instrument, start_index, end_index, freq):
-        res: pd.Series = super(Corr, self)._load_internal(instrument, start_index, end_index, freq)
+    def _load_internal(self, instrument, start_index, end_index, *args):
+        res: pd.Series = super(Corr, self)._load_internal(instrument, start_index, end_index, *args)

        # NOTE: Load uses MemCache, so calling load again will not cause performance degradation
-        series_left = self.feature_left.load(instrument, start_index, end_index, freq)
-        series_right = self.feature_right.load(instrument, start_index, end_index, freq)
+        series_left = self.feature_left.load(instrument, start_index, end_index, *args)
+        series_right = self.feature_right.load(instrument, start_index, end_index, *args)
        res.loc[
            np.isclose(series_left.rolling(self.N, min_periods=1).std(), 0, atol=2e-05)
            | np.isclose(series_right.rolling(self.N, min_periods=1).std(), 0, atol=2e-05)
@@ -1529,8 +1527,8 @@ class TResample(ElemOperator):
    def __str__(self):
        return "{}({},{})".format(type(self).__name__, self.feature, self.freq)

-    def _load_internal(self, instrument, start_index, end_index, freq):
-        series = self.feature.load(instrument, start_index, end_index, freq)
+    def _load_internal(self, instrument, start_index, end_index, *args):
+        series = self.feature.load(instrument, start_index, end_index, *args)

        if series.empty:
            return series
@@ -1590,6 +1588,7 @@ OpsList = [
    IdxMin,
    If,
    Feature,
+    PFeature,
 ] + [TResample]


@@ -1622,7 +1621,7 @@ class OpsWrapper:
            else:
                _ops_class = _operator

-            if not issubclass(_ops_class, Expression):
+            if not issubclass(_ops_class, (Expression,)):
                raise TypeError("operator must be subclass of ExpressionOps, not {}".format(_ops_class))

            if _ops_class.__name__ in self._ops:
@@ -1644,8 +1643,10 @@ def register_all_ops(C):
    """register all operator"""
    logger = get_module_logger("ops")

+    from qlib.data.pit import P  # pylint: disable=C0415
+
    Operators.reset()
-    Operators.register(OpsList)
+    Operators.register(OpsList + [P])

    if getattr(C, "custom_ops", None) is not None:
        Operators.register(C.custom_ops)
--- a/qlib/data/pit.py
+++ b/qlib/data/pit.py
@@ -0,0 +1,57 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+"""
+Qlib follow the logic below to supporting point-in-time database
+
+For each stock, the format of its data is <observe_time, feature>. Expression Engine support calculation on such format of data
+
+To calculate the feature value f_t at a specific observe time t,  data with format <period_time, feature> will be used.
+For example, the average earning of last 4 quarters (period_time) on 20190719 (observe_time)
+
+The calculation of both <period_time, feature> and <observe_time, feature> data rely on expression engine. It consists of 2 phases.
+1) calculation <period_time, feature> at each observation time t and it will collasped into a point (just like a normal feature)
+2) concatenate all th collasped data, we will get data with format <observe_time, feature>.
+Qlib will use the operator `P` to perform the collapse.
+"""
+import numpy as np
+import pandas as pd
+from qlib.data.ops import ElemOperator
+from qlib.log import get_module_logger
+from .data import Cal
+
+
+class P(ElemOperator):
+    def _load_internal(self, instrument, start_index, end_index, freq):
+
+        _calendar = Cal.calendar(freq=freq)
+        resample_data = np.empty(end_index - start_index + 1, dtype="float32")
+
+        for cur_index in range(start_index, end_index + 1):
+            cur_time = _calendar[cur_index]
+            # To load expression accurately, more historical data are required
+            start_ws, end_ws = self.feature.get_extended_window_size()
+            if end_ws > 0:
+                raise ValueError(
+                    "PIT database does not support referring to future period (e.g. expressions like `Ref('$$roewa_q', -1)` are not supported"
+                )
+
+            # The calculated value will always the last element, so the end_offset is zero.
+            try:
+                s = self.feature.load(instrument, -start_ws, 0, cur_time)
+                resample_data[cur_index - start_index] = s.iloc[-1] if len(s) > 0 else np.nan
+            except FileNotFoundError:
+                get_module_logger("base").warning(f"WARN: period data not found for {str(self)}")
+                return pd.Series(dtype="float32", name=str(self))
+
+        resample_series = pd.Series(
+            resample_data, index=pd.RangeIndex(start_index, end_index + 1), dtype="float32", name=str(self)
+        )
+        return resample_series
+
+    def get_longest_back_rolling(self):
+        # The period data will collapse as a normal feature. So no extending and looking back
+        return 0
+
+    def get_extended_window_size(self):
+        # The period data will collapse as a normal feature. So no extending and looking back
+        return 0, 0