add docs & fix reinit of datatset

2026-07-05 03:50:57 +08:00 · 2021-02-03 08:57:31 +00:00
parent c71b645777
commit 4ed8b8e233
4 changed files with 96 additions and 16 deletions
--- a/examples/highfreq/README.md
+++ b/examples/highfreq/README.md
@@ -0,0 +1,28 @@
+# High-Frequency Dataset
+
+This dataset is an example for RL high frequency trading.
+
+## Get High-Frequency Data
+
+Get high-frequency data by running the following command:
+```bash
+    python workflow.py get_data
+```
+
+## Dump & Reload & Reinitialize the Dataset
+
+
+The High-Frequency Dataset is implemented as `qlib.data.dataset.DatasetH` in the `workflow.py`. `DatatsetH` is the subclass of `qlib.utils.serial.Serializable`, which supports being dumped in or loaded from disk in `pickle` format.
+
+### About Reinitialization
+
+After reloading `Dataset` from disk, `Qlib` also support reinitialize the dataset. It means that users can reset some config of `Dataset` or `DataHandler` such as `instruments`, `start_time`, `end_time` and `segmens`, etc.
+
+The example is given in `workflow.py`, users can run the code as follows.
+
+### Run the Code
+
+Run the example by running the following command:
+```bash
+    python workflow.py dump_and_load_dataset
+```
--- a/examples/highfreq/init.py
+++ b/examples/highfreq/init.py
--- a/examples/highfreq/workflow.py
+++ b/examples/highfreq/workflow.py
@@ -9,7 +9,7 @@ import qlib
 import pickle
 import numpy as np
 import pandas as pd
-from qlib.config import HIGH_FREQ_CONFIG
+from qlib.config import REG_CN, HIGH_FREQ_CONFIG
 from qlib.contrib.model.gbdt import LGBModel
 from qlib.contrib.data.handler import Alpha158
 from qlib.contrib.strategy.strategy import TopkDropoutStrategy
@@ -26,7 +26,6 @@ from qlib.tests.data import GetData

 from highfreq_ops import get_calendar_day, DayLast, FFillNan, BFillNan, Date, Select, IsNull

-
 class HighfreqWorkflow(object):

    SPEC_CONF = {"custom_ops": [DayLast, FFillNan, BFillNan, Date, Select, IsNull], "expression_cache": None}
@@ -123,8 +122,7 @@ class HighfreqWorkflow(object):
        backtest_train, backtest_test = dataset_backtest.prepare(["train", "test"])
        print(backtest_train, backtest_test)

-        del xtrain, xtest
-        del backtest_train, backtest_test
+        return

    def dump_and_load_dataset(self):
        """dump and load dataset state on disk"""
@@ -146,18 +144,39 @@ class HighfreqWorkflow(object):
            dataset_backtest = pickle.load(file_dataset_backtest)

        self._prepare_calender_cache()
-        ##=============reload_dataset=============
-        dataset.init(init_type=DataHandlerLP.IT_LS)
-        dataset_backtest.init()
+        ##=============reinit dataset=============
+        dataset.init(
+            handler_kwargs = {
+                "init_type" : DataHandlerLP.IT_LS,
+                "start_time" : "2021-01-19 00:00:00",
+                "end_time" : "2021-01-25 16:00:00",
+            },
+            segment_kwargs = {
+                "test": (
+                    "2021-01-19 00:00:00", 
+                    "2021-01-25 16:00:00",
+                ),
+            }
+        )
+        dataset_backtest.init(
+            handler_kwargs = {
+                "start_time" : "2021-01-19 00:00:00",
+                "end_time" : "2021-01-25 16:00:00",
+            },
+            segment_kwargs = {
+                "test": (
+                    "2021-01-19 00:00:00", 
+                    "2021-01-25 16:00:00",
+                ),
+            }
+        )

        ##=============get data=============
-        xtrain, xtest = dataset.prepare(["train", "test"])
-        backtest_train, backtest_test = dataset_backtest.prepare(["train", "test"])
+        xtest = dataset.prepare(["test"])
+        backtest_test = dataset_backtest.prepare(["test"])

-        print(xtrain, xtest)
-        print(backtest_train, backtest_test)
-        del xtrain, xtest
-        del backtest_train, backtest_test
+        print(xtest, backtest_test)
+        return


 if __name__ == "__main__":
--- a/qlib/data/dataset/init.py
+++ b/qlib/data/dataset/init.py
@@ -87,9 +87,42 @@ class DatasetH(Dataset):
        """
        super().__init__(handler, segments)

-    def init(self, **kwargs):
-        """Initialize the DatasetH, Only parameters belonging to handler.init will be passed in"""
-        self.handler.init(**kwargs)
+    def init(self, handler_kwargs:dict = None, segment_kwargs:dict = None):
+        """
+        Initialize the DatasetH
+        
+        Parameters
+        ----------
+        handler_kwargs : dict
+            Config of DataHanlder, which could include the following arguments:
+            
+            - arguments of DataHandler.conf_data, such as 'instruments', 'start_time' and 'end_time'.
+
+            - arguments of DataHandler.init, such as 'enable_cache', etc.
+        
+        segment_kwargs : dict
+            Config of segments which is same as 'segments' in DatasetH.setup_data
+
+        """
+        if handler_kwargs:
+            if not isinstance(handler_kwargs, dict):
+                raise TypeError(f"param handler_kwargs must be type dict, not {type(handler_kwargs)}")
+            kwargs_init = {}
+            kwargs_conf_data = {}
+            conf_data_arg = {"instruments", "start_time", "end_time"}
+            for k, v in handler_kwargs.items():
+                if k in conf_data_arg:
+                    kwargs_conf_data.update({k:v})
+                else:
+                    kwargs_init.update({k:v})
+            
+            self.handler.conf_data(**kwargs_conf_data)
+            self.handler.init(**kwargs_init)
+
+        if segment_kwargs:
+            if not isinstance(segment_kwargs, dict):
+                raise TypeError(f"param handler_kwargs must be type dict, not {type(segment_kwargs)}")
+            self.segments = segment_kwargs.copy()

    def setup_data(self, handler: Union[dict, DataHandler], segments: dict):
        """