From 4ed8b8e233435c641fa584fd86e43ef89f5b8e7b Mon Sep 17 00:00:00 2001 From: bxdd Date: Wed, 3 Feb 2021 08:57:31 +0000 Subject: [PATCH] add docs & fix reinit of datatset --- examples/highfreq/README.md | 28 ++++++++++++++++++++++ examples/highfreq/__init__.py | 0 examples/highfreq/workflow.py | 45 +++++++++++++++++++++++++---------- qlib/data/dataset/__init__.py | 39 +++++++++++++++++++++++++++--- 4 files changed, 96 insertions(+), 16 deletions(-) create mode 100644 examples/highfreq/README.md delete mode 100644 examples/highfreq/__init__.py diff --git a/examples/highfreq/README.md b/examples/highfreq/README.md new file mode 100644 index 000000000..56981abcc --- /dev/null +++ b/examples/highfreq/README.md @@ -0,0 +1,28 @@ +# High-Frequency Dataset + +This dataset is an example for RL high frequency trading. + +## Get High-Frequency Data + +Get high-frequency data by running the following command: +```bash + python workflow.py get_data +``` + +## Dump & Reload & Reinitialize the Dataset + + +The High-Frequency Dataset is implemented as `qlib.data.dataset.DatasetH` in the `workflow.py`. `DatatsetH` is the subclass of `qlib.utils.serial.Serializable`, which supports being dumped in or loaded from disk in `pickle` format. + +### About Reinitialization + +After reloading `Dataset` from disk, `Qlib` also support reinitialize the dataset. It means that users can reset some config of `Dataset` or `DataHandler` such as `instruments`, `start_time`, `end_time` and `segmens`, etc. + +The example is given in `workflow.py`, users can run the code as follows. + +### Run the Code + +Run the example by running the following command: +```bash + python workflow.py dump_and_load_dataset +``` \ No newline at end of file diff --git a/examples/highfreq/__init__.py b/examples/highfreq/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/examples/highfreq/workflow.py b/examples/highfreq/workflow.py index 6649079d8..dac27082f 100644 --- a/examples/highfreq/workflow.py +++ b/examples/highfreq/workflow.py @@ -9,7 +9,7 @@ import qlib import pickle import numpy as np import pandas as pd -from qlib.config import HIGH_FREQ_CONFIG +from qlib.config import REG_CN, HIGH_FREQ_CONFIG from qlib.contrib.model.gbdt import LGBModel from qlib.contrib.data.handler import Alpha158 from qlib.contrib.strategy.strategy import TopkDropoutStrategy @@ -26,7 +26,6 @@ from qlib.tests.data import GetData from highfreq_ops import get_calendar_day, DayLast, FFillNan, BFillNan, Date, Select, IsNull - class HighfreqWorkflow(object): SPEC_CONF = {"custom_ops": [DayLast, FFillNan, BFillNan, Date, Select, IsNull], "expression_cache": None} @@ -123,8 +122,7 @@ class HighfreqWorkflow(object): backtest_train, backtest_test = dataset_backtest.prepare(["train", "test"]) print(backtest_train, backtest_test) - del xtrain, xtest - del backtest_train, backtest_test + return def dump_and_load_dataset(self): """dump and load dataset state on disk""" @@ -146,18 +144,39 @@ class HighfreqWorkflow(object): dataset_backtest = pickle.load(file_dataset_backtest) self._prepare_calender_cache() - ##=============reload_dataset============= - dataset.init(init_type=DataHandlerLP.IT_LS) - dataset_backtest.init() + ##=============reinit dataset============= + dataset.init( + handler_kwargs = { + "init_type" : DataHandlerLP.IT_LS, + "start_time" : "2021-01-19 00:00:00", + "end_time" : "2021-01-25 16:00:00", + }, + segment_kwargs = { + "test": ( + "2021-01-19 00:00:00", + "2021-01-25 16:00:00", + ), + } + ) + dataset_backtest.init( + handler_kwargs = { + "start_time" : "2021-01-19 00:00:00", + "end_time" : "2021-01-25 16:00:00", + }, + segment_kwargs = { + "test": ( + "2021-01-19 00:00:00", + "2021-01-25 16:00:00", + ), + } + ) ##=============get data============= - xtrain, xtest = dataset.prepare(["train", "test"]) - backtest_train, backtest_test = dataset_backtest.prepare(["train", "test"]) + xtest = dataset.prepare(["test"]) + backtest_test = dataset_backtest.prepare(["test"]) - print(xtrain, xtest) - print(backtest_train, backtest_test) - del xtrain, xtest - del backtest_train, backtest_test + print(xtest, backtest_test) + return if __name__ == "__main__": diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py index 6b98baf8f..e2606ec0f 100644 --- a/qlib/data/dataset/__init__.py +++ b/qlib/data/dataset/__init__.py @@ -87,9 +87,42 @@ class DatasetH(Dataset): """ super().__init__(handler, segments) - def init(self, **kwargs): - """Initialize the DatasetH, Only parameters belonging to handler.init will be passed in""" - self.handler.init(**kwargs) + def init(self, handler_kwargs:dict = None, segment_kwargs:dict = None): + """ + Initialize the DatasetH + + Parameters + ---------- + handler_kwargs : dict + Config of DataHanlder, which could include the following arguments: + + - arguments of DataHandler.conf_data, such as 'instruments', 'start_time' and 'end_time'. + + - arguments of DataHandler.init, such as 'enable_cache', etc. + + segment_kwargs : dict + Config of segments which is same as 'segments' in DatasetH.setup_data + + """ + if handler_kwargs: + if not isinstance(handler_kwargs, dict): + raise TypeError(f"param handler_kwargs must be type dict, not {type(handler_kwargs)}") + kwargs_init = {} + kwargs_conf_data = {} + conf_data_arg = {"instruments", "start_time", "end_time"} + for k, v in handler_kwargs.items(): + if k in conf_data_arg: + kwargs_conf_data.update({k:v}) + else: + kwargs_init.update({k:v}) + + self.handler.conf_data(**kwargs_conf_data) + self.handler.init(**kwargs_init) + + if segment_kwargs: + if not isinstance(segment_kwargs, dict): + raise TypeError(f"param handler_kwargs must be type dict, not {type(segment_kwargs)}") + self.segments = segment_kwargs.copy() def setup_data(self, handler: Union[dict, DataHandler], segments: dict): """