1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-07-05 03:50:57 +08:00

add docs & fix reinit of datatset

This commit is contained in:
bxdd
2021-02-03 08:57:31 +00:00
committed by you-n-g
parent c71b645777
commit 4ed8b8e233
4 changed files with 96 additions and 16 deletions

View File

@@ -0,0 +1,28 @@
# High-Frequency Dataset
This dataset is an example for RL high frequency trading.
## Get High-Frequency Data
Get high-frequency data by running the following command:
```bash
python workflow.py get_data
```
## Dump & Reload & Reinitialize the Dataset
The High-Frequency Dataset is implemented as `qlib.data.dataset.DatasetH` in the `workflow.py`. `DatatsetH` is the subclass of `qlib.utils.serial.Serializable`, which supports being dumped in or loaded from disk in `pickle` format.
### About Reinitialization
After reloading `Dataset` from disk, `Qlib` also support reinitialize the dataset. It means that users can reset some config of `Dataset` or `DataHandler` such as `instruments`, `start_time`, `end_time` and `segmens`, etc.
The example is given in `workflow.py`, users can run the code as follows.
### Run the Code
Run the example by running the following command:
```bash
python workflow.py dump_and_load_dataset
```

View File

@@ -9,7 +9,7 @@ import qlib
import pickle
import numpy as np
import pandas as pd
from qlib.config import HIGH_FREQ_CONFIG
from qlib.config import REG_CN, HIGH_FREQ_CONFIG
from qlib.contrib.model.gbdt import LGBModel
from qlib.contrib.data.handler import Alpha158
from qlib.contrib.strategy.strategy import TopkDropoutStrategy
@@ -26,7 +26,6 @@ from qlib.tests.data import GetData
from highfreq_ops import get_calendar_day, DayLast, FFillNan, BFillNan, Date, Select, IsNull
class HighfreqWorkflow(object):
SPEC_CONF = {"custom_ops": [DayLast, FFillNan, BFillNan, Date, Select, IsNull], "expression_cache": None}
@@ -123,8 +122,7 @@ class HighfreqWorkflow(object):
backtest_train, backtest_test = dataset_backtest.prepare(["train", "test"])
print(backtest_train, backtest_test)
del xtrain, xtest
del backtest_train, backtest_test
return
def dump_and_load_dataset(self):
"""dump and load dataset state on disk"""
@@ -146,18 +144,39 @@ class HighfreqWorkflow(object):
dataset_backtest = pickle.load(file_dataset_backtest)
self._prepare_calender_cache()
##=============reload_dataset=============
dataset.init(init_type=DataHandlerLP.IT_LS)
dataset_backtest.init()
##=============reinit dataset=============
dataset.init(
handler_kwargs = {
"init_type" : DataHandlerLP.IT_LS,
"start_time" : "2021-01-19 00:00:00",
"end_time" : "2021-01-25 16:00:00",
},
segment_kwargs = {
"test": (
"2021-01-19 00:00:00",
"2021-01-25 16:00:00",
),
}
)
dataset_backtest.init(
handler_kwargs = {
"start_time" : "2021-01-19 00:00:00",
"end_time" : "2021-01-25 16:00:00",
},
segment_kwargs = {
"test": (
"2021-01-19 00:00:00",
"2021-01-25 16:00:00",
),
}
)
##=============get data=============
xtrain, xtest = dataset.prepare(["train", "test"])
backtest_train, backtest_test = dataset_backtest.prepare(["train", "test"])
xtest = dataset.prepare(["test"])
backtest_test = dataset_backtest.prepare(["test"])
print(xtrain, xtest)
print(backtest_train, backtest_test)
del xtrain, xtest
del backtest_train, backtest_test
print(xtest, backtest_test)
return
if __name__ == "__main__":

View File

@@ -87,9 +87,42 @@ class DatasetH(Dataset):
"""
super().__init__(handler, segments)
def init(self, **kwargs):
"""Initialize the DatasetH, Only parameters belonging to handler.init will be passed in"""
self.handler.init(**kwargs)
def init(self, handler_kwargs:dict = None, segment_kwargs:dict = None):
"""
Initialize the DatasetH
Parameters
----------
handler_kwargs : dict
Config of DataHanlder, which could include the following arguments:
- arguments of DataHandler.conf_data, such as 'instruments', 'start_time' and 'end_time'.
- arguments of DataHandler.init, such as 'enable_cache', etc.
segment_kwargs : dict
Config of segments which is same as 'segments' in DatasetH.setup_data
"""
if handler_kwargs:
if not isinstance(handler_kwargs, dict):
raise TypeError(f"param handler_kwargs must be type dict, not {type(handler_kwargs)}")
kwargs_init = {}
kwargs_conf_data = {}
conf_data_arg = {"instruments", "start_time", "end_time"}
for k, v in handler_kwargs.items():
if k in conf_data_arg:
kwargs_conf_data.update({k:v})
else:
kwargs_init.update({k:v})
self.handler.conf_data(**kwargs_conf_data)
self.handler.init(**kwargs_init)
if segment_kwargs:
if not isinstance(segment_kwargs, dict):
raise TypeError(f"param handler_kwargs must be type dict, not {type(segment_kwargs)}")
self.segments = segment_kwargs.copy()
def setup_data(self, handler: Union[dict, DataHandler], segments: dict):
"""