1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-07-05 12:00:58 +08:00

add highfreq example

This commit is contained in:
bxdd
2021-01-25 17:58:45 +00:00
parent 3f9f295a87
commit ffedb6382f
10 changed files with 585 additions and 29 deletions

View File

View File

@@ -0,0 +1,220 @@
from qlib.data.dataset.handler import DataHandler, DataHandlerLP
from qlib.data.dataset.processor import Processor
from qlib.utils import get_cls_kwargs
from qlib.log import TimeInspector
class HighFreqHandler(DataHandlerLP):
def __init__(
self,
instruments="csi500",
start_time=None,
end_time=None,
freq="1min",
infer_processors=[],
learn_processors=[],
fit_start_time=None,
fit_end_time=None,
drop_raw=True,
):
def check_transform_proc(proc_l):
new_l = []
for p in proc_l:
p["kwargs"].update(
{
"fit_start_time": fit_start_time,
"fit_end_time": fit_end_time,
}
)
new_l.append(p)
return new_l
infer_processors = []
learn_processors = []
data_loader = {
"class": "QlibDataLoader",
"kwargs": {
"config": self.get_feature_config(),
"swap_level": False,
},
}
super().__init__(
instruments=instruments,
start_time=start_time,
end_time=end_time,
freq=freq,
data_loader=data_loader,
infer_processors=infer_processors,
learn_processors=learn_processors,
drop_raw=drop_raw,
)
def get_feature_config(self):
fields = []
names = []
template_if = "If(IsNull({1}), {0}, {1})"
template_paused = "Select(Eq($paused, 0.0), {0})"
template_fillnan = "FFillNan({0})"
fields += [
"{0}/Ref(DayLast({1}), 240)".format(
template_if.format(
template_fillnan.format(template_paused.format("$close")),
template_paused.format("$open"),
),
template_fillnan.format(template_paused.format("$close")),
)
]
fields += [
"{0}/Ref(DayLast({1}), 240)".format(
template_if.format(
template_fillnan.format(template_paused.format("$close")),
template_paused.format("$high"),
),
template_fillnan.format(template_paused.format("$close")),
)
]
fields += [
"{0}/Ref(DayLast({1}), 240)".format(
template_if.format(
template_fillnan.format(template_paused.format("$close")),
template_paused.format("$low"),
),
template_fillnan.format(template_paused.format("$close")),
)
]
fields += ["{0}/Ref(DayLast({0}), 240)".format(template_fillnan.format(template_paused.format("$close")))]
fields += [
"{0}/Ref(DayLast({1}), 240)".format(
"If(IsNull({1}), {0}, If(Or(Or(Or(Eq({1}, np.inf), Eq({1}, -np.inf)), Eq({1}, 0)), Or(Gt({1}, Mul(1.001, {3})), Lt({1}, Mul(0.999, {2})))), {0}, {1}))".format(
template_fillnan.format(template_paused.format("$close")),
template_paused.format("$vwap"),
template_paused.format("$low"),
template_paused.format("$high"),
),
template_fillnan.format(template_paused.format("$close")),
)
]
names += ["$open", "$high", "$low", "$close", "$vwap"]
fields += [
"Ref({0}, 240)/Ref(DayLast({1}), 240)".format(
template_if.format(
template_fillnan.format(template_paused.format("$close")),
template_paused.format("$open"),
),
template_fillnan.format(template_paused.format("$close")),
)
]
fields += [
"Ref({0}, 240)/Ref(DayLast({1}), 240)".format(
template_if.format(
template_fillnan.format(template_paused.format("$close")),
template_paused.format("$high"),
),
template_fillnan.format(template_paused.format("$close")),
)
]
fields += [
"Ref({0}, 240)/Ref(DayLast({1}), 240)".format(
template_if.format(
template_fillnan.format(template_paused.format("$close")),
template_paused.format("$low"),
),
template_fillnan.format(template_paused.format("$close")),
)
]
fields += [
"Ref({0}, 240)/Ref(DayLast({0}), 240)".format(template_fillnan.format(template_paused.format("$close")))
]
fields += [
"Ref({0}, 240)/Ref(DayLast({1}), 240)".format(
"If(IsNull({1}), {0}, If(Or(Or(Or(Eq({1}, np.inf), Eq({1}, -np.inf)), Eq({1}, 0)), Or(Gt({1}, Mul(1.001, {3})), Lt({1}, Mul(0.999, {2})))), {0}, {1}))".format(
template_fillnan.format(template_paused.format("$close")),
template_paused.format("$vwap"),
template_paused.format("$low"),
template_paused.format("$high"),
),
template_fillnan.format(template_paused.format("$close")),
)
]
names += ["$open_1", "$high_1", "$low_1", "$close_1", "$vwap_1"]
fields += [
"{0}/Ref(DayLast(Mean({0}, 7200)), 240)".format(
"If(IsNull({1}), 0, If(Or(Gt({2}, Mul(1.001, {4})), Lt({2}, Mul(0.999, {3}))), 0, {1}))".format(
template_fillnan.format(template_paused.format("$close")),
template_paused.format("$volume"),
template_paused.format("$vwap"),
template_paused.format("$low"),
template_paused.format("$high"),
)
)
]
names += ["$volume"]
fields += [
"Ref({0}, 240)/Ref(DayLast(Mean({0}, 7200)), 240)".format(
"If(IsNull({1}), 0, If(Or(Gt({2}, Mul(1.001, {4})), Lt({2}, Mul(0.999, {3}))), 0, {1}))".format(
template_fillnan.format(template_paused.format("$close")),
template_paused.format("$volume"),
template_paused.format("$vwap"),
template_paused.format("$low"),
template_paused.format("$high"),
)
)
]
names += ["$volume_1"]
fields += [template_paused.format("Date($close)")]
names += ["date"]
return fields, names
class HighFreqBacktestHandler(DataHandler):
def __init__(
self,
instruments="csi300",
start_time=None,
end_time=None,
freq="1min",
):
infer_processors = check_transform_proc(infer_processors)
learn_processors = check_transform_proc(learn_processors)
data_loader = {
"class": "QlibDataLoader",
"kwargs": {
"config": self.get_feature_config(),
"swap_level": False,
},
}
super().__init__(
instruments=instruments,
start_time=start_time,
end_time=end_time,
freq=freq,
data_loader=data_loader,
)
def get_feature_config(self):
fields = []
names = []
template_if = "If(Eq({1}, np.nan), {0}, {1})"
template_paused = "Select(Eq($paused, 0.0), {0})"
template_fillnan = "FFillNan({0})"
fields += [template_fillnan.format(template_paused.format("$close")),]
names += ["$close0"]
fields += [
"If(Eq({1}, np.nan), 0, If(Or(Gt({2}, Mul(1.001, {4})), Lt({2}, Mul(0.999, {3}))), 0, {1}))".format(
template_fillnan.format(template_paused.format("$close")),
template_paused.format("$volume"),
template_paused.format("$vwap"),
template_paused.format("$low"),
template_paused.format("$high"),
)
]
names += ["$volume0"]
return fields, names

View File

@@ -0,0 +1,62 @@
import numpy as np
import pandas as pd
import importlib
from qlib.data.ops import ElemOperator, PairOperator
from qlib.config import C
from qlib.data.data import Cal
class DayFirst(ElemOperator):
def __init__(self, feature):
super(DayFirst, self).__init__(feature, "day_first")
def _load_internal(self, instrument, start_index, end_index, freq):
_calendar = Cal.get_calender_day(freq=freq)[0]
series = self.feature.load(instrument, start_index, end_index, freq)
return series.groupby(_calendar[series.index]).transform("first")
class DayLast(ElemOperator):
def __init__(self, feature):
super(DayLast, self).__init__(feature, "day_last")
def _load_internal(self, instrument, start_index, end_index, freq):
_calendar = Cal.get_calender_day(freq=freq)[0]
series = self.feature.load(instrument, start_index, end_index, freq)
return series.groupby(_calendar[series.index]).transform("last")
class FFillNan(ElemOperator):
def __init__(self, feature):
super(FFillNan, self).__init__(feature, "fill_nan")
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
return series.fillna(method="ffill")
class Date(ElemOperator):
def __init__(self, feature):
super(Date, self).__init__(feature, "date")
def _load_internal(self, instrument, start_index, end_index, freq):
_calendar = Cal.get_calender_day(freq=freq)[0]
series = self.feature.load(instrument, start_index, end_index, freq)
return pd.Series(_calendar[series.index], index=series.index)
class Select(PairOperator):
def __init__(self, condition, feature):
super(Select, self).__init__(condition, feature, "select")
def _load_internal(self, instrument, start_index, end_index, freq):
series_condition = self.feature_left.load(instrument, start_index, end_index, freq)
series_feature = self.feature_right.load(instrument, start_index, end_index, freq)
return series_feature.loc[series_condition]
class IsNull(ElemOperator):
def __init__(self, feature):
super(IsNull, self).__init__(feature, "isnull")
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
return series.isnull()

View File

@@ -0,0 +1,70 @@
import numpy as np
import pandas as pd
from qlib.data.dataset.processor import Processor
from qlib.log import TimeInspector
from qlib.data.dataset.utils import fetch_df_by_index
class HighFreqNorm(Processor):
def __init__(self, fit_start_time, fit_end_time):
self.fit_start_time = fit_start_time
self.fit_end_time = fit_end_time
def fit(self, df_features):
fetch_df = fetch_df_by_index(df, slice(self.fit_start_time, self.fit_end_time), level="datetime")
del df
df_values = fetch_df.values
names = {
"price": slice(0, 10),
"volume": slice(10, 12),
}
self.feature_med = {}
self.feature_std = {}
self.feature_vmax = {}
self.feature_vmin = {}
for name, name_val in names.items():
part_values = df_values[:, name_val]
if name == "volume":
df_features.loc(axis=1)[name_val] = np.log1p(part_values)
self.feature_med[name] = np.nanmedian(part_values)
part_values = part_values - self.feature_med # mean, copy
self.feature_std[name] = np.nanmedian(np.absolute(part_values)) * 1.4826 + 1e-12
part_values = part_values / self.feature_std
self.feature_vmax[name] = np.nanmax(part_values)
self.feature_vmin[name] = np.nanmin(part_values)
def __call__(self, df_features):
df_features.set_index("date", append=True, drop=True, inplace=True)
df_values = df_features.values
names = {
"price": slice(0, 10),
"volume": slice(10, 12),
}
for name, name_val in names.items():
part_values = df_values[:, name_val]
if name == "volume":
part_values[:] = np.log1p(part_values)
part_values -= self.feature_med[name]
part_values /= self.feature_std[name]
slice0 = part_values > 3.0
slice1 = part_values > 3.5
slice2 = part_values < -3.0
slice3 = part_values < -3.5
part_values[slice0] = 3.0 + (part_values[slice0] - 3.0) / (self.feature_vmax[name] - 3) * 0.5
part_values[slice1] = 3.5
part_values[slice2] = -3.0 - (part_values[slice2] + 3.0) / (self.feature_vmin[name] + 3) * 0.5
part_values[slice3] = -3.5
# print("start_call_feature_reshape")
idx = df_features.index.droplevel("datetime").drop_duplicates()
feat = df_values[:, [0, 1, 2, 3, 4, 10]].reshape(-1, 6 * 240)
feat_1 = df_values[:, [5, 6, 7, 8, 9, 11]].reshape(-1, 6 * 240)
df_new_features = pd.DataFrame(
data=np.concatenate((feat, feat_1), axis=1),
index=idx,
columns=["FEATURE_%d" % i for i in range(12 * 240)],
).sort_index()
return df_new_features

View File

@@ -0,0 +1,137 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import sys
from pathlib import Path
import qlib
import pickle
import numpy as np
import pandas as pd
from qlib.config import REG_CN
from qlib.contrib.model.gbdt import LGBModel
from qlib.contrib.data.handler import Alpha158
from qlib.contrib.strategy.strategy import TopkDropoutStrategy
from qlib.contrib.evaluate import (
backtest as normal_backtest,
risk_analysis,
)
from qlib.utils import init_instance_by_config
from qlib.data.dataset.handler import DataHandlerLP
from qlib.data.ops import Operators
from qlib.data.data import Cal
from highfreq_ops import DayFirst, DayLast, FFillNan, Date, Select, IsNull
def save_dataset(dataset, path: [Path, str]):
"""
save dataset to path
Parameters
----------
path : [Path, str]
path to save
"""
dataset.to_pickle(path=path)
def load_dataset(path: [Path, str], init_type=DataHandlerLP.IT_LS):
"""
load dataset from path
Parameters
----------
path : [Path, str]
path to load
init_type : str
- if `init_type` == DataHandlerLP.IT_FIT_SEQ:
the input of `DataHandlerLP.fit` will be the output of the previous processor
- if `init_type` == DataHandlerLP.IT_FIT_IND:
the input of `DataHandlerLP.fit` will be the original df
- if `init_type` == DataHandlerLP.IT_LS:
The state of the object has been load by pickle
"""
fd = open(path, 'rb')
dataset = pickle.load(fd)
dataset.init(init_type=init_type)
fd.close()
return dataset
if __name__ == "__main__":
# use default data
provider_uri = "/mnt/v-xiabi/data/qlib/high_freq" # target_dir
qlib.init(provider_uri=provider_uri, custom_ops=[DayFirst, DayLast, FFillNan, Date, Select, IsNull], redis_port=233, region=REG_CN, auto_mount=False)
MARKET = "csi300"
BENCHMARK = "SH000300"
###################################
# train model
###################################
DATA_HANDLER_CONFIG0 = {
"start_time": "2017-01-01 00:00:00",
"end_time": "2020-11-30 15:00:00",
"freq": "1min",
"fit_start_time": "2017-01-01 00:00:00",
"fit_end_time": "2020-08-31 15:00:00",
"instruments": "all",
"infer_processors": [{"class": "HighFreqNorm", "module_path": "highfreq_processor", "kwargs": {}}],
}
DATA_HANDLER_CONFIG1 = {
"start_time": "2017-01-01 00:00:00",
"end_time": "2020-11-30 15:00:00",
"freq": "1min",
"instruments": "all",
}
task = {
"dataset": {
"class": "DatasetH",
"module_path": "qlib.data.dataset",
"kwargs": {
"handler": {
"class": "HighFreqHandler",
"module_path": "highfreq_handler",
"kwargs": DATA_HANDLER_CONFIG0,
},
"segments": {
"train": ("2017-01-01 00:00:00", "2020-08-31 15:00:00"),
"test": (
"2020-09-01 00:00:00",
"2020-11-30 15:00:00",
),
},
},
},
# You shoud record the data in specific sequence
# "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'],
"dataset_backtest": {
"class": "DatasetH",
"module_path": "qlib.data.dataset",
"kwargs": {
"handler": {
"class": "HighFreqBacktestHandler",
"module_path": "highfreq_hander",
"kwargs": DATA_HANDLER_CONFIG1,
},
"segments": {
"train": ("2017-01-01 00:00:00", "2020-08-31 15:00:00"),
"test": (
"2020-09-01 00:00:00",
"2020-11-30 15:00:00",
),
},
},
},
}
Cal.get_calender_day(freq="1min") # TO FIX: load the calendar day for cache
dataset = init_instance_by_config(task["dataset"])
dataset_backtest = init_instance_by_config(task["dataset_backtest"])