1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-07-03 11:00:57 +08:00

Fix processor bug and format

This commit is contained in:
Jactus
2020-11-11 14:24:04 +08:00
parent e2d89f44fb
commit 52c0c4b7a8
8 changed files with 114 additions and 101 deletions

View File

@@ -36,15 +36,14 @@ if __name__ == "__main__":
MARKET = "csi300"
BENCHMARK = "SH000300"
###################################
# train model
###################################
DATA_HANDLER_CONFIG = {
"start_time": "2008-01-01",
"end_time": "2020-08-01",
"fit_start_time":"2008-01-01",
"fit_end_time":"2014-12-31",
"fit_start_time": "2008-01-01",
"fit_end_time": "2014-12-31",
"instruments": MARKET,
}
@@ -69,37 +68,43 @@ if __name__ == "__main__":
"n_epochs": 2000,
"lr": 1e-1,
"early_stop": 200,
"batch_size":800,
"batch_size": 800,
"smooth_steps": 5,
"metric": "mse",
"loss": "mse",
"seed": 0,
"GPU": 0,
}
},
},
"dataset": {
"class": "DatasetH",
"module_path": "qlib.data.dataset",
"kwargs": {
'handler': {
"handler": {
"class": "ALPHA360",
"module_path": "qlib.contrib.data.handler",
"kwargs": DATA_HANDLER_CONFIG
"kwargs": DATA_HANDLER_CONFIG,
},
'segments': {
'train': ("2008-01-01", "2014-12-31"),
'valid': ("2015-01-01", "2016-12-31",),
'test': ("2017-01-01", "2020-08-01",),
}
}
"segments": {
"train": ("2008-01-01", "2014-12-31"),
"valid": (
"2015-01-01",
"2016-12-31",
),
"test": (
"2017-01-01",
"2020-08-01",
),
},
},
}
# You shoud record the data in specific sequence
# "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'],
}
# model = train_model(task)
model = init_instance_by_config(task['model'])
dataset = init_instance_by_config(task['dataset'])
model = init_instance_by_config(task["model"])
dataset = init_instance_by_config(task["dataset"])
model.fit(dataset)

View File

@@ -21,7 +21,6 @@ from qlib.utils import init_instance_by_config
if __name__ == "__main__":
# use default data
provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir
if not exists_qlib_data(provider_uri):
@@ -36,15 +35,14 @@ if __name__ == "__main__":
MARKET = "csi300"
BENCHMARK = "SH000300"
###################################
# train model
###################################
DATA_HANDLER_CONFIG = {
"start_time": "2008-01-01",
"end_time": "2020-08-01",
"fit_start_time":"2008-01-01",
"fit_end_time":"2014-12-31",
"fit_start_time": "2008-01-01",
"fit_end_time": "2014-12-31",
"instruments": MARKET,
}
@@ -62,43 +60,49 @@ if __name__ == "__main__":
"class": "XGBModel",
"module_path": "qlib.contrib.model.xgboost",
"kwargs": {
"objective": 'reg:linear',
"n_estimators":5000,
"objective": "reg:linear",
"n_estimators": 5000,
"colsample_bytree": 0.85,
"learning_rate": 0.0421,
"subsample": 0.8789,
"max_depth": 8,
"num_leaves": 210,
"num_threads": 20,
"missing":-1,
"min_child_weight":1,
"nthread":4,
"tree_method":'hist',
}
"missing": -1,
"min_child_weight": 1,
"nthread": 4,
"tree_method": "hist",
},
},
"dataset": {
"class": "DatasetH",
"module_path": "qlib.data.dataset",
"kwargs": {
'handler': {
"handler": {
"class": "Alpha158",
"module_path": "qlib.contrib.data.handler",
"kwargs": DATA_HANDLER_CONFIG
"kwargs": DATA_HANDLER_CONFIG,
},
'segments': {
'train': ("2008-01-01", "2014-12-31"),
'valid': ("2015-01-01", "2016-12-31",),
'test': ("2017-01-01", "2020-08-01",),
}
}
"segments": {
"train": ("2008-01-01", "2014-12-31"),
"valid": (
"2015-01-01",
"2016-12-31",
),
"test": (
"2017-01-01",
"2020-08-01",
),
},
},
}
# You shoud record the data in specific sequence
# "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'],
}
# model = train_model(task)
model = init_instance_by_config(task['model'])
dataset = init_instance_by_config(task['dataset'])
model = init_instance_by_config(task["model"])
dataset = init_instance_by_config(task["dataset"])
model.fit(dataset)
pred_score = model.predict(dataset)

View File

@@ -8,15 +8,9 @@ from ...data.dataset import processor as processor_module
from ...log import TimeInspector
import copy
class ALPHA360(DataHandlerLP):
def __init__(
self,
instruments="csi500",
start_time=None,
end_time=None,
fit_start_time=None,
fit_end_time=None
):
def __init__(self, instruments="csi500", start_time=None, end_time=None, fit_start_time=None, fit_end_time=None):
data_loader = {
"class": "QlibDataLoader",
"kwargs": {
@@ -28,22 +22,22 @@ class ALPHA360(DataHandlerLP):
}
learn_processors = [
{"class": "DropnaLabel", "kwargs": {'group': 'label'}},
{"class": "CSZScoreNorm", "kwargs": {"fields_group": "label"}},
{"class": "DropnaLabel", "kwargs": {"group": "label"}},
{"class": "CSZScoreNorm", "kwargs": {"fields_group": "label"}},
]
infer_processors = [
{"class": "ProcessInf", "kwargs": {}},
{"class": "ZscoreNorm", "kwargs": {"fit_start_time": fit_start_time, "fit_end_time": fit_end_time}},
{"class": "Fillna", "kwargs": {}},
{"class": "ProcessInf", "kwargs": {}},
{"class": "ZscoreNorm", "kwargs": {"fit_start_time": fit_start_time, "fit_end_time": fit_end_time}},
{"class": "Fillna", "kwargs": {}},
]
super().__init__(
instruments,
start_time,
end_time,
data_loader=data_loader,
learn_processors=learn_processors,
infer_processors=infer_processors
instruments,
start_time,
end_time,
data_loader=data_loader,
learn_processors=learn_processors,
infer_processors=infer_processors,
)
def get_label_config(self):
@@ -54,19 +48,19 @@ class ALPHA360(DataHandlerLP):
fields = []
names = []
for i in range(59,0,-1):
fields += ["Ref($close, %d)/$close"%(i)]
names += ["CLOSE%d"%(i)]
fields += ["Ref($open, %d)/$close"%(i)]
names += ["OPEN%d"%(i)]
fields += ["Ref($high, %d)/$close"%(i)]
names += ["HIGH%d"%(i)]
fields += ["Ref($low, %d)/$close"%(i)]
names += ["LOW%d"%(i)]
fields += ["Ref($vwap, %d)/$close"%(i)]
names += ["VWAP%d"%(i)]
fields += ["Ref($volume, %d)/$volume"%(i)]
names += ["VOLUME%d"%(i)]
for i in range(59, 0, -1):
fields += ["Ref($close, %d)/$close" % (i)]
names += ["CLOSE%d" % (i)]
fields += ["Ref($open, %d)/$close" % (i)]
names += ["OPEN%d" % (i)]
fields += ["Ref($high, %d)/$close" % (i)]
names += ["HIGH%d" % (i)]
fields += ["Ref($low, %d)/$close" % (i)]
names += ["LOW%d" % (i)]
fields += ["Ref($vwap, %d)/$close" % (i)]
names += ["VWAP%d" % (i)]
fields += ["Ref($volume, %d)/$volume" % (i)]
names += ["VOLUME%d" % (i)]
fields += ["$close/$close"]
fields += ["$open/$close"]

View File

@@ -22,6 +22,7 @@ from ...model.base import Model
from ...data.dataset import DatasetH
from ...data.dataset.handler import DataHandlerLP
class GRU(Model):
"""GRU Model
@@ -127,7 +128,9 @@ class GRU(Model):
raise NotImplementedError("loss {} is not supported!".format(loss))
self._scorer = mean_squared_error if loss == "mse" else roc_auc_score
self.gru_model = GRUModel(d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers, dropout=self.dropout)
self.gru_model = GRUModel(
d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers, dropout=self.dropout
)
if optimizer.lower() == "adam":
self.train_optimizer = optim.Adam(self.gru_model.parameters(), lr=self.lr)
elif optimizer.lower() == "gd":
@@ -262,7 +265,7 @@ class GRU(Model):
def get_loss(self, pred, target, loss_type):
if loss_type == "mse":
sqr_loss = (pred - target)**2
sqr_loss = (pred - target) ** 2
loss = sqr_loss.mean()
return loss
elif loss_type == "binary":
@@ -307,6 +310,7 @@ class GRU(Model):
self.gru_model.load_state_dict(torch.load(_model_path))
self._fitted = True
class AverageMeter(object):
"""Computes and stores the average and current value"""
@@ -327,7 +331,6 @@ class AverageMeter(object):
class GRUModel(nn.Module):
def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0):
super().__init__()
@@ -344,8 +347,7 @@ class GRUModel(nn.Module):
def forward(self, x):
# x: [N, F*T]
x = x.reshape(len(x), self.d_feat, -1) # [N, F, T]
x = x.permute(0, 2, 1) # [N, T, F]
x = x.reshape(len(x), self.d_feat, -1) # [N, F, T]
x = x.permute(0, 2, 1) # [N, T, F]
out, _ = self.rnn(x)
return self.fc_out(out[:, -1, :]).squeeze()

View File

@@ -41,14 +41,14 @@ class XGBModel(Model):
y_train_1d, y_valid_1d = np.squeeze(y_train.values), np.squeeze(y_valid.values)
else:
raise ValueError("XGBoost doesn't support multi-label training")
dtrain = xgb.DMatrix(x_train.values, label=y_train_1d)
dvalid = xgb.DMatrix(x_valid.values, label=y_valid_1d)
self.model = xgb.train(
self._params,
dtrain=dtrain,
num_boost_round=num_boost_round,
evals=[(dtrain, 'train'), (dvalid, 'valid')],
evals=[(dtrain, "train"), (dvalid, "valid")],
early_stopping_rounds=early_stopping_rounds,
verbose_eval=verbose_eval,
evals_result=evals_result,

View File

@@ -16,7 +16,7 @@ from ...data import D
from ...config import C
from ...utils import parse_config, transform_end_date, init_instance_by_config
from ...utils.serial import Serializable
from .utils import get_level_index
from .utils import get_level_index, fetch_df_by_index
from pathlib import Path
from .loader import DataLoader
@@ -99,25 +99,6 @@ class DataHandler(Serializable):
self._data = self.data_loader.load(self.instruments, self.start_time, self.end_time)
# TODO: cache
def _fetch_df_by_index(
self, df: pd.DataFrame, selector: Union[pd.Timestamp, slice, str, list], level: Union[str, int]
) -> pd.DataFrame:
"""
fetch data from `data` with `selector` and `level`
Parameters
----------
selector : Union[pd.Timestamp, slice, str, list]
selector
level : Union[int, str]
the level to use the selector
"""
# Try to get the right index
idx_slc = (selector, slice(None, None))
if get_level_index(df, level) == 1:
idx_slc = idx_slc[1], idx_slc[0]
return df.loc(axis=0)[idx_slc]
CS_ALL = "__all"
def _fetch_df_by_col(self, df: pd.DataFrame, col_set: str) -> pd.DataFrame:
@@ -156,7 +137,7 @@ class DataHandler(Serializable):
-------
pd.DataFrame:
"""
df = self._fetch_df_by_index(self._data, selector, level)
df = fetch_df_by_index(self._data, selector, level)
df = self._fetch_df_by_col(df, col_set)
if squeeze:
# squeeze columns
@@ -414,7 +395,7 @@ class DataHandlerLP(DataHandler):
pd.DataFrame:
"""
df = self._get_df_by_key(data_key)
df = self._fetch_df_by_index(df, selector, level)
df = fetch_df_by_index(df, selector, level)
return self._fetch_df_by_col(df, col_set)
def get_cols(self, col_set=DataHandler.CS_ALL, data_key: str = DK_I) -> list:

View File

@@ -7,6 +7,7 @@ import pandas as pd
import copy
from ...log import TimeInspector
from .utils import fetch_df_by_index
from ...utils.serial import Serializable
from ...utils.paral import datetime_groupby_apply
@@ -106,6 +107,7 @@ class ProcessInf(Processor):
return replace_inf(df)
class Fillna(Processor):
"""Process infinity """
@@ -123,14 +125,15 @@ class Fillna(Processor):
return fill_na(df)
class MinMaxNorm(Processor):
def __init__(self, fit_start_time, fit_end_time, fields_group=None):
# FIXME: time is not used
self.fit_start_time = fit_start_time
self.fit_end_time = fit_end_time
self.fields_group = fields_group
def fit(self, df):
df = fetch_df_by_index(df, slice(self.fit_start_time, self.fit_end_time), level="datetime")
cols = get_group_columns(df, self.fields_group)
self.min_val = np.nanmin(df[cols].values, axis=0)
self.max_val = np.nanmax(df[cols].values, axis=0)
@@ -152,15 +155,15 @@ class MinMaxNorm(Processor):
class ZscoreNorm(Processor):
def __init__(self, fit_start_time, fit_end_time, fields_group=None):
# FIXME: time is not used
self.fit_start_time = fit_start_time
self.fit_end_time = fit_end_time
self.fields_group = fields_group
def fit(self, df):
df = fetch_df_by_index(df, slice(self.fit_start_time, self.fit_end_time), level="datetime")
cols = get_group_columns(df, self.fields_group)
self.mean_train = np.nanmean(df[cols].values, axis=0)
self.std_train = np.nanstd(df[cols].values, axis=0)
self.std_train = np.nanstd(_df[cols].values, axis=0)
self.ignore = self.std_train == 0
self.cols = cols

View File

@@ -29,3 +29,27 @@ def get_level_index(df: pd.DataFrame, level=Union[str, int]) -> int:
return level
else:
raise NotImplementedError(f"This type of input is not supported")
def fetch_df_by_index(
df: pd.DataFrame, selector: Union[pd.Timestamp, slice, str, list], level: Union[str, int]
) -> pd.DataFrame:
"""
fetch data from `data` with `selector` and `level`
Parameters
----------
selector : Union[pd.Timestamp, slice, str, list]
selector
level : Union[int, str]
the level to use the selector
Returns
-------
Data of the given index.
"""
# Try to get the right index
idx_slc = (selector, slice(None, None))
if get_level_index(df, level) == 1:
idx_slc = idx_slc[1], idx_slc[0]
return df.loc(axis=0)[idx_slc]