mirror of
https://github.com/microsoft/qlib.git
synced 2026-07-03 11:00:57 +08:00
Fix processor bug and format
This commit is contained in:
@@ -36,15 +36,14 @@ if __name__ == "__main__":
|
||||
MARKET = "csi300"
|
||||
BENCHMARK = "SH000300"
|
||||
|
||||
|
||||
###################################
|
||||
# train model
|
||||
###################################
|
||||
DATA_HANDLER_CONFIG = {
|
||||
"start_time": "2008-01-01",
|
||||
"end_time": "2020-08-01",
|
||||
"fit_start_time":"2008-01-01",
|
||||
"fit_end_time":"2014-12-31",
|
||||
"fit_start_time": "2008-01-01",
|
||||
"fit_end_time": "2014-12-31",
|
||||
"instruments": MARKET,
|
||||
}
|
||||
|
||||
@@ -69,37 +68,43 @@ if __name__ == "__main__":
|
||||
"n_epochs": 2000,
|
||||
"lr": 1e-1,
|
||||
"early_stop": 200,
|
||||
"batch_size":800,
|
||||
"batch_size": 800,
|
||||
"smooth_steps": 5,
|
||||
"metric": "mse",
|
||||
"loss": "mse",
|
||||
"seed": 0,
|
||||
"GPU": 0,
|
||||
}
|
||||
},
|
||||
},
|
||||
"dataset": {
|
||||
"class": "DatasetH",
|
||||
"module_path": "qlib.data.dataset",
|
||||
"kwargs": {
|
||||
'handler': {
|
||||
"handler": {
|
||||
"class": "ALPHA360",
|
||||
"module_path": "qlib.contrib.data.handler",
|
||||
"kwargs": DATA_HANDLER_CONFIG
|
||||
"kwargs": DATA_HANDLER_CONFIG,
|
||||
},
|
||||
'segments': {
|
||||
'train': ("2008-01-01", "2014-12-31"),
|
||||
'valid': ("2015-01-01", "2016-12-31",),
|
||||
'test': ("2017-01-01", "2020-08-01",),
|
||||
}
|
||||
}
|
||||
"segments": {
|
||||
"train": ("2008-01-01", "2014-12-31"),
|
||||
"valid": (
|
||||
"2015-01-01",
|
||||
"2016-12-31",
|
||||
),
|
||||
"test": (
|
||||
"2017-01-01",
|
||||
"2020-08-01",
|
||||
),
|
||||
},
|
||||
},
|
||||
}
|
||||
# You shoud record the data in specific sequence
|
||||
# "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'],
|
||||
}
|
||||
|
||||
# model = train_model(task)
|
||||
model = init_instance_by_config(task['model'])
|
||||
dataset = init_instance_by_config(task['dataset'])
|
||||
model = init_instance_by_config(task["model"])
|
||||
dataset = init_instance_by_config(task["dataset"])
|
||||
|
||||
model.fit(dataset)
|
||||
|
||||
|
||||
@@ -21,7 +21,6 @@ from qlib.utils import init_instance_by_config
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
|
||||
# use default data
|
||||
provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir
|
||||
if not exists_qlib_data(provider_uri):
|
||||
@@ -36,15 +35,14 @@ if __name__ == "__main__":
|
||||
MARKET = "csi300"
|
||||
BENCHMARK = "SH000300"
|
||||
|
||||
|
||||
###################################
|
||||
# train model
|
||||
###################################
|
||||
DATA_HANDLER_CONFIG = {
|
||||
"start_time": "2008-01-01",
|
||||
"end_time": "2020-08-01",
|
||||
"fit_start_time":"2008-01-01",
|
||||
"fit_end_time":"2014-12-31",
|
||||
"fit_start_time": "2008-01-01",
|
||||
"fit_end_time": "2014-12-31",
|
||||
"instruments": MARKET,
|
||||
}
|
||||
|
||||
@@ -62,43 +60,49 @@ if __name__ == "__main__":
|
||||
"class": "XGBModel",
|
||||
"module_path": "qlib.contrib.model.xgboost",
|
||||
"kwargs": {
|
||||
"objective": 'reg:linear',
|
||||
"n_estimators":5000,
|
||||
"objective": "reg:linear",
|
||||
"n_estimators": 5000,
|
||||
"colsample_bytree": 0.85,
|
||||
"learning_rate": 0.0421,
|
||||
"subsample": 0.8789,
|
||||
"max_depth": 8,
|
||||
"num_leaves": 210,
|
||||
"num_threads": 20,
|
||||
"missing":-1,
|
||||
"min_child_weight":1,
|
||||
"nthread":4,
|
||||
"tree_method":'hist',
|
||||
}
|
||||
"missing": -1,
|
||||
"min_child_weight": 1,
|
||||
"nthread": 4,
|
||||
"tree_method": "hist",
|
||||
},
|
||||
},
|
||||
"dataset": {
|
||||
"class": "DatasetH",
|
||||
"module_path": "qlib.data.dataset",
|
||||
"kwargs": {
|
||||
'handler': {
|
||||
"handler": {
|
||||
"class": "Alpha158",
|
||||
"module_path": "qlib.contrib.data.handler",
|
||||
"kwargs": DATA_HANDLER_CONFIG
|
||||
"kwargs": DATA_HANDLER_CONFIG,
|
||||
},
|
||||
'segments': {
|
||||
'train': ("2008-01-01", "2014-12-31"),
|
||||
'valid': ("2015-01-01", "2016-12-31",),
|
||||
'test': ("2017-01-01", "2020-08-01",),
|
||||
}
|
||||
}
|
||||
"segments": {
|
||||
"train": ("2008-01-01", "2014-12-31"),
|
||||
"valid": (
|
||||
"2015-01-01",
|
||||
"2016-12-31",
|
||||
),
|
||||
"test": (
|
||||
"2017-01-01",
|
||||
"2020-08-01",
|
||||
),
|
||||
},
|
||||
},
|
||||
}
|
||||
# You shoud record the data in specific sequence
|
||||
# "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'],
|
||||
}
|
||||
|
||||
# model = train_model(task)
|
||||
model = init_instance_by_config(task['model'])
|
||||
dataset = init_instance_by_config(task['dataset'])
|
||||
model = init_instance_by_config(task["model"])
|
||||
dataset = init_instance_by_config(task["dataset"])
|
||||
|
||||
model.fit(dataset)
|
||||
pred_score = model.predict(dataset)
|
||||
|
||||
@@ -8,15 +8,9 @@ from ...data.dataset import processor as processor_module
|
||||
from ...log import TimeInspector
|
||||
import copy
|
||||
|
||||
|
||||
class ALPHA360(DataHandlerLP):
|
||||
def __init__(
|
||||
self,
|
||||
instruments="csi500",
|
||||
start_time=None,
|
||||
end_time=None,
|
||||
fit_start_time=None,
|
||||
fit_end_time=None
|
||||
):
|
||||
def __init__(self, instruments="csi500", start_time=None, end_time=None, fit_start_time=None, fit_end_time=None):
|
||||
data_loader = {
|
||||
"class": "QlibDataLoader",
|
||||
"kwargs": {
|
||||
@@ -28,22 +22,22 @@ class ALPHA360(DataHandlerLP):
|
||||
}
|
||||
|
||||
learn_processors = [
|
||||
{"class": "DropnaLabel", "kwargs": {'group': 'label'}},
|
||||
{"class": "CSZScoreNorm", "kwargs": {"fields_group": "label"}},
|
||||
{"class": "DropnaLabel", "kwargs": {"group": "label"}},
|
||||
{"class": "CSZScoreNorm", "kwargs": {"fields_group": "label"}},
|
||||
]
|
||||
infer_processors = [
|
||||
{"class": "ProcessInf", "kwargs": {}},
|
||||
{"class": "ZscoreNorm", "kwargs": {"fit_start_time": fit_start_time, "fit_end_time": fit_end_time}},
|
||||
{"class": "Fillna", "kwargs": {}},
|
||||
{"class": "ProcessInf", "kwargs": {}},
|
||||
{"class": "ZscoreNorm", "kwargs": {"fit_start_time": fit_start_time, "fit_end_time": fit_end_time}},
|
||||
{"class": "Fillna", "kwargs": {}},
|
||||
]
|
||||
|
||||
super().__init__(
|
||||
instruments,
|
||||
start_time,
|
||||
end_time,
|
||||
data_loader=data_loader,
|
||||
learn_processors=learn_processors,
|
||||
infer_processors=infer_processors
|
||||
instruments,
|
||||
start_time,
|
||||
end_time,
|
||||
data_loader=data_loader,
|
||||
learn_processors=learn_processors,
|
||||
infer_processors=infer_processors,
|
||||
)
|
||||
|
||||
def get_label_config(self):
|
||||
@@ -54,19 +48,19 @@ class ALPHA360(DataHandlerLP):
|
||||
fields = []
|
||||
names = []
|
||||
|
||||
for i in range(59,0,-1):
|
||||
fields += ["Ref($close, %d)/$close"%(i)]
|
||||
names += ["CLOSE%d"%(i)]
|
||||
fields += ["Ref($open, %d)/$close"%(i)]
|
||||
names += ["OPEN%d"%(i)]
|
||||
fields += ["Ref($high, %d)/$close"%(i)]
|
||||
names += ["HIGH%d"%(i)]
|
||||
fields += ["Ref($low, %d)/$close"%(i)]
|
||||
names += ["LOW%d"%(i)]
|
||||
fields += ["Ref($vwap, %d)/$close"%(i)]
|
||||
names += ["VWAP%d"%(i)]
|
||||
fields += ["Ref($volume, %d)/$volume"%(i)]
|
||||
names += ["VOLUME%d"%(i)]
|
||||
for i in range(59, 0, -1):
|
||||
fields += ["Ref($close, %d)/$close" % (i)]
|
||||
names += ["CLOSE%d" % (i)]
|
||||
fields += ["Ref($open, %d)/$close" % (i)]
|
||||
names += ["OPEN%d" % (i)]
|
||||
fields += ["Ref($high, %d)/$close" % (i)]
|
||||
names += ["HIGH%d" % (i)]
|
||||
fields += ["Ref($low, %d)/$close" % (i)]
|
||||
names += ["LOW%d" % (i)]
|
||||
fields += ["Ref($vwap, %d)/$close" % (i)]
|
||||
names += ["VWAP%d" % (i)]
|
||||
fields += ["Ref($volume, %d)/$volume" % (i)]
|
||||
names += ["VOLUME%d" % (i)]
|
||||
|
||||
fields += ["$close/$close"]
|
||||
fields += ["$open/$close"]
|
||||
|
||||
@@ -22,6 +22,7 @@ from ...model.base import Model
|
||||
from ...data.dataset import DatasetH
|
||||
from ...data.dataset.handler import DataHandlerLP
|
||||
|
||||
|
||||
class GRU(Model):
|
||||
"""GRU Model
|
||||
|
||||
@@ -127,7 +128,9 @@ class GRU(Model):
|
||||
raise NotImplementedError("loss {} is not supported!".format(loss))
|
||||
self._scorer = mean_squared_error if loss == "mse" else roc_auc_score
|
||||
|
||||
self.gru_model = GRUModel(d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers, dropout=self.dropout)
|
||||
self.gru_model = GRUModel(
|
||||
d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers, dropout=self.dropout
|
||||
)
|
||||
if optimizer.lower() == "adam":
|
||||
self.train_optimizer = optim.Adam(self.gru_model.parameters(), lr=self.lr)
|
||||
elif optimizer.lower() == "gd":
|
||||
@@ -262,7 +265,7 @@ class GRU(Model):
|
||||
|
||||
def get_loss(self, pred, target, loss_type):
|
||||
if loss_type == "mse":
|
||||
sqr_loss = (pred - target)**2
|
||||
sqr_loss = (pred - target) ** 2
|
||||
loss = sqr_loss.mean()
|
||||
return loss
|
||||
elif loss_type == "binary":
|
||||
@@ -307,6 +310,7 @@ class GRU(Model):
|
||||
self.gru_model.load_state_dict(torch.load(_model_path))
|
||||
self._fitted = True
|
||||
|
||||
|
||||
class AverageMeter(object):
|
||||
"""Computes and stores the average and current value"""
|
||||
|
||||
@@ -327,7 +331,6 @@ class AverageMeter(object):
|
||||
|
||||
|
||||
class GRUModel(nn.Module):
|
||||
|
||||
def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0):
|
||||
super().__init__()
|
||||
|
||||
@@ -344,8 +347,7 @@ class GRUModel(nn.Module):
|
||||
|
||||
def forward(self, x):
|
||||
# x: [N, F*T]
|
||||
x = x.reshape(len(x), self.d_feat, -1) # [N, F, T]
|
||||
x = x.permute(0, 2, 1) # [N, T, F]
|
||||
x = x.reshape(len(x), self.d_feat, -1) # [N, F, T]
|
||||
x = x.permute(0, 2, 1) # [N, T, F]
|
||||
out, _ = self.rnn(x)
|
||||
return self.fc_out(out[:, -1, :]).squeeze()
|
||||
|
||||
|
||||
@@ -41,14 +41,14 @@ class XGBModel(Model):
|
||||
y_train_1d, y_valid_1d = np.squeeze(y_train.values), np.squeeze(y_valid.values)
|
||||
else:
|
||||
raise ValueError("XGBoost doesn't support multi-label training")
|
||||
|
||||
|
||||
dtrain = xgb.DMatrix(x_train.values, label=y_train_1d)
|
||||
dvalid = xgb.DMatrix(x_valid.values, label=y_valid_1d)
|
||||
self.model = xgb.train(
|
||||
self._params,
|
||||
dtrain=dtrain,
|
||||
num_boost_round=num_boost_round,
|
||||
evals=[(dtrain, 'train'), (dvalid, 'valid')],
|
||||
evals=[(dtrain, "train"), (dvalid, "valid")],
|
||||
early_stopping_rounds=early_stopping_rounds,
|
||||
verbose_eval=verbose_eval,
|
||||
evals_result=evals_result,
|
||||
|
||||
@@ -16,7 +16,7 @@ from ...data import D
|
||||
from ...config import C
|
||||
from ...utils import parse_config, transform_end_date, init_instance_by_config
|
||||
from ...utils.serial import Serializable
|
||||
from .utils import get_level_index
|
||||
from .utils import get_level_index, fetch_df_by_index
|
||||
from pathlib import Path
|
||||
from .loader import DataLoader
|
||||
|
||||
@@ -99,25 +99,6 @@ class DataHandler(Serializable):
|
||||
self._data = self.data_loader.load(self.instruments, self.start_time, self.end_time)
|
||||
# TODO: cache
|
||||
|
||||
def _fetch_df_by_index(
|
||||
self, df: pd.DataFrame, selector: Union[pd.Timestamp, slice, str, list], level: Union[str, int]
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
fetch data from `data` with `selector` and `level`
|
||||
|
||||
Parameters
|
||||
----------
|
||||
selector : Union[pd.Timestamp, slice, str, list]
|
||||
selector
|
||||
level : Union[int, str]
|
||||
the level to use the selector
|
||||
"""
|
||||
# Try to get the right index
|
||||
idx_slc = (selector, slice(None, None))
|
||||
if get_level_index(df, level) == 1:
|
||||
idx_slc = idx_slc[1], idx_slc[0]
|
||||
return df.loc(axis=0)[idx_slc]
|
||||
|
||||
CS_ALL = "__all"
|
||||
|
||||
def _fetch_df_by_col(self, df: pd.DataFrame, col_set: str) -> pd.DataFrame:
|
||||
@@ -156,7 +137,7 @@ class DataHandler(Serializable):
|
||||
-------
|
||||
pd.DataFrame:
|
||||
"""
|
||||
df = self._fetch_df_by_index(self._data, selector, level)
|
||||
df = fetch_df_by_index(self._data, selector, level)
|
||||
df = self._fetch_df_by_col(df, col_set)
|
||||
if squeeze:
|
||||
# squeeze columns
|
||||
@@ -414,7 +395,7 @@ class DataHandlerLP(DataHandler):
|
||||
pd.DataFrame:
|
||||
"""
|
||||
df = self._get_df_by_key(data_key)
|
||||
df = self._fetch_df_by_index(df, selector, level)
|
||||
df = fetch_df_by_index(df, selector, level)
|
||||
return self._fetch_df_by_col(df, col_set)
|
||||
|
||||
def get_cols(self, col_set=DataHandler.CS_ALL, data_key: str = DK_I) -> list:
|
||||
|
||||
@@ -7,6 +7,7 @@ import pandas as pd
|
||||
import copy
|
||||
|
||||
from ...log import TimeInspector
|
||||
from .utils import fetch_df_by_index
|
||||
from ...utils.serial import Serializable
|
||||
from ...utils.paral import datetime_groupby_apply
|
||||
|
||||
@@ -106,6 +107,7 @@ class ProcessInf(Processor):
|
||||
|
||||
return replace_inf(df)
|
||||
|
||||
|
||||
class Fillna(Processor):
|
||||
"""Process infinity """
|
||||
|
||||
@@ -123,14 +125,15 @@ class Fillna(Processor):
|
||||
|
||||
return fill_na(df)
|
||||
|
||||
|
||||
class MinMaxNorm(Processor):
|
||||
def __init__(self, fit_start_time, fit_end_time, fields_group=None):
|
||||
# FIXME: time is not used
|
||||
self.fit_start_time = fit_start_time
|
||||
self.fit_end_time = fit_end_time
|
||||
self.fields_group = fields_group
|
||||
|
||||
def fit(self, df):
|
||||
df = fetch_df_by_index(df, slice(self.fit_start_time, self.fit_end_time), level="datetime")
|
||||
cols = get_group_columns(df, self.fields_group)
|
||||
self.min_val = np.nanmin(df[cols].values, axis=0)
|
||||
self.max_val = np.nanmax(df[cols].values, axis=0)
|
||||
@@ -152,15 +155,15 @@ class MinMaxNorm(Processor):
|
||||
|
||||
class ZscoreNorm(Processor):
|
||||
def __init__(self, fit_start_time, fit_end_time, fields_group=None):
|
||||
# FIXME: time is not used
|
||||
self.fit_start_time = fit_start_time
|
||||
self.fit_end_time = fit_end_time
|
||||
self.fields_group = fields_group
|
||||
|
||||
def fit(self, df):
|
||||
df = fetch_df_by_index(df, slice(self.fit_start_time, self.fit_end_time), level="datetime")
|
||||
cols = get_group_columns(df, self.fields_group)
|
||||
self.mean_train = np.nanmean(df[cols].values, axis=0)
|
||||
self.std_train = np.nanstd(df[cols].values, axis=0)
|
||||
self.std_train = np.nanstd(_df[cols].values, axis=0)
|
||||
self.ignore = self.std_train == 0
|
||||
self.cols = cols
|
||||
|
||||
|
||||
@@ -29,3 +29,27 @@ def get_level_index(df: pd.DataFrame, level=Union[str, int]) -> int:
|
||||
return level
|
||||
else:
|
||||
raise NotImplementedError(f"This type of input is not supported")
|
||||
|
||||
|
||||
def fetch_df_by_index(
|
||||
df: pd.DataFrame, selector: Union[pd.Timestamp, slice, str, list], level: Union[str, int]
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
fetch data from `data` with `selector` and `level`
|
||||
|
||||
Parameters
|
||||
----------
|
||||
selector : Union[pd.Timestamp, slice, str, list]
|
||||
selector
|
||||
level : Union[int, str]
|
||||
the level to use the selector
|
||||
|
||||
Returns
|
||||
-------
|
||||
Data of the given index.
|
||||
"""
|
||||
# Try to get the right index
|
||||
idx_slc = (selector, slice(None, None))
|
||||
if get_level_index(df, level) == 1:
|
||||
idx_slc = idx_slc[1], idx_slc[0]
|
||||
return df.loc(axis=0)[idx_slc]
|
||||
|
||||
Reference in New Issue
Block a user