diff --git a/examples/workflow_by_code_gru.py b/examples/workflow_by_code_gru.py index 2bcbe2aa6..52d3c451a 100755 --- a/examples/workflow_by_code_gru.py +++ b/examples/workflow_by_code_gru.py @@ -36,15 +36,14 @@ if __name__ == "__main__": MARKET = "csi300" BENCHMARK = "SH000300" - ################################### # train model ################################### DATA_HANDLER_CONFIG = { "start_time": "2008-01-01", "end_time": "2020-08-01", - "fit_start_time":"2008-01-01", - "fit_end_time":"2014-12-31", + "fit_start_time": "2008-01-01", + "fit_end_time": "2014-12-31", "instruments": MARKET, } @@ -69,37 +68,43 @@ if __name__ == "__main__": "n_epochs": 2000, "lr": 1e-1, "early_stop": 200, - "batch_size":800, + "batch_size": 800, "smooth_steps": 5, "metric": "mse", "loss": "mse", "seed": 0, "GPU": 0, - } + }, }, "dataset": { "class": "DatasetH", "module_path": "qlib.data.dataset", "kwargs": { - 'handler': { + "handler": { "class": "ALPHA360", "module_path": "qlib.contrib.data.handler", - "kwargs": DATA_HANDLER_CONFIG + "kwargs": DATA_HANDLER_CONFIG, }, - 'segments': { - 'train': ("2008-01-01", "2014-12-31"), - 'valid': ("2015-01-01", "2016-12-31",), - 'test': ("2017-01-01", "2020-08-01",), - } - } + "segments": { + "train": ("2008-01-01", "2014-12-31"), + "valid": ( + "2015-01-01", + "2016-12-31", + ), + "test": ( + "2017-01-01", + "2020-08-01", + ), + }, + }, } # You shoud record the data in specific sequence # "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'], } # model = train_model(task) - model = init_instance_by_config(task['model']) - dataset = init_instance_by_config(task['dataset']) + model = init_instance_by_config(task["model"]) + dataset = init_instance_by_config(task["dataset"]) model.fit(dataset) diff --git a/examples/workflow_by_code_xgboost.py b/examples/workflow_by_code_xgboost.py index 0eb5f4e93..8883bacee 100755 --- a/examples/workflow_by_code_xgboost.py +++ b/examples/workflow_by_code_xgboost.py @@ -21,7 +21,6 @@ from qlib.utils import init_instance_by_config if __name__ == "__main__": - # use default data provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir if not exists_qlib_data(provider_uri): @@ -36,15 +35,14 @@ if __name__ == "__main__": MARKET = "csi300" BENCHMARK = "SH000300" - ################################### # train model ################################### DATA_HANDLER_CONFIG = { "start_time": "2008-01-01", "end_time": "2020-08-01", - "fit_start_time":"2008-01-01", - "fit_end_time":"2014-12-31", + "fit_start_time": "2008-01-01", + "fit_end_time": "2014-12-31", "instruments": MARKET, } @@ -62,43 +60,49 @@ if __name__ == "__main__": "class": "XGBModel", "module_path": "qlib.contrib.model.xgboost", "kwargs": { - "objective": 'reg:linear', - "n_estimators":5000, + "objective": "reg:linear", + "n_estimators": 5000, "colsample_bytree": 0.85, "learning_rate": 0.0421, "subsample": 0.8789, "max_depth": 8, "num_leaves": 210, "num_threads": 20, - "missing":-1, - "min_child_weight":1, - "nthread":4, - "tree_method":'hist', - } + "missing": -1, + "min_child_weight": 1, + "nthread": 4, + "tree_method": "hist", + }, }, "dataset": { "class": "DatasetH", "module_path": "qlib.data.dataset", "kwargs": { - 'handler': { + "handler": { "class": "Alpha158", "module_path": "qlib.contrib.data.handler", - "kwargs": DATA_HANDLER_CONFIG + "kwargs": DATA_HANDLER_CONFIG, }, - 'segments': { - 'train': ("2008-01-01", "2014-12-31"), - 'valid': ("2015-01-01", "2016-12-31",), - 'test': ("2017-01-01", "2020-08-01",), - } - } + "segments": { + "train": ("2008-01-01", "2014-12-31"), + "valid": ( + "2015-01-01", + "2016-12-31", + ), + "test": ( + "2017-01-01", + "2020-08-01", + ), + }, + }, } # You shoud record the data in specific sequence # "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'], } # model = train_model(task) - model = init_instance_by_config(task['model']) - dataset = init_instance_by_config(task['dataset']) + model = init_instance_by_config(task["model"]) + dataset = init_instance_by_config(task["dataset"]) model.fit(dataset) pred_score = model.predict(dataset) diff --git a/qlib/contrib/data/handler.py b/qlib/contrib/data/handler.py index 61f8652be..e8545c367 100644 --- a/qlib/contrib/data/handler.py +++ b/qlib/contrib/data/handler.py @@ -8,15 +8,9 @@ from ...data.dataset import processor as processor_module from ...log import TimeInspector import copy + class ALPHA360(DataHandlerLP): - def __init__( - self, - instruments="csi500", - start_time=None, - end_time=None, - fit_start_time=None, - fit_end_time=None - ): + def __init__(self, instruments="csi500", start_time=None, end_time=None, fit_start_time=None, fit_end_time=None): data_loader = { "class": "QlibDataLoader", "kwargs": { @@ -28,22 +22,22 @@ class ALPHA360(DataHandlerLP): } learn_processors = [ - {"class": "DropnaLabel", "kwargs": {'group': 'label'}}, - {"class": "CSZScoreNorm", "kwargs": {"fields_group": "label"}}, + {"class": "DropnaLabel", "kwargs": {"group": "label"}}, + {"class": "CSZScoreNorm", "kwargs": {"fields_group": "label"}}, ] infer_processors = [ - {"class": "ProcessInf", "kwargs": {}}, - {"class": "ZscoreNorm", "kwargs": {"fit_start_time": fit_start_time, "fit_end_time": fit_end_time}}, - {"class": "Fillna", "kwargs": {}}, + {"class": "ProcessInf", "kwargs": {}}, + {"class": "ZscoreNorm", "kwargs": {"fit_start_time": fit_start_time, "fit_end_time": fit_end_time}}, + {"class": "Fillna", "kwargs": {}}, ] super().__init__( - instruments, - start_time, - end_time, - data_loader=data_loader, - learn_processors=learn_processors, - infer_processors=infer_processors + instruments, + start_time, + end_time, + data_loader=data_loader, + learn_processors=learn_processors, + infer_processors=infer_processors, ) def get_label_config(self): @@ -54,19 +48,19 @@ class ALPHA360(DataHandlerLP): fields = [] names = [] - for i in range(59,0,-1): - fields += ["Ref($close, %d)/$close"%(i)] - names += ["CLOSE%d"%(i)] - fields += ["Ref($open, %d)/$close"%(i)] - names += ["OPEN%d"%(i)] - fields += ["Ref($high, %d)/$close"%(i)] - names += ["HIGH%d"%(i)] - fields += ["Ref($low, %d)/$close"%(i)] - names += ["LOW%d"%(i)] - fields += ["Ref($vwap, %d)/$close"%(i)] - names += ["VWAP%d"%(i)] - fields += ["Ref($volume, %d)/$volume"%(i)] - names += ["VOLUME%d"%(i)] + for i in range(59, 0, -1): + fields += ["Ref($close, %d)/$close" % (i)] + names += ["CLOSE%d" % (i)] + fields += ["Ref($open, %d)/$close" % (i)] + names += ["OPEN%d" % (i)] + fields += ["Ref($high, %d)/$close" % (i)] + names += ["HIGH%d" % (i)] + fields += ["Ref($low, %d)/$close" % (i)] + names += ["LOW%d" % (i)] + fields += ["Ref($vwap, %d)/$close" % (i)] + names += ["VWAP%d" % (i)] + fields += ["Ref($volume, %d)/$volume" % (i)] + names += ["VOLUME%d" % (i)] fields += ["$close/$close"] fields += ["$open/$close"] diff --git a/qlib/contrib/model/pytorch_gru.py b/qlib/contrib/model/pytorch_gru.py index 9e18b09c1..7b999d0a1 100755 --- a/qlib/contrib/model/pytorch_gru.py +++ b/qlib/contrib/model/pytorch_gru.py @@ -22,6 +22,7 @@ from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP + class GRU(Model): """GRU Model @@ -127,7 +128,9 @@ class GRU(Model): raise NotImplementedError("loss {} is not supported!".format(loss)) self._scorer = mean_squared_error if loss == "mse" else roc_auc_score - self.gru_model = GRUModel(d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers, dropout=self.dropout) + self.gru_model = GRUModel( + d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers, dropout=self.dropout + ) if optimizer.lower() == "adam": self.train_optimizer = optim.Adam(self.gru_model.parameters(), lr=self.lr) elif optimizer.lower() == "gd": @@ -262,7 +265,7 @@ class GRU(Model): def get_loss(self, pred, target, loss_type): if loss_type == "mse": - sqr_loss = (pred - target)**2 + sqr_loss = (pred - target) ** 2 loss = sqr_loss.mean() return loss elif loss_type == "binary": @@ -307,6 +310,7 @@ class GRU(Model): self.gru_model.load_state_dict(torch.load(_model_path)) self._fitted = True + class AverageMeter(object): """Computes and stores the average and current value""" @@ -327,7 +331,6 @@ class AverageMeter(object): class GRUModel(nn.Module): - def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0): super().__init__() @@ -344,8 +347,7 @@ class GRUModel(nn.Module): def forward(self, x): # x: [N, F*T] - x = x.reshape(len(x), self.d_feat, -1) # [N, F, T] - x = x.permute(0, 2, 1) # [N, T, F] + x = x.reshape(len(x), self.d_feat, -1) # [N, F, T] + x = x.permute(0, 2, 1) # [N, T, F] out, _ = self.rnn(x) return self.fc_out(out[:, -1, :]).squeeze() - diff --git a/qlib/contrib/model/xgboost.py b/qlib/contrib/model/xgboost.py index 95954198e..f1208eb93 100755 --- a/qlib/contrib/model/xgboost.py +++ b/qlib/contrib/model/xgboost.py @@ -41,14 +41,14 @@ class XGBModel(Model): y_train_1d, y_valid_1d = np.squeeze(y_train.values), np.squeeze(y_valid.values) else: raise ValueError("XGBoost doesn't support multi-label training") - + dtrain = xgb.DMatrix(x_train.values, label=y_train_1d) dvalid = xgb.DMatrix(x_valid.values, label=y_valid_1d) self.model = xgb.train( self._params, dtrain=dtrain, num_boost_round=num_boost_round, - evals=[(dtrain, 'train'), (dvalid, 'valid')], + evals=[(dtrain, "train"), (dvalid, "valid")], early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval, evals_result=evals_result, diff --git a/qlib/data/dataset/handler.py b/qlib/data/dataset/handler.py index 13d3465c7..f6c097d22 100644 --- a/qlib/data/dataset/handler.py +++ b/qlib/data/dataset/handler.py @@ -16,7 +16,7 @@ from ...data import D from ...config import C from ...utils import parse_config, transform_end_date, init_instance_by_config from ...utils.serial import Serializable -from .utils import get_level_index +from .utils import get_level_index, fetch_df_by_index from pathlib import Path from .loader import DataLoader @@ -99,25 +99,6 @@ class DataHandler(Serializable): self._data = self.data_loader.load(self.instruments, self.start_time, self.end_time) # TODO: cache - def _fetch_df_by_index( - self, df: pd.DataFrame, selector: Union[pd.Timestamp, slice, str, list], level: Union[str, int] - ) -> pd.DataFrame: - """ - fetch data from `data` with `selector` and `level` - - Parameters - ---------- - selector : Union[pd.Timestamp, slice, str, list] - selector - level : Union[int, str] - the level to use the selector - """ - # Try to get the right index - idx_slc = (selector, slice(None, None)) - if get_level_index(df, level) == 1: - idx_slc = idx_slc[1], idx_slc[0] - return df.loc(axis=0)[idx_slc] - CS_ALL = "__all" def _fetch_df_by_col(self, df: pd.DataFrame, col_set: str) -> pd.DataFrame: @@ -156,7 +137,7 @@ class DataHandler(Serializable): ------- pd.DataFrame: """ - df = self._fetch_df_by_index(self._data, selector, level) + df = fetch_df_by_index(self._data, selector, level) df = self._fetch_df_by_col(df, col_set) if squeeze: # squeeze columns @@ -414,7 +395,7 @@ class DataHandlerLP(DataHandler): pd.DataFrame: """ df = self._get_df_by_key(data_key) - df = self._fetch_df_by_index(df, selector, level) + df = fetch_df_by_index(df, selector, level) return self._fetch_df_by_col(df, col_set) def get_cols(self, col_set=DataHandler.CS_ALL, data_key: str = DK_I) -> list: diff --git a/qlib/data/dataset/processor.py b/qlib/data/dataset/processor.py index 1f6754312..a9e404b7a 100644 --- a/qlib/data/dataset/processor.py +++ b/qlib/data/dataset/processor.py @@ -7,6 +7,7 @@ import pandas as pd import copy from ...log import TimeInspector +from .utils import fetch_df_by_index from ...utils.serial import Serializable from ...utils.paral import datetime_groupby_apply @@ -106,6 +107,7 @@ class ProcessInf(Processor): return replace_inf(df) + class Fillna(Processor): """Process infinity """ @@ -123,14 +125,15 @@ class Fillna(Processor): return fill_na(df) + class MinMaxNorm(Processor): def __init__(self, fit_start_time, fit_end_time, fields_group=None): - # FIXME: time is not used self.fit_start_time = fit_start_time self.fit_end_time = fit_end_time self.fields_group = fields_group def fit(self, df): + df = fetch_df_by_index(df, slice(self.fit_start_time, self.fit_end_time), level="datetime") cols = get_group_columns(df, self.fields_group) self.min_val = np.nanmin(df[cols].values, axis=0) self.max_val = np.nanmax(df[cols].values, axis=0) @@ -152,15 +155,15 @@ class MinMaxNorm(Processor): class ZscoreNorm(Processor): def __init__(self, fit_start_time, fit_end_time, fields_group=None): - # FIXME: time is not used self.fit_start_time = fit_start_time self.fit_end_time = fit_end_time self.fields_group = fields_group def fit(self, df): + df = fetch_df_by_index(df, slice(self.fit_start_time, self.fit_end_time), level="datetime") cols = get_group_columns(df, self.fields_group) self.mean_train = np.nanmean(df[cols].values, axis=0) - self.std_train = np.nanstd(df[cols].values, axis=0) + self.std_train = np.nanstd(_df[cols].values, axis=0) self.ignore = self.std_train == 0 self.cols = cols diff --git a/qlib/data/dataset/utils.py b/qlib/data/dataset/utils.py index af0900867..6eb00ffee 100644 --- a/qlib/data/dataset/utils.py +++ b/qlib/data/dataset/utils.py @@ -29,3 +29,27 @@ def get_level_index(df: pd.DataFrame, level=Union[str, int]) -> int: return level else: raise NotImplementedError(f"This type of input is not supported") + + +def fetch_df_by_index( + df: pd.DataFrame, selector: Union[pd.Timestamp, slice, str, list], level: Union[str, int] +) -> pd.DataFrame: + """ + fetch data from `data` with `selector` and `level` + + Parameters + ---------- + selector : Union[pd.Timestamp, slice, str, list] + selector + level : Union[int, str] + the level to use the selector + + Returns + ------- + Data of the given index. + """ + # Try to get the right index + idx_slc = (selector, slice(None, None)) + if get_level_index(df, level) == 1: + idx_slc = idx_slc[1], idx_slc[0] + return df.loc(axis=0)[idx_slc]