diff --git a/examples/benchmarks/ALSTM/README.md b/examples/benchmarks/ALSTM/README.md new file mode 100644 index 000000000..cd9dd3493 --- /dev/null +++ b/examples/benchmarks/ALSTM/README.md @@ -0,0 +1,10 @@ +# ALSTM + +- ALSTM contains a temporal attentive aggregation layer based on normal LSTM. + +- The code used in Qlib is a pyTorch implementation of Code: https://github.com/fulifeng/Adv-ALSTM + +- Paper: A dual-stage attention-based recurrent neural network for time series prediction. + + https://www.ijcai.org/Proceedings/2017/0366.pdf + diff --git a/examples/benchmarks/TFT/README.md b/examples/benchmarks/TFT/README.md index a64ca0129..5a6a9f153 100644 --- a/examples/benchmarks/TFT/README.md +++ b/examples/benchmarks/TFT/README.md @@ -5,8 +5,10 @@ **GitHub**: https://github.com/google-research/google-research/tree/master/tft ## Run the Workflow -Users can follow the ``workflow_by_code_tft.py`` to run the benchmark. Please be **aware** that this script can only support Python 3.5 - 3.8. +Users can follow the ``workflow_by_code_tft.py`` to run the benchmark. ### Notes -1. The model must run in GPU, or an error will be raised. -2. New datasets should be registered in ``data_formatters``, for detail please visit the source. +1. Please be **aware** that this script can only support `Python 3.5 - 3.8`. +2. If the CUDA version on your machine is not 10.0, please remember to run the following commands `conda install anaconda cudatoolkit=10.0` and `conda install cudnn` on your machine. +3. The model must run in GPU, or an error will be raised. +4. New datasets should be registered in ``data_formatters``, for detail please visit the source. diff --git a/examples/run_all_model.py b/examples/run_all_model.py index b448a1857..2f6c4299e 100644 --- a/examples/run_all_model.py +++ b/examples/run_all_model.py @@ -10,6 +10,7 @@ import shutil import tempfile import statistics from pathlib import Path +from operator import xor from subprocess import Popen, PIPE from threading import Thread from pprint import pprint @@ -174,11 +175,22 @@ def cal_mean_std(results) -> dict: # function to get all the folders benchmark folder -def get_all_folders() -> dict: +def get_all_folders(models, exclude) -> dict: folders = dict() + if isinstance(models, str): + model_list = models.split(",") + models = [m.lower().strip("[ ]") for m in model_list] + elif isinstance(models, list): + models = [m.lower() for m in models] + elif models is None: + models = [f.name.lower() for f in os.scandir("benchmarks")] + else: + raise ValueError("Input models type is not supported. Please provide str or list without space.") for f in os.scandir("benchmarks"): - path = Path("benchmarks") / f.name - folders[f.name] = str(path.resolve()) + add = xor(bool(f.name.lower() in models), bool(exclude)) + if add: + path = Path("benchmarks") / f.name + folders[f.name] = str(path.resolve()) return folders @@ -225,13 +237,44 @@ def gen_and_save_md_table(metrics): # function to run the all the models -def run(times=1): +def run(times=1, models=None, exclude=False): """ Please be aware that this function can only work under Linux. MacOS and Windows will be supported in the future. Any PR to enhance this method is highly welcomed. + + Parameters: + ----------- + times : int + determines how many times the model should be running. + models : str or list + determines the specific model or list of models to run or exclude. + exclude : boolean + determines whether the model being used is excluded or included. + + Usage: + ------- + Here are some use cases of the function in the bash: + + .. code-block:: bash + + # Case 1 - run all models multiple times + python run_all_model.py 3 + + # Case 2 - run specific models multiple times + python run_all_model.py 3 dnn + + # Case 3 - run other models except those are given as arguments for multiple times + python run_all_model.py 3 [dnn,tft,lstm] True + + # Case 4 - run specific models for one time + python run_all_model.py --models=[dnn,lightgbm] + + # Case 5 - run other models except those are given as aruments for one time + python run_all_model.py --models=[dnn,tft,sfm] --exclude=True + """ # get all folders - folders = get_all_folders() + folders = get_all_folders(models, exclude) # set up compatible = True if sys.version_info < (3, 3): diff --git a/examples/workflow_by_code_gats.py b/examples/workflow_by_code_gats.py index 3bb4edf08..b5bad31ec 100644 --- a/examples/workflow_by_code_gats.py +++ b/examples/workflow_by_code_gats.py @@ -7,19 +7,16 @@ from pathlib import Path import qlib import pandas as pd from qlib.config import REG_CN -from qlib.contrib.model.pytorch_gats import GAT -from qlib.contrib.data.handler import ALPHA360_Denoise + from qlib.contrib.strategy.strategy import TopkDropoutStrategy from qlib.contrib.evaluate import ( backtest as normal_backtest, risk_analysis, ) from qlib.utils import exists_qlib_data - -# from qlib.model.learner import train_model from qlib.utils import init_instance_by_config -import pickle + if __name__ == "__main__": diff --git a/examples/workflow_by_code_sfm.py b/examples/workflow_by_code_sfm.py index ccc2d412c..e9a72883a 100644 --- a/examples/workflow_by_code_sfm.py +++ b/examples/workflow_by_code_sfm.py @@ -71,21 +71,22 @@ if __name__ == "__main__": "module_path": "qlib.contrib.model.pytorch_sfm", "kwargs": { "d_feat": 6, - "hidden_size": 32, - "output_dim": 16, + "hidden_size": 64, + "output_dim": 32, "freq_dim": 25, "dropout_W": 0.5, "dropout_U": 0.5, - "n_epochs": 200, + "n_epochs": 15, "lr": 1e-3, - "batch_size": 200, + "metric": "", + "batch_size": 1600, "early_stop": 20, "eval_steps": 5, "loss": "mse", "lr_decay": 0.96, "lr_decay_steps": 100, "optimizer": "adam", - "GPU": 1, + "GPU": 3, "seed": 710, }, }, diff --git a/qlib/contrib/data/handler.py b/qlib/contrib/data/handler.py index 8cce92907..07ef2267a 100644 --- a/qlib/contrib/data/handler.py +++ b/qlib/contrib/data/handler.py @@ -10,6 +10,28 @@ from inspect import getfullargspec import copy +def check_transform_proc(proc_l, fit_start_time, fit_end_time): + new_l = [] + for p in proc_l: + if not isinstance(p, Processor): + klass, pkwargs = get_cls_kwargs(p, processor_module) + args = getfullargspec(klass).args + if "fit_start_time" in args and "fit_end_time" in args: + assert ( + fit_start_time is not None and fit_end_time is not None + ), "Make sure `fit_start_time` and `fit_end_time` are not None." + pkwargs.update( + { + "fit_start_time": fit_start_time, + "fit_end_time": fit_end_time, + } + ) + new_l.append({"class": klass.__name__, "kwargs": pkwargs}) + else: + new_l.append(p) + return new_l + + class ALPHA360_Denoise(DataHandlerLP): def __init__(self, instruments="csi500", start_time=None, end_time=None, fit_start_time=None, fit_end_time=None): data_loader = { @@ -83,8 +105,31 @@ class ALPHA360_Denoise(DataHandlerLP): return fields, names +_DEFAULT_LEARN_PROCESSORS = [ + {"class": "DropnaLabel"}, + {"class": "CSZScoreNorm", "kwargs": {"fields_group": "label"}}, +] +_DEFAULT_INFER_PROCESSORS = [ + {"class": "ProcessInf", "kwargs": {}}, + {"class": "ZScoreNorm", "kwargs": {}}, + {"class": "Fillna", "kwargs": {}}, +] + + class ALPHA360(DataHandlerLP): - def __init__(self, instruments="csi500", start_time=None, end_time=None, fit_start_time=None, fit_end_time=None): + def __init__( + self, + instruments="csi500", + start_time=None, + end_time=None, + infer_processors=_DEFAULT_INFER_PROCESSORS, + learn_processors=_DEFAULT_LEARN_PROCESSORS, + fit_start_time=None, + fit_end_time=None, + ): + infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time) + learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time) + data_loader = { "class": "QlibDataLoader", "kwargs": { @@ -95,16 +140,6 @@ class ALPHA360(DataHandlerLP): }, } - learn_processors = [ - {"class": "DropnaLabel", "kwargs": {"fields_group": "label"}}, - {"class": "CSZScoreNorm", "kwargs": {"fields_group": "label"}}, - ] - infer_processors = [ - {"class": "ProcessInf", "kwargs": {}}, - {"class": "ZscoreNorm", "kwargs": {"fit_start_time": fit_start_time, "fit_end_time": fit_end_time}}, - {"class": "Fillna", "kwargs": {}}, - ] - super().__init__( instruments, start_time, @@ -168,33 +203,12 @@ class Alpha158(DataHandlerLP): start_time=None, end_time=None, infer_processors=[], - learn_processors=["DropnaLabel", {"class": "CSZScoreNorm", "kwargs": {"fields_group": "label"}}], + learn_processors=_DEFAULT_LEARN_PROCESSORS, fit_start_time=None, fit_end_time=None, ): - def check_transform_proc(proc_l): - new_l = [] - for p in proc_l: - if not isinstance(p, Processor): - klass, pkwargs = get_cls_kwargs(p, processor_module) - args = getfullargspec(klass).args - if "fit_start_time" in args and "fit_end_time" in args: - assert ( - fit_start_time is not None and fit_end_time is not None - ), "Make sure `fit_start_time` and `fit_end_time` are not None." - pkwargs.update( - { - "fit_start_time": fit_start_time, - "fit_end_time": fit_end_time, - } - ) - new_l.append({"class": klass.__name__, "kwargs": pkwargs}) - else: - new_l.append(p) - return new_l - - infer_processors = check_transform_proc(infer_processors) - learn_processors = check_transform_proc(learn_processors) + infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time) + learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time) data_loader = { "class": "QlibDataLoader", diff --git a/qlib/contrib/model/catboost_model.py b/qlib/contrib/model/catboost_model.py index bba006c35..eb97fc75b 100644 --- a/qlib/contrib/model/catboost_model.py +++ b/qlib/contrib/model/catboost_model.py @@ -34,14 +34,14 @@ class CatBoostModel(Model): def fit( self, dataset: DatasetH, - num_boost_round=1000, - early_stopping_rounds=50, - verbose_eval=20, - evals_result=dict(), + num_boost_round = 1000, + early_stopping_rounds = 50, + verbose_eval = 20, + evals_result = dict(), **kwargs ): df_train, df_valid = dataset.prepare( - ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L + ["train", "valid"], col_set = ["feature", "label"], data_key = DataHandlerLP.DK_L ) x_train, y_train = df_train["feature"], df_train["label"] x_valid, y_valid = df_valid["feature"], df_valid["label"] @@ -52,8 +52,8 @@ class CatBoostModel(Model): else: raise ValueError("CatBoost doesn't support multi-label training") - train_pool = Pool(data=x_train, label=y_train_1d) - valid_pool = Pool(data=x_valid, label=y_valid_1d) + train_pool = Pool(data = x_train, label = y_train_1d) + valid_pool = Pool(data = x_valid, label = y_valid_1d) # Initialize the catboost model self._params["iterations"] = num_boost_round @@ -63,7 +63,7 @@ class CatBoostModel(Model): self.model = CatBoost(self._params, **kwargs) # train the model - self.model.fit(train_pool, eval_set=valid_pool, use_best_model=True, **kwargs) + self.model.fit(train_pool, eval_set = valid_pool, use_best_model = True, **kwargs) evals_result = self.model.get_evals_result() evals_result["train"] = list(evals_result["learn"].values())[0] @@ -72,8 +72,8 @@ class CatBoostModel(Model): def predict(self, dataset): if self.model is None: raise ValueError("model is not fitted yet!") - x_test = dataset.prepare("test", col_set="feature") - return pd.Series(self.model.predict(x_test.values), index=x_test.index) + x_test = dataset.prepare("test", col_set = "feature") + return pd.Series(self.model.predict(x_test.values), index = x_test.index) if __name__ == "__main__": diff --git a/qlib/contrib/model/pytorch_gats.py b/qlib/contrib/model/pytorch_gats.py index 07af4eda4..7cdfb571a 100755 --- a/qlib/contrib/model/pytorch_gats.py +++ b/qlib/contrib/model/pytorch_gats.py @@ -28,14 +28,12 @@ class GAT(Model): Parameters ---------- - input_dim : int - input dimension - output_dim : int - output dimension - layers : tuple - layer sizes lr : float learning rate + d_feat : int + input dimensions for each time step + metric : str + the evaluate metric used in early stop optimizer : str optimizer name GPU : str @@ -119,11 +117,7 @@ class GAT(Model): seed, ) ) - - if loss not in {"mse", "binary"}: - raise NotImplementedError("loss {} is not supported!".format(loss)) - self._scorer = mean_squared_error if loss == "mse" else roc_auc_score - + self.GAT_model = GATModel( d_feat=self.d_feat, hidden_size=self.hidden_size, @@ -213,7 +207,6 @@ class GAT(Model): losses = [] indices = np.arange(len(x_values)) - np.random.shuffle(indices) for i in range(len(indices))[:: self.batch_size]: @@ -377,7 +370,6 @@ class GATModel(nn.Module): self.fc_out = nn.Linear(hidden_size, 1) self.leaky_relu = nn.LeakyReLU() self.softmax = nn.Softmax(dim=1) - self.d_feat = d_feat def cal_convariance(self, x, y): # the 2nd dimension of x and y are the same @@ -396,12 +388,7 @@ class GATModel(nn.Module): out, _ = self.rnn(x) hidden = out[:, -1, :] hidden = self.bn1(hidden) - gamma = self.cal_convariance(hidden, hidden) - # gamma = hidden.mm(torch.t(hidden)) - # gamma = self.leaky_relu(gamma) - # gamma = self.softmax(gamma) - # gamma = gamma * (torch.ones(x.shape[0], x.shape[0]).to(device) - torch.diag(torch.ones(x.shape[0])).to(device)) output = gamma.mm(hidden) output = self.fc(output) output = self.bn2(output) diff --git a/qlib/contrib/model/pytorch_gru.py b/qlib/contrib/model/pytorch_gru.py index 4cc7f9852..2dd8464e2 100755 --- a/qlib/contrib/model/pytorch_gru.py +++ b/qlib/contrib/model/pytorch_gru.py @@ -28,14 +28,10 @@ class GRU(Model): Parameters ---------- - input_dim : int - input dimension - output_dim : int - output dimension - layers : tuple - layer sizes - lr : float - learning rate + d_feat : int + input dimension for each time step + metric: str + the evaluate metric used in early stop optimizer : str optimizer name GPU : str @@ -112,10 +108,6 @@ class GRU(Model): ) ) - if loss not in {"mse", "binary"}: - raise NotImplementedError("loss {} is not supported!".format(loss)) - self._scorer = mean_squared_error if loss == "mse" else roc_auc_score - self.gru_model = GRUModel( d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers, dropout=self.dropout ) @@ -251,7 +243,6 @@ class GRU(Model): # train self.logger.info("training...") self._fitted = True - # return for step in range(self.n_epochs): self.logger.info("Epoch%d:", step) diff --git a/qlib/contrib/model/pytorch_lstm.py b/qlib/contrib/model/pytorch_lstm.py index 8b8454380..be43d3698 100755 --- a/qlib/contrib/model/pytorch_lstm.py +++ b/qlib/contrib/model/pytorch_lstm.py @@ -28,14 +28,10 @@ class LSTM(Model): Parameters ---------- - input_dim : int - input dimension - output_dim : int - output dimension - layers : tuple - layer sizes - lr : float - learning rate + d_feat : int + input dimension for each time step + metric: str + the evaluate metric used in early stop optimizer : str optimizer name GPU : str @@ -112,10 +108,6 @@ class LSTM(Model): ) ) - if loss not in {"mse", "binary"}: - raise NotImplementedError("loss {} is not supported!".format(loss)) - self._scorer = mean_squared_error if loss == "mse" else roc_auc_score - self.lstm_model = LSTMModel( d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers, dropout=self.dropout ) @@ -251,7 +243,6 @@ class LSTM(Model): # train self.logger.info("training...") self._fitted = True - # return for step in range(self.n_epochs): self.logger.info("Epoch%d:", step) diff --git a/qlib/contrib/model/pytorch_sfm.py b/qlib/contrib/model/pytorch_sfm.py index d8baa9cb2..4ec61430e 100644 --- a/qlib/contrib/model/pytorch_sfm.py +++ b/qlib/contrib/model/pytorch_sfm.py @@ -31,7 +31,6 @@ from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP - class SFM_Model(nn.Module): def __init__(self, d_feat=6, output_dim=1, freq_dim=10, hidden_size=64, dropout_W=0.0, dropout_U=0.0, device="cpu"): super().__init__() @@ -76,13 +75,13 @@ class SFM_Model(nn.Module): self.states = [] def forward(self, input): - input = input.reshape(len(input), self.input_dim, -1) # [N, F, T] - input = input.permute(0, 2, 1) # [N, T, F] + input = input.reshape(len(input), self.input_dim, -1) # [N, F, T] + input = input.permute(0, 2, 1) # [N, T, F] time_step = input.shape[1] - + for ts in range(time_step): - x = input[:, ts, :] - if len(self.states) == 0: # hasn't initialized yet + x = input[:, ts,:] + if len(self.states)==0: #hasn't initialized yet self.init_states(x) self.get_constants(x) p_tm1 = self.states[0] @@ -99,65 +98,64 @@ class SFM_Model(nn.Module): x_fre = torch.matmul(x * B_W[0], self.W_fre) + self.b_fre x_c = torch.matmul(x * B_W[0], self.W_c) + self.b_c x_o = torch.matmul(x * B_W[0], self.W_o) + self.b_o - - i = self.inner_activation( - x_i + torch.matmul(h_tm1 * B_U[0], self.U_i) - ) # not sure whether I am doing in the right unsquuze + + i = self.inner_activation(x_i + torch.matmul(h_tm1 * B_U[0], self.U_i)) # not sure whether I am doing in the right unsquuze + ste = self.inner_activation(x_ste + torch.matmul(h_tm1 * B_U[0], self.U_ste)) fre = self.inner_activation(x_fre + torch.matmul(h_tm1 * B_U[0], self.U_fre)) ste = torch.reshape(ste, (-1, self.hidden_dim, 1)) fre = torch.reshape(fre, (-1, 1, self.freq_dim)) - + f = ste * fre - + c = i * self.activation(x_c + torch.matmul(h_tm1 * B_U[0], self.U_c)) time = time_tm1 + 1 omega = torch.tensor(2 * np.pi) * time * frequency - re = torch.cos(omega) + re = torch.cos(omega) im = torch.sin(omega) - + c = torch.reshape(c, (-1, self.hidden_dim, 1)) S_re = f * S_re_tm1 + c * re S_im = f * S_im_tm1 + c * im - + A = torch.square(S_re) + torch.square(S_im) A = torch.reshape(A, (-1, self.freq_dim)).float() A_a = torch.matmul(A * B_U[0], self.U_a) A_a = torch.reshape(A_a, (-1, self.hidden_dim)) a = self.activation(A_a + self.b_a) - + o = self.inner_activation(x_o + torch.matmul(h_tm1 * B_U[0], self.U_o)) h = o * a p = torch.matmul(h, self.W_p) + self.b_p self.states = [p, h, S_re, S_im, time, None, None, None] - self.states = [] + self.states = [] return self.fc_out(p).squeeze() def init_states(self, x): reducer_f = torch.zeros((self.hidden_dim, self.freq_dim)).to(self.device) reducer_p = torch.zeros((self.hidden_dim, self.output_dim)).to(self.device) - + init_state_h = torch.zeros(self.hidden_dim).to(self.device) init_state_p = torch.matmul(init_state_h, reducer_p) - + init_state = torch.zeros_like(init_state_h).to(self.device) init_freq = torch.matmul(init_state_h, reducer_f) init_state = torch.reshape(init_state, (-1, self.hidden_dim, 1)) init_freq = torch.reshape(init_freq, (-1, 1, self.freq_dim)) - + init_state_S_re = init_state * init_freq init_state_S_im = init_state * init_freq - + init_state_time = torch.tensor(0).to(self.device) self.states = [init_state_p, init_state_h, init_state_S_re, init_state_S_im, init_state_time, None, None, None] @@ -203,6 +201,7 @@ class SFM(Model): dropout_U=0.0, n_epochs=200, lr=0.001, + metric = "", batch_size=2000, early_stop=20, eval_steps=5, @@ -227,14 +226,15 @@ class SFM(Model): self.dropout_U = dropout_U self.n_epochs = n_epochs self.lr = lr + self.metric = metric self.batch_size = batch_size self.early_stop = early_stop self.eval_steps = eval_steps self.lr_decay = lr_decay self.lr_decay_steps = lr_decay_steps self.optimizer = optimizer.lower() - self.loss_type = loss - self.device = "cuda:%d" % (GPU) if torch.cuda.is_available() else "cpu" + self.loss = loss + self.device = "cuda:%d"%(GPU) if torch.cuda.is_available() else "cpu" self.use_gpu = torch.cuda.is_available() self.seed = seed @@ -243,11 +243,12 @@ class SFM(Model): "\nd_feat : {}" "\nhidden_size : {}" "\noutput_size : {}" - "\nfrequency_dimension : {}" + "\nfrequency_dimension : {}" "\ndropout_W: {}" "\ndropout_U: {}" "\nn_epochs : {}" "\nlr : {}" + "\nmetric : {}" "\nbatch_size : {}" "\nearly_stop : {}" "\neval_steps : {}" @@ -266,6 +267,7 @@ class SFM(Model): dropout_U, n_epochs, lr, + metric, batch_size, early_stop, eval_steps, @@ -284,14 +286,14 @@ class SFM(Model): self._scorer = mean_squared_error if loss == "mse" else roc_auc_score self.sfm_model = SFM_Model( - d_feat=self.d_feat, + d_feat=self.d_feat, output_dim=self.output_dim, - hidden_size=self.hidden_size, - freq_dim=self.freq_dim, - dropout_W=self.dropout_W, - dropout_U=self.dropout_U, - device=self.device, - ) + hidden_size=self.hidden_size, + freq_dim=self.freq_dim, + dropout_W=self.dropout_W, + dropout_U=self.dropout_U, + device=self.device + ) if optimizer.lower() == "adam": self.train_optimizer = optim.Adam(self.sfm_model.parameters(), lr=self.lr) elif optimizer.lower() == "gd": @@ -299,24 +301,73 @@ class SFM(Model): else: raise NotImplementedError("optimizer {} is not supported!".format(optimizer)) - # Reduce learning rate when loss has stopped decrease - self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( - self.train_optimizer, - mode="min", - factor=0.5, - patience=10, - verbose=True, - threshold=0.0001, - threshold_mode="rel", - cooldown=0, - min_lr=0.00001, - eps=1e-08, - ) - self._fitted = False self.sfm_model.to(self.device) - def fit(self, dataset: DatasetH, evals_result=dict(), verbose=True, save_path=None, **kwargs): + def test_epoch(self, data_x, data_y): + + # prepare training data + x_values = data_x.values + y_values = np.squeeze(data_y.values) + + self.sfm_model.eval() + + scores = [] + losses = [] + + indices = np.arange(len(x_values)) + np.random.shuffle(indices) + + for i in range(len(indices))[:: self.batch_size]: + + if len(indices) - i < self.batch_size: + break + + feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float().to(self.device) + label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float().to(self.device) + + pred = self.sfm_model(feature) + loss = self.loss_fn(pred, label) + losses.append(loss.item()) + + score = self.metric_fn(pred, label) + scores.append(score.item()) + + return np.mean(losses), np.mean(scores) + + def train_epoch(self, x_train, y_train): + + x_train_values = x_train.values + y_train_values = np.squeeze(y_train.values) * 100 + + self.sfm_model.train() + + indices = np.arange(len(x_train_values)) + np.random.shuffle(indices) + + for i in range(len(indices))[:: self.batch_size]: + + if len(indices) - i < self.batch_size: + break + + feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float().to(self.device) + label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float().to(self.device) + + pred = self.sfm_model(feature) + loss = self.loss_fn(pred, label) + + self.train_optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_value_(self.sfm_model.parameters(), 3.0) + self.train_optimizer.step() + + def fit( + self, + dataset: DatasetH, + evals_result=dict(), + verbose=True, + save_path=None, + ): df_train, df_valid = dataset.prepare( ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L @@ -324,10 +375,10 @@ class SFM(Model): x_train, y_train = df_train["feature"], df_train["label"] x_valid, y_valid = df_valid["feature"], df_valid["label"] - save_path = create_save_path(save_path) stop_steps = 0 train_loss = 0 - best_loss = np.inf + best_score = -np.inf + best_epoch = 0 evals_result["train"] = [] evals_result["valid"] = [] @@ -335,90 +386,56 @@ class SFM(Model): self.logger.info("training...") self._fitted = True - # prepare training data - x_train_values = torch.from_numpy(x_train.values).float() - y_train_values = torch.from_numpy(np.squeeze(y_train.values)).float() - train_num = y_train_values.shape[0] - - # prepare validation data - x_val_auto = torch.from_numpy(x_valid.values).float() - y_val_auto = torch.from_numpy(np.squeeze(y_valid.values)).float() - - x_val_auto = x_val_auto.to(self.device) - y_val_auto = y_val_auto.to(self.device) - for step in range(self.n_epochs): - if stop_steps >= self.early_stop: - if verbose: - self.logger.info("\tearly stop") - break - loss = AverageMeter() - self.sfm_model.train() - self.train_optimizer.zero_grad() + self.logger.info("Epoch%d:", step) + self.logger.info("training...") + self.train_epoch(x_train, y_train) + self.logger.info("evaluating...") + train_loss, train_score = self.test_epoch(x_train, y_train) + val_loss, val_score = self.test_epoch(x_valid, y_valid) + self.logger.info("train %.6f, valid %.6f" % (train_score, val_score)) + evals_result["train"].append(train_score) + evals_result["valid"].append(val_score) - choice = np.random.choice(train_num, self.batch_size) - x_batch_auto = x_train_values[choice] - y_batch_auto = y_train_values[choice] - - x_batch_auto = x_batch_auto.to(self.device) - y_batch_auto = y_batch_auto.to(self.device) - - # forward - preds = self.sfm_model(x_batch_auto) - cur_loss = self.get_loss(preds, y_batch_auto, self.loss_type) - cur_loss.backward() - self.train_optimizer.step() - loss.update(cur_loss.item()) - - # validation - train_loss += loss.val - if step and step % self.eval_steps == 0: + if val_score > best_score: + best_score = val_score + stop_steps = 0 + best_epoch = step + best_param = copy.deepcopy(self.sfm_model.state_dict()) + else: stop_steps += 1 - train_loss /= self.eval_steps - - with torch.no_grad(): - self.sfm_model.eval() - loss_val = AverageMeter() - - # forward - preds = self.sfm_model(x_val_auto) - cur_loss_val = self.get_loss(preds, y_val_auto, self.loss_type) - loss_val.update(cur_loss_val.item()) - - if verbose: - self.logger.info( - "[Epoch {}]: train_loss {:.6f}, valid_loss {:.6f}".format(step, train_loss, loss_val.val) - ) - evals_result["train"].append(train_loss) - evals_result["valid"].append(loss_val.val) - if loss_val.val < best_loss: - if verbose: - self.logger.info( - "\tvalid loss update from {:.6f} to {:.6f}, save checkpoint.".format( - best_loss, loss_val.val - ) - ) - best_loss = loss_val.val - stop_steps = 0 - torch.save(self.sfm_model.state_dict(), save_path) - train_loss = 0 - # update learning rate - self.scheduler.step(cur_loss_val) - + if stop_steps >= self.early_stop: + self.logger.info("early stop") + break + self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch)) if self.device != "cpu": torch.cuda.empty_cache() - def get_loss(self, pred, target, loss_type): - if loss_type == "mse": - sqr_loss = (pred - target) ** 2 - loss = sqr_loss.mean() - return loss - elif loss_type == "binary": - loss = nn.BCELoss() - return loss(pred, target) - else: - raise NotImplementedError("loss {} is not supported!".format(loss_type)) + def mse(self, pred, label): + loss = (pred - label) ** 2 + return torch.mean(loss) + + def loss_fn(self, pred, label): + mask = ~torch.isnan(label) + if self.loss == "mse": + return self.mse(pred[mask], label[mask]) + + raise ValueError("unknown loss `%s`" % self.loss) + + def metric_fn(self, pred, label): + + mask = torch.isfinite(label) + if self.metric == "IC": + return self.cal_ic(pred[mask], label[mask]) + + if self.metric == "" or self.metric == "loss": # use loss + return -self.loss_fn(pred[mask], label[mask]) + + raise ValueError("unknown metric `%s`" % self.metric) + + def cal_ic(self, pred, label): + return torch.mean(pred * label) def predict(self, dataset): if not self._fitted: raise ValueError("model is not fitted yet!") @@ -430,7 +447,7 @@ class SFM(Model): sample_num = x_values.shape[0] preds = [] - for begin in range(sample_num)[:: self.batch_size]: + for begin in range(sample_num)[::self.batch_size]: if sample_num - begin < self.batch_size: end = sample_num else: @@ -440,37 +457,16 @@ class SFM(Model): if self.device != "cpu": x_batch = x_batch.to(self.device) - + with torch.no_grad(): - if self.device != "cpu": - pred = self.sfm_model(x_batch).detach().cpu().numpy() - else: - pred = self.sfm_model(x_batch).detach().cpu().numpy() + pred = self.sfm_model(x_batch).detach().cpu().numpy() + preds.append(pred) - + return pd.Series(np.concatenate(preds), index=index) - def save(self, filename, **kwargs): - with save_multiple_parts_file(filename) as model_dir: - model_path = os.path.join(model_dir, os.path.split(model_dir)[-1]) - # Save model - torch.save(self.sfm_model.state_dict(), model_path) - - def load(self, buffer, **kwargs): - with unpack_archive_with_buffer(buffer) as model_dir: - # Get model name - _model_name = os.path.splitext(list(filter(lambda x: x.startswith("model.bin"), os.listdir(model_dir)))[0])[ - 0 - ] - _model_path = os.path.join(model_dir, _model_name) - # Load model - self.sfm_model.load_state_dict(torch.load(_model_path)) - self._fitted = True - - class AverageMeter(object): """Computes and stores the average and current value""" - def __init__(self): self.reset() diff --git a/qlib/contrib/model/xgboost.py b/qlib/contrib/model/xgboost.py index 039fd2c80..203e71b9a 100755 --- a/qlib/contrib/model/xgboost.py +++ b/qlib/contrib/model/xgboost.py @@ -30,15 +30,15 @@ class XGBModel(Model): def fit( self, dataset: DatasetH, - num_boost_round=1000, - early_stopping_rounds=50, - verbose_eval=20, - evals_result=dict(), + num_boost_round = 1000, + early_stopping_rounds = 50, + verbose_eval = 20, + evals_result = dict(), **kwargs ): df_train, df_valid = dataset.prepare( - ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L + ["train", "valid"], col_set = ["feature", "label"], data_key = DataHandlerLP.DK_L ) x_train, y_train = df_train["feature"], df_train["label"] x_valid, y_valid = df_valid["feature"], df_valid["label"] @@ -49,16 +49,16 @@ class XGBModel(Model): else: raise ValueError("XGBoost doesn't support multi-label training") - dtrain = xgb.DMatrix(x_train.values, label=y_train_1d) - dvalid = xgb.DMatrix(x_valid.values, label=y_valid_1d) + dtrain = xgb.DMatrix(x_train.values, label = y_train_1d) + dvalid = xgb.DMatrix(x_valid.values, label = y_valid_1d) self.model = xgb.train( self._params, - dtrain=dtrain, - num_boost_round=num_boost_round, - evals=[(dtrain, "train"), (dvalid, "valid")], - early_stopping_rounds=early_stopping_rounds, - verbose_eval=verbose_eval, - evals_result=evals_result, + dtrain = dtrain, + num_boost_round = num_boost_round, + evals = [(dtrain, "train"), (dvalid, "valid")], + early_stopping_rounds = early_stopping_rounds, + verbose_eval = verbose_eval, + evals_result = evals_result, **kwargs ) evals_result["train"] = list(evals_result["train"].values())[0] @@ -67,5 +67,5 @@ class XGBModel(Model): def predict(self, dataset): if self.model is None: raise ValueError("model is not fitted yet!") - x_test = dataset.prepare("test", col_set="feature") - return pd.Series(self.model.predict(xgb.DMatrix(x_test.values)), index=x_test.index) + x_test = dataset.prepare("test", col_set = "feature") + return pd.Series(self.model.predict(xgb.DMatrix(x_test.values)), index = x_test.index) diff --git a/qlib/data/dataset/processor.py b/qlib/data/dataset/processor.py index e4003a1f5..e2d251aa7 100755 --- a/qlib/data/dataset/processor.py +++ b/qlib/data/dataset/processor.py @@ -166,7 +166,9 @@ class MinMaxNorm(Processor): return df -class ZscoreNorm(Processor): +class ZScoreNorm(Processor): + """ZScore Normalization""" + def __init__(self, fit_start_time, fit_end_time, fields_group=None): self.fit_start_time = fit_start_time self.fit_end_time = fit_end_time @@ -193,6 +195,42 @@ class ZscoreNorm(Processor): return df +class RobustZScoreNorm(Processor): + """Robust ZScore Normalization + + Use robust statistics for Z-Score normalization: + mean(x) = median(x) + std(x) = MAD(x) * 1.4826 + + Reference: + https://en.wikipedia.org/wiki/Median_absolute_deviation. + """ + + def __init__(self, fit_start_time, fit_end_time, fields_group=None, clip_outlier=True): + self.fit_start_time = fit_start_time + self.fit_end_time = fit_end_time + self.fields_group = fields_group + self.clip_outlier = clip_outlier + + def fit(self, df): + df = fetch_df_by_index(df, slice(self.fit_start_time, self.fit_end_time), level="datetime") + self.cols = get_group_columns(df, self.fields_group) + X = df[self.cols].values + self.mean_train = np.nanmedian(X, axis=0) + self.std_train = np.nanmedian(np.abs(X - self.mean_train), axis=0) + self.std_train += EPS + self.std_train *= 1.4826 + + def __call__(self, df): + X = df[self.cols] + X -= self.mean_train + X /= self.std_train + df[self.cols] = X + if self.clip_outlier: + df.clip(-3, 3, inplace=True) + return df + + class CSZScoreNorm(Processor): """Cross Sectional ZScore Normalization""" diff --git a/qlib/workflow/cli.py b/qlib/workflow/cli.py index 2e087877b..ecec8d3d7 100644 --- a/qlib/workflow/cli.py +++ b/qlib/workflow/cli.py @@ -27,9 +27,9 @@ def sys_config(config, config_path): Parameters ---------- config : dict - configuration of the workflow + configuration of the workflow. config_path : str - configuration of the path + configuration of the path. """ sys_config = config.get("sys", {})