diff --git a/qlib/contrib/model/__init__.py b/qlib/contrib/model/__init__.py index e69de29bb..09b0c929b 100644 --- a/qlib/contrib/model/__init__.py +++ b/qlib/contrib/model/__init__.py @@ -0,0 +1,39 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +try: + from .catboost_model import CatBoostModel +except ModuleNotFoundError: + CatBoostModel = None + print("Please install necessary libs for CatBoostModel.") +try: + from .double_ensemble import DEnsembleModel + from .gbdt import LGBModel +except ModuleNotFoundError: + DEnsembleModel, LGBModel = None, None + print("Please install necessary libs for DEnsembleModel and LGBModel, such as lightgbm.") +try: + from .xgboost import XGBModel +except ModuleNotFoundError: + XGBModel = None + print("Please install necessary libs for XGBModel, such as xgboost.") +try: + from .linear import LinearModel +except ModuleNotFoundError: + LinearModel = None + print("Please install necessary libs for LinearModel, such as scipy and sklearn.") +# import pytorch models +try: + from .pytorch_alstm import ALSTM + from .pytorch_gats import GATs + from .pytorch_gru import GRU + from .pytorch_lstm import LSTM + from .pytorch_nn import DNNModelPytorch + from .pytorch_tabnet import TabnetModel + from .pytorch_sfm import SFM_Model + + pytorch_classes = (ALSTM, GATs, GRU, LSTM, DNNModelPytorch, TabnetModel, SFM_Model) +except ModuleNotFoundError: + pytorch_classes = () + print("Please install necessary libs for PyTorch models.") + +all_model_classes = (CatBoostModel, DEnsembleModel, LGBModel, XGBModel, LinearModel) + pytorch_classes diff --git a/qlib/contrib/model/catboost_model.py b/qlib/contrib/model/catboost_model.py index d57c32b70..98b9b9c2d 100644 --- a/qlib/contrib/model/catboost_model.py +++ b/qlib/contrib/model/catboost_model.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd +from typing import Text, Union from catboost import Pool, CatBoost from catboost.utils import get_gpu_device_count @@ -62,10 +63,10 @@ class CatBoostModel(Model): evals_result["train"] = list(evals_result["learn"].values())[0] evals_result["valid"] = list(evals_result["validation"].values())[0] - def predict(self, dataset): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if self.model is None: raise ValueError("model is not fitted yet!") - x_test = dataset.prepare("test", col_set="feature") + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) return pd.Series(self.model.predict(x_test.values), index=x_test.index) diff --git a/qlib/contrib/model/double_ensemble.py b/qlib/contrib/model/double_ensemble.py index 541f74e99..4b267a2b0 100644 --- a/qlib/contrib/model/double_ensemble.py +++ b/qlib/contrib/model/double_ensemble.py @@ -4,7 +4,7 @@ import lightgbm as lgb import numpy as np import pandas as pd - +from typing import Text, Union from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP @@ -40,6 +40,10 @@ class DEnsembleModel(Model): self.bins_sr = bins_sr self.bins_fs = bins_fs self.decay = decay + if sample_ratios is None: # the default values for sample_ratios + sample_ratios = [0.8, 0.7, 0.6, 0.5, 0.4] + if sub_weights is None: # the default values for sub_weights + sub_weights = [1.0, 0.2, 0.2, 0.2, 0.2, 0.2] if not len(sample_ratios) == bins_fs: raise ValueError("The length of sample_ratios should be equal to bins_fs.") self.sample_ratios = sample_ratios @@ -228,10 +232,10 @@ class DEnsembleModel(Model): raise ValueError("not implemented yet") return loss_curve - def predict(self, dataset): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if self.ensemble is None: raise ValueError("model is not fitted yet!") - x_test = dataset.prepare("test", col_set="feature", data_key=DataHandlerLP.DK_I) + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) pred = pd.Series(np.zeros(x_test.shape[0]), index=x_test.index) for i_sub, submodel in enumerate(self.ensemble): feat_sub = self.sub_features[i_sub] diff --git a/qlib/contrib/model/gbdt.py b/qlib/contrib/model/gbdt.py index 058d9a0e3..463cf8f4f 100644 --- a/qlib/contrib/model/gbdt.py +++ b/qlib/contrib/model/gbdt.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd import lightgbm as lgb - +from typing import Text, Union from ...model.base import ModelFT from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP @@ -61,10 +61,10 @@ class LGBModel(ModelFT): evals_result["train"] = list(evals_result["train"].values())[0] evals_result["valid"] = list(evals_result["valid"].values())[0] - def predict(self, dataset): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if self.model is None: raise ValueError("model is not fitted yet!") - x_test = dataset.prepare("test", col_set="feature", data_key=DataHandlerLP.DK_I) + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) return pd.Series(self.model.predict(x_test.values), index=x_test.index) def finetune(self, dataset: DatasetH, num_boost_round=10, verbose_eval=20): diff --git a/qlib/contrib/model/linear.py b/qlib/contrib/model/linear.py index 0f9223737..f16acc1ec 100644 --- a/qlib/contrib/model/linear.py +++ b/qlib/contrib/model/linear.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd - +from typing import Text, Union from scipy.optimize import nnls from sklearn.linear_model import LinearRegression, Ridge, Lasso @@ -84,8 +84,8 @@ class LinearModel(Model): self.coef_ = coef self.intercept_ = 0.0 - def predict(self, dataset): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if self.coef_ is None: raise ValueError("model is not fitted yet!") - x_test = dataset.prepare("test", col_set="feature", data_key=DataHandlerLP.DK_I) + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) return pd.Series(x_test.values @ self.coef_ + self.intercept_, index=x_test.index) diff --git a/qlib/contrib/model/pytorch_alstm.py b/qlib/contrib/model/pytorch_alstm.py index a149272da..4fe2b2714 100644 --- a/qlib/contrib/model/pytorch_alstm.py +++ b/qlib/contrib/model/pytorch_alstm.py @@ -8,13 +8,9 @@ from __future__ import print_function import os import numpy as np import pandas as pd +from typing import Text, Union import copy -from ...utils import ( - unpack_archive_with_buffer, - save_multiple_parts_file, - get_or_create_path, - drop_nan_by_y_index, -) +from ...utils import get_or_create_path from ...log import get_module_logger import torch @@ -273,11 +269,11 @@ class ALSTM(Model): if self.use_gpu: torch.cuda.empty_cache() - def predict(self, dataset): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: raise ValueError("model is not fitted yet!") - x_test = dataset.prepare("test", col_set="feature") + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) index = x_test.index self.ALSTM_model.eval() x_values = x_test.values diff --git a/qlib/contrib/model/pytorch_alstm_ts.py b/qlib/contrib/model/pytorch_alstm_ts.py index c38727b9e..f1aa8227c 100644 --- a/qlib/contrib/model/pytorch_alstm_ts.py +++ b/qlib/contrib/model/pytorch_alstm_ts.py @@ -8,13 +8,9 @@ from __future__ import print_function import os import numpy as np import pandas as pd +from typing import Text, Union import copy -from ...utils import ( - unpack_archive_with_buffer, - save_multiple_parts_file, - get_or_create_path, - drop_nan_by_y_index, -) +from ...utils import get_or_create_path from ...log import get_module_logger import torch @@ -264,11 +260,11 @@ class ALSTM(Model): if self.use_gpu: torch.cuda.empty_cache() - def predict(self, dataset): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: raise ValueError("model is not fitted yet!") - dl_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I) + dl_test = dataset.prepare(segment, col_set=["feature", "label"], data_key=DataHandlerLP.DK_I) dl_test.config(fillna_type="ffill+bfill") test_loader = DataLoader(dl_test, batch_size=self.batch_size, num_workers=self.n_jobs) self.ALSTM_model.eval() diff --git a/qlib/contrib/model/pytorch_gats.py b/qlib/contrib/model/pytorch_gats.py index 53afb5404..493bf120f 100644 --- a/qlib/contrib/model/pytorch_gats.py +++ b/qlib/contrib/model/pytorch_gats.py @@ -8,13 +8,9 @@ from __future__ import print_function import os import numpy as np import pandas as pd +from typing import Text, Union import copy -from ...utils import ( - unpack_archive_with_buffer, - save_multiple_parts_file, - get_or_create_path, - drop_nan_by_y_index, -) +from ...utils import get_or_create_path from ...log import get_module_logger import torch import torch.nn as nn @@ -83,7 +79,6 @@ class GATs(Model): self.with_pretrain = with_pretrain self.model_path = model_path self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") - self.use_gpu = torch.cuda.is_available() self.seed = seed self.logger.info( @@ -310,11 +305,11 @@ class GATs(Model): if self.use_gpu: torch.cuda.empty_cache() - def predict(self, dataset): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: raise ValueError("model is not fitted yet!") - x_test = dataset.prepare("test", col_set="feature") + x_test = dataset.prepare(segment, col_set="feature") index = x_test.index self.GAT_model.eval() x_values = x_test.values diff --git a/qlib/contrib/model/pytorch_gats_ts.py b/qlib/contrib/model/pytorch_gats_ts.py index f02bf1e47..5f9961b0b 100644 --- a/qlib/contrib/model/pytorch_gats_ts.py +++ b/qlib/contrib/model/pytorch_gats_ts.py @@ -9,12 +9,7 @@ import os import numpy as np import pandas as pd import copy -from ...utils import ( - unpack_archive_with_buffer, - save_multiple_parts_file, - get_or_create_path, - drop_nan_by_y_index, -) +from ...utils import get_or_create_path from ...log import get_module_logger import torch import torch.nn as nn diff --git a/qlib/contrib/model/pytorch_gru.py b/qlib/contrib/model/pytorch_gru.py index 5eba33595..552815d39 100755 --- a/qlib/contrib/model/pytorch_gru.py +++ b/qlib/contrib/model/pytorch_gru.py @@ -8,13 +8,9 @@ from __future__ import print_function import os import numpy as np import pandas as pd +from typing import Text, Union import copy -from ...utils import ( - unpack_archive_with_buffer, - save_multiple_parts_file, - get_or_create_path, - drop_nan_by_y_index, -) +from ...utils import get_or_create_path from ...log import get_module_logger import torch @@ -273,11 +269,11 @@ class GRU(Model): if self.use_gpu: torch.cuda.empty_cache() - def predict(self, dataset): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: raise ValueError("model is not fitted yet!") - x_test = dataset.prepare("test", col_set="feature") + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) index = x_test.index self.gru_model.eval() x_values = x_test.values diff --git a/qlib/contrib/model/pytorch_gru_ts.py b/qlib/contrib/model/pytorch_gru_ts.py index de5e280d0..c094a3e3c 100755 --- a/qlib/contrib/model/pytorch_gru_ts.py +++ b/qlib/contrib/model/pytorch_gru_ts.py @@ -9,12 +9,7 @@ import os import numpy as np import pandas as pd import copy -from ...utils import ( - unpack_archive_with_buffer, - save_multiple_parts_file, - get_or_create_path, - drop_nan_by_y_index, -) +from ...utils import get_or_create_path from ...log import get_module_logger import torch diff --git a/qlib/contrib/model/pytorch_lstm.py b/qlib/contrib/model/pytorch_lstm.py index 636ef6e3a..0ecfc2083 100755 --- a/qlib/contrib/model/pytorch_lstm.py +++ b/qlib/contrib/model/pytorch_lstm.py @@ -8,13 +8,9 @@ from __future__ import print_function import os import numpy as np import pandas as pd +from typing import Text, Union import copy -from ...utils import ( - unpack_archive_with_buffer, - save_multiple_parts_file, - get_or_create_path, - drop_nan_by_y_index, -) +from ...utils import get_or_create_path from ...log import get_module_logger import torch @@ -268,11 +264,11 @@ class LSTM(Model): if self.use_gpu: torch.cuda.empty_cache() - def predict(self, dataset): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: raise ValueError("model is not fitted yet!") - x_test = dataset.prepare("test", col_set="feature") + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) index = x_test.index self.lstm_model.eval() x_values = x_test.values @@ -280,17 +276,13 @@ class LSTM(Model): preds = [] for begin in range(sample_num)[:: self.batch_size]: - if sample_num - begin < self.batch_size: end = sample_num else: end = begin + self.batch_size - x_batch = torch.from_numpy(x_values[begin:end]).float().to(self.device) - with torch.no_grad(): pred = self.lstm_model(x_batch).detach().cpu().numpy() - preds.append(pred) return pd.Series(np.concatenate(preds), index=index) diff --git a/qlib/contrib/model/pytorch_lstm_ts.py b/qlib/contrib/model/pytorch_lstm_ts.py index c978e84c7..1f97bd5b1 100755 --- a/qlib/contrib/model/pytorch_lstm_ts.py +++ b/qlib/contrib/model/pytorch_lstm_ts.py @@ -9,12 +9,7 @@ import os import numpy as np import pandas as pd import copy -from ...utils import ( - unpack_archive_with_buffer, - save_multiple_parts_file, - get_or_create_path, - drop_nan_by_y_index, -) +from ...utils import get_or_create_path from ...log import get_module_logger import torch diff --git a/qlib/contrib/model/pytorch_nn.py b/qlib/contrib/model/pytorch_nn.py index caf34b330..15ee7ef71 100644 --- a/qlib/contrib/model/pytorch_nn.py +++ b/qlib/contrib/model/pytorch_nn.py @@ -8,6 +8,7 @@ from __future__ import print_function import os import numpy as np import pandas as pd +from typing import Text, Union from sklearn.metrics import roc_auc_score, mean_squared_error import torch @@ -18,7 +19,7 @@ from .pytorch_utils import count_parameters from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP -from ...utils import unpack_archive_with_buffer, save_multiple_parts_file, get_or_create_path, drop_nan_by_y_index +from ...utils import unpack_archive_with_buffer, save_multiple_parts_file, get_or_create_path from ...log import get_module_logger from ...workflow import R @@ -48,8 +49,8 @@ class DNNModelPytorch(Model): def __init__( self, - input_dim, - output_dim, + input_dim=360, + output_dim=1, layers=(256,), lr=0.001, max_steps=300, @@ -271,13 +272,12 @@ class DNNModelPytorch(Model): else: raise NotImplementedError("loss {} is not supported!".format(loss_type)) - def predict(self, dataset): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: raise ValueError("model is not fitted yet!") - x_test_pd = dataset.prepare("test", col_set="feature") + x_test_pd = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) x_test = torch.from_numpy(x_test_pd.values).float().to(self.device) self.dnn_model.eval() - with torch.no_grad(): preds = self.dnn_model(x_test).detach().cpu().numpy() return pd.Series(np.squeeze(preds), index=x_test_pd.index) diff --git a/qlib/contrib/model/pytorch_sfm.py b/qlib/contrib/model/pytorch_sfm.py index db3e8bb12..cf65c2662 100644 --- a/qlib/contrib/model/pytorch_sfm.py +++ b/qlib/contrib/model/pytorch_sfm.py @@ -7,13 +7,9 @@ from __future__ import print_function import os import numpy as np import pandas as pd +from typing import Text, Union import copy -from ...utils import ( - unpack_archive_with_buffer, - save_multiple_parts_file, - get_or_create_path, - drop_nan_by_y_index, -) +from ...utils import get_or_create_path from ...log import get_module_logger import torch @@ -442,11 +438,11 @@ class SFM(Model): raise ValueError("unknown metric `%s`" % self.metric) - def predict(self, dataset): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: raise ValueError("model is not fitted yet!") - x_test = dataset.prepare("test", col_set="feature") + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) index = x_test.index self.sfm_model.eval() x_values = x_test.values @@ -459,10 +455,7 @@ class SFM(Model): else: end = begin + self.batch_size - x_batch = torch.from_numpy(x_values[begin:end]).float() - - if self.device != "cpu": - x_batch = x_batch.to(self.device) + x_batch = torch.from_numpy(x_values[begin:end]).float().to(self.device) with torch.no_grad(): pred = self.sfm_model(x_batch).detach().cpu().numpy() diff --git a/qlib/contrib/model/pytorch_tabnet.py b/qlib/contrib/model/pytorch_tabnet.py index 450e6f5d1..b05d9a026 100644 --- a/qlib/contrib/model/pytorch_tabnet.py +++ b/qlib/contrib/model/pytorch_tabnet.py @@ -6,13 +6,9 @@ from __future__ import print_function import os import numpy as np import pandas as pd +from typing import Text, Union import copy -from ...utils import ( - unpack_archive_with_buffer, - save_multiple_parts_file, - get_or_create_path, - drop_nan_by_y_index, -) +from ...utils import get_or_create_path from ...log import get_module_logger import torch @@ -217,11 +213,11 @@ class TabnetModel(Model): if self.use_gpu: torch.cuda.empty_cache() - def predict(self, dataset): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: raise ValueError("model is not fitted yet!") - x_test = dataset.prepare("test", col_set="feature", data_key=DataHandlerLP.DK_I) + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) index = x_test.index self.tabnet_model.eval() x_values = torch.from_numpy(x_test.values) diff --git a/qlib/contrib/model/xgboost.py b/qlib/contrib/model/xgboost.py index ba2e5789b..cbba14678 100755 --- a/qlib/contrib/model/xgboost.py +++ b/qlib/contrib/model/xgboost.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd import xgboost as xgb - +from typing import Text, Union from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP @@ -57,8 +57,8 @@ class XGBModel(Model): evals_result["train"] = list(evals_result["train"].values())[0] evals_result["valid"] = list(evals_result["valid"].values())[0] - def predict(self, dataset): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if self.model is None: raise ValueError("model is not fitted yet!") - x_test = dataset.prepare("test", col_set="feature") + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) return pd.Series(self.model.predict(xgb.DMatrix(x_test.values)), index=x_test.index) diff --git a/qlib/contrib/workflow/__init__.py b/qlib/contrib/workflow/__init__.py index e69de29bb..9945e179c 100644 --- a/qlib/contrib/workflow/__init__.py +++ b/qlib/contrib/workflow/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +from .record_temp import MultiSegRecord +from .record_temp import SignalMseRecord diff --git a/qlib/contrib/workflow/record_temp.py b/qlib/contrib/workflow/record_temp.py index 3fdf0c281..863daee85 100644 --- a/qlib/contrib/workflow/record_temp.py +++ b/qlib/contrib/workflow/record_temp.py @@ -1,18 +1,59 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -import re import pandas as pd from sklearn.metrics import mean_squared_error -from pprint import pprint +from typing import Dict, Text, Any import numpy as np +from ...contrib.eva.alpha import calc_ic +from ...workflow.record_temp import RecordTemp from ...workflow.record_temp import SignalRecord +from ...data import dataset as qlib_dataset from ...log import get_module_logger logger = get_module_logger("workflow", "INFO") +class MultiSegRecord(RecordTemp): + """ + This is the multiple segments signal record class that generates the signal prediction. + This class inherits the ``RecordTemp`` class. + """ + + def __init__(self, model, dataset, recorder=None): + super().__init__(recorder=recorder) + if not isinstance(dataset, qlib_dataset.DatasetH): + raise ValueError("The type of dataset is not DatasetH instead of {:}".format(type(dataset))) + self.model = model + self.dataset = dataset + + def generate(self, segments: Dict[Text, Any], save: bool = False): + for key, segment in segments.items(): + predics = self.model.predict(self.dataset, segment) + if isinstance(predics, pd.Series): + predics = predics.to_frame("score") + labels = self.dataset.prepare( + segments=segment, col_set="label", data_key=qlib_dataset.handler.DataHandlerLP.DK_R + ) + # Compute the IC and Rank IC + ic, ric = calc_ic(predics.iloc[:, 0], labels.iloc[:, 0]) + results = {"all-IC": ic, "mean-IC": ic.mean(), "all-Rank-IC": ric, "mean-Rank-IC": ric.mean()} + logger.info("--- Results for {:} ({:}) ---".format(key, segment)) + ic_x100, ric_x100 = ic * 100, ric * 100 + logger.info("IC: {:.4f}%".format(ic_x100.mean())) + logger.info("ICIR: {:.4f}%".format(ic_x100.mean() / ic_x100.std())) + logger.info("Rank IC: {:.4f}%".format(ric_x100.mean())) + logger.info("Rank ICIR: {:.4f}%".format(ric_x100.mean() / ric_x100.std())) + + if save: + save_name = "results-{:}.pkl".format(key) + self.recorder.save_objects(**{save_name: results}) + logger.info( + "The record '{save_name}' has been saved as the artifact of the Experiment {self.recorder.experiment_id}" + ) + + class SignalMseRecord(SignalRecord): """ This is the Signal MSE Record class that computes the mean squared error (MSE). @@ -38,7 +79,7 @@ class SignalMseRecord(SignalRecord): objects = {"mse.pkl": mse, "rmse.pkl": np.sqrt(mse)} self.recorder.log_metrics(**metrics) self.recorder.save_objects(**objects, artifact_path=self.get_path()) - pprint(metrics) + logger.info("The evaluation results in SignalMseRecord is {:}".format(metrics)) def list(self): paths = [self.get_path("mse.pkl"), self.get_path("rmse.pkl")] diff --git a/qlib/data/dataset/processor.py b/qlib/data/dataset/processor.py old mode 100755 new mode 100644 diff --git a/qlib/model/base.py b/qlib/model/base.py index 3708298d5..1ac8f2fc9 100644 --- a/qlib/model/base.py +++ b/qlib/model/base.py @@ -1,6 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. import abc +from typing import Text, Union from ..utils.serial import Serializable from ..data.dataset import Dataset @@ -59,7 +60,7 @@ class Model(BaseModel): raise NotImplementedError() @abc.abstractmethod - def predict(self, dataset: Dataset) -> object: + def predict(self, dataset: Dataset, segment: Union[Text, slice] = "test") -> object: """give prediction given Dataset Parameters @@ -67,6 +68,9 @@ class Model(BaseModel): dataset : Dataset dataset will generate the processed dataset from model training. + segment : Text or slice + dataset will use this segment to prepare data. (default=test) + Returns ------- Prediction results with certain type such as `pandas.Series`. diff --git a/qlib/workflow/exp.py b/qlib/workflow/exp.py index 5ed4362de..0f420cec4 100644 --- a/qlib/workflow/exp.py +++ b/qlib/workflow/exp.py @@ -159,7 +159,10 @@ class Experiment: if create: recorder, is_new = self._get_or_create_rec(recorder_id=recorder_id, recorder_name=recorder_name) else: - recorder, is_new = self._get_recorder(recorder_id=recorder_id, recorder_name=recorder_name), False + recorder, is_new = ( + self._get_recorder(recorder_id=recorder_id, recorder_name=recorder_name), + False, + ) if is_new: self.active_recorder = recorder # start the recorder @@ -174,7 +177,10 @@ class Experiment: try: if recorder_id is None and recorder_name is None: recorder_name = self._default_rec_name - return self._get_recorder(recorder_id=recorder_id, recorder_name=recorder_name), False + return ( + self._get_recorder(recorder_id=recorder_id, recorder_name=recorder_name), + False, + ) except ValueError: if recorder_name is None: recorder_name = self._default_rec_name diff --git a/qlib/workflow/expm.py b/qlib/workflow/expm.py index 95cad4c6e..28d6d92c7 100644 --- a/qlib/workflow/expm.py +++ b/qlib/workflow/expm.py @@ -159,7 +159,10 @@ class ExpManager: if create: exp, is_new = self._get_or_create_exp(experiment_id=experiment_id, experiment_name=experiment_name) else: - exp, is_new = self._get_exp(experiment_id=experiment_id, experiment_name=experiment_name), False + exp, is_new = ( + self._get_exp(experiment_id=experiment_id, experiment_name=experiment_name), + False, + ) if is_new: self.active_experiment = exp # start the recorder @@ -172,7 +175,10 @@ class ExpManager: automatically create a new experiment based on the given id and name. """ try: - return self._get_exp(experiment_id=experiment_id, experiment_name=experiment_name), False + return ( + self._get_exp(experiment_id=experiment_id, experiment_name=experiment_name), + False, + ) except ValueError: if experiment_name is None: experiment_name = self._default_exp_name diff --git a/qlib/workflow/record_temp.py b/qlib/workflow/record_temp.py index 2c1b6fecc..ed8039ac8 100644 --- a/qlib/workflow/record_temp.py +++ b/qlib/workflow/record_temp.py @@ -39,7 +39,13 @@ class RecordTemp: return "/".join(names) def __init__(self, recorder): - self.recorder = recorder + self._recorder = recorder + + @property + def recorder(self): + if self._recorder is None: + raise ValueError("This RecordTemp did not set recorder yet.") + return self._recorder def generate(self, **kwargs): """ @@ -248,11 +254,20 @@ class PortAnaRecord(SignalRecord): report_dict = normal_backtest(pred_score, strategy=self.strategy, **self.backtest_config) report_normal = report_dict.get("report_df") positions_normal = report_dict.get("positions") - self.recorder.save_objects(**{"report_normal.pkl": report_normal}, artifact_path=PortAnaRecord.get_path()) - self.recorder.save_objects(**{"positions_normal.pkl": positions_normal}, artifact_path=PortAnaRecord.get_path()) + self.recorder.save_objects( + **{"report_normal.pkl": report_normal}, + artifact_path=PortAnaRecord.get_path(), + ) + self.recorder.save_objects( + **{"positions_normal.pkl": positions_normal}, + artifact_path=PortAnaRecord.get_path(), + ) order_normal = report_dict.get("order_list") if order_normal: - self.recorder.save_objects(**{"order_normal.pkl": order_normal}, artifact_path=PortAnaRecord.get_path()) + self.recorder.save_objects( + **{"order_normal.pkl": order_normal}, + artifact_path=PortAnaRecord.get_path(), + ) # analysis analysis = dict() diff --git a/tests/test_all_pipeline.py b/tests/test_all_pipeline.py index 29d39179d..d34c1773a 100644 --- a/tests/test_all_pipeline.py +++ b/tests/test_all_pipeline.py @@ -6,24 +6,11 @@ import shutil import unittest from pathlib import Path -import numpy as np -import pandas as pd - import qlib -from qlib.config import REG_CN, C -from qlib.utils import drop_nan_by_y_index -from qlib.contrib.model.gbdt import LGBModel -from qlib.contrib.data.handler import Alpha158 -from qlib.contrib.strategy.strategy import TopkDropoutStrategy -from qlib.contrib.evaluate import ( - backtest as normal_backtest, - risk_analysis, -) -from qlib.contrib.workflow.record_temp import SignalMseRecord -from qlib.utils import exists_qlib_data, init_instance_by_config, flatten_dict +from qlib.config import C +from qlib.utils import init_instance_by_config, flatten_dict from qlib.workflow import R from qlib.workflow.record_temp import SignalRecord, SigAnaRecord, PortAnaRecord -from qlib.tests.data import GetData from qlib.tests import TestAutoData @@ -166,8 +153,6 @@ def train_with_sigana(): ric = sar.load(sar.get_path("ric.pkl")) pred_score = sar.load("pred.pkl") - smr = SignalMseRecord(recorder) - smr.generate() uri_path = R.get_uri() return pred_score, {"ic": ic, "ric": ric}, uri_path @@ -256,8 +241,10 @@ class TestAllFlow(TestAutoData): def suite(): _suite = unittest.TestSuite() - _suite.addTest(TestAllFlow("test_0_train")) - _suite.addTest(TestAllFlow("test_1_backtest")) + _suite.addTest(TestAllFlow("test_0_train_with_sigana")) + _suite.addTest(TestAllFlow("test_1_train")) + _suite.addTest(TestAllFlow("test_2_backtest")) + _suite.addTest(TestAllFlow("test_3_expmanager")) return _suite diff --git a/tests/test_contrib_model.py b/tests/test_contrib_model.py new file mode 100644 index 000000000..a82a3042e --- /dev/null +++ b/tests/test_contrib_model.py @@ -0,0 +1,27 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import unittest + +from qlib.contrib.model import all_model_classes + + +class TestAllFlow(unittest.TestCase): + def test_0_initialize(self): + num = 0 + for model_class in all_model_classes: + if model_class is not None: + model = model_class() + num += 1 + print("There are {:}/{:} valid models in total.".format(num, len(all_model_classes))) + + +def suite(): + _suite = unittest.TestSuite() + _suite.addTest(TestAllFlow("test_0_initialize")) + return _suite + + +if __name__ == "__main__": + runner = unittest.TextTestRunner() + runner.run(suite()) diff --git a/tests/test_contrib_workflow.py b/tests/test_contrib_workflow.py new file mode 100644 index 000000000..ccd3c6a90 --- /dev/null +++ b/tests/test_contrib_workflow.py @@ -0,0 +1,111 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import sys +import shutil +import unittest +from pathlib import Path + +import qlib +from qlib.config import C +from qlib.contrib.workflow import MultiSegRecord, SignalMseRecord +from qlib.utils import init_instance_by_config, flatten_dict +from qlib.workflow import R +from qlib.tests import TestAutoData + + +market = "csi300" +benchmark = "SH000300" + +################################### +# train model +################################### +data_handler_config = { + "start_time": "2008-01-01", + "end_time": "2020-08-01", + "fit_start_time": "2008-01-01", + "fit_end_time": "2014-12-31", + "instruments": market, +} + +task = { + "model": { + "class": "LGBModel", + "module_path": "qlib.contrib.model.gbdt", + "kwargs": { + "loss": "mse", + "colsample_bytree": 0.8879, + "learning_rate": 0.0421, + "subsample": 0.8789, + "lambda_l1": 205.6999, + "lambda_l2": 580.9768, + "max_depth": 8, + "num_leaves": 210, + "num_threads": 20, + }, + }, + "dataset": { + "class": "DatasetH", + "module_path": "qlib.data.dataset", + "kwargs": { + "handler": { + "class": "Alpha158", + "module_path": "qlib.contrib.data.handler", + "kwargs": data_handler_config, + }, + "segments": { + "train": ("2008-01-01", "2014-12-31"), + "valid": ("2015-01-01", "2016-12-31"), + "test": ("2017-01-01", "2020-08-01"), + }, + }, + }, +} + + +def train_multiseg(): + model = init_instance_by_config(task["model"]) + dataset = init_instance_by_config(task["dataset"]) + with R.start(experiment_name="workflow"): + R.log_params(**flatten_dict(task)) + model.fit(dataset) + recorder = R.get_recorder() + sr = MultiSegRecord(model, dataset, recorder) + sr.generate(dict(valid="valid", test="test"), True) + uri = R.get_uri() + return uri + + +def train_mse(): + model = init_instance_by_config(task["model"]) + dataset = init_instance_by_config(task["dataset"]) + with R.start(experiment_name="workflow"): + R.log_params(**flatten_dict(task)) + model.fit(dataset) + recorder = R.get_recorder() + sr = SignalMseRecord(recorder, model=model, dataset=dataset) + sr.generate() + uri = R.get_uri() + return uri + + +class TestAllFlow(TestAutoData): + def test_0_multiseg(self): + uri_path = train_multiseg() + shutil.rmtree(str(Path(uri_path.strip("file:")).resolve())) + + def test_1_mse(self): + uri_path = train_mse() + shutil.rmtree(str(Path(uri_path.strip("file:")).resolve())) + + +def suite(): + _suite = unittest.TestSuite() + _suite.addTest(TestAllFlow("test_0_multiseg")) + _suite.addTest(TestAllFlow("test_1_mse")) + return _suite + + +if __name__ == "__main__": + runner = unittest.TextTestRunner() + runner.run(suite())