From 0386df7b16ce4480687a49af07a3a2fac3a0caad Mon Sep 17 00:00:00 2001 From: D-X-Y <280835372@qq.com> Date: Sun, 28 Mar 2021 10:39:28 +0000 Subject: [PATCH] Collect all contrib models in __init__ and add unit tests for init --- qlib/contrib/model/__init__.py | 39 ++++++++++++++++++++++++++ qlib/contrib/model/catboost_model.py | 5 ++-- qlib/contrib/model/double_ensemble.py | 10 +++++-- qlib/contrib/model/gbdt.py | 4 +-- qlib/contrib/model/linear.py | 4 +-- qlib/contrib/model/pytorch_alstm.py | 6 ++-- qlib/contrib/model/pytorch_alstm_ts.py | 5 ++-- qlib/contrib/model/pytorch_gats.py | 6 ++-- qlib/contrib/model/pytorch_gru.py | 5 ++-- qlib/contrib/model/pytorch_lstm.py | 9 ++---- qlib/contrib/model/pytorch_nn.py | 10 +++---- qlib/contrib/model/pytorch_sfm.py | 12 +++----- qlib/contrib/model/pytorch_tabnet.py | 7 ++--- qlib/contrib/model/xgboost.py | 6 ++-- qlib/data/dataset/processor.py | 0 qlib/model/base.py | 6 +++- tests/test_contrib_model.py | 27 ++++++++++++++++++ 17 files changed, 115 insertions(+), 46 deletions(-) mode change 100755 => 100644 qlib/data/dataset/processor.py create mode 100644 tests/test_contrib_model.py diff --git a/qlib/contrib/model/__init__.py b/qlib/contrib/model/__init__.py index e69de29bb..09b0c929b 100644 --- a/qlib/contrib/model/__init__.py +++ b/qlib/contrib/model/__init__.py @@ -0,0 +1,39 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +try: + from .catboost_model import CatBoostModel +except ModuleNotFoundError: + CatBoostModel = None + print("Please install necessary libs for CatBoostModel.") +try: + from .double_ensemble import DEnsembleModel + from .gbdt import LGBModel +except ModuleNotFoundError: + DEnsembleModel, LGBModel = None, None + print("Please install necessary libs for DEnsembleModel and LGBModel, such as lightgbm.") +try: + from .xgboost import XGBModel +except ModuleNotFoundError: + XGBModel = None + print("Please install necessary libs for XGBModel, such as xgboost.") +try: + from .linear import LinearModel +except ModuleNotFoundError: + LinearModel = None + print("Please install necessary libs for LinearModel, such as scipy and sklearn.") +# import pytorch models +try: + from .pytorch_alstm import ALSTM + from .pytorch_gats import GATs + from .pytorch_gru import GRU + from .pytorch_lstm import LSTM + from .pytorch_nn import DNNModelPytorch + from .pytorch_tabnet import TabnetModel + from .pytorch_sfm import SFM_Model + + pytorch_classes = (ALSTM, GATs, GRU, LSTM, DNNModelPytorch, TabnetModel, SFM_Model) +except ModuleNotFoundError: + pytorch_classes = () + print("Please install necessary libs for PyTorch models.") + +all_model_classes = (CatBoostModel, DEnsembleModel, LGBModel, XGBModel, LinearModel) + pytorch_classes diff --git a/qlib/contrib/model/catboost_model.py b/qlib/contrib/model/catboost_model.py index d57c32b70..98b9b9c2d 100644 --- a/qlib/contrib/model/catboost_model.py +++ b/qlib/contrib/model/catboost_model.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd +from typing import Text, Union from catboost import Pool, CatBoost from catboost.utils import get_gpu_device_count @@ -62,10 +63,10 @@ class CatBoostModel(Model): evals_result["train"] = list(evals_result["learn"].values())[0] evals_result["valid"] = list(evals_result["validation"].values())[0] - def predict(self, dataset): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if self.model is None: raise ValueError("model is not fitted yet!") - x_test = dataset.prepare("test", col_set="feature") + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) return pd.Series(self.model.predict(x_test.values), index=x_test.index) diff --git a/qlib/contrib/model/double_ensemble.py b/qlib/contrib/model/double_ensemble.py index 541f74e99..4b267a2b0 100644 --- a/qlib/contrib/model/double_ensemble.py +++ b/qlib/contrib/model/double_ensemble.py @@ -4,7 +4,7 @@ import lightgbm as lgb import numpy as np import pandas as pd - +from typing import Text, Union from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP @@ -40,6 +40,10 @@ class DEnsembleModel(Model): self.bins_sr = bins_sr self.bins_fs = bins_fs self.decay = decay + if sample_ratios is None: # the default values for sample_ratios + sample_ratios = [0.8, 0.7, 0.6, 0.5, 0.4] + if sub_weights is None: # the default values for sub_weights + sub_weights = [1.0, 0.2, 0.2, 0.2, 0.2, 0.2] if not len(sample_ratios) == bins_fs: raise ValueError("The length of sample_ratios should be equal to bins_fs.") self.sample_ratios = sample_ratios @@ -228,10 +232,10 @@ class DEnsembleModel(Model): raise ValueError("not implemented yet") return loss_curve - def predict(self, dataset): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if self.ensemble is None: raise ValueError("model is not fitted yet!") - x_test = dataset.prepare("test", col_set="feature", data_key=DataHandlerLP.DK_I) + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) pred = pd.Series(np.zeros(x_test.shape[0]), index=x_test.index) for i_sub, submodel in enumerate(self.ensemble): feat_sub = self.sub_features[i_sub] diff --git a/qlib/contrib/model/gbdt.py b/qlib/contrib/model/gbdt.py index e4ac48ed6..463cf8f4f 100644 --- a/qlib/contrib/model/gbdt.py +++ b/qlib/contrib/model/gbdt.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd import lightgbm as lgb - +from typing import Text, Union from ...model.base import ModelFT from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP @@ -61,7 +61,7 @@ class LGBModel(ModelFT): evals_result["train"] = list(evals_result["train"].values())[0] evals_result["valid"] = list(evals_result["valid"].values())[0] - def predict(self, dataset, segment="test"): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if self.model is None: raise ValueError("model is not fitted yet!") x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) diff --git a/qlib/contrib/model/linear.py b/qlib/contrib/model/linear.py index 269e788c5..f16acc1ec 100644 --- a/qlib/contrib/model/linear.py +++ b/qlib/contrib/model/linear.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd - +from typing import Text, Union from scipy.optimize import nnls from sklearn.linear_model import LinearRegression, Ridge, Lasso @@ -84,7 +84,7 @@ class LinearModel(Model): self.coef_ = coef self.intercept_ = 0.0 - def predict(self, dataset, segment="test"): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if self.coef_ is None: raise ValueError("model is not fitted yet!") x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) diff --git a/qlib/contrib/model/pytorch_alstm.py b/qlib/contrib/model/pytorch_alstm.py index a149272da..ed706be86 100644 --- a/qlib/contrib/model/pytorch_alstm.py +++ b/qlib/contrib/model/pytorch_alstm.py @@ -8,9 +8,9 @@ from __future__ import print_function import os import numpy as np import pandas as pd +from typing import Text, Union import copy from ...utils import ( - unpack_archive_with_buffer, save_multiple_parts_file, get_or_create_path, drop_nan_by_y_index, @@ -273,11 +273,11 @@ class ALSTM(Model): if self.use_gpu: torch.cuda.empty_cache() - def predict(self, dataset): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: raise ValueError("model is not fitted yet!") - x_test = dataset.prepare("test", col_set="feature") + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) index = x_test.index self.ALSTM_model.eval() x_values = x_test.values diff --git a/qlib/contrib/model/pytorch_alstm_ts.py b/qlib/contrib/model/pytorch_alstm_ts.py index c38727b9e..3cd7ec280 100644 --- a/qlib/contrib/model/pytorch_alstm_ts.py +++ b/qlib/contrib/model/pytorch_alstm_ts.py @@ -8,6 +8,7 @@ from __future__ import print_function import os import numpy as np import pandas as pd +from typing import Text, Union import copy from ...utils import ( unpack_archive_with_buffer, @@ -264,11 +265,11 @@ class ALSTM(Model): if self.use_gpu: torch.cuda.empty_cache() - def predict(self, dataset): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: raise ValueError("model is not fitted yet!") - dl_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I) + dl_test = dataset.prepare(segment, col_set=["feature", "label"], data_key=DataHandlerLP.DK_I) dl_test.config(fillna_type="ffill+bfill") test_loader = DataLoader(dl_test, batch_size=self.batch_size, num_workers=self.n_jobs) self.ALSTM_model.eval() diff --git a/qlib/contrib/model/pytorch_gats.py b/qlib/contrib/model/pytorch_gats.py index 53afb5404..71edda76e 100644 --- a/qlib/contrib/model/pytorch_gats.py +++ b/qlib/contrib/model/pytorch_gats.py @@ -8,6 +8,7 @@ from __future__ import print_function import os import numpy as np import pandas as pd +from typing import Text, Union import copy from ...utils import ( unpack_archive_with_buffer, @@ -83,7 +84,6 @@ class GATs(Model): self.with_pretrain = with_pretrain self.model_path = model_path self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") - self.use_gpu = torch.cuda.is_available() self.seed = seed self.logger.info( @@ -310,11 +310,11 @@ class GATs(Model): if self.use_gpu: torch.cuda.empty_cache() - def predict(self, dataset): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: raise ValueError("model is not fitted yet!") - x_test = dataset.prepare("test", col_set="feature") + x_test = dataset.prepare(segment, col_set="feature") index = x_test.index self.GAT_model.eval() x_values = x_test.values diff --git a/qlib/contrib/model/pytorch_gru.py b/qlib/contrib/model/pytorch_gru.py index 5eba33595..da2161653 100755 --- a/qlib/contrib/model/pytorch_gru.py +++ b/qlib/contrib/model/pytorch_gru.py @@ -8,6 +8,7 @@ from __future__ import print_function import os import numpy as np import pandas as pd +from typing import Text, Union import copy from ...utils import ( unpack_archive_with_buffer, @@ -273,11 +274,11 @@ class GRU(Model): if self.use_gpu: torch.cuda.empty_cache() - def predict(self, dataset): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: raise ValueError("model is not fitted yet!") - x_test = dataset.prepare("test", col_set="feature") + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) index = x_test.index self.gru_model.eval() x_values = x_test.values diff --git a/qlib/contrib/model/pytorch_lstm.py b/qlib/contrib/model/pytorch_lstm.py index 636ef6e3a..bafd83ea6 100755 --- a/qlib/contrib/model/pytorch_lstm.py +++ b/qlib/contrib/model/pytorch_lstm.py @@ -8,6 +8,7 @@ from __future__ import print_function import os import numpy as np import pandas as pd +from typing import Text, Union import copy from ...utils import ( unpack_archive_with_buffer, @@ -268,11 +269,11 @@ class LSTM(Model): if self.use_gpu: torch.cuda.empty_cache() - def predict(self, dataset): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: raise ValueError("model is not fitted yet!") - x_test = dataset.prepare("test", col_set="feature") + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) index = x_test.index self.lstm_model.eval() x_values = x_test.values @@ -280,17 +281,13 @@ class LSTM(Model): preds = [] for begin in range(sample_num)[:: self.batch_size]: - if sample_num - begin < self.batch_size: end = sample_num else: end = begin + self.batch_size - x_batch = torch.from_numpy(x_values[begin:end]).float().to(self.device) - with torch.no_grad(): pred = self.lstm_model(x_batch).detach().cpu().numpy() - preds.append(pred) return pd.Series(np.concatenate(preds), index=index) diff --git a/qlib/contrib/model/pytorch_nn.py b/qlib/contrib/model/pytorch_nn.py index caf34b330..4dc02cc0a 100644 --- a/qlib/contrib/model/pytorch_nn.py +++ b/qlib/contrib/model/pytorch_nn.py @@ -8,6 +8,7 @@ from __future__ import print_function import os import numpy as np import pandas as pd +from typing import Text, Union from sklearn.metrics import roc_auc_score, mean_squared_error import torch @@ -48,8 +49,8 @@ class DNNModelPytorch(Model): def __init__( self, - input_dim, - output_dim, + input_dim=360, + output_dim=1, layers=(256,), lr=0.001, max_steps=300, @@ -271,13 +272,12 @@ class DNNModelPytorch(Model): else: raise NotImplementedError("loss {} is not supported!".format(loss_type)) - def predict(self, dataset): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: raise ValueError("model is not fitted yet!") - x_test_pd = dataset.prepare("test", col_set="feature") + x_test_pd = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) x_test = torch.from_numpy(x_test_pd.values).float().to(self.device) self.dnn_model.eval() - with torch.no_grad(): preds = self.dnn_model(x_test).detach().cpu().numpy() return pd.Series(np.squeeze(preds), index=x_test_pd.index) diff --git a/qlib/contrib/model/pytorch_sfm.py b/qlib/contrib/model/pytorch_sfm.py index db3e8bb12..4eb89bdda 100644 --- a/qlib/contrib/model/pytorch_sfm.py +++ b/qlib/contrib/model/pytorch_sfm.py @@ -7,10 +7,9 @@ from __future__ import print_function import os import numpy as np import pandas as pd +from typing import Text, Union import copy from ...utils import ( - unpack_archive_with_buffer, - save_multiple_parts_file, get_or_create_path, drop_nan_by_y_index, ) @@ -442,11 +441,11 @@ class SFM(Model): raise ValueError("unknown metric `%s`" % self.metric) - def predict(self, dataset): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: raise ValueError("model is not fitted yet!") - x_test = dataset.prepare("test", col_set="feature") + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) index = x_test.index self.sfm_model.eval() x_values = x_test.values @@ -459,10 +458,7 @@ class SFM(Model): else: end = begin + self.batch_size - x_batch = torch.from_numpy(x_values[begin:end]).float() - - if self.device != "cpu": - x_batch = x_batch.to(self.device) + x_batch = torch.from_numpy(x_values[begin:end]).float().to(self.device) with torch.no_grad(): pred = self.sfm_model(x_batch).detach().cpu().numpy() diff --git a/qlib/contrib/model/pytorch_tabnet.py b/qlib/contrib/model/pytorch_tabnet.py index 450e6f5d1..b772b60d9 100644 --- a/qlib/contrib/model/pytorch_tabnet.py +++ b/qlib/contrib/model/pytorch_tabnet.py @@ -6,10 +6,9 @@ from __future__ import print_function import os import numpy as np import pandas as pd +from typing import Text, Union import copy from ...utils import ( - unpack_archive_with_buffer, - save_multiple_parts_file, get_or_create_path, drop_nan_by_y_index, ) @@ -217,11 +216,11 @@ class TabnetModel(Model): if self.use_gpu: torch.cuda.empty_cache() - def predict(self, dataset): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if not self.fitted: raise ValueError("model is not fitted yet!") - x_test = dataset.prepare("test", col_set="feature", data_key=DataHandlerLP.DK_I) + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) index = x_test.index self.tabnet_model.eval() x_values = torch.from_numpy(x_test.values) diff --git a/qlib/contrib/model/xgboost.py b/qlib/contrib/model/xgboost.py index 6bfd2c799..cbba14678 100755 --- a/qlib/contrib/model/xgboost.py +++ b/qlib/contrib/model/xgboost.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd import xgboost as xgb - +from typing import Text, Union from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP @@ -57,8 +57,8 @@ class XGBModel(Model): evals_result["train"] = list(evals_result["train"].values())[0] evals_result["valid"] = list(evals_result["valid"].values())[0] - def predict(self, dataset, segment="test"): + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if self.model is None: raise ValueError("model is not fitted yet!") - x_test = dataset.prepare(segment, col_set="feature") + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) return pd.Series(self.model.predict(xgb.DMatrix(x_test.values)), index=x_test.index) diff --git a/qlib/data/dataset/processor.py b/qlib/data/dataset/processor.py old mode 100755 new mode 100644 diff --git a/qlib/model/base.py b/qlib/model/base.py index 3708298d5..1ac8f2fc9 100644 --- a/qlib/model/base.py +++ b/qlib/model/base.py @@ -1,6 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. import abc +from typing import Text, Union from ..utils.serial import Serializable from ..data.dataset import Dataset @@ -59,7 +60,7 @@ class Model(BaseModel): raise NotImplementedError() @abc.abstractmethod - def predict(self, dataset: Dataset) -> object: + def predict(self, dataset: Dataset, segment: Union[Text, slice] = "test") -> object: """give prediction given Dataset Parameters @@ -67,6 +68,9 @@ class Model(BaseModel): dataset : Dataset dataset will generate the processed dataset from model training. + segment : Text or slice + dataset will use this segment to prepare data. (default=test) + Returns ------- Prediction results with certain type such as `pandas.Series`. diff --git a/tests/test_contrib_model.py b/tests/test_contrib_model.py new file mode 100644 index 000000000..a82a3042e --- /dev/null +++ b/tests/test_contrib_model.py @@ -0,0 +1,27 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import unittest + +from qlib.contrib.model import all_model_classes + + +class TestAllFlow(unittest.TestCase): + def test_0_initialize(self): + num = 0 + for model_class in all_model_classes: + if model_class is not None: + model = model_class() + num += 1 + print("There are {:}/{:} valid models in total.".format(num, len(all_model_classes))) + + +def suite(): + _suite = unittest.TestSuite() + _suite.addTest(TestAllFlow("test_0_initialize")) + return _suite + + +if __name__ == "__main__": + runner = unittest.TextTestRunner() + runner.run(suite())