diff --git a/qlib/contrib/model/catboost_model.py b/qlib/contrib/model/catboost_model.py new file mode 100644 index 000000000..ee9d5a4f8 --- /dev/null +++ b/qlib/contrib/model/catboost_model.py @@ -0,0 +1,72 @@ +import numpy as np +import pandas as pd +from catboost import Pool, CatBoost +from catboost.utils import get_gpu_device_count + +from ...model.base import Model +from ...data.dataset import DatasetH +from ...data.dataset.handler import DataHandlerLP + + +class CatBoostModel(Model): + """CatBoost Model""" + + def __init__(self, loss="RMSE", **kwargs): + # There are more options + if loss not in {"RMSE", "Logloss"}: + raise NotImplementedError + self._params = {"loss_function": loss} + self._params.update(kwargs) + self.model = None + + def fit( + self, + dataset: DatasetH, + num_boost_round=1000, + early_stopping_rounds=50, + verbose_eval=20, + evals_result=dict(), + **kwargs + ): + df_train, df_valid = dataset.prepare( + ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L + ) + x_train, y_train = df_train["feature"], df_train["label"] + x_valid, y_valid = df_valid["feature"], df_valid["label"] + + # CatBoost needs 1D array as its label + if y_train.values.ndim == 2 and y_train.values.shape[1] == 1: + y_train_1d, y_valid_1d = np.squeeze(y_train.values), np.squeeze(y_valid.values) + else: + raise ValueError("CatBoost doesn't support multi-label training") + + train_pool = Pool(data = x_train, label = y_train_1d) + valid_pool = Pool(data = x_valid, label = y_valid_1d) + + #Initialize the catboost model + self._params['iterations'] = num_boost_round + self._params['early_stopping_rounds'] = early_stopping_rounds + self._params['verbose_eval'] = verbose_eval + self._params['task_type'] = "GPU" if get_gpu_device_count() > 0 else "CPU" + self.model = CatBoost(self._params, **kwargs) + + #train the model + self.model.fit( + train_pool, + eval_set = valid_pool, + use_best_model = True + ) + + evals_result["train"] = list(self.model.get_evals_result().values())[0] + evals_result["valid"] = self.model.get_test_eval() + + + def predict(self, dataset): + if self.model is None: + raise ValueError("model is not fitted yet!") + x_test = dataset.prepare("test", col_set="feature") + return pd.Series(self.model.predict(np.squeeze(x_test.values)), index=x_test.index) + + +if __name__ == '__main__': + cat = CatBoostModel() \ No newline at end of file