1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-07-04 11:30:57 +08:00

Adding Catboost as a model

This commit is contained in:
Haoyu Wang
2020-11-06 16:18:56 +08:00
parent 65a009acb2
commit 1556be6798

View File

@@ -0,0 +1,72 @@
import numpy as np
import pandas as pd
from catboost import Pool, CatBoost
from catboost.utils import get_gpu_device_count
from ...model.base import Model
from ...data.dataset import DatasetH
from ...data.dataset.handler import DataHandlerLP
class CatBoostModel(Model):
"""CatBoost Model"""
def __init__(self, loss="RMSE", **kwargs):
# There are more options
if loss not in {"RMSE", "Logloss"}:
raise NotImplementedError
self._params = {"loss_function": loss}
self._params.update(kwargs)
self.model = None
def fit(
self,
dataset: DatasetH,
num_boost_round=1000,
early_stopping_rounds=50,
verbose_eval=20,
evals_result=dict(),
**kwargs
):
df_train, df_valid = dataset.prepare(
["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
)
x_train, y_train = df_train["feature"], df_train["label"]
x_valid, y_valid = df_valid["feature"], df_valid["label"]
# CatBoost needs 1D array as its label
if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
y_train_1d, y_valid_1d = np.squeeze(y_train.values), np.squeeze(y_valid.values)
else:
raise ValueError("CatBoost doesn't support multi-label training")
train_pool = Pool(data = x_train, label = y_train_1d)
valid_pool = Pool(data = x_valid, label = y_valid_1d)
#Initialize the catboost model
self._params['iterations'] = num_boost_round
self._params['early_stopping_rounds'] = early_stopping_rounds
self._params['verbose_eval'] = verbose_eval
self._params['task_type'] = "GPU" if get_gpu_device_count() > 0 else "CPU"
self.model = CatBoost(self._params, **kwargs)
#train the model
self.model.fit(
train_pool,
eval_set = valid_pool,
use_best_model = True
)
evals_result["train"] = list(self.model.get_evals_result().values())[0]
evals_result["valid"] = self.model.get_test_eval()
def predict(self, dataset):
if self.model is None:
raise ValueError("model is not fitted yet!")
x_test = dataset.prepare("test", col_set="feature")
return pd.Series(self.model.predict(np.squeeze(x_test.values)), index=x_test.index)
if __name__ == '__main__':
cat = CatBoostModel()