mirror of
https://github.com/microsoft/qlib.git
synced 2026-07-03 11:00:57 +08:00
81 lines
3.0 KiB
Python
81 lines
3.0 KiB
Python
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from catboost import Pool, CatBoost
|
|
from catboost.utils import get_gpu_device_count
|
|
|
|
from ...model.base import Model
|
|
from ...data.dataset import DatasetH
|
|
from ...data.dataset.handler import DataHandlerLP
|
|
|
|
|
|
class CatBoostModel(Model):
|
|
"""CatBoost Model"""
|
|
|
|
def __init__(self, loss="RMSE", **kwargs):
|
|
# There are more options
|
|
if loss not in {"RMSE", "Logloss"}:
|
|
raise NotImplementedError
|
|
self._params = {"loss_function": loss}
|
|
self._params.update(kwargs)
|
|
self.model = None
|
|
|
|
def fit(
|
|
self,
|
|
dataset: DatasetH,
|
|
num_boost_round = 1000,
|
|
early_stopping_rounds = 50,
|
|
verbose_eval = 20,
|
|
evals_result = dict(),
|
|
**kwargs
|
|
):
|
|
df_train, df_valid = dataset.prepare(
|
|
["train", "valid"], col_set = ["feature", "label"], data_key = DataHandlerLP.DK_L
|
|
)
|
|
x_train, y_train = df_train["feature"], df_train["label"]
|
|
x_valid, y_valid = df_valid["feature"], df_valid["label"]
|
|
|
|
# CatBoost needs 1D array as its label
|
|
if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
|
|
y_train_1d, y_valid_1d = np.squeeze(y_train.values), np.squeeze(y_valid.values)
|
|
else:
|
|
raise ValueError("CatBoost doesn't support multi-label training")
|
|
|
|
train_pool = Pool(data = x_train, label = y_train_1d)
|
|
valid_pool = Pool(data = x_valid, label = y_valid_1d)
|
|
|
|
# Initialize the catboost model
|
|
self._params["iterations"] = num_boost_round
|
|
self._params["early_stopping_rounds"] = early_stopping_rounds
|
|
self._params["verbose_eval"] = verbose_eval
|
|
self._params["task_type"] = "GPU" if get_gpu_device_count() > 0 else "CPU"
|
|
self.model = CatBoost(self._params, **kwargs)
|
|
|
|
# train the model
|
|
self.model.fit(train_pool, eval_set = valid_pool, use_best_model = True, **kwargs)
|
|
|
|
evals_result = self.model.get_evals_result()
|
|
evals_result["train"] = list(evals_result["learn"].values())[0]
|
|
evals_result["valid"] = list(evals_result["validation"].values())[0]
|
|
|
|
def predict(self, dataset):
|
|
if self.model is None:
|
|
raise ValueError("model is not fitted yet!")
|
|
x_test = dataset.prepare("test", col_set = "feature")
|
|
return pd.Series(self.model.predict(x_test.values), index = x_test.index)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
cat = CatBoostModel()
|