mirror of
https://github.com/microsoft/qlib.git
synced 2026-07-04 03:21:00 +08:00
99 lines
3.4 KiB
Python
99 lines
3.4 KiB
Python
# Copyright (c) Microsoft Corporation.
|
|
# Licensed under the MIT License.
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import lightgbm as lgb
|
|
from typing import Text, Union
|
|
from ...model.base import ModelFT
|
|
from ...data.dataset import DatasetH
|
|
from ...data.dataset.handler import DataHandlerLP
|
|
from ...model.interpret.base import LightGBMFInt
|
|
|
|
|
|
class LGBModel(ModelFT, LightGBMFInt):
|
|
"""LightGBM Model"""
|
|
|
|
def __init__(self, loss="mse", **kwargs):
|
|
if loss not in {"mse", "binary"}:
|
|
raise NotImplementedError
|
|
self.params = {"objective": loss, "verbosity": -1}
|
|
self.params.update(kwargs)
|
|
self.model = None
|
|
|
|
def _prepare_data(self, dataset: DatasetH):
|
|
df_train, df_valid = dataset.prepare(
|
|
["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
|
|
)
|
|
if df_train.empty or df_valid.empty:
|
|
raise ValueError("Empty data from dataset, please check your dataset config.")
|
|
x_train, y_train = df_train["feature"], df_train["label"]
|
|
x_valid, y_valid = df_valid["feature"], df_valid["label"]
|
|
|
|
# Lightgbm need 1D array as its label
|
|
if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
|
|
y_train, y_valid = np.squeeze(y_train.values), np.squeeze(y_valid.values)
|
|
else:
|
|
raise ValueError("LightGBM doesn't support multi-label training")
|
|
|
|
dtrain = lgb.Dataset(x_train, label=y_train)
|
|
dvalid = lgb.Dataset(x_valid, label=y_valid)
|
|
return dtrain, dvalid
|
|
|
|
def fit(
|
|
self,
|
|
dataset: DatasetH,
|
|
num_boost_round=1000,
|
|
early_stopping_rounds=50,
|
|
verbose_eval=20,
|
|
evals_result=dict(),
|
|
**kwargs
|
|
):
|
|
dtrain, dvalid = self._prepare_data(dataset)
|
|
self.model = lgb.train(
|
|
self.params,
|
|
dtrain,
|
|
num_boost_round=num_boost_round,
|
|
valid_sets=[dtrain, dvalid],
|
|
valid_names=["train", "valid"],
|
|
early_stopping_rounds=early_stopping_rounds,
|
|
verbose_eval=verbose_eval,
|
|
evals_result=evals_result,
|
|
**kwargs
|
|
)
|
|
evals_result["train"] = list(evals_result["train"].values())[0]
|
|
evals_result["valid"] = list(evals_result["valid"].values())[0]
|
|
|
|
def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
|
|
if self.model is None:
|
|
raise ValueError("model is not fitted yet!")
|
|
x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I)
|
|
return pd.Series(self.model.predict(x_test.values), index=x_test.index)
|
|
|
|
def finetune(self, dataset: DatasetH, num_boost_round=10, verbose_eval=20):
|
|
"""
|
|
finetune model
|
|
|
|
Parameters
|
|
----------
|
|
dataset : DatasetH
|
|
dataset for finetuning
|
|
num_boost_round : int
|
|
number of round to finetune model
|
|
verbose_eval : int
|
|
verbose level
|
|
"""
|
|
# Based on existing model and finetune by train more rounds
|
|
dtrain, _ = self._prepare_data(dataset)
|
|
if dtrain.empty:
|
|
raise ValueError("Empty data from dataset, please check your dataset config.")
|
|
self.model = lgb.train(
|
|
self.params,
|
|
dtrain,
|
|
num_boost_round=num_boost_round,
|
|
init_model=self.model,
|
|
valid_sets=[dtrain],
|
|
valid_names=["train"],
|
|
verbose_eval=verbose_eval,
|
|
)
|