diff --git a/examples/model_interpreter.py b/examples/model_interpreter.py new file mode 100644 index 000000000..1d9230b8c --- /dev/null +++ b/examples/model_interpreter.py @@ -0,0 +1,81 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + + +import qlib +from qlib.config import REG_CN + +from qlib.utils import exists_qlib_data, init_instance_by_config +from qlib.tests.data import GetData + +market = "csi300" +benchmark = "SH000300" + +################################### +# config +################################### +data_handler_config = { + "start_time": "2008-01-01", + "end_time": "2020-08-01", + "fit_start_time": "2008-01-01", + "fit_end_time": "2014-12-31", + "instruments": market, +} + +task = { + "model": { + "class": "LGBModel", + "module_path": "qlib.contrib.model.gbdt", + "kwargs": { + "loss": "mse", + "colsample_bytree": 0.8879, + "learning_rate": 0.0421, + "subsample": 0.8789, + "lambda_l1": 205.6999, + "lambda_l2": 580.9768, + "max_depth": 8, + "num_leaves": 210, + "num_threads": 20, + }, + }, + "dataset": { + "class": "DatasetH", + "module_path": "qlib.data.dataset", + "kwargs": { + "handler": { + "class": "Alpha158", + "module_path": "qlib.contrib.data.handler", + "kwargs": data_handler_config, + }, + "segments": { + "train": ("2008-01-01", "2014-12-31"), + "valid": ("2015-01-01", "2016-12-31"), + "test": ("2017-01-01", "2020-08-01"), + }, + }, + }, +} + + +if __name__ == "__main__": + + # use default data + provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir + if not exists_qlib_data(provider_uri): + print(f"Qlib data is not found in {provider_uri}") + GetData().qlib_data(target_dir=provider_uri, region=REG_CN) + + qlib.init(provider_uri=provider_uri, region=REG_CN) + + ################################### + # train model + ################################### + # model initialization + model = init_instance_by_config(task["model"]) + dataset = init_instance_by_config(task["dataset"]) + model.fit(dataset) + + # get model feature importance + feature_importance = model.get_feature_importance() + print("feature importance:") + print(feature_importance) diff --git a/qlib/contrib/model/catboost_model.py b/qlib/contrib/model/catboost_model.py index 98b9b9c2d..5138e0e6f 100644 --- a/qlib/contrib/model/catboost_model.py +++ b/qlib/contrib/model/catboost_model.py @@ -10,9 +10,10 @@ from catboost.utils import get_gpu_device_count from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP +from ...model.interpret.base import FeatureInt -class CatBoostModel(Model): +class CatBoostModel(Model, FeatureInt): """CatBoost Model""" def __init__(self, loss="RMSE", **kwargs): @@ -69,6 +70,18 @@ class CatBoostModel(Model): x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) return pd.Series(self.model.predict(x_test.values), index=x_test.index) + def get_feature_importance(self, *args, **kwargs) -> pd.Series: + """get feature importance + + Notes + ----- + parameters references: + https://catboost.ai/docs/concepts/python-reference_catboost_get_feature_importance.html#python-reference_catboost_get_feature_importance + """ + return pd.Series( + data=self.model.get_feature_importance(*args, **kwargs), index=self.model.feature_names_ + ).sort_values(ascending=False) + if __name__ == "__main__": cat = CatBoostModel() diff --git a/qlib/contrib/model/double_ensemble.py b/qlib/contrib/model/double_ensemble.py index 4b267a2b0..d3ca898f8 100644 --- a/qlib/contrib/model/double_ensemble.py +++ b/qlib/contrib/model/double_ensemble.py @@ -1,251 +1,265 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import lightgbm as lgb -import numpy as np -import pandas as pd -from typing import Text, Union -from ...model.base import Model -from ...data.dataset import DatasetH -from ...data.dataset.handler import DataHandlerLP -from ...log import get_module_logger - - -class DEnsembleModel(Model): - """Double Ensemble Model""" - - def __init__( - self, - base_model="gbm", - loss="mse", - num_models=6, - enable_sr=True, - enable_fs=True, - alpha1=1.0, - alpha2=1.0, - bins_sr=10, - bins_fs=5, - decay=None, - sample_ratios=None, - sub_weights=None, - epochs=100, - **kwargs - ): - self.base_model = base_model # "gbm" or "mlp", specifically, we use lgbm for "gbm" - self.num_models = num_models # the number of sub-models - self.enable_sr = enable_sr - self.enable_fs = enable_fs - self.alpha1 = alpha1 - self.alpha2 = alpha2 - self.bins_sr = bins_sr - self.bins_fs = bins_fs - self.decay = decay - if sample_ratios is None: # the default values for sample_ratios - sample_ratios = [0.8, 0.7, 0.6, 0.5, 0.4] - if sub_weights is None: # the default values for sub_weights - sub_weights = [1.0, 0.2, 0.2, 0.2, 0.2, 0.2] - if not len(sample_ratios) == bins_fs: - raise ValueError("The length of sample_ratios should be equal to bins_fs.") - self.sample_ratios = sample_ratios - if not len(sub_weights) == num_models: - raise ValueError("The length of sub_weights should be equal to num_models.") - self.sub_weights = sub_weights - self.epochs = epochs - self.logger = get_module_logger("DEnsembleModel") - self.logger.info("Double Ensemble Model...") - self.ensemble = [] # the current ensemble model, a list contains all the sub-models - self.sub_features = [] # the features for each sub model in the form of pandas.Index - self.params = {"objective": loss} - self.params.update(kwargs) - self.loss = loss - - def fit(self, dataset: DatasetH): - df_train, df_valid = dataset.prepare( - ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L - ) - x_train, y_train = df_train["feature"], df_train["label"] - # initialize the sample weights - N, F = x_train.shape - weights = pd.Series(np.ones(N, dtype=float)) - # initialize the features - features = x_train.columns - pred_sub = pd.DataFrame(np.zeros((N, self.num_models), dtype=float), index=x_train.index) - # train sub-models - for k in range(self.num_models): - self.sub_features.append(features) - self.logger.info("Training sub-model: ({}/{})".format(k + 1, self.num_models)) - model_k = self.train_submodel(df_train, df_valid, weights, features) - self.ensemble.append(model_k) - # no further sample re-weight and feature selection needed for the last sub-model - if k + 1 == self.num_models: - break - - self.logger.info("Retrieving loss curve and loss values...") - loss_curve = self.retrieve_loss_curve(model_k, df_train, features) - pred_k = self.predict_sub(model_k, df_train, features) - pred_sub.iloc[:, k] = pred_k - pred_ensemble = pred_sub.iloc[:, : k + 1].mean(axis=1) - loss_values = pd.Series(self.get_loss(y_train.values.squeeze(), pred_ensemble.values)) - - if self.enable_sr: - self.logger.info("Sample re-weighting...") - weights = self.sample_reweight(loss_curve, loss_values, k + 1) - - if self.enable_fs: - self.logger.info("Feature selection...") - features = self.feature_selection(df_train, loss_values) - - def train_submodel(self, df_train, df_valid, weights, features): - dtrain, dvalid = self._prepare_data_gbm(df_train, df_valid, weights, features) - evals_result = dict() - model = lgb.train( - self.params, - dtrain, - num_boost_round=self.epochs, - valid_sets=[dtrain, dvalid], - valid_names=["train", "valid"], - verbose_eval=20, - evals_result=evals_result, - ) - evals_result["train"] = list(evals_result["train"].values())[0] - evals_result["valid"] = list(evals_result["valid"].values())[0] - return model - - def _prepare_data_gbm(self, df_train, df_valid, weights, features): - x_train, y_train = df_train["feature"].loc[:, features], df_train["label"] - x_valid, y_valid = df_valid["feature"].loc[:, features], df_valid["label"] - - # Lightgbm need 1D array as its label - if y_train.values.ndim == 2 and y_train.values.shape[1] == 1: - y_train, y_valid = np.squeeze(y_train.values), np.squeeze(y_valid.values) - else: - raise ValueError("LightGBM doesn't support multi-label training") - - dtrain = lgb.Dataset(x_train.values, label=y_train, weight=weights) - dvalid = lgb.Dataset(x_valid.values, label=y_valid) - return dtrain, dvalid - - def sample_reweight(self, loss_curve, loss_values, k_th): - """ - the SR module of Double Ensemble - :param loss_curve: the shape is NxT - the loss curve for the previous sub-model, where the element (i, t) if the error on the i-th sample - after the t-th iteration in the training of the previous sub-model. - :param loss_values: the shape is N - the loss of the current ensemble on the i-th sample. - :param k_th: the index of the current sub-model, starting from 1 - :return: weights - the weights for all the samples. - """ - # normalize loss_curve and loss_values with ranking - loss_curve_norm = loss_curve.rank(axis=0, pct=True) - loss_values_norm = (-loss_values).rank(pct=True) - - # calculate l_start and l_end from loss_curve - N, T = loss_curve.shape - part = np.maximum(int(T * 0.1), 1) - l_start = loss_curve_norm.iloc[:, :part].mean(axis=1) - l_end = loss_curve_norm.iloc[:, -part:].mean(axis=1) - - # calculate h-value for each sample - h1 = loss_values_norm - h2 = (l_end / l_start).rank(pct=True) - h = pd.DataFrame({"h_value": self.alpha1 * h1 + self.alpha2 * h2}) - - # calculate weights - h["bins"] = pd.cut(h["h_value"], self.bins_sr) - h_avg = h.groupby("bins")["h_value"].mean() - weights = pd.Series(np.zeros(N, dtype=float)) - for i_b, b in enumerate(h_avg.index): - weights[h["bins"] == b] = 1.0 / (self.decay ** k_th * h_avg[i_b] + 0.1) - return weights - - def feature_selection(self, df_train, loss_values): - """ - the FS module of Double Ensemble - :param df_train: the shape is NxF - :param loss_values: the shape is N - the loss of the current ensemble on the i-th sample. - :return: res_feat: in the form of pandas.Index - - """ - x_train, y_train = df_train["feature"], df_train["label"] - features = x_train.columns - N, F = x_train.shape - g = pd.DataFrame({"g_value": np.zeros(F, dtype=float)}) - M = len(self.ensemble) - - # shuffle specific columns and calculate g-value for each feature - x_train_tmp = x_train.copy() - for i_f, feat in enumerate(features): - x_train_tmp.loc[:, feat] = np.random.permutation(x_train_tmp.loc[:, feat].values) - pred = pd.Series(np.zeros(N), index=x_train_tmp.index) - for i_s, submodel in enumerate(self.ensemble): - pred += ( - pd.Series( - submodel.predict(x_train_tmp.loc[:, self.sub_features[i_s]].values), index=x_train_tmp.index - ) - / M - ) - loss_feat = self.get_loss(y_train.values.squeeze(), pred.values) - g.loc[i_f, "g_value"] = np.mean(loss_feat - loss_values) / (np.std(loss_feat - loss_values) + 1e-7) - x_train_tmp.loc[:, feat] = x_train.loc[:, feat].copy() - - # one column in train features is all-nan # if g['g_value'].isna().any() - g["g_value"].replace(np.nan, 0, inplace=True) - - # divide features into bins_fs bins - g["bins"] = pd.cut(g["g_value"], self.bins_fs) - - # randomly sample features from bins to construct the new features - res_feat = [] - sorted_bins = sorted(g["bins"].unique(), reverse=True) - for i_b, b in enumerate(sorted_bins): - b_feat = features[g["bins"] == b] - num_feat = int(np.ceil(self.sample_ratios[i_b] * len(b_feat))) - res_feat = res_feat + np.random.choice(b_feat, size=num_feat).tolist() - return pd.Index(res_feat) - - def get_loss(self, label, pred): - if self.loss == "mse": - return (label - pred) ** 2 - else: - raise ValueError("not implemented yet") - - def retrieve_loss_curve(self, model, df_train, features): - if self.base_model == "gbm": - num_trees = model.num_trees() - x_train, y_train = df_train["feature"].loc[:, features], df_train["label"] - # Lightgbm need 1D array as its label - if y_train.values.ndim == 2 and y_train.values.shape[1] == 1: - y_train = np.squeeze(y_train.values) - else: - raise ValueError("LightGBM doesn't support multi-label training") - - N = x_train.shape[0] - loss_curve = pd.DataFrame(np.zeros((N, num_trees))) - pred_tree = np.zeros(N, dtype=float) - for i_tree in range(num_trees): - pred_tree += model.predict(x_train.values, start_iteration=i_tree, num_iteration=1) - loss_curve.iloc[:, i_tree] = self.get_loss(y_train, pred_tree) - else: - raise ValueError("not implemented yet") - return loss_curve - - def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): - if self.ensemble is None: - raise ValueError("model is not fitted yet!") - x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) - pred = pd.Series(np.zeros(x_test.shape[0]), index=x_test.index) - for i_sub, submodel in enumerate(self.ensemble): - feat_sub = self.sub_features[i_sub] - pred += ( - pd.Series(submodel.predict(x_test.loc[:, feat_sub].values), index=x_test.index) - * self.sub_weights[i_sub] - ) - return pred - - def predict_sub(self, submodel, df_data, features): - x_data, y_data = df_data["feature"].loc[:, features], df_data["label"] - pred_sub = pd.Series(submodel.predict(x_data.values), index=x_data.index) - return pred_sub +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import lightgbm as lgb +import numpy as np +import pandas as pd +from typing import Text, Union +from ...model.base import Model +from ...data.dataset import DatasetH +from ...data.dataset.handler import DataHandlerLP +from ...model.interpret.base import FeatureInt +from ...log import get_module_logger + + +class DEnsembleModel(Model, FeatureInt): + """Double Ensemble Model""" + + def __init__( + self, + base_model="gbm", + loss="mse", + num_models=6, + enable_sr=True, + enable_fs=True, + alpha1=1.0, + alpha2=1.0, + bins_sr=10, + bins_fs=5, + decay=None, + sample_ratios=None, + sub_weights=None, + epochs=100, + **kwargs + ): + self.base_model = base_model # "gbm" or "mlp", specifically, we use lgbm for "gbm" + self.num_models = num_models # the number of sub-models + self.enable_sr = enable_sr + self.enable_fs = enable_fs + self.alpha1 = alpha1 + self.alpha2 = alpha2 + self.bins_sr = bins_sr + self.bins_fs = bins_fs + self.decay = decay + if sample_ratios is None: # the default values for sample_ratios + sample_ratios = [0.8, 0.7, 0.6, 0.5, 0.4] + if sub_weights is None: # the default values for sub_weights + sub_weights = [1.0, 0.2, 0.2, 0.2, 0.2, 0.2] + if not len(sample_ratios) == bins_fs: + raise ValueError("The length of sample_ratios should be equal to bins_fs.") + self.sample_ratios = sample_ratios + if not len(sub_weights) == num_models: + raise ValueError("The length of sub_weights should be equal to num_models.") + self.sub_weights = sub_weights + self.epochs = epochs + self.logger = get_module_logger("DEnsembleModel") + self.logger.info("Double Ensemble Model...") + self.ensemble = [] # the current ensemble model, a list contains all the sub-models + self.sub_features = [] # the features for each sub model in the form of pandas.Index + self.params = {"objective": loss} + self.params.update(kwargs) + self.loss = loss + + def fit(self, dataset: DatasetH): + df_train, df_valid = dataset.prepare( + ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L + ) + x_train, y_train = df_train["feature"], df_train["label"] + # initialize the sample weights + N, F = x_train.shape + weights = pd.Series(np.ones(N, dtype=float)) + # initialize the features + features = x_train.columns + pred_sub = pd.DataFrame(np.zeros((N, self.num_models), dtype=float), index=x_train.index) + # train sub-models + for k in range(self.num_models): + self.sub_features.append(features) + self.logger.info("Training sub-model: ({}/{})".format(k + 1, self.num_models)) + model_k = self.train_submodel(df_train, df_valid, weights, features) + self.ensemble.append(model_k) + # no further sample re-weight and feature selection needed for the last sub-model + if k + 1 == self.num_models: + break + + self.logger.info("Retrieving loss curve and loss values...") + loss_curve = self.retrieve_loss_curve(model_k, df_train, features) + pred_k = self.predict_sub(model_k, df_train, features) + pred_sub.iloc[:, k] = pred_k + pred_ensemble = pred_sub.iloc[:, : k + 1].mean(axis=1) + loss_values = pd.Series(self.get_loss(y_train.values.squeeze(), pred_ensemble.values)) + + if self.enable_sr: + self.logger.info("Sample re-weighting...") + weights = self.sample_reweight(loss_curve, loss_values, k + 1) + + if self.enable_fs: + self.logger.info("Feature selection...") + features = self.feature_selection(df_train, loss_values) + + def train_submodel(self, df_train, df_valid, weights, features): + dtrain, dvalid = self._prepare_data_gbm(df_train, df_valid, weights, features) + evals_result = dict() + model = lgb.train( + self.params, + dtrain, + num_boost_round=self.epochs, + valid_sets=[dtrain, dvalid], + valid_names=["train", "valid"], + verbose_eval=20, + evals_result=evals_result, + ) + evals_result["train"] = list(evals_result["train"].values())[0] + evals_result["valid"] = list(evals_result["valid"].values())[0] + return model + + def _prepare_data_gbm(self, df_train, df_valid, weights, features): + x_train, y_train = df_train["feature"].loc[:, features], df_train["label"] + x_valid, y_valid = df_valid["feature"].loc[:, features], df_valid["label"] + + # Lightgbm need 1D array as its label + if y_train.values.ndim == 2 and y_train.values.shape[1] == 1: + y_train, y_valid = np.squeeze(y_train.values), np.squeeze(y_valid.values) + else: + raise ValueError("LightGBM doesn't support multi-label training") + + dtrain = lgb.Dataset(x_train, label=y_train, weight=weights) + dvalid = lgb.Dataset(x_valid, label=y_valid) + return dtrain, dvalid + + def sample_reweight(self, loss_curve, loss_values, k_th): + """ + the SR module of Double Ensemble + :param loss_curve: the shape is NxT + the loss curve for the previous sub-model, where the element (i, t) if the error on the i-th sample + after the t-th iteration in the training of the previous sub-model. + :param loss_values: the shape is N + the loss of the current ensemble on the i-th sample. + :param k_th: the index of the current sub-model, starting from 1 + :return: weights + the weights for all the samples. + """ + # normalize loss_curve and loss_values with ranking + loss_curve_norm = loss_curve.rank(axis=0, pct=True) + loss_values_norm = (-loss_values).rank(pct=True) + + # calculate l_start and l_end from loss_curve + N, T = loss_curve.shape + part = np.maximum(int(T * 0.1), 1) + l_start = loss_curve_norm.iloc[:, :part].mean(axis=1) + l_end = loss_curve_norm.iloc[:, -part:].mean(axis=1) + + # calculate h-value for each sample + h1 = loss_values_norm + h2 = (l_end / l_start).rank(pct=True) + h = pd.DataFrame({"h_value": self.alpha1 * h1 + self.alpha2 * h2}) + + # calculate weights + h["bins"] = pd.cut(h["h_value"], self.bins_sr) + h_avg = h.groupby("bins")["h_value"].mean() + weights = pd.Series(np.zeros(N, dtype=float)) + for i_b, b in enumerate(h_avg.index): + weights[h["bins"] == b] = 1.0 / (self.decay ** k_th * h_avg[i_b] + 0.1) + return weights + + def feature_selection(self, df_train, loss_values): + """ + the FS module of Double Ensemble + :param df_train: the shape is NxF + :param loss_values: the shape is N + the loss of the current ensemble on the i-th sample. + :return: res_feat: in the form of pandas.Index + + """ + x_train, y_train = df_train["feature"], df_train["label"] + features = x_train.columns + N, F = x_train.shape + g = pd.DataFrame({"g_value": np.zeros(F, dtype=float)}) + M = len(self.ensemble) + + # shuffle specific columns and calculate g-value for each feature + x_train_tmp = x_train.copy() + for i_f, feat in enumerate(features): + x_train_tmp.loc[:, feat] = np.random.permutation(x_train_tmp.loc[:, feat].values) + pred = pd.Series(np.zeros(N), index=x_train_tmp.index) + for i_s, submodel in enumerate(self.ensemble): + pred += ( + pd.Series( + submodel.predict(x_train_tmp.loc[:, self.sub_features[i_s]].values), index=x_train_tmp.index + ) + / M + ) + loss_feat = self.get_loss(y_train.values.squeeze(), pred.values) + g.loc[i_f, "g_value"] = np.mean(loss_feat - loss_values) / (np.std(loss_feat - loss_values) + 1e-7) + x_train_tmp.loc[:, feat] = x_train.loc[:, feat].copy() + + # one column in train features is all-nan # if g['g_value'].isna().any() + g["g_value"].replace(np.nan, 0, inplace=True) + + # divide features into bins_fs bins + g["bins"] = pd.cut(g["g_value"], self.bins_fs) + + # randomly sample features from bins to construct the new features + res_feat = [] + sorted_bins = sorted(g["bins"].unique(), reverse=True) + for i_b, b in enumerate(sorted_bins): + b_feat = features[g["bins"] == b] + num_feat = int(np.ceil(self.sample_ratios[i_b] * len(b_feat))) + res_feat = res_feat + np.random.choice(b_feat, size=num_feat, replace=False).tolist() + return pd.Index(set(res_feat)) + + def get_loss(self, label, pred): + if self.loss == "mse": + return (label - pred) ** 2 + else: + raise ValueError("not implemented yet") + + def retrieve_loss_curve(self, model, df_train, features): + if self.base_model == "gbm": + num_trees = model.num_trees() + x_train, y_train = df_train["feature"].loc[:, features], df_train["label"] + # Lightgbm need 1D array as its label + if y_train.values.ndim == 2 and y_train.values.shape[1] == 1: + y_train = np.squeeze(y_train.values) + else: + raise ValueError("LightGBM doesn't support multi-label training") + + N = x_train.shape[0] + loss_curve = pd.DataFrame(np.zeros((N, num_trees))) + pred_tree = np.zeros(N, dtype=float) + for i_tree in range(num_trees): + pred_tree += model.predict(x_train.values, start_iteration=i_tree, num_iteration=1) + loss_curve.iloc[:, i_tree] = self.get_loss(y_train, pred_tree) + else: + raise ValueError("not implemented yet") + return loss_curve + + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): + if self.ensemble is None: + raise ValueError("model is not fitted yet!") + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) + pred = pd.Series(np.zeros(x_test.shape[0]), index=x_test.index) + for i_sub, submodel in enumerate(self.ensemble): + feat_sub = self.sub_features[i_sub] + pred += ( + pd.Series(submodel.predict(x_test.loc[:, feat_sub].values), index=x_test.index) + * self.sub_weights[i_sub] + ) + return pred + + def predict_sub(self, submodel, df_data, features): + x_data, y_data = df_data["feature"].loc[:, features], df_data["label"] + pred_sub = pd.Series(submodel.predict(x_data.values), index=x_data.index) + return pred_sub + + def get_feature_importance(self, *args, **kwargs) -> pd.Series: + """get feature importance + + Notes + ----- + parameters reference: + https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html?highlight=feature_importance#lightgbm.Booster.feature_importance + """ + res = [] + for _model, _weight in zip(self.ensemble, self.sub_weights): + res.append(pd.Series(_model.feature_importance(*args, **kwargs), index=_model.feature_name()) * _weight) + return pd.concat(res, axis=1, sort=False).sum(axis=1).sort_values(ascending=False) diff --git a/qlib/contrib/model/gbdt.py b/qlib/contrib/model/gbdt.py index 463cf8f4f..1a7cf7fba 100644 --- a/qlib/contrib/model/gbdt.py +++ b/qlib/contrib/model/gbdt.py @@ -8,9 +8,10 @@ from typing import Text, Union from ...model.base import ModelFT from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP +from ...model.interpret.base import LightGBMFInt -class LGBModel(ModelFT): +class LGBModel(ModelFT, LightGBMFInt): """LightGBM Model""" def __init__(self, loss="mse", **kwargs): @@ -33,8 +34,8 @@ class LGBModel(ModelFT): else: raise ValueError("LightGBM doesn't support multi-label training") - dtrain = lgb.Dataset(x_train.values, label=y_train) - dvalid = lgb.Dataset(x_valid.values, label=y_valid) + dtrain = lgb.Dataset(x_train, label=y_train) + dvalid = lgb.Dataset(x_valid, label=y_valid) return dtrain, dvalid def fit( diff --git a/qlib/contrib/model/highfreq_gdbt_model.py b/qlib/contrib/model/highfreq_gdbt_model.py index 5a2eeb50a..04d6ab9d5 100644 --- a/qlib/contrib/model/highfreq_gdbt_model.py +++ b/qlib/contrib/model/highfreq_gdbt_model.py @@ -1,17 +1,18 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. +import warnings import numpy as np import pandas as pd import lightgbm as lgb -from qlib.model.base import ModelFT -from qlib.data.dataset import DatasetH -from qlib.data.dataset.handler import DataHandlerLP -import warnings +from ...model.base import ModelFT +from ...data.dataset import DatasetH +from ...data.dataset.handler import DataHandlerLP +from ...model.interpret.base import LightGBMFInt -class HFLGBModel(ModelFT): +class HFLGBModel(ModelFT, LightGBMFInt): """LightGBM Model for high frequency prediction""" def __init__(self, loss="mse", **kwargs): @@ -97,8 +98,8 @@ class HFLGBModel(ModelFT): else: raise ValueError("LightGBM doesn't support multi-label training") - dtrain = lgb.Dataset(x_train.values, label=y_train) - dvalid = lgb.Dataset(x_valid.values, label=y_valid) + dtrain = lgb.Dataset(x_train, label=y_train) + dvalid = lgb.Dataset(x_valid, label=y_valid) return dtrain, dvalid def fit( diff --git a/qlib/contrib/model/xgboost.py b/qlib/contrib/model/xgboost.py index cbba14678..2a38f4fe1 100755 --- a/qlib/contrib/model/xgboost.py +++ b/qlib/contrib/model/xgboost.py @@ -8,9 +8,10 @@ from typing import Text, Union from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP +from ...model.interpret.base import FeatureInt -class XGBModel(Model): +class XGBModel(Model, FeatureInt): """XGBModel Model""" def __init__(self, **kwargs): @@ -42,8 +43,8 @@ class XGBModel(Model): else: raise ValueError("XGBoost doesn't support multi-label training") - dtrain = xgb.DMatrix(x_train.values, label=y_train_1d) - dvalid = xgb.DMatrix(x_valid.values, label=y_valid_1d) + dtrain = xgb.DMatrix(x_train, label=y_train_1d) + dvalid = xgb.DMatrix(x_valid, label=y_valid_1d) self.model = xgb.train( self._params, dtrain=dtrain, @@ -62,3 +63,13 @@ class XGBModel(Model): raise ValueError("model is not fitted yet!") x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) return pd.Series(self.model.predict(xgb.DMatrix(x_test.values)), index=x_test.index) + + def get_feature_importance(self, *args, **kwargs) -> pd.Series: + """get feature importance + + Notes + ------- + parameters reference: + https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.Booster.get_score + """ + return pd.Series(self.model.get_score(*args, **kwargs)).sort_values(ascending=False) diff --git a/qlib/model/interpret/__init__.py b/qlib/model/interpret/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/qlib/model/interpret/base.py b/qlib/model/interpret/base.py new file mode 100644 index 000000000..70d79faca --- /dev/null +++ b/qlib/model/interpret/base.py @@ -0,0 +1,33 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Interfaces to interpret models +""" + +import pandas as pd +from abc import abstractmethod + + +class FeatureInt: + """Feature (Int)erpreter""" + + @abstractmethod + def get_feature_importance(self) -> pd.Series: + ... + + +class LightGBMFInt(FeatureInt): + """LightGBM (F)eature (Int)erpreter""" + + def get_feature_importance(self, *args, **kwargs) -> pd.Series: + """get feature importance + + Notes + ----- + parameters reference: + https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html?highlight=feature_importance#lightgbm.Booster.feature_importance + """ + return pd.Series(self.model.feature_importance(*args, **kwargs), index=self.model.feature_name()).sort_values( + ascending=False + )