1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-07-02 18:40:58 +08:00

Merge pull request #286 from meng-ustc/main

Add a new method to benchmarks: DoubleEnsemble
This commit is contained in:
you-n-g
2021-03-02 18:34:17 +08:00
committed by GitHub
7 changed files with 445 additions and 1 deletions

View File

@@ -237,6 +237,7 @@ Here is a list of models built on `Qlib`.
- [SFM based on pytorch (Liheng Zhang, et al. 2017)](qlib/contrib/model/pytorch_sfm.py)
- [TFT based on tensorflow (Bryan Lim, et al. 2019)](examples/benchmarks/TFT/tft.py)
- [TabNet based on pytorch (Sercan O. Arik, et al. 2019)](qlib/contrib/model/pytorch_tabnet.py)
- [DoubleEnsemble based on LightGBM (Chuheng Zhang, et al. 2020)](qlib/contrib/model/double_ensemble.py)
Your PR of new Quant models is highly welcomed.

View File

@@ -0,0 +1,4 @@
# DoubleEnsemble
* DoubleEnsemble is an ensemble framework leveraging learning trajectory based sample reweighting and shuffling based feature selection, to solve both the low signal-to-noise ratio and increasing number of features problems. They identify the key samples based on the training dynamics on each sample and elicit key features based on the ablation impact of each feature via shuffling. The model is applicable to a wide range of base models, capable of extracting complex patterns, while mitigating the overfitting and instability issues for financial market prediction.
* This code used in Qlib is implemented by ourselves.
* Paper: DoubleEnsemble: A New Ensemble Method Based on Sample Reweighting and Feature Selection for Financial Data Analysis [https://arxiv.org/pdf/2010.01265.pdf](https://arxiv.org/pdf/2010.01265.pdf).

View File

@@ -0,0 +1,3 @@
pandas==1.1.2
numpy==1.17.4
lightgbm==3.1.0

View File

@@ -0,0 +1,90 @@
qlib_init:
provider_uri: "~/.qlib/qlib_data/cn_data"
region: cn
market: &market csi300
benchmark: &benchmark SH000300
data_handler_config: &data_handler_config
start_time: 2008-01-01
end_time: 2020-08-01
fit_start_time: 2008-01-01
fit_end_time: 2014-12-31
instruments: *market
port_analysis_config: &port_analysis_config
strategy:
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy.strategy
kwargs:
topk: 50
n_drop: 5
backtest:
verbose: False
limit_threshold: 0.095
account: 100000000
benchmark: *benchmark
deal_price: close
open_cost: 0.0005
close_cost: 0.0015
min_cost: 5
task:
model:
class: DEnsembleModel
module_path: qlib.contrib.model.double_ensemble
kwargs:
base_model: "gbm"
loss: mse
num_models: 6
enable_sr: True
enable_fs: True
alpha1: 1
alpha2: 1
bins_sr: 10
bins_fs: 5
decay: 0.5
sample_ratios:
- 0.8
- 0.7
- 0.6
- 0.5
- 0.4
sub_weights:
- 1
- 0.2
- 0.2
- 0.2
- 0.2
- 0.2
epochs: 28
colsample_bytree: 0.8879
learning_rate: 0.2
subsample: 0.8789
lambda_l1: 205.6999
lambda_l2: 580.9768
max_depth: 8
num_leaves: 210
num_threads: 20
verbosity: -1
dataset:
class: DatasetH
module_path: qlib.data.dataset
kwargs:
handler:
class: Alpha158
module_path: qlib.contrib.data.handler
kwargs: *data_handler_config
segments:
train: [2008-01-01, 2014-12-31]
valid: [2015-01-01, 2016-12-31]
test: [2017-01-01, 2020-08-01]
record:
- class: SignalRecord
module_path: qlib.workflow.record_temp
kwargs: {}
- class: SigAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
ana_long_short: False
ann_scaler: 252
- class: PortAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
config: *port_analysis_config

View File

@@ -0,0 +1,97 @@
qlib_init:
provider_uri: "~/.qlib/qlib_data/cn_data"
region: cn
market: &market csi300
benchmark: &benchmark SH000300
data_handler_config: &data_handler_config
start_time: 2008-01-01
end_time: 2020-08-01
fit_start_time: 2008-01-01
fit_end_time: 2014-12-31
instruments: *market
infer_processors: []
learn_processors:
- class: DropnaLabel
- class: CSRankNorm
kwargs:
fields_group: label
label: ["Ref($close, -2) / Ref($close, -1) - 1"]
port_analysis_config: &port_analysis_config
strategy:
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy.strategy
kwargs:
topk: 50
n_drop: 5
backtest:
verbose: False
limit_threshold: 0.095
account: 100000000
benchmark: *benchmark
deal_price: close
open_cost: 0.0005
close_cost: 0.0015
min_cost: 5
task:
model:
class: DEnsembleModel
module_path: qlib.contrib.model.double_ensemble
kwargs:
base_model: "gbm"
loss: mse
num_models: 6
enable_sr: True
enable_fs: True
alpha1: 1
alpha2: 1
bins_sr: 10
bins_fs: 5
decay: 0.5
sample_ratios:
- 0.8
- 0.7
- 0.6
- 0.5
- 0.4
sub_weights:
- 1
- 0.2
- 0.2
- 0.2
- 0.2
- 0.2
epochs: 136
colsample_bytree: 0.8879
learning_rate: 0.0421
subsample: 0.8789
lambda_l1: 205.6999
lambda_l2: 580.9768
max_depth: 8
num_leaves: 210
num_threads: 20
verbosity: -1
dataset:
class: DatasetH
module_path: qlib.data.dataset
kwargs:
handler:
class: Alpha360
module_path: qlib.contrib.data.handler
kwargs: *data_handler_config
segments:
train: [2008-01-01, 2014-12-31]
valid: [2015-01-01, 2016-12-31]
test: [2017-01-01, 2020-08-01]
record:
- class: SignalRecord
module_path: qlib.workflow.record_temp
kwargs: {}
- class: SigAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
ana_long_short: False
ann_scaler: 252
- class: PortAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
config: *port_analysis_config

View File

@@ -16,7 +16,7 @@ The numbers shown below demonstrate the performance of the entire `workflow` of
| LSTM (Sepp Hochreiter, et al.) | Alpha360 | 0.0443±0.01 | 0.3401±0.05| 0.0536±0.01 | 0.4248±0.05 | 0.0627±0.03 | 0.8441±0.48| -0.0882±0.03 |
| ALSTM (Yao Qin, et al.) | Alpha360 | 0.0493±0.01 | 0.3778±0.06| 0.0585±0.00 | 0.4606±0.04 | 0.0513±0.03 | 0.6727±0.38| -0.1085±0.02 |
| GATs (Petar Velickovic, et al.) | Alpha360 | 0.0475±0.00 | 0.3515±0.02| 0.0592±0.00 | 0.4585±0.01 | 0.0876±0.02 | 1.1513±0.27| -0.0795±0.02 |
| DoubleEnsemble (Chuheng Zhang, et al.) | Alpha360 | 0.0407±0.00| 0.3053±0.00 | 0.0490±0.00 | 0.3840±0.00 | 0.0380±0.02 | 0.5000±0.21 | -0.0984±0.02 |
## Alpha158 dataset
| Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown |
|---|---|---|---|---|---|---|---|---|
@@ -31,5 +31,7 @@ The numbers shown below demonstrate the performance of the entire `workflow` of
| LSTM (Sepp Hochreiter, et al.) | Alpha158 (with selected 20 features) | 0.0312±0.00 | 0.2394±0.04| 0.0418±0.00 | 0.3324±0.03 | 0.0298±0.02 | 0.4198±0.33| -0.1348±0.03 |
| ALSTM (Yao Qin, et al.) | Alpha158 (with selected 20 features) | 0.0385±0.01 | 0.3022±0.06| 0.0478±0.00 | 0.3874±0.04 | 0.0486±0.03 | 0.7141±0.45| -0.1088±0.03 |
| GATs (Petar Velickovic, et al.) | Alpha158 (with selected 20 features) | 0.0349±0.00 | 0.2511±0.01| 0.0457±0.00 | 0.3537±0.01 | 0.0578±0.02 | 0.8221±0.25| -0.0824±0.02 |
| DoubleEnsemble (Chuheng Zhang, et al.) | Alpha158 | 0.0544±0.00 | 0.4338±0.01 | 0.0523±0.00 | 0.4257±0.01 | 0.1253±0.01 | 1.4105±0.14 | -0.0902±0.01 |
- The selected 20 features are based on the feature importance of a lightgbm-based model.
- The base model of DoubleEnsemble is LGBM.

View File

@@ -0,0 +1,247 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import lightgbm as lgb
import numpy as np
import pandas as pd
from ...model.base import Model
from ...data.dataset import DatasetH
from ...data.dataset.handler import DataHandlerLP
from ...log import get_module_logger
class DEnsembleModel(Model):
"""Double Ensemble Model"""
def __init__(
self,
base_model="gbm",
loss="mse",
num_models=6,
enable_sr=True,
enable_fs=True,
alpha1=1.0,
alpha2=1.0,
bins_sr=10,
bins_fs=5,
decay=None,
sample_ratios=None,
sub_weights=None,
epochs=100,
**kwargs
):
self.base_model = base_model # "gbm" or "mlp", specifically, we use lgbm for "gbm"
self.num_models = num_models # the number of sub-models
self.enable_sr = enable_sr
self.enable_fs = enable_fs
self.alpha1 = alpha1
self.alpha2 = alpha2
self.bins_sr = bins_sr
self.bins_fs = bins_fs
self.decay = decay
if not len(sample_ratios) == bins_fs:
raise ValueError("The length of sample_ratios should be equal to bins_fs.")
self.sample_ratios = sample_ratios
if not len(sub_weights) == num_models:
raise ValueError("The length of sub_weights should be equal to num_models.")
self.sub_weights = sub_weights
self.epochs = epochs
self.logger = get_module_logger("DEnsembleModel")
self.logger.info("Double Ensemble Model...")
self.ensemble = [] # the current ensemble model, a list contains all the sub-models
self.sub_features = [] # the features for each sub model in the form of pandas.Index
self.params = {"objective": loss}
self.params.update(kwargs)
self.loss = loss
def fit(self, dataset: DatasetH):
df_train, df_valid = dataset.prepare(
["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
)
x_train, y_train = df_train["feature"], df_train["label"]
# initialize the sample weights
N, F = x_train.shape
weights = pd.Series(np.ones(N, dtype=float))
# initialize the features
features = x_train.columns
pred_sub = pd.DataFrame(np.zeros((N, self.num_models), dtype=float), index=x_train.index)
# train sub-models
for k in range(self.num_models):
self.sub_features.append(features)
self.logger.info("Training sub-model: ({}/{})".format(k + 1, self.num_models))
model_k = self.train_submodel(df_train, df_valid, weights, features)
self.ensemble.append(model_k)
# no further sample re-weight and feature selection needed for the last sub-model
if k + 1 == self.num_models:
break
self.logger.info("Retrieving loss curve and loss values...")
loss_curve = self.retrieve_loss_curve(model_k, df_train, features)
pred_k = self.predict_sub(model_k, df_train, features)
pred_sub.iloc[:, k] = pred_k
pred_ensemble = pred_sub.iloc[:, : k + 1].mean(axis=1)
loss_values = pd.Series(self.get_loss(y_train.values.squeeze(), pred_ensemble.values))
if self.enable_sr:
self.logger.info("Sample re-weighting...")
weights = self.sample_reweight(loss_curve, loss_values, k + 1)
if self.enable_fs:
self.logger.info("Feature selection...")
features = self.feature_selection(df_train, loss_values)
def train_submodel(self, df_train, df_valid, weights, features):
dtrain, dvalid = self._prepare_data_gbm(df_train, df_valid, weights, features)
evals_result = dict()
model = lgb.train(
self.params,
dtrain,
num_boost_round=self.epochs,
valid_sets=[dtrain, dvalid],
valid_names=["train", "valid"],
verbose_eval=20,
evals_result=evals_result,
)
evals_result["train"] = list(evals_result["train"].values())[0]
evals_result["valid"] = list(evals_result["valid"].values())[0]
return model
def _prepare_data_gbm(self, df_train, df_valid, weights, features):
x_train, y_train = df_train["feature"].loc[:, features], df_train["label"]
x_valid, y_valid = df_valid["feature"].loc[:, features], df_valid["label"]
# Lightgbm need 1D array as its label
if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
y_train, y_valid = np.squeeze(y_train.values), np.squeeze(y_valid.values)
else:
raise ValueError("LightGBM doesn't support multi-label training")
dtrain = lgb.Dataset(x_train.values, label=y_train, weight=weights)
dvalid = lgb.Dataset(x_valid.values, label=y_valid)
return dtrain, dvalid
def sample_reweight(self, loss_curve, loss_values, k_th):
"""
the SR module of Double Ensemble
:param loss_curve: the shape is NxT
the loss curve for the previous sub-model, where the element (i, t) if the error on the i-th sample
after the t-th iteration in the training of the previous sub-model.
:param loss_values: the shape is N
the loss of the current ensemble on the i-th sample.
:param k_th: the index of the current sub-model, starting from 1
:return: weights
the weights for all the samples.
"""
# normalize loss_curve and loss_values with ranking
loss_curve_norm = loss_curve.rank(axis=0, pct=True)
loss_values_norm = (-loss_values).rank(pct=True)
# calculate l_start and l_end from loss_curve
N, T = loss_curve.shape
part = np.maximum(int(T * 0.1), 1)
l_start = loss_curve_norm.iloc[:, :part].mean(axis=1)
l_end = loss_curve_norm.iloc[:, -part:].mean(axis=1)
# calculate h-value for each sample
h1 = loss_values_norm
h2 = (l_end / l_start).rank(pct=True)
h = pd.DataFrame({"h_value": self.alpha1 * h1 + self.alpha2 * h2})
# calculate weights
h["bins"] = pd.cut(h["h_value"], self.bins_sr)
h_avg = h.groupby("bins")["h_value"].mean()
weights = pd.Series(np.zeros(N, dtype=float))
for i_b, b in enumerate(h_avg.index):
weights[h["bins"] == b] = 1.0 / (self.decay ** k_th * h_avg[i_b] + 0.1)
return weights
def feature_selection(self, df_train, loss_values):
"""
the FS module of Double Ensemble
:param df_train: the shape is NxF
:param loss_values: the shape is N
the loss of the current ensemble on the i-th sample.
:return: res_feat: in the form of pandas.Index
"""
x_train, y_train = df_train["feature"], df_train["label"]
features = x_train.columns
N, F = x_train.shape
g = pd.DataFrame({"g_value": np.zeros(F, dtype=float)})
M = len(self.ensemble)
# shuffle specific columns and calculate g-value for each feature
x_train_tmp = x_train.copy()
for i_f, feat in enumerate(features):
x_train_tmp.loc[:, feat] = np.random.permutation(x_train_tmp.loc[:, feat].values)
pred = pd.Series(np.zeros(N), index=x_train_tmp.index)
for i_s, submodel in enumerate(self.ensemble):
pred += (
pd.Series(
submodel.predict(x_train_tmp.loc[:, self.sub_features[i_s]].values), index=x_train_tmp.index
)
/ M
)
loss_feat = self.get_loss(y_train.values.squeeze(), pred.values)
g.loc[i_f, "g_value"] = np.mean(loss_feat - loss_values) / np.std(loss_feat - loss_values)
x_train_tmp.loc[:, feat] = x_train.loc[:, feat].copy()
# one column in train features is all-nan # if g['g_value'].isna().any()
g["g_value"].replace(np.nan, 0, inplace=True)
# divide features into bins_fs bins
g["bins"] = pd.cut(g["g_value"], self.bins_fs)
# randomly sample features from bins to construct the new features
res_feat = []
sorted_bins = sorted(g["bins"].unique(), reverse=True)
for i_b, b in enumerate(sorted_bins):
b_feat = features[g["bins"] == b]
num_feat = int(np.ceil(self.sample_ratios[i_b] * len(b_feat)))
res_feat = res_feat + np.random.choice(b_feat, size=num_feat).tolist()
return pd.Index(res_feat)
def get_loss(self, label, pred):
if self.loss == "mse":
return (label - pred) ** 2
else:
raise ValueError("not implemented yet")
def retrieve_loss_curve(self, model, df_train, features):
if self.base_model == "gbm":
num_trees = model.num_trees()
x_train, y_train = df_train["feature"].loc[:, features], df_train["label"]
# Lightgbm need 1D array as its label
if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
y_train = np.squeeze(y_train.values)
else:
raise ValueError("LightGBM doesn't support multi-label training")
N = x_train.shape[0]
loss_curve = pd.DataFrame(np.zeros((N, num_trees)))
pred_tree = np.zeros(N, dtype=float)
for i_tree in range(num_trees):
pred_tree += model.predict(x_train.values, start_iteration=i_tree, num_iteration=1)
loss_curve.iloc[:, i_tree] = self.get_loss(y_train, pred_tree)
else:
raise ValueError("not implemented yet")
return loss_curve
def predict(self, dataset):
if self.ensemble is None:
raise ValueError("model is not fitted yet!")
x_test = dataset.prepare("test", col_set="feature", data_key=DataHandlerLP.DK_I)
pred = pd.Series(np.zeros(x_test.shape[0]), index=x_test.index)
for i_sub, submodel in enumerate(self.ensemble):
feat_sub = self.sub_features[i_sub]
pred += (
pd.Series(submodel.predict(x_test.loc[:, feat_sub].values), index=x_test.index)
* self.sub_weights[i_sub]
)
return pred
def predict_sub(self, submodel, df_data, features):
x_data, y_data = df_data["feature"].loc[:, features], df_data["label"]
pred_sub = pd.Series(submodel.predict(x_data.values), index=x_data.index)
return pred_sub