1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-06-06 05:51:17 +08:00

Add README and Formatted

This commit is contained in:
meng-ustc
2021-02-24 16:59:31 +09:00
parent 1a990fdd25
commit ce60097722
3 changed files with 45 additions and 37 deletions

View File

@@ -232,6 +232,7 @@ Here is a list of models built on `Qlib`.
- [SFM based on pytorch (Liheng Zhang, et al. 2017)](qlib/contrib/model/pytorch_sfm.py)
- [TFT based on tensorflow (Bryan Lim, et al. 2019)](examples/benchmarks/TFT/tft.py)
- [TabNet based on pytorch (Sercan O. Arik, et al. 2019)](qlib/contrib/model/pytorch_tabnet.py)
- [DoubleEnsemble based on LightGBM (Chuheng Zhang, et al. 2020)](qlib/contrib/model/double_ensemble.py)
Your PR of new Quant models is highly welcomed.

View File

@@ -0,0 +1,4 @@
# DoubleEnsemble
* DoubleEnsemble is an ensemble framework leveraging learning trajectory based sample reweighting and shuffling based feature selection, to solve both the low signal-to-noise ratio and increasing number of features problems. They identify the key samples based on the training dynamics on each sample and elicit key features based on the ablation impact of each feature via shuffling. The model is applicable to a wide range of base models, capable of extracting complex patterns, while mitigating the overfitting and instability issues for financial market prediction.
* This code used in Qlib is implemented by ourselves.
* Paper: DoubleEnsemble: A New Ensemble Method Based on Sample Reweighting and Feature Selection for Financial Data Analysis [https://arxiv.org/pdf/2010.01265.pdf](https://arxiv.org/pdf/2010.01265.pdf).

View File

@@ -15,21 +15,22 @@ class DEnsembleModel(Model):
"""Double Ensemble Model"""
def __init__(
self,
base="gbm",
loss="mse",
k=6,
enable_sr=True,
enable_fs=True,
alpha1=1.,
alpha2=1.,
bins_sr=10,
bins_fs=5,
decay=None,
sample_ratios=None,
sub_weights=None,
epochs=100,
**kwargs):
self,
base="gbm",
loss="mse",
k=6,
enable_sr=True,
enable_fs=True,
alpha1=1.0,
alpha2=1.0,
bins_sr=10,
bins_fs=5,
decay=None,
sample_ratios=None,
sub_weights=None,
epochs=100,
**kwargs
):
self.base = base # "gbm" or "mlp", specifically, we use lgbm for "gbm"
self.k = k
self.enable_sr = enable_sr
@@ -54,10 +55,7 @@ class DEnsembleModel(Model):
self.params.update(kwargs)
self.loss = loss
def fit(
self,
dataset: DatasetH
):
def fit(self, dataset: DatasetH):
df_train, df_valid = dataset.prepare(
["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
)
@@ -71,7 +69,7 @@ class DEnsembleModel(Model):
# train k sub-models
for i_k in range(self.k):
self.sub_features.append(features)
self.logger.info("Training sub-model: ({}/{})".format(i_k+1, self.k))
self.logger.info("Training sub-model: ({}/{})".format(i_k + 1, self.k))
model_k = self.train_submodel(df_train, df_valid, weights, features)
self.ensemble.append(model_k)
# no further sample re-weight and feature selection needed for the last sub-model
@@ -82,12 +80,12 @@ class DEnsembleModel(Model):
loss_curve = self.retrieve_loss_curve(model_k, df_train, features)
pred_k = self.predict_sub(model_k, df_train, features)
pred_sub.iloc[:, i_k] = pred_k
pred_ensemble = pred_sub.iloc[:, :i_k+1].mean(axis=1)
pred_ensemble = pred_sub.iloc[:, : i_k + 1].mean(axis=1)
loss_values = pd.Series(self.get_loss(y_train.values.squeeze(), pred_ensemble.values))
if self.enable_sr:
self.logger.info("Sample re-weighting...")
weights = self.sample_reweight(loss_curve, loss_values, i_k+1)
weights = self.sample_reweight(loss_curve, loss_values, i_k + 1)
if self.enable_fs:
self.logger.info("Feature selection...")
@@ -148,14 +146,14 @@ class DEnsembleModel(Model):
# calculate h-value for each sample
h1 = loss_values_norm
h2 = (l_end / l_start).rank(pct=True)
h = pd.DataFrame({'h_value': self.alpha1 * h1 + self.alpha2 * h2})
h = pd.DataFrame({"h_value": self.alpha1 * h1 + self.alpha2 * h2})
# calculate weights
h['bins'] = pd.cut(h['h_value'], self.bins_sr)
h_avg = h.groupby('bins')['h_value'].mean()
h["bins"] = pd.cut(h["h_value"], self.bins_sr)
h_avg = h.groupby("bins")["h_value"].mean()
weights = pd.Series(np.zeros(N, dtype=float))
for i_b, b in enumerate(h_avg.index):
weights[h['bins'] == b] = 1. / (self.decay ** k_th * h_avg[i_b] + 0.1)
weights[h["bins"] == b] = 1.0 / (self.decay ** k_th * h_avg[i_b] + 0.1)
return weights
def feature_selection(self, df_train, loss_values):
@@ -170,7 +168,7 @@ class DEnsembleModel(Model):
x_train, y_train = df_train["feature"], df_train["label"]
features = x_train.columns
N, F = x_train.shape
g = pd.DataFrame({'g_value': np.zeros(F, dtype=float)})
g = pd.DataFrame({"g_value": np.zeros(F, dtype=float)})
M = len(self.ensemble)
# shuffle specific columns and calculate g-value for each feature
@@ -179,23 +177,27 @@ class DEnsembleModel(Model):
x_train_tmp.loc[:, feat] = np.random.permutation(x_train_tmp.loc[:, feat].values)
pred = pd.Series(np.zeros(N), index=x_train_tmp.index)
for i_s, submodel in enumerate(self.ensemble):
pred += pd.Series(submodel.predict(x_train_tmp.loc[:, self.sub_features[i_s]].values),
index=x_train_tmp.index) / M
pred += (
pd.Series(
submodel.predict(x_train_tmp.loc[:, self.sub_features[i_s]].values), index=x_train_tmp.index
)
/ M
)
loss_feat = self.get_loss(y_train.values.squeeze(), pred.values)
g.loc[i_f, 'g_value'] = np.mean(loss_feat - loss_values) / np.std(loss_feat - loss_values)
g.loc[i_f, "g_value"] = np.mean(loss_feat - loss_values) / np.std(loss_feat - loss_values)
x_train_tmp.loc[:, feat] = x_train.loc[:, feat].copy()
# one column in train features is all-nan # if g['g_value'].isna().any()
g['g_value'].replace(np.nan, 0, inplace=True)
g["g_value"].replace(np.nan, 0, inplace=True)
# divide features into bins_fs bins
g['bins'] = pd.cut(g['g_value'], self.bins_fs)
g["bins"] = pd.cut(g["g_value"], self.bins_fs)
# randomly sample features from bins to construct the new features
res_feat = []
sorted_bins = sorted(g['bins'].unique(), reverse=True)
sorted_bins = sorted(g["bins"].unique(), reverse=True)
for i_b, b in enumerate(sorted_bins):
b_feat = features[g['bins'] == b]
b_feat = features[g["bins"] == b]
num_feat = int(np.ceil(self.sample_ratios[i_b] * len(b_feat)))
res_feat = res_feat + np.random.choice(b_feat, size=num_feat).tolist()
return pd.Index(res_feat)
@@ -233,12 +235,13 @@ class DEnsembleModel(Model):
pred = pd.Series(np.zeros(x_test.shape[0]), index=x_test.index)
for i_sub, submodel in enumerate(self.ensemble):
feat_sub = self.sub_features[i_sub]
pred += pd.Series(submodel.predict(x_test.loc[:, feat_sub].values), index=x_test.index) * self.sub_weights[i_sub]
pred += (
pd.Series(submodel.predict(x_test.loc[:, feat_sub].values), index=x_test.index)
* self.sub_weights[i_sub]
)
return pred
def predict_sub(self, submodel, df_data, features):
x_data, y_data = df_data["feature"].loc[:, features], df_data["label"]
pred_sub = pd.Series(submodel.predict(x_data.values), index=x_data.index)
return pred_sub