From ce60097722d46bb431224d1bf9e67e59b2e03224 Mon Sep 17 00:00:00 2001
From: meng-ustc <dongmeng@mail.ustc.edu.cn>
Date: Wed, 24 Feb 2021 16:59:31 +0900
Subject: [PATCH] Add README and Formatted

---
 README.md                                    |  1 +
 examples/benchmarks/DoubleEnsemble/README.md |  4 +
 qlib/contrib/model/double_ensemble.py        | 77 ++++++++++----------
 3 files changed, 45 insertions(+), 37 deletions(-)
 create mode 100644 examples/benchmarks/DoubleEnsemble/README.md

diff --git a/README.md b/README.md
index 787075d6a..e1608c37d 100644
--- a/README.md
+++ b/README.md
@@ -232,6 +232,7 @@ Here is a list of models built on `Qlib`.
 - [SFM based on pytorch (Liheng Zhang, et al. 2017)](qlib/contrib/model/pytorch_sfm.py)
 - [TFT based on tensorflow (Bryan Lim, et al. 2019)](examples/benchmarks/TFT/tft.py)
 - [TabNet based on pytorch (Sercan O. Arik, et al. 2019)](qlib/contrib/model/pytorch_tabnet.py)
+- [DoubleEnsemble based on LightGBM (Chuheng Zhang, et al. 2020)](qlib/contrib/model/double_ensemble.py)
 
 Your PR of new Quant models is highly welcomed.
 
diff --git a/examples/benchmarks/DoubleEnsemble/README.md b/examples/benchmarks/DoubleEnsemble/README.md
new file mode 100644
index 000000000..67e741050
--- /dev/null
+++ b/examples/benchmarks/DoubleEnsemble/README.md
@@ -0,0 +1,4 @@
+# DoubleEnsemble
+* DoubleEnsemble is an ensemble framework leveraging learning trajectory based sample reweighting and shuffling based feature selection, to solve both the low signal-to-noise ratio and increasing number of features problems. They identify the key samples based on the training dynamics on each sample and elicit key features based on the ablation impact of each feature via shuffling. The model is applicable to a wide range of base models, capable of extracting complex patterns, while mitigating the overfitting and instability issues for financial market prediction.
+* This code used in Qlib is implemented by ourselves.
+* Paper: DoubleEnsemble: A New Ensemble Method Based on Sample Reweighting and Feature Selection for Financial Data Analysis [https://arxiv.org/pdf/2010.01265.pdf](https://arxiv.org/pdf/2010.01265.pdf).
\ No newline at end of file
diff --git a/qlib/contrib/model/double_ensemble.py b/qlib/contrib/model/double_ensemble.py
index 786b3987c..c96b69e8b 100644
--- a/qlib/contrib/model/double_ensemble.py
+++ b/qlib/contrib/model/double_ensemble.py
@@ -15,21 +15,22 @@ class DEnsembleModel(Model):
     """Double Ensemble Model"""
 
     def __init__(
-            self,
-            base="gbm",
-            loss="mse",
-            k=6,
-            enable_sr=True,
-            enable_fs=True,
-            alpha1=1.,
-            alpha2=1.,
-            bins_sr=10,
-            bins_fs=5,
-            decay=None,
-            sample_ratios=None,
-            sub_weights=None,
-            epochs=100,
-            **kwargs):
+        self,
+        base="gbm",
+        loss="mse",
+        k=6,
+        enable_sr=True,
+        enable_fs=True,
+        alpha1=1.0,
+        alpha2=1.0,
+        bins_sr=10,
+        bins_fs=5,
+        decay=None,
+        sample_ratios=None,
+        sub_weights=None,
+        epochs=100,
+        **kwargs
+    ):
         self.base = base  # "gbm" or "mlp", specifically, we use lgbm for "gbm"
         self.k = k
         self.enable_sr = enable_sr
@@ -54,10 +55,7 @@ class DEnsembleModel(Model):
         self.params.update(kwargs)
         self.loss = loss
 
-    def fit(
-        self,
-        dataset: DatasetH
-    ):
+    def fit(self, dataset: DatasetH):
         df_train, df_valid = dataset.prepare(
             ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
         )
@@ -71,7 +69,7 @@ class DEnsembleModel(Model):
         # train k sub-models
         for i_k in range(self.k):
             self.sub_features.append(features)
-            self.logger.info("Training sub-model: ({}/{})".format(i_k+1, self.k))
+            self.logger.info("Training sub-model: ({}/{})".format(i_k + 1, self.k))
             model_k = self.train_submodel(df_train, df_valid, weights, features)
             self.ensemble.append(model_k)
             # no further sample re-weight and feature selection needed for the last sub-model
@@ -82,12 +80,12 @@ class DEnsembleModel(Model):
             loss_curve = self.retrieve_loss_curve(model_k, df_train, features)
             pred_k = self.predict_sub(model_k, df_train, features)
             pred_sub.iloc[:, i_k] = pred_k
-            pred_ensemble = pred_sub.iloc[:, :i_k+1].mean(axis=1)
+            pred_ensemble = pred_sub.iloc[:, : i_k + 1].mean(axis=1)
             loss_values = pd.Series(self.get_loss(y_train.values.squeeze(), pred_ensemble.values))
 
             if self.enable_sr:
                 self.logger.info("Sample re-weighting...")
-                weights = self.sample_reweight(loss_curve, loss_values, i_k+1)
+                weights = self.sample_reweight(loss_curve, loss_values, i_k + 1)
 
             if self.enable_fs:
                 self.logger.info("Feature selection...")
@@ -148,14 +146,14 @@ class DEnsembleModel(Model):
         # calculate h-value for each sample
         h1 = loss_values_norm
         h2 = (l_end / l_start).rank(pct=True)
-        h = pd.DataFrame({'h_value': self.alpha1 * h1 + self.alpha2 * h2})
+        h = pd.DataFrame({"h_value": self.alpha1 * h1 + self.alpha2 * h2})
 
         # calculate weights
-        h['bins'] = pd.cut(h['h_value'], self.bins_sr)
-        h_avg = h.groupby('bins')['h_value'].mean()
+        h["bins"] = pd.cut(h["h_value"], self.bins_sr)
+        h_avg = h.groupby("bins")["h_value"].mean()
         weights = pd.Series(np.zeros(N, dtype=float))
         for i_b, b in enumerate(h_avg.index):
-            weights[h['bins'] == b] = 1. / (self.decay ** k_th * h_avg[i_b] + 0.1)
+            weights[h["bins"] == b] = 1.0 / (self.decay ** k_th * h_avg[i_b] + 0.1)
         return weights
 
     def feature_selection(self, df_train, loss_values):
@@ -170,7 +168,7 @@ class DEnsembleModel(Model):
         x_train, y_train = df_train["feature"], df_train["label"]
         features = x_train.columns
         N, F = x_train.shape
-        g = pd.DataFrame({'g_value': np.zeros(F, dtype=float)})
+        g = pd.DataFrame({"g_value": np.zeros(F, dtype=float)})
         M = len(self.ensemble)
 
         # shuffle specific columns and calculate g-value for each feature
@@ -179,23 +177,27 @@ class DEnsembleModel(Model):
             x_train_tmp.loc[:, feat] = np.random.permutation(x_train_tmp.loc[:, feat].values)
             pred = pd.Series(np.zeros(N), index=x_train_tmp.index)
             for i_s, submodel in enumerate(self.ensemble):
-                pred += pd.Series(submodel.predict(x_train_tmp.loc[:, self.sub_features[i_s]].values),
-                                  index=x_train_tmp.index) / M
+                pred += (
+                    pd.Series(
+                        submodel.predict(x_train_tmp.loc[:, self.sub_features[i_s]].values), index=x_train_tmp.index
+                    )
+                    / M
+                )
             loss_feat = self.get_loss(y_train.values.squeeze(), pred.values)
-            g.loc[i_f, 'g_value'] = np.mean(loss_feat - loss_values) / np.std(loss_feat - loss_values)
+            g.loc[i_f, "g_value"] = np.mean(loss_feat - loss_values) / np.std(loss_feat - loss_values)
             x_train_tmp.loc[:, feat] = x_train.loc[:, feat].copy()
 
         # one column in train features is all-nan # if g['g_value'].isna().any()
-        g['g_value'].replace(np.nan, 0, inplace=True)
+        g["g_value"].replace(np.nan, 0, inplace=True)
 
         # divide features into bins_fs bins
-        g['bins'] = pd.cut(g['g_value'], self.bins_fs)
+        g["bins"] = pd.cut(g["g_value"], self.bins_fs)
 
         # randomly sample features from bins to construct the new features
         res_feat = []
-        sorted_bins = sorted(g['bins'].unique(), reverse=True)
+        sorted_bins = sorted(g["bins"].unique(), reverse=True)
         for i_b, b in enumerate(sorted_bins):
-            b_feat = features[g['bins'] == b]
+            b_feat = features[g["bins"] == b]
             num_feat = int(np.ceil(self.sample_ratios[i_b] * len(b_feat)))
             res_feat = res_feat + np.random.choice(b_feat, size=num_feat).tolist()
         return pd.Index(res_feat)
@@ -233,12 +235,13 @@ class DEnsembleModel(Model):
         pred = pd.Series(np.zeros(x_test.shape[0]), index=x_test.index)
         for i_sub, submodel in enumerate(self.ensemble):
             feat_sub = self.sub_features[i_sub]
-            pred += pd.Series(submodel.predict(x_test.loc[:, feat_sub].values), index=x_test.index) * self.sub_weights[i_sub]
+            pred += (
+                pd.Series(submodel.predict(x_test.loc[:, feat_sub].values), index=x_test.index)
+                * self.sub_weights[i_sub]
+            )
         return pred
 
     def predict_sub(self, submodel, df_data, features):
         x_data, y_data = df_data["feature"].loc[:, features], df_data["label"]
         pred_sub = pd.Series(submodel.predict(x_data.values), index=x_data.index)
         return pred_sub
-
-