From 45c6dfc5daed1c8e2678fa9ca68c35a514748d3f Mon Sep 17 00:00:00 2001
From: blin <binlins>
Date: Wed, 28 Apr 2021 07:25:19 +0000
Subject: [PATCH 1/5] filter

---
 qlib/data/dataset/__init__.py | 44 ++++++++++++++++++++++++++++-------
 1 file changed, 36 insertions(+), 8 deletions(-)
diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py
index cd15a98c9..5485796ef 100644
--- a/qlib/data/dataset/__init__.py
+++ b/qlib/data/dataset/__init__.py
@@ -114,6 +114,7 @@ class DatasetH(Dataset):
         """
         self.handler: DataHandler = init_instance_by_config(handler, accept_types=DataHandler)
         self.segments = segments.copy()
+        self.fetch_kwargs = {}
         super().__init__(**kwargs)
 
     def config(self, handler_kwargs: dict = None, **kwargs):
@@ -171,7 +172,7 @@ class DatasetH(Dataset):
         ----------
         slc : slice
         """
-        return self.handler.fetch(slc, **kwargs)
+        return self.handler.fetch(slc, **kwargs, **self.fetch_kwargs)
 
     def prepare(
         self,
@@ -288,13 +289,29 @@ class TSDataSampler:
 
         # the data type will be changed
         # The index of usable data is between start_idx and end_idx
-        self.start_idx, self.end_idx = self.data.index.slice_locs(start=pd.Timestamp(start), end=pd.Timestamp(end))
         self.idx_df, self.idx_map = self.build_index(self.data)
-        self.idx_arr = np.array(self.idx_df.values, dtype=np.float64)  # for better performance
-        self.data_idx = deepcopy(self.data.index)
+        self.data_index = deepcopy(self.data.index)
 
+        if flt_data is not None:
+            self.flt_data = np.array(flt_data).reshape(-1)
+            self.idx_map = self.flt_idx_map(self.flt_data, self.idx_map)
+            self.data_index = self.data_index[np.where(self.flt_data == True)[0]]
+
+        self.start_idx, self.end_idx = self.data_index.slice_locs(start=pd.Timestamp(start), end=pd.Timestamp(end))
+        self.idx_arr = np.array(self.idx_df.values, dtype=np.float64)  # for better performance
+        
         del self.data  # save memory
 
+    @staticmethod
+    def flt_idx_map(flt_data, idx_map):
+        idx = 0
+        new_idx_map = {}
+        for i, exist in enumerate(flt_data):
+            if exist:
+                new_idx_map[idx] = idx_map[i]
+                idx += 1
+        return new_idx_map
+
     def get_index(self):
         """
         Get the pandas index of the data, it will be useful in following scenarios
@@ -488,8 +505,19 @@ class TSDatasetH(DatasetH):
         """
         split the _prepare_raw_seg is to leave a hook for data preprocessing before creating processing data
         """
-        dtype = kwargs.pop("dtype", None)
+        dtype = kwargs.pop("dtype")
         start, end = slc.start, slc.stop
-        data = self._prepare_raw_seg(slc=slc, **kwargs)
-        tsds = TSDataSampler(data=data, start=start, end=end, step_len=self.step_len, dtype=dtype)
-        return tsds
+        flt_col = kwargs.pop('flt_col', None)
+        # TSDatasetH will retrieve more data for complete
+        data = self._prepare_raw_seg(slc, **kwargs)
+
+        flt_kwargs = deepcopy(kwargs)
+        if flt_col is not None:
+            flt_kwargs['col_set'] = flt_col
+            flt_data = self._prepare_raw_seg(slc, **flt_kwargs)
+            assert len(flt_data.columns) == 1
+        else:
+            flt_data = None
+
+        tsds = TSDataSampler(data=data, start=start, end=end, step_len=self.step_len, dtype=dtype, flt_data=flt_data)
+        return tsds
\ No newline at end of file

From fa4511cb0a82f994b094df25f226e18bb8deb543 Mon Sep 17 00:00:00 2001
From: blin <binlins>
Date: Wed, 28 Apr 2021 07:30:22 +0000
Subject: [PATCH 2/5] filter

---
 qlib/data/dataset/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py
index 5485796ef..0bdb5018b 100644
--- a/qlib/data/dataset/__init__.py
+++ b/qlib/data/dataset/__init__.py
@@ -505,7 +505,7 @@ class TSDatasetH(DatasetH):
         """
         split the _prepare_raw_seg is to leave a hook for data preprocessing before creating processing data
         """
-        dtype = kwargs.pop("dtype")
+        dtype = kwargs.pop("dtype", None)
         start, end = slc.start, slc.stop
         flt_col = kwargs.pop('flt_col', None)
         # TSDatasetH will retrieve more data for complete

From 846c64f6c6e695a7fd13e805af57c68a5de887c9 Mon Sep 17 00:00:00 2001
From: blin <981921742@qq.com>
Date: Thu, 6 May 2021 12:00:41 +0000
Subject: [PATCH 3/5] fix param

---
 qlib/data/dataset/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py
index 0bdb5018b..8bcd6419a 100644
--- a/qlib/data/dataset/__init__.py
+++ b/qlib/data/dataset/__init__.py
@@ -244,7 +244,7 @@ class TSDataSampler:
 
     """
 
-    def __init__(self, data: pd.DataFrame, start, end, step_len: int, fillna_type: str = "none", dtype=None):
+    def __init__(self, data: pd.DataFrame, start, end, step_len: int, fillna_type: str = "none", dtype=None, flt_data=None):
         """
         Build a dataset which looks like torch.data.utils.Dataset.
 
@@ -317,7 +317,7 @@ class TSDataSampler:
         Get the pandas index of the data, it will be useful in following scenarios
         - Special sampler will be used (e.g. user want to sample day by day)
         """
-        return self.data_idx[self.start_idx : self.end_idx]
+        return self.data_index[self.start_idx : self.end_idx]
 
     def config(self, **kwargs):
         # Config the attributes

From bec65ddf94a014349a16931833ac85c7f78ebc5d Mon Sep 17 00:00:00 2001
From: binlins <981921742@qq.com>
Date: Fri, 7 May 2021 11:47:47 +0000
Subject: [PATCH 4/5] add document and reindex

---
 qlib/data/dataset/__init__.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py
index 8bcd6419a..a8b10a258 100644
--- a/qlib/data/dataset/__init__.py
+++ b/qlib/data/dataset/__init__.py
@@ -200,6 +200,12 @@ class DatasetH(Dataset):
             The data to fetch:  DK_*
             Default is DK_I, which indicate fetching data for **inference**.
 
+        kwargs : 
+            The parameters that kwargs may contain:
+                flt_col : str
+                    It only exists in TSDatasetH, can be used to add a column of data(True or False) to filter data.
+                    This parameter is only supported when it is an instance of TSDatasetH.
+
         Returns
         -------
         Union[List[pd.DataFrame], pd.DataFrame]:
@@ -293,7 +299,7 @@ class TSDataSampler:
         self.data_index = deepcopy(self.data.index)
 
         if flt_data is not None:
-            self.flt_data = np.array(flt_data).reshape(-1)
+            self.flt_data = np.array(flt_data.reindex(self.data_index)).reshape(-1)
             self.idx_map = self.flt_idx_map(self.flt_data, self.idx_map)
             self.data_index = self.data_index[np.where(self.flt_data == True)[0]]
 

From 08edb92461779e7b2ba666b1bd2c14e27969f848 Mon Sep 17 00:00:00 2001
From: binlins <981921742@qq.com>
Date: Fri, 7 May 2021 12:56:58 +0000
Subject: [PATCH 5/5] add flt_data doc

---
 qlib/data/dataset/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py
index a8b10a258..2173d87ae 100644
--- a/qlib/data/dataset/__init__.py
+++ b/qlib/data/dataset/__init__.py
@@ -272,6 +272,11 @@ class TSDataSampler:
                 ffill with previous sample
             ffill+bfill:
                 ffill with previous samples first and fill with later samples second
+        flt_data : pd.Series
+            a column of data(True or False) to filter data.
+            None:
+                kepp all data
+
         """
         self.start = start
         self.end = end