From 45c6dfc5daed1c8e2678fa9ca68c35a514748d3f Mon Sep 17 00:00:00 2001 From: blin Date: Wed, 28 Apr 2021 07:25:19 +0000 Subject: [PATCH 1/5] filter --- qlib/data/dataset/__init__.py | 44 ++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py index cd15a98c9..5485796ef 100644 --- a/qlib/data/dataset/__init__.py +++ b/qlib/data/dataset/__init__.py @@ -114,6 +114,7 @@ class DatasetH(Dataset): """ self.handler: DataHandler = init_instance_by_config(handler, accept_types=DataHandler) self.segments = segments.copy() + self.fetch_kwargs = {} super().__init__(**kwargs) def config(self, handler_kwargs: dict = None, **kwargs): @@ -171,7 +172,7 @@ class DatasetH(Dataset): ---------- slc : slice """ - return self.handler.fetch(slc, **kwargs) + return self.handler.fetch(slc, **kwargs, **self.fetch_kwargs) def prepare( self, @@ -288,13 +289,29 @@ class TSDataSampler: # the data type will be changed # The index of usable data is between start_idx and end_idx - self.start_idx, self.end_idx = self.data.index.slice_locs(start=pd.Timestamp(start), end=pd.Timestamp(end)) self.idx_df, self.idx_map = self.build_index(self.data) - self.idx_arr = np.array(self.idx_df.values, dtype=np.float64) # for better performance - self.data_idx = deepcopy(self.data.index) + self.data_index = deepcopy(self.data.index) + if flt_data is not None: + self.flt_data = np.array(flt_data).reshape(-1) + self.idx_map = self.flt_idx_map(self.flt_data, self.idx_map) + self.data_index = self.data_index[np.where(self.flt_data == True)[0]] + + self.start_idx, self.end_idx = self.data_index.slice_locs(start=pd.Timestamp(start), end=pd.Timestamp(end)) + self.idx_arr = np.array(self.idx_df.values, dtype=np.float64) # for better performance + del self.data # save memory + @staticmethod + def flt_idx_map(flt_data, idx_map): + idx = 0 + new_idx_map = {} + for i, exist in enumerate(flt_data): + if exist: + new_idx_map[idx] = idx_map[i] + idx += 1 + return new_idx_map + def get_index(self): """ Get the pandas index of the data, it will be useful in following scenarios @@ -488,8 +505,19 @@ class TSDatasetH(DatasetH): """ split the _prepare_raw_seg is to leave a hook for data preprocessing before creating processing data """ - dtype = kwargs.pop("dtype", None) + dtype = kwargs.pop("dtype") start, end = slc.start, slc.stop - data = self._prepare_raw_seg(slc=slc, **kwargs) - tsds = TSDataSampler(data=data, start=start, end=end, step_len=self.step_len, dtype=dtype) - return tsds + flt_col = kwargs.pop('flt_col', None) + # TSDatasetH will retrieve more data for complete + data = self._prepare_raw_seg(slc, **kwargs) + + flt_kwargs = deepcopy(kwargs) + if flt_col is not None: + flt_kwargs['col_set'] = flt_col + flt_data = self._prepare_raw_seg(slc, **flt_kwargs) + assert len(flt_data.columns) == 1 + else: + flt_data = None + + tsds = TSDataSampler(data=data, start=start, end=end, step_len=self.step_len, dtype=dtype, flt_data=flt_data) + return tsds \ No newline at end of file From fa4511cb0a82f994b094df25f226e18bb8deb543 Mon Sep 17 00:00:00 2001 From: blin Date: Wed, 28 Apr 2021 07:30:22 +0000 Subject: [PATCH 2/5] filter --- qlib/data/dataset/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py index 5485796ef..0bdb5018b 100644 --- a/qlib/data/dataset/__init__.py +++ b/qlib/data/dataset/__init__.py @@ -505,7 +505,7 @@ class TSDatasetH(DatasetH): """ split the _prepare_raw_seg is to leave a hook for data preprocessing before creating processing data """ - dtype = kwargs.pop("dtype") + dtype = kwargs.pop("dtype", None) start, end = slc.start, slc.stop flt_col = kwargs.pop('flt_col', None) # TSDatasetH will retrieve more data for complete From 846c64f6c6e695a7fd13e805af57c68a5de887c9 Mon Sep 17 00:00:00 2001 From: blin <981921742@qq.com> Date: Thu, 6 May 2021 12:00:41 +0000 Subject: [PATCH 3/5] fix param --- qlib/data/dataset/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py index 0bdb5018b..8bcd6419a 100644 --- a/qlib/data/dataset/__init__.py +++ b/qlib/data/dataset/__init__.py @@ -244,7 +244,7 @@ class TSDataSampler: """ - def __init__(self, data: pd.DataFrame, start, end, step_len: int, fillna_type: str = "none", dtype=None): + def __init__(self, data: pd.DataFrame, start, end, step_len: int, fillna_type: str = "none", dtype=None, flt_data=None): """ Build a dataset which looks like torch.data.utils.Dataset. @@ -317,7 +317,7 @@ class TSDataSampler: Get the pandas index of the data, it will be useful in following scenarios - Special sampler will be used (e.g. user want to sample day by day) """ - return self.data_idx[self.start_idx : self.end_idx] + return self.data_index[self.start_idx : self.end_idx] def config(self, **kwargs): # Config the attributes From bec65ddf94a014349a16931833ac85c7f78ebc5d Mon Sep 17 00:00:00 2001 From: binlins <981921742@qq.com> Date: Fri, 7 May 2021 11:47:47 +0000 Subject: [PATCH 4/5] add document and reindex --- qlib/data/dataset/__init__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py index 8bcd6419a..a8b10a258 100644 --- a/qlib/data/dataset/__init__.py +++ b/qlib/data/dataset/__init__.py @@ -200,6 +200,12 @@ class DatasetH(Dataset): The data to fetch: DK_* Default is DK_I, which indicate fetching data for **inference**. + kwargs : + The parameters that kwargs may contain: + flt_col : str + It only exists in TSDatasetH, can be used to add a column of data(True or False) to filter data. + This parameter is only supported when it is an instance of TSDatasetH. + Returns ------- Union[List[pd.DataFrame], pd.DataFrame]: @@ -293,7 +299,7 @@ class TSDataSampler: self.data_index = deepcopy(self.data.index) if flt_data is not None: - self.flt_data = np.array(flt_data).reshape(-1) + self.flt_data = np.array(flt_data.reindex(self.data_index)).reshape(-1) self.idx_map = self.flt_idx_map(self.flt_data, self.idx_map) self.data_index = self.data_index[np.where(self.flt_data == True)[0]] From 08edb92461779e7b2ba666b1bd2c14e27969f848 Mon Sep 17 00:00:00 2001 From: binlins <981921742@qq.com> Date: Fri, 7 May 2021 12:56:58 +0000 Subject: [PATCH 5/5] add flt_data doc --- qlib/data/dataset/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py index a8b10a258..2173d87ae 100644 --- a/qlib/data/dataset/__init__.py +++ b/qlib/data/dataset/__init__.py @@ -272,6 +272,11 @@ class TSDataSampler: ffill with previous sample ffill+bfill: ffill with previous samples first and fill with later samples second + flt_data : pd.Series + a column of data(True or False) to filter data. + None: + kepp all data + """ self.start = start self.end = end