diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py index 6e0c0ab60..7a65b3f3c 100644 --- a/qlib/data/dataset/__init__.py +++ b/qlib/data/dataset/__init__.py @@ -350,7 +350,7 @@ class TSDataSampler: flt_data = flt_data.reindex(self.data_index).fillna(False).astype(np.bool) self.flt_data = flt_data.values self.idx_map = self.flt_idx_map(self.flt_data, self.idx_map) - self.data_index = self.data_index[np.where(self.flt_data is True)[0]] + self.data_index = self.data_index[np.where(self.flt_data)[0]] self.idx_map = self.idx_map2arr(self.idx_map) self.start_idx, self.end_idx = self.data_index.slice_locs( diff --git a/qlib/data/dataset/processor.py b/qlib/data/dataset/processor.py index ff05e11d6..26ff7e09d 100644 --- a/qlib/data/dataset/processor.py +++ b/qlib/data/dataset/processor.py @@ -187,7 +187,13 @@ class Fillna(Processor): df.fillna(self.fill_value, inplace=True) else: cols = get_group_columns(df, self.fields_group) - df.fillna({col: self.fill_value for col in cols}, inplace=True) + # this implementation is extremely slow + # df.fillna({col: self.fill_value for col in cols}, inplace=True) + + # So we use numpy to accelerate filling values + nan_select = np.isnan(df.values) + nan_select[:, ~df.columns.isin(cols)] = False + df.values[nan_select] = self.fill_value return df