fast fillna (#1074)

* fast fillna * fix TSDataSampler bug
2026-07-21 11:17:34 +08:00 · 2022-04-24 23:24:32 +08:00
parent caea495f40
commit cd5e5d5235
2 changed files with 8 additions and 2 deletions
--- a/qlib/data/dataset/init.py
+++ b/qlib/data/dataset/init.py
@@ -350,7 +350,7 @@ class TSDataSampler:
            flt_data = flt_data.reindex(self.data_index).fillna(False).astype(np.bool)
            self.flt_data = flt_data.values
            self.idx_map = self.flt_idx_map(self.flt_data, self.idx_map)
-            self.data_index = self.data_index[np.where(self.flt_data is True)[0]]
+            self.data_index = self.data_index[np.where(self.flt_data)[0]]
        self.idx_map = self.idx_map2arr(self.idx_map)

        self.start_idx, self.end_idx = self.data_index.slice_locs(
--- a/qlib/data/dataset/processor.py
+++ b/qlib/data/dataset/processor.py
@@ -187,7 +187,13 @@ class Fillna(Processor):
            df.fillna(self.fill_value, inplace=True)
        else:
            cols = get_group_columns(df, self.fields_group)
-            df.fillna({col: self.fill_value for col in cols}, inplace=True)
+            # this implementation is extremely slow
+            # df.fillna({col: self.fill_value for col in cols}, inplace=True)
+
+            # So we use numpy to accelerate filling values
+            nan_select = np.isnan(df.values)
+            nan_select[:, ~df.columns.isin(cols)] = False
+            df.values[nan_select] = self.fill_value
        return df