diff --git a/docs/component/data.rst b/docs/component/data.rst index 4b0962d49..9e5d7de2f 100644 --- a/docs/component/data.rst +++ b/docs/component/data.rst @@ -218,6 +218,25 @@ Filter - `cross-sectional features filter` \: rule_expression = '$rank($close)<10' - `time-sequence features filter`: rule_expression = '$Ref($close, 3)>100' +Here is a simple example showing how to use filter in a basic ``Qlib`` workflow configuration file: + +.. code-block:: yaml + + filter: &filter + filter_type: ExpressionDFilter + rule_expression: "Ref($close, -2) / Ref($close, -1) > 1" + filter_start_time: 2010-01-01 + filter_end_time: 2010-01-07 + keep: False + + data_handler_config: &data_handler_config + start_time: 2010-01-01 + end_time: 2021-01-22 + fit_start_time: 2010-01-01 + fit_end_time: 2015-12-31 + instruments: *market + filter_pipe: [*filter] + To know more about ``Filter``, please refer to `Filter API <../reference/api.html#module-qlib.data.filter>`_. Reference diff --git a/qlib/contrib/model/pytorch_alstm_ts.py b/qlib/contrib/model/pytorch_alstm_ts.py index 0272dd2e5..fe562fd1c 100644 --- a/qlib/contrib/model/pytorch_alstm_ts.py +++ b/qlib/contrib/model/pytorch_alstm_ts.py @@ -213,8 +213,12 @@ class ALSTM(Model): dl_train.config(fillna_type="ffill+bfill") # process nan brought by dataloader dl_valid.config(fillna_type="ffill+bfill") # process nan brought by dataloader - train_loader = DataLoader(dl_train, batch_size=self.batch_size, shuffle=True, num_workers=self.n_jobs) - valid_loader = DataLoader(dl_valid, batch_size=self.batch_size, shuffle=False, num_workers=self.n_jobs) + train_loader = DataLoader( + dl_train, batch_size=self.batch_size, shuffle=True, num_workers=self.n_jobs, drop_last=True + ) + valid_loader = DataLoader( + dl_valid, batch_size=self.batch_size, shuffle=False, num_workers=self.n_jobs, drop_last=True + ) save_path = get_or_create_path(save_path) diff --git a/qlib/contrib/model/pytorch_gats_ts.py b/qlib/contrib/model/pytorch_gats_ts.py index 7c8015437..369d1ca7f 100644 --- a/qlib/contrib/model/pytorch_gats_ts.py +++ b/qlib/contrib/model/pytorch_gats_ts.py @@ -261,8 +261,8 @@ class GATs(Model): sampler_train = DailyBatchSampler(dl_train) sampler_valid = DailyBatchSampler(dl_valid) - train_loader = DataLoader(dl_train, sampler=sampler_train, num_workers=self.n_jobs) - valid_loader = DataLoader(dl_valid, sampler=sampler_valid, num_workers=self.n_jobs) + train_loader = DataLoader(dl_train, sampler=sampler_train, num_workers=self.n_jobs, drop_last=True) + valid_loader = DataLoader(dl_valid, sampler=sampler_valid, num_workers=self.n_jobs, drop_last=True) save_path = get_or_create_path(save_path) diff --git a/qlib/contrib/model/pytorch_gru_ts.py b/qlib/contrib/model/pytorch_gru_ts.py index da449c714..483f419ce 100755 --- a/qlib/contrib/model/pytorch_gru_ts.py +++ b/qlib/contrib/model/pytorch_gru_ts.py @@ -213,8 +213,12 @@ class GRU(Model): dl_train.config(fillna_type="ffill+bfill") # process nan brought by dataloader dl_valid.config(fillna_type="ffill+bfill") # process nan brought by dataloader - train_loader = DataLoader(dl_train, batch_size=self.batch_size, shuffle=True, num_workers=self.n_jobs) - valid_loader = DataLoader(dl_valid, batch_size=self.batch_size, shuffle=False, num_workers=self.n_jobs) + train_loader = DataLoader( + dl_train, batch_size=self.batch_size, shuffle=True, num_workers=self.n_jobs, drop_last=True + ) + valid_loader = DataLoader( + dl_valid, batch_size=self.batch_size, shuffle=False, num_workers=self.n_jobs, drop_last=True + ) save_path = get_or_create_path(save_path) diff --git a/qlib/contrib/model/pytorch_lstm_ts.py b/qlib/contrib/model/pytorch_lstm_ts.py index 94d9a3f12..95476fedf 100755 --- a/qlib/contrib/model/pytorch_lstm_ts.py +++ b/qlib/contrib/model/pytorch_lstm_ts.py @@ -209,8 +209,12 @@ class LSTM(Model): dl_train.config(fillna_type="ffill+bfill") # process nan brought by dataloader dl_valid.config(fillna_type="ffill+bfill") # process nan brought by dataloader - train_loader = DataLoader(dl_train, batch_size=self.batch_size, shuffle=True, num_workers=self.n_jobs) - valid_loader = DataLoader(dl_valid, batch_size=self.batch_size, shuffle=False, num_workers=self.n_jobs) + train_loader = DataLoader( + dl_train, batch_size=self.batch_size, shuffle=True, num_workers=self.n_jobs, drop_last=True + ) + valid_loader = DataLoader( + dl_valid, batch_size=self.batch_size, shuffle=False, num_workers=self.n_jobs, drop_last=True + ) save_path = get_or_create_path(save_path) diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py index ecbeebc95..690436ba9 100644 --- a/qlib/data/dataset/__init__.py +++ b/qlib/data/dataset/__init__.py @@ -413,7 +413,7 @@ class TSDataSampler: # 1) for better performance, use the last nan line for padding the lost date # 2) In case of precision problems. We use np.float64. # TODO: I'm not sure if whether np.float64 will result in # precision problems. It will not cause any problems in my tests at least - indices = np.nan_to_num(indices.astype(np.float64), nan=self.nan_idx).astype(np.int) + indices = np.nan_to_num(indices.astype(np.float64), nan=self.nan_idx).astype(int) data = self.data_arr[indices] if isinstance(idx, mtit): diff --git a/qlib/data/ops.py b/qlib/data/ops.py index 8bc7e1fa7..cbc101f47 100644 --- a/qlib/data/ops.py +++ b/qlib/data/ops.py @@ -74,7 +74,6 @@ class NpElemOperator(ElemOperator): """ def __init__(self, feature, func): - self.feature = feature self.func = func super(NpElemOperator, self).__init__(feature) @@ -289,8 +288,6 @@ class NpPairOperator(PairOperator): """ def __init__(self, feature_left, feature_right, func): - self.feature_left = feature_left - self.feature_right = feature_right self.func = func super(NpPairOperator, self).__init__(feature_left, feature_right) diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index e5933cd9d..68d7d8f3f 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -64,7 +64,7 @@ def np_ffill(arr: np.array): arr : np.array Input numpy 1D array """ - mask = np.isnan(arr.astype(np.float)) # np.isnan only works on np.float + mask = np.isnan(arr.astype(float)) # np.isnan only works on np.float # get fill index idx = np.where(~mask, np.arange(mask.shape[0]), 0) np.maximum.accumulate(idx, out=idx)