Merge branch 'main' of github.com:microsoft/qlib into fshare

2026-07-02 02:21:18 +08:00 · 2021-03-11 12:54:04 +00:00
parent b99de068f8 e8beaa5257
commit e626264d5a
8 changed files with 41 additions and 13 deletions
--- a/docs/component/data.rst
+++ b/docs/component/data.rst
@@ -218,6 +218,25 @@ Filter
    - `cross-sectional features filter` \: rule_expression = '$rank($close)<10'
    - `time-sequence features filter`: rule_expression = '$Ref($close, 3)>100'

+Here is a simple example showing how to use filter in a basic ``Qlib`` workflow configuration file:
+
+.. code-block:: yaml
+
+    filter: &filter
+        filter_type: ExpressionDFilter
+        rule_expression: "Ref($close, -2) / Ref($close, -1) > 1"
+        filter_start_time: 2010-01-01
+        filter_end_time: 2010-01-07
+        keep: False
+
+    data_handler_config: &data_handler_config
+        start_time: 2010-01-01
+        end_time: 2021-01-22
+        fit_start_time: 2010-01-01
+        fit_end_time: 2015-12-31
+        instruments: *market
+        filter_pipe: [*filter]
+
 To know more about ``Filter``, please refer to `Filter API <../reference/api.html#module-qlib.data.filter>`_.

 Reference
--- a/qlib/contrib/model/pytorch_alstm_ts.py
+++ b/qlib/contrib/model/pytorch_alstm_ts.py
@@ -213,8 +213,12 @@ class ALSTM(Model):
        dl_train.config(fillna_type="ffill+bfill")  # process nan brought by dataloader
        dl_valid.config(fillna_type="ffill+bfill")  # process nan brought by dataloader

-        train_loader = DataLoader(dl_train, batch_size=self.batch_size, shuffle=True, num_workers=self.n_jobs)
-        valid_loader = DataLoader(dl_valid, batch_size=self.batch_size, shuffle=False, num_workers=self.n_jobs)
+        train_loader = DataLoader(
+            dl_train, batch_size=self.batch_size, shuffle=True, num_workers=self.n_jobs, drop_last=True
+        )
+        valid_loader = DataLoader(
+            dl_valid, batch_size=self.batch_size, shuffle=False, num_workers=self.n_jobs, drop_last=True
+        )

        save_path = get_or_create_path(save_path)

--- a/qlib/contrib/model/pytorch_gats_ts.py
+++ b/qlib/contrib/model/pytorch_gats_ts.py
@@ -261,8 +261,8 @@ class GATs(Model):
        sampler_train = DailyBatchSampler(dl_train)
        sampler_valid = DailyBatchSampler(dl_valid)

-        train_loader = DataLoader(dl_train, sampler=sampler_train, num_workers=self.n_jobs)
-        valid_loader = DataLoader(dl_valid, sampler=sampler_valid, num_workers=self.n_jobs)
+        train_loader = DataLoader(dl_train, sampler=sampler_train, num_workers=self.n_jobs, drop_last=True)
+        valid_loader = DataLoader(dl_valid, sampler=sampler_valid, num_workers=self.n_jobs, drop_last=True)

        save_path = get_or_create_path(save_path)

--- a/qlib/contrib/model/pytorch_gru_ts.py
+++ b/qlib/contrib/model/pytorch_gru_ts.py
@@ -213,8 +213,12 @@ class GRU(Model):
        dl_train.config(fillna_type="ffill+bfill")  # process nan brought by dataloader
        dl_valid.config(fillna_type="ffill+bfill")  # process nan brought by dataloader

-        train_loader = DataLoader(dl_train, batch_size=self.batch_size, shuffle=True, num_workers=self.n_jobs)
-        valid_loader = DataLoader(dl_valid, batch_size=self.batch_size, shuffle=False, num_workers=self.n_jobs)
+        train_loader = DataLoader(
+            dl_train, batch_size=self.batch_size, shuffle=True, num_workers=self.n_jobs, drop_last=True
+        )
+        valid_loader = DataLoader(
+            dl_valid, batch_size=self.batch_size, shuffle=False, num_workers=self.n_jobs, drop_last=True
+        )

        save_path = get_or_create_path(save_path)

--- a/qlib/contrib/model/pytorch_lstm_ts.py
+++ b/qlib/contrib/model/pytorch_lstm_ts.py
@@ -209,8 +209,12 @@ class LSTM(Model):
        dl_train.config(fillna_type="ffill+bfill")  # process nan brought by dataloader
        dl_valid.config(fillna_type="ffill+bfill")  # process nan brought by dataloader

-        train_loader = DataLoader(dl_train, batch_size=self.batch_size, shuffle=True, num_workers=self.n_jobs)
-        valid_loader = DataLoader(dl_valid, batch_size=self.batch_size, shuffle=False, num_workers=self.n_jobs)
+        train_loader = DataLoader(
+            dl_train, batch_size=self.batch_size, shuffle=True, num_workers=self.n_jobs, drop_last=True
+        )
+        valid_loader = DataLoader(
+            dl_valid, batch_size=self.batch_size, shuffle=False, num_workers=self.n_jobs, drop_last=True
+        )

        save_path = get_or_create_path(save_path)

--- a/qlib/data/dataset/init.py
+++ b/qlib/data/dataset/init.py
@@ -413,7 +413,7 @@ class TSDataSampler:
        # 1) for better performance, use the last nan line for padding the lost date
        # 2) In case of precision problems. We use np.float64. # TODO: I'm not sure if whether np.float64 will result in
        # precision problems. It will not cause any problems in my tests at least
-        indices = np.nan_to_num(indices.astype(np.float64), nan=self.nan_idx).astype(np.int)
+        indices = np.nan_to_num(indices.astype(np.float64), nan=self.nan_idx).astype(int)

        data = self.data_arr[indices]
        if isinstance(idx, mtit):
--- a/qlib/data/ops.py
+++ b/qlib/data/ops.py
@@ -74,7 +74,6 @@ class NpElemOperator(ElemOperator):
    """

    def __init__(self, feature, func):
-        self.feature = feature
        self.func = func
        super(NpElemOperator, self).__init__(feature)

@@ -289,8 +288,6 @@ class NpPairOperator(PairOperator):
    """

    def __init__(self, feature_left, feature_right, func):
-        self.feature_left = feature_left
-        self.feature_right = feature_right
        self.func = func
        super(NpPairOperator, self).__init__(feature_left, feature_right)

--- a/qlib/utils/init.py
+++ b/qlib/utils/init.py
@@ -64,7 +64,7 @@ def np_ffill(arr: np.array):
    arr : np.array
        Input numpy 1D array
    """
-    mask = np.isnan(arr.astype(np.float))  # np.isnan only works on np.float
+    mask = np.isnan(arr.astype(float))  # np.isnan only works on np.float
    # get fill index
    idx = np.where(~mask, np.arange(mask.shape[0]), 0)
    np.maximum.accumulate(idx, out=idx)