From 756bd0f65b66e27847884b9e3382d081e5c750b1 Mon Sep 17 00:00:00 2001 From: Linlang <30293408+SunsetWolf@users.noreply.github.com> Date: Fri, 30 Dec 2022 20:42:37 +0800 Subject: [PATCH] Fix ZScoreNorm processor bug (#1398) * fix_ZScoreNorm_bug * fix_CI_error * fix_CI_error * add_test_processor * fix_pylint_error * fix_some_error_and_optimize_code * modify_terrible_code * optimize_code * optimize_code --- .github/workflows/test_qlib_from_source.yml | 5 +- qlib/data/dataset/processor.py | 36 +++++----- setup.py | 9 ++- tests/test_processor.py | 75 +++++++++++++++++++++ 4 files changed, 105 insertions(+), 20 deletions(-) create mode 100644 tests/test_processor.py diff --git a/.github/workflows/test_qlib_from_source.yml b/.github/workflows/test_qlib_from_source.yml index aa8bf63e2..d3894f230 100644 --- a/.github/workflows/test_qlib_from_source.yml +++ b/.github/workflows/test_qlib_from_source.yml @@ -140,10 +140,7 @@ jobs: - name: Test workflow by config (install from source) run: | - # Version 0.52.0 of numba must be installed manually in CI, otherwise it will cause incompatibility with the latest version of numpy. - python -m pip install numba==0.52.0 - # You must update numpy manually, because when installing python tools, it will try to uninstall numpy and cause CI to fail. - python -m pip install --upgrade numpy + python -m pip install numba python qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml - name: Unit tests with Pytest diff --git a/qlib/data/dataset/processor.py b/qlib/data/dataset/processor.py index a6445052f..cf4845af8 100644 --- a/qlib/data/dataset/processor.py +++ b/qlib/data/dataset/processor.py @@ -211,16 +211,19 @@ class MinMaxNorm(Processor): self.min_val = np.nanmin(df[cols].values, axis=0) self.max_val = np.nanmax(df[cols].values, axis=0) self.ignore = self.min_val == self.max_val + # To improve the speed, we set the value of `min_val` to `0` for the columns that do not need to be processed, + # and the value of `max_val` to `1`, when using `(x - min_val) / (max_val - min_val)` for uniform calculation, + # the columns that do not need to be processed will be calculated by `(x - 0) / (1 - 0)`, + # as you can see, the columns that do not need to be processed, will not be affected. + for _i, _con in enumerate(self.ignore): + if _con: + self.min_val[_i] = 0 + self.max_val[_i] = 1 self.cols = cols def __call__(self, df): - def normalize(x, min_val=self.min_val, max_val=self.max_val, ignore=self.ignore): - if (~ignore).all(): - return (x - min_val) / (max_val - min_val) - for i in range(ignore.size): - if not ignore[i]: - x[i] = (x[i] - min_val) / (max_val - min_val) - return x + def normalize(x, min_val=self.min_val, max_val=self.max_val): + return (x - min_val) / (max_val - min_val) df.loc(axis=1)[self.cols] = normalize(df[self.cols].values) return df @@ -242,16 +245,19 @@ class ZScoreNorm(Processor): self.mean_train = np.nanmean(df[cols].values, axis=0) self.std_train = np.nanstd(df[cols].values, axis=0) self.ignore = self.std_train == 0 + # To improve the speed, we set the value of `std_train` to `1` for the columns that do not need to be processed, + # and the value of `mean_train` to `0`, when using `(x - mean_train) / std_train` for uniform calculation, + # the columns that do not need to be processed will be calculated by `(x - 0) / 1`, + # as you can see, the columns that do not need to be processed, will not be affected. + for _i, _con in enumerate(self.ignore): + if _con: + self.std_train[_i] = 1 + self.mean_train[_i] = 0 self.cols = cols def __call__(self, df): - def normalize(x, mean_train=self.mean_train, std_train=self.std_train, ignore=self.ignore): - if (~ignore).all(): - return (x - mean_train) / std_train - for i in range(ignore.size): - if not ignore[i]: - x[i] = (x[i] - mean_train) / std_train - return x + def normalize(x, mean_train=self.mean_train, std_train=self.std_train): + return (x - mean_train) / std_train df.loc(axis=1)[self.cols] = normalize(df[self.cols].values) return df @@ -361,7 +367,7 @@ class CSZFillna(Processor): def __call__(self, df): cols = get_group_columns(df, self.fields_group) - df[cols] = df[cols].groupby("datetime").apply(lambda x: x.fillna(x.mean())) + df[cols] = df[cols].groupby("datetime", group_keys=False).apply(lambda x: x.fillna(x.mean())) return df diff --git a/setup.py b/setup.py index 9ff13d4a5..0cb6ac775 100644 --- a/setup.py +++ b/setup.py @@ -156,7 +156,14 @@ setup( "baostock", "yahooquery", "beautifulsoup4", - "tianshou", + # In version 0.4.11 of tianshou, the code: + # logits, hidden = self.actor(batch.obs, state=state, info=batch.info) + # was changed in PR787, + # which causes pytest errors(AttributeError: 'dict' object has no attribute 'info') in CI, + # so we restricted the version of tianshou. + # References: + # https://github.com/thu-ml/tianshou/releases + "tianshou<=0.4.10", "gym>=0.24", # If you do not put gym at the end, gym will degrade causing pytest results to fail. ], "rl": [ diff --git a/tests/test_processor.py b/tests/test_processor.py new file mode 100644 index 000000000..46453b316 --- /dev/null +++ b/tests/test_processor.py @@ -0,0 +1,75 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import unittest +import numpy as np +from qlib.data import D +from qlib.tests import TestAutoData +from qlib.data.dataset.processor import MinMaxNorm, ZScoreNorm, CSZScoreNorm, CSZFillna + + +class TestProcessor(TestAutoData): + TEST_INST = "SH600519" + + def test_MinMaxNorm(self): + def normalize(df): + min_val = np.nanmin(df.values, axis=0) + max_val = np.nanmax(df.values, axis=0) + ignore = min_val == max_val + for _i, _con in enumerate(ignore): + if _con: + max_val[_i] = 1 + min_val[_i] = 0 + df.loc(axis=1)[df.columns] = (df.values - min_val) / (max_val - min_val) + return df + + origin_df = D.features([self.TEST_INST], ["$high", "$open", "$low", "$close"]).tail(10) + origin_df["test"] = 0 + df = origin_df.copy() + mmn = MinMaxNorm(fields_group=None, fit_start_time="2021-05-31", fit_end_time="2021-06-11") + mmn.fit(df) + mmn.__call__(df) + origin_df = normalize(origin_df) + assert (df == origin_df).all().all() + + def test_ZScoreNorm(self): + def normalize(df): + mean_train = np.nanmean(df.values, axis=0) + std_train = np.nanstd(df.values, axis=0) + ignore = std_train == 0 + for _i, _con in enumerate(ignore): + if _con: + std_train[_i] = 1 + mean_train[_i] = 0 + df.loc(axis=1)[df.columns] = (df.values - mean_train) / std_train + return df + + origin_df = D.features([self.TEST_INST], ["$high", "$open", "$low", "$close"]).tail(10) + origin_df["test"] = 0 + df = origin_df.copy() + zsn = ZScoreNorm(fields_group=None, fit_start_time="2021-05-31", fit_end_time="2021-06-11") + zsn.fit(df) + zsn.__call__(df) + origin_df = normalize(origin_df) + assert (df == origin_df).all().all() + + def test_CSZFillna(self): + origin_df = D.features(D.instruments(market="csi300"), fields=["$high", "$open", "$low", "$close"]) + origin_df = origin_df.groupby("datetime", group_keys=False).apply(lambda x: x[97:99])[228:238] + df = origin_df.copy() + CSZFillna(fields_group=None).__call__(df) + assert ~df[1:2].isna().all().all() and origin_df[1:2].isna().all().all() + + def test_CSZScoreNorm(self): + origin_df = D.features(D.instruments(market="csi300"), fields=["$high", "$open", "$low", "$close"]) + origin_df = origin_df.groupby("datetime", group_keys=False).apply(lambda x: x[10:12])[50:60] + df = origin_df.copy() + CSZScoreNorm(fields_group=None).__call__(df) + # If we use the formula directly on the original data, we cannot get the correct result, + # because the original data is processed by `groupby`, so we use the method of slicing, + # taking the 2nd group of data from the original data, to calculate and compare. + assert (df[2:4] == ((origin_df[2:4] - origin_df[2:4].mean()).div(origin_df[2:4].std()))).all().all() + + +if __name__ == "__main__": + unittest.main()