1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-06-06 05:51:17 +08:00

Fix ZScoreNorm processor bug (#1398)

* fix_ZScoreNorm_bug

* fix_CI_error

* fix_CI_error

* add_test_processor

* fix_pylint_error

* fix_some_error_and_optimize_code

* modify_terrible_code

* optimize_code

* optimize_code
This commit is contained in:
Linlang
2022-12-30 20:42:37 +08:00
committed by GitHub
parent 667fb0e4d9
commit 756bd0f65b
4 changed files with 105 additions and 20 deletions

View File

@@ -140,10 +140,7 @@ jobs:
- name: Test workflow by config (install from source)
run: |
# Version 0.52.0 of numba must be installed manually in CI, otherwise it will cause incompatibility with the latest version of numpy.
python -m pip install numba==0.52.0
# You must update numpy manually, because when installing python tools, it will try to uninstall numpy and cause CI to fail.
python -m pip install --upgrade numpy
python -m pip install numba
python qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
- name: Unit tests with Pytest

View File

@@ -211,16 +211,19 @@ class MinMaxNorm(Processor):
self.min_val = np.nanmin(df[cols].values, axis=0)
self.max_val = np.nanmax(df[cols].values, axis=0)
self.ignore = self.min_val == self.max_val
# To improve the speed, we set the value of `min_val` to `0` for the columns that do not need to be processed,
# and the value of `max_val` to `1`, when using `(x - min_val) / (max_val - min_val)` for uniform calculation,
# the columns that do not need to be processed will be calculated by `(x - 0) / (1 - 0)`,
# as you can see, the columns that do not need to be processed, will not be affected.
for _i, _con in enumerate(self.ignore):
if _con:
self.min_val[_i] = 0
self.max_val[_i] = 1
self.cols = cols
def __call__(self, df):
def normalize(x, min_val=self.min_val, max_val=self.max_val, ignore=self.ignore):
if (~ignore).all():
return (x - min_val) / (max_val - min_val)
for i in range(ignore.size):
if not ignore[i]:
x[i] = (x[i] - min_val) / (max_val - min_val)
return x
def normalize(x, min_val=self.min_val, max_val=self.max_val):
return (x - min_val) / (max_val - min_val)
df.loc(axis=1)[self.cols] = normalize(df[self.cols].values)
return df
@@ -242,16 +245,19 @@ class ZScoreNorm(Processor):
self.mean_train = np.nanmean(df[cols].values, axis=0)
self.std_train = np.nanstd(df[cols].values, axis=0)
self.ignore = self.std_train == 0
# To improve the speed, we set the value of `std_train` to `1` for the columns that do not need to be processed,
# and the value of `mean_train` to `0`, when using `(x - mean_train) / std_train` for uniform calculation,
# the columns that do not need to be processed will be calculated by `(x - 0) / 1`,
# as you can see, the columns that do not need to be processed, will not be affected.
for _i, _con in enumerate(self.ignore):
if _con:
self.std_train[_i] = 1
self.mean_train[_i] = 0
self.cols = cols
def __call__(self, df):
def normalize(x, mean_train=self.mean_train, std_train=self.std_train, ignore=self.ignore):
if (~ignore).all():
return (x - mean_train) / std_train
for i in range(ignore.size):
if not ignore[i]:
x[i] = (x[i] - mean_train) / std_train
return x
def normalize(x, mean_train=self.mean_train, std_train=self.std_train):
return (x - mean_train) / std_train
df.loc(axis=1)[self.cols] = normalize(df[self.cols].values)
return df
@@ -361,7 +367,7 @@ class CSZFillna(Processor):
def __call__(self, df):
cols = get_group_columns(df, self.fields_group)
df[cols] = df[cols].groupby("datetime").apply(lambda x: x.fillna(x.mean()))
df[cols] = df[cols].groupby("datetime", group_keys=False).apply(lambda x: x.fillna(x.mean()))
return df

View File

@@ -156,7 +156,14 @@ setup(
"baostock",
"yahooquery",
"beautifulsoup4",
"tianshou",
# In version 0.4.11 of tianshou, the code:
# logits, hidden = self.actor(batch.obs, state=state, info=batch.info)
# was changed in PR787,
# which causes pytest errors(AttributeError: 'dict' object has no attribute 'info') in CI,
# so we restricted the version of tianshou.
# References:
# https://github.com/thu-ml/tianshou/releases
"tianshou<=0.4.10",
"gym>=0.24", # If you do not put gym at the end, gym will degrade causing pytest results to fail.
],
"rl": [

75
tests/test_processor.py Normal file
View File

@@ -0,0 +1,75 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import unittest
import numpy as np
from qlib.data import D
from qlib.tests import TestAutoData
from qlib.data.dataset.processor import MinMaxNorm, ZScoreNorm, CSZScoreNorm, CSZFillna
class TestProcessor(TestAutoData):
TEST_INST = "SH600519"
def test_MinMaxNorm(self):
def normalize(df):
min_val = np.nanmin(df.values, axis=0)
max_val = np.nanmax(df.values, axis=0)
ignore = min_val == max_val
for _i, _con in enumerate(ignore):
if _con:
max_val[_i] = 1
min_val[_i] = 0
df.loc(axis=1)[df.columns] = (df.values - min_val) / (max_val - min_val)
return df
origin_df = D.features([self.TEST_INST], ["$high", "$open", "$low", "$close"]).tail(10)
origin_df["test"] = 0
df = origin_df.copy()
mmn = MinMaxNorm(fields_group=None, fit_start_time="2021-05-31", fit_end_time="2021-06-11")
mmn.fit(df)
mmn.__call__(df)
origin_df = normalize(origin_df)
assert (df == origin_df).all().all()
def test_ZScoreNorm(self):
def normalize(df):
mean_train = np.nanmean(df.values, axis=0)
std_train = np.nanstd(df.values, axis=0)
ignore = std_train == 0
for _i, _con in enumerate(ignore):
if _con:
std_train[_i] = 1
mean_train[_i] = 0
df.loc(axis=1)[df.columns] = (df.values - mean_train) / std_train
return df
origin_df = D.features([self.TEST_INST], ["$high", "$open", "$low", "$close"]).tail(10)
origin_df["test"] = 0
df = origin_df.copy()
zsn = ZScoreNorm(fields_group=None, fit_start_time="2021-05-31", fit_end_time="2021-06-11")
zsn.fit(df)
zsn.__call__(df)
origin_df = normalize(origin_df)
assert (df == origin_df).all().all()
def test_CSZFillna(self):
origin_df = D.features(D.instruments(market="csi300"), fields=["$high", "$open", "$low", "$close"])
origin_df = origin_df.groupby("datetime", group_keys=False).apply(lambda x: x[97:99])[228:238]
df = origin_df.copy()
CSZFillna(fields_group=None).__call__(df)
assert ~df[1:2].isna().all().all() and origin_df[1:2].isna().all().all()
def test_CSZScoreNorm(self):
origin_df = D.features(D.instruments(market="csi300"), fields=["$high", "$open", "$low", "$close"])
origin_df = origin_df.groupby("datetime", group_keys=False).apply(lambda x: x[10:12])[50:60]
df = origin_df.copy()
CSZScoreNorm(fields_group=None).__call__(df)
# If we use the formula directly on the original data, we cannot get the correct result,
# because the original data is processed by `groupby`, so we use the method of slicing,
# taking the 2nd group of data from the original data, to calculate and compare.
assert (df[2:4] == ((origin_df[2:4] - origin_df[2:4].mean()).div(origin_df[2:4].std()))).all().all()
if __name__ == "__main__":
unittest.main()