mirror of
https://github.com/microsoft/qlib.git
synced 2026-06-06 05:51:17 +08:00
Fix ZScoreNorm processor bug (#1398)
* fix_ZScoreNorm_bug * fix_CI_error * fix_CI_error * add_test_processor * fix_pylint_error * fix_some_error_and_optimize_code * modify_terrible_code * optimize_code * optimize_code
This commit is contained in:
5
.github/workflows/test_qlib_from_source.yml
vendored
5
.github/workflows/test_qlib_from_source.yml
vendored
@@ -140,10 +140,7 @@ jobs:
|
||||
|
||||
- name: Test workflow by config (install from source)
|
||||
run: |
|
||||
# Version 0.52.0 of numba must be installed manually in CI, otherwise it will cause incompatibility with the latest version of numpy.
|
||||
python -m pip install numba==0.52.0
|
||||
# You must update numpy manually, because when installing python tools, it will try to uninstall numpy and cause CI to fail.
|
||||
python -m pip install --upgrade numpy
|
||||
python -m pip install numba
|
||||
python qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
|
||||
|
||||
- name: Unit tests with Pytest
|
||||
|
||||
@@ -211,16 +211,19 @@ class MinMaxNorm(Processor):
|
||||
self.min_val = np.nanmin(df[cols].values, axis=0)
|
||||
self.max_val = np.nanmax(df[cols].values, axis=0)
|
||||
self.ignore = self.min_val == self.max_val
|
||||
# To improve the speed, we set the value of `min_val` to `0` for the columns that do not need to be processed,
|
||||
# and the value of `max_val` to `1`, when using `(x - min_val) / (max_val - min_val)` for uniform calculation,
|
||||
# the columns that do not need to be processed will be calculated by `(x - 0) / (1 - 0)`,
|
||||
# as you can see, the columns that do not need to be processed, will not be affected.
|
||||
for _i, _con in enumerate(self.ignore):
|
||||
if _con:
|
||||
self.min_val[_i] = 0
|
||||
self.max_val[_i] = 1
|
||||
self.cols = cols
|
||||
|
||||
def __call__(self, df):
|
||||
def normalize(x, min_val=self.min_val, max_val=self.max_val, ignore=self.ignore):
|
||||
if (~ignore).all():
|
||||
return (x - min_val) / (max_val - min_val)
|
||||
for i in range(ignore.size):
|
||||
if not ignore[i]:
|
||||
x[i] = (x[i] - min_val) / (max_val - min_val)
|
||||
return x
|
||||
def normalize(x, min_val=self.min_val, max_val=self.max_val):
|
||||
return (x - min_val) / (max_val - min_val)
|
||||
|
||||
df.loc(axis=1)[self.cols] = normalize(df[self.cols].values)
|
||||
return df
|
||||
@@ -242,16 +245,19 @@ class ZScoreNorm(Processor):
|
||||
self.mean_train = np.nanmean(df[cols].values, axis=0)
|
||||
self.std_train = np.nanstd(df[cols].values, axis=0)
|
||||
self.ignore = self.std_train == 0
|
||||
# To improve the speed, we set the value of `std_train` to `1` for the columns that do not need to be processed,
|
||||
# and the value of `mean_train` to `0`, when using `(x - mean_train) / std_train` for uniform calculation,
|
||||
# the columns that do not need to be processed will be calculated by `(x - 0) / 1`,
|
||||
# as you can see, the columns that do not need to be processed, will not be affected.
|
||||
for _i, _con in enumerate(self.ignore):
|
||||
if _con:
|
||||
self.std_train[_i] = 1
|
||||
self.mean_train[_i] = 0
|
||||
self.cols = cols
|
||||
|
||||
def __call__(self, df):
|
||||
def normalize(x, mean_train=self.mean_train, std_train=self.std_train, ignore=self.ignore):
|
||||
if (~ignore).all():
|
||||
return (x - mean_train) / std_train
|
||||
for i in range(ignore.size):
|
||||
if not ignore[i]:
|
||||
x[i] = (x[i] - mean_train) / std_train
|
||||
return x
|
||||
def normalize(x, mean_train=self.mean_train, std_train=self.std_train):
|
||||
return (x - mean_train) / std_train
|
||||
|
||||
df.loc(axis=1)[self.cols] = normalize(df[self.cols].values)
|
||||
return df
|
||||
@@ -361,7 +367,7 @@ class CSZFillna(Processor):
|
||||
|
||||
def __call__(self, df):
|
||||
cols = get_group_columns(df, self.fields_group)
|
||||
df[cols] = df[cols].groupby("datetime").apply(lambda x: x.fillna(x.mean()))
|
||||
df[cols] = df[cols].groupby("datetime", group_keys=False).apply(lambda x: x.fillna(x.mean()))
|
||||
return df
|
||||
|
||||
|
||||
|
||||
9
setup.py
9
setup.py
@@ -156,7 +156,14 @@ setup(
|
||||
"baostock",
|
||||
"yahooquery",
|
||||
"beautifulsoup4",
|
||||
"tianshou",
|
||||
# In version 0.4.11 of tianshou, the code:
|
||||
# logits, hidden = self.actor(batch.obs, state=state, info=batch.info)
|
||||
# was changed in PR787,
|
||||
# which causes pytest errors(AttributeError: 'dict' object has no attribute 'info') in CI,
|
||||
# so we restricted the version of tianshou.
|
||||
# References:
|
||||
# https://github.com/thu-ml/tianshou/releases
|
||||
"tianshou<=0.4.10",
|
||||
"gym>=0.24", # If you do not put gym at the end, gym will degrade causing pytest results to fail.
|
||||
],
|
||||
"rl": [
|
||||
|
||||
75
tests/test_processor.py
Normal file
75
tests/test_processor.py
Normal file
@@ -0,0 +1,75 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import unittest
|
||||
import numpy as np
|
||||
from qlib.data import D
|
||||
from qlib.tests import TestAutoData
|
||||
from qlib.data.dataset.processor import MinMaxNorm, ZScoreNorm, CSZScoreNorm, CSZFillna
|
||||
|
||||
|
||||
class TestProcessor(TestAutoData):
|
||||
TEST_INST = "SH600519"
|
||||
|
||||
def test_MinMaxNorm(self):
|
||||
def normalize(df):
|
||||
min_val = np.nanmin(df.values, axis=0)
|
||||
max_val = np.nanmax(df.values, axis=0)
|
||||
ignore = min_val == max_val
|
||||
for _i, _con in enumerate(ignore):
|
||||
if _con:
|
||||
max_val[_i] = 1
|
||||
min_val[_i] = 0
|
||||
df.loc(axis=1)[df.columns] = (df.values - min_val) / (max_val - min_val)
|
||||
return df
|
||||
|
||||
origin_df = D.features([self.TEST_INST], ["$high", "$open", "$low", "$close"]).tail(10)
|
||||
origin_df["test"] = 0
|
||||
df = origin_df.copy()
|
||||
mmn = MinMaxNorm(fields_group=None, fit_start_time="2021-05-31", fit_end_time="2021-06-11")
|
||||
mmn.fit(df)
|
||||
mmn.__call__(df)
|
||||
origin_df = normalize(origin_df)
|
||||
assert (df == origin_df).all().all()
|
||||
|
||||
def test_ZScoreNorm(self):
|
||||
def normalize(df):
|
||||
mean_train = np.nanmean(df.values, axis=0)
|
||||
std_train = np.nanstd(df.values, axis=0)
|
||||
ignore = std_train == 0
|
||||
for _i, _con in enumerate(ignore):
|
||||
if _con:
|
||||
std_train[_i] = 1
|
||||
mean_train[_i] = 0
|
||||
df.loc(axis=1)[df.columns] = (df.values - mean_train) / std_train
|
||||
return df
|
||||
|
||||
origin_df = D.features([self.TEST_INST], ["$high", "$open", "$low", "$close"]).tail(10)
|
||||
origin_df["test"] = 0
|
||||
df = origin_df.copy()
|
||||
zsn = ZScoreNorm(fields_group=None, fit_start_time="2021-05-31", fit_end_time="2021-06-11")
|
||||
zsn.fit(df)
|
||||
zsn.__call__(df)
|
||||
origin_df = normalize(origin_df)
|
||||
assert (df == origin_df).all().all()
|
||||
|
||||
def test_CSZFillna(self):
|
||||
origin_df = D.features(D.instruments(market="csi300"), fields=["$high", "$open", "$low", "$close"])
|
||||
origin_df = origin_df.groupby("datetime", group_keys=False).apply(lambda x: x[97:99])[228:238]
|
||||
df = origin_df.copy()
|
||||
CSZFillna(fields_group=None).__call__(df)
|
||||
assert ~df[1:2].isna().all().all() and origin_df[1:2].isna().all().all()
|
||||
|
||||
def test_CSZScoreNorm(self):
|
||||
origin_df = D.features(D.instruments(market="csi300"), fields=["$high", "$open", "$low", "$close"])
|
||||
origin_df = origin_df.groupby("datetime", group_keys=False).apply(lambda x: x[10:12])[50:60]
|
||||
df = origin_df.copy()
|
||||
CSZScoreNorm(fields_group=None).__call__(df)
|
||||
# If we use the formula directly on the original data, we cannot get the correct result,
|
||||
# because the original data is processed by `groupby`, so we use the method of slicing,
|
||||
# taking the 2nd group of data from the original data, to calculate and compare.
|
||||
assert (df[2:4] == ((origin_df[2:4] - origin_df[2:4].mean()).div(origin_df[2:4].std()))).all().all()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user