diff --git a/qlib/contrib/data/handler.py b/qlib/contrib/data/handler.py index 8cce92907..07ef2267a 100644 --- a/qlib/contrib/data/handler.py +++ b/qlib/contrib/data/handler.py @@ -10,6 +10,28 @@ from inspect import getfullargspec import copy +def check_transform_proc(proc_l, fit_start_time, fit_end_time): + new_l = [] + for p in proc_l: + if not isinstance(p, Processor): + klass, pkwargs = get_cls_kwargs(p, processor_module) + args = getfullargspec(klass).args + if "fit_start_time" in args and "fit_end_time" in args: + assert ( + fit_start_time is not None and fit_end_time is not None + ), "Make sure `fit_start_time` and `fit_end_time` are not None." + pkwargs.update( + { + "fit_start_time": fit_start_time, + "fit_end_time": fit_end_time, + } + ) + new_l.append({"class": klass.__name__, "kwargs": pkwargs}) + else: + new_l.append(p) + return new_l + + class ALPHA360_Denoise(DataHandlerLP): def __init__(self, instruments="csi500", start_time=None, end_time=None, fit_start_time=None, fit_end_time=None): data_loader = { @@ -83,8 +105,31 @@ class ALPHA360_Denoise(DataHandlerLP): return fields, names +_DEFAULT_LEARN_PROCESSORS = [ + {"class": "DropnaLabel"}, + {"class": "CSZScoreNorm", "kwargs": {"fields_group": "label"}}, +] +_DEFAULT_INFER_PROCESSORS = [ + {"class": "ProcessInf", "kwargs": {}}, + {"class": "ZScoreNorm", "kwargs": {}}, + {"class": "Fillna", "kwargs": {}}, +] + + class ALPHA360(DataHandlerLP): - def __init__(self, instruments="csi500", start_time=None, end_time=None, fit_start_time=None, fit_end_time=None): + def __init__( + self, + instruments="csi500", + start_time=None, + end_time=None, + infer_processors=_DEFAULT_INFER_PROCESSORS, + learn_processors=_DEFAULT_LEARN_PROCESSORS, + fit_start_time=None, + fit_end_time=None, + ): + infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time) + learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time) + data_loader = { "class": "QlibDataLoader", "kwargs": { @@ -95,16 +140,6 @@ class ALPHA360(DataHandlerLP): }, } - learn_processors = [ - {"class": "DropnaLabel", "kwargs": {"fields_group": "label"}}, - {"class": "CSZScoreNorm", "kwargs": {"fields_group": "label"}}, - ] - infer_processors = [ - {"class": "ProcessInf", "kwargs": {}}, - {"class": "ZscoreNorm", "kwargs": {"fit_start_time": fit_start_time, "fit_end_time": fit_end_time}}, - {"class": "Fillna", "kwargs": {}}, - ] - super().__init__( instruments, start_time, @@ -168,33 +203,12 @@ class Alpha158(DataHandlerLP): start_time=None, end_time=None, infer_processors=[], - learn_processors=["DropnaLabel", {"class": "CSZScoreNorm", "kwargs": {"fields_group": "label"}}], + learn_processors=_DEFAULT_LEARN_PROCESSORS, fit_start_time=None, fit_end_time=None, ): - def check_transform_proc(proc_l): - new_l = [] - for p in proc_l: - if not isinstance(p, Processor): - klass, pkwargs = get_cls_kwargs(p, processor_module) - args = getfullargspec(klass).args - if "fit_start_time" in args and "fit_end_time" in args: - assert ( - fit_start_time is not None and fit_end_time is not None - ), "Make sure `fit_start_time` and `fit_end_time` are not None." - pkwargs.update( - { - "fit_start_time": fit_start_time, - "fit_end_time": fit_end_time, - } - ) - new_l.append({"class": klass.__name__, "kwargs": pkwargs}) - else: - new_l.append(p) - return new_l - - infer_processors = check_transform_proc(infer_processors) - learn_processors = check_transform_proc(learn_processors) + infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time) + learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time) data_loader = { "class": "QlibDataLoader", diff --git a/qlib/data/dataset/processor.py b/qlib/data/dataset/processor.py index e4003a1f5..b764875ed 100755 --- a/qlib/data/dataset/processor.py +++ b/qlib/data/dataset/processor.py @@ -166,7 +166,9 @@ class MinMaxNorm(Processor): return df -class ZscoreNorm(Processor): +class ZScoreNorm(Processor): + """ZScore Normalization""" + def __init__(self, fit_start_time, fit_end_time, fields_group=None): self.fit_start_time = fit_start_time self.fit_end_time = fit_end_time @@ -193,6 +195,40 @@ class ZscoreNorm(Processor): return df +class RobustZScoreNorm(Processor): + """Robust ZScore Normalization + + Use robust statistics for Z-Score normalization: + mean(x) = median(x) + std(x) = MAD(x) * 1.4826 + + Reference: + https://en.wikipedia.org/wiki/Median_absolute_deviation. + """ + + def __init__(self, fit_start_time, fit_end_time, fields_group=None, clip_outlier=True): + self.fit_start_time = fit_start_time + self.fit_end_time = fit_end_time + self.fields_group = fields_group + self.clip_outlier = clip_outlier + + def fit(self, df): + df = fetch_df_by_index(df, slice(self.fit_start_time, self.fit_end_time), level="datetime") + self.cols = get_group_columns(df, self.fields_group) + X = df[self.cols].values + self.mean_train = np.nanmedian(X, axis=0) + self.std_train = np.nanmedian(np.abs(X - self.mean_train), axis=0) + self.std_train += EPS + self.std_train *= 1.4826 + + def __call__(self, df): + df.loc(axis=1)[self.cols] -= self.mean_train + df.loc(axis=1)[self.cols] /= self.std_train + if self.clip_outlier: + df.clip(-3, 3, inplace=True) + return df + + class CSZScoreNorm(Processor): """Cross Sectional ZScore Normalization"""