From 21eb86e5cb2df4df95d36b75f8ed8931c953baa3 Mon Sep 17 00:00:00 2001 From: Jactus Date: Thu, 26 Nov 2020 11:54:06 +0800 Subject: [PATCH 1/6] Update run_all_model --- examples/benchmarks/TFT/README.md | 8 ++-- examples/run_all_model.py | 67 ++++++++++++++++++++++++++++--- 2 files changed, 67 insertions(+), 8 deletions(-) diff --git a/examples/benchmarks/TFT/README.md b/examples/benchmarks/TFT/README.md index a64ca0129..e9e44db1a 100644 --- a/examples/benchmarks/TFT/README.md +++ b/examples/benchmarks/TFT/README.md @@ -5,8 +5,10 @@ **GitHub**: https://github.com/google-research/google-research/tree/master/tft ## Run the Workflow -Users can follow the ``workflow_by_code_tft.py`` to run the benchmark. Please be **aware** that this script can only support Python 3.5 - 3.8. +Users can follow the ``workflow_by_code_tft.py`` to run the benchmark. ### Notes -1. The model must run in GPU, or an error will be raised. -2. New datasets should be registered in ``data_formatters``, for detail please visit the source. +1. Please be **aware** that this script can only support `Python 3.5 - 3.8`, and `Cuda 10.0 or 10.1`. +2. Please remember to install `cudatoolkit==10.1` and `cudnn==7.6` on your machine. +3. The model must run in GPU, or an error will be raised. +4. New datasets should be registered in ``data_formatters``, for detail please visit the source. diff --git a/examples/run_all_model.py b/examples/run_all_model.py index b448a1857..6f12434da 100644 --- a/examples/run_all_model.py +++ b/examples/run_all_model.py @@ -10,6 +10,7 @@ import shutil import tempfile import statistics from pathlib import Path +from operator import xor from subprocess import Popen, PIPE from threading import Thread from pprint import pprint @@ -161,6 +162,19 @@ class ExtendedEnvBuilder(venv.EnvBuilder): self.install_script(context, "pip", url) +# function to check cuda version on the machine, this case is for the model TFT +def check_cuda(folders): + path = "/usr/local/cuda/version.txt" + exclude_tft = True + if os.path.exists(path): + with open(path, "w") as f: + if "10.1" in str(f.read()) or "10.0" in str(f.read()): + exclude_tft = False + if exclude_tft and "TFT" in folders: + del folders["TFT"] + return folders + + # function to calculate the mean and std of a list in the results dictionary def cal_mean_std(results) -> dict: mean_std = dict() @@ -174,11 +188,23 @@ def cal_mean_std(results) -> dict: # function to get all the folders benchmark folder -def get_all_folders() -> dict: +def get_all_folders(models, exclude) -> dict: folders = dict() + if isinstance(models, str): + model_list = models.split(",") + models = [m.lower().strip("[ ]") for m in model_list] + elif isinstance(models, list): + models = [m.lower() for m in models] + elif models is None: + models = [f.name.lower() for f in os.scandir("benchmarks")] + else: + raise ValueError("Input models type is not supported. Please provide str or list without space.") for f in os.scandir("benchmarks"): - path = Path("benchmarks") / f.name - folders[f.name] = str(path.resolve()) + add = xor(bool(f.name.lower() in models), bool(exclude)) + if add: + path = Path("benchmarks") / f.name + folders[f.name] = str(path.resolve()) + folders = check_cuda(folders) return folders @@ -225,13 +251,44 @@ def gen_and_save_md_table(metrics): # function to run the all the models -def run(times=1): +def run(times=1, models=None, exclude=False): """ Please be aware that this function can only work under Linux. MacOS and Windows will be supported in the future. Any PR to enhance this method is highly welcomed. + + Parameters: + ----------- + times : int + determines how many times the model should be running. + models : str or list + determines the specific model or list of models to run or exclude. + exclude : boolean + determines whether the model being used is excluded or included. + + Usage: + ------- + Here are some use cases of the function in the bash: + + .. code-block:: bash + + # Case 1 - run all models multiple times + python run_all_model.py 3 + + # Case 2 - run specific models multiple times + python run_all_model.py 3 dnn + + # Case 3 - run other models except those are given as arguments for multiple times + python run_all_model.py 3 [dnn,tft,lstm] True + + # Case 4 - run specific models for one time + python run_all_model.py --models=[dnn,lightgbm] + + # Case 5 - run other models except those are given as aruments for one time + python run_all_model.py --models=[dnn,tft,sfm] --exclude=True + """ # get all folders - folders = get_all_folders() + folders = get_all_folders(models, exclude) # set up compatible = True if sys.version_info < (3, 3): From d0ca52f3fdd34ea246751b94da69df588a50e7d0 Mon Sep 17 00:00:00 2001 From: Dong Zhou Date: Thu, 26 Nov 2020 12:04:48 +0800 Subject: [PATCH 2/6] add robust zscore processor & ALPHA360 support custom processors --- qlib/contrib/data/handler.py | 84 ++++++++++++++++++++-------------- qlib/data/dataset/processor.py | 38 ++++++++++++++- 2 files changed, 86 insertions(+), 36 deletions(-) diff --git a/qlib/contrib/data/handler.py b/qlib/contrib/data/handler.py index 8cce92907..07ef2267a 100644 --- a/qlib/contrib/data/handler.py +++ b/qlib/contrib/data/handler.py @@ -10,6 +10,28 @@ from inspect import getfullargspec import copy +def check_transform_proc(proc_l, fit_start_time, fit_end_time): + new_l = [] + for p in proc_l: + if not isinstance(p, Processor): + klass, pkwargs = get_cls_kwargs(p, processor_module) + args = getfullargspec(klass).args + if "fit_start_time" in args and "fit_end_time" in args: + assert ( + fit_start_time is not None and fit_end_time is not None + ), "Make sure `fit_start_time` and `fit_end_time` are not None." + pkwargs.update( + { + "fit_start_time": fit_start_time, + "fit_end_time": fit_end_time, + } + ) + new_l.append({"class": klass.__name__, "kwargs": pkwargs}) + else: + new_l.append(p) + return new_l + + class ALPHA360_Denoise(DataHandlerLP): def __init__(self, instruments="csi500", start_time=None, end_time=None, fit_start_time=None, fit_end_time=None): data_loader = { @@ -83,8 +105,31 @@ class ALPHA360_Denoise(DataHandlerLP): return fields, names +_DEFAULT_LEARN_PROCESSORS = [ + {"class": "DropnaLabel"}, + {"class": "CSZScoreNorm", "kwargs": {"fields_group": "label"}}, +] +_DEFAULT_INFER_PROCESSORS = [ + {"class": "ProcessInf", "kwargs": {}}, + {"class": "ZScoreNorm", "kwargs": {}}, + {"class": "Fillna", "kwargs": {}}, +] + + class ALPHA360(DataHandlerLP): - def __init__(self, instruments="csi500", start_time=None, end_time=None, fit_start_time=None, fit_end_time=None): + def __init__( + self, + instruments="csi500", + start_time=None, + end_time=None, + infer_processors=_DEFAULT_INFER_PROCESSORS, + learn_processors=_DEFAULT_LEARN_PROCESSORS, + fit_start_time=None, + fit_end_time=None, + ): + infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time) + learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time) + data_loader = { "class": "QlibDataLoader", "kwargs": { @@ -95,16 +140,6 @@ class ALPHA360(DataHandlerLP): }, } - learn_processors = [ - {"class": "DropnaLabel", "kwargs": {"fields_group": "label"}}, - {"class": "CSZScoreNorm", "kwargs": {"fields_group": "label"}}, - ] - infer_processors = [ - {"class": "ProcessInf", "kwargs": {}}, - {"class": "ZscoreNorm", "kwargs": {"fit_start_time": fit_start_time, "fit_end_time": fit_end_time}}, - {"class": "Fillna", "kwargs": {}}, - ] - super().__init__( instruments, start_time, @@ -168,33 +203,12 @@ class Alpha158(DataHandlerLP): start_time=None, end_time=None, infer_processors=[], - learn_processors=["DropnaLabel", {"class": "CSZScoreNorm", "kwargs": {"fields_group": "label"}}], + learn_processors=_DEFAULT_LEARN_PROCESSORS, fit_start_time=None, fit_end_time=None, ): - def check_transform_proc(proc_l): - new_l = [] - for p in proc_l: - if not isinstance(p, Processor): - klass, pkwargs = get_cls_kwargs(p, processor_module) - args = getfullargspec(klass).args - if "fit_start_time" in args and "fit_end_time" in args: - assert ( - fit_start_time is not None and fit_end_time is not None - ), "Make sure `fit_start_time` and `fit_end_time` are not None." - pkwargs.update( - { - "fit_start_time": fit_start_time, - "fit_end_time": fit_end_time, - } - ) - new_l.append({"class": klass.__name__, "kwargs": pkwargs}) - else: - new_l.append(p) - return new_l - - infer_processors = check_transform_proc(infer_processors) - learn_processors = check_transform_proc(learn_processors) + infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time) + learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time) data_loader = { "class": "QlibDataLoader", diff --git a/qlib/data/dataset/processor.py b/qlib/data/dataset/processor.py index e4003a1f5..b764875ed 100755 --- a/qlib/data/dataset/processor.py +++ b/qlib/data/dataset/processor.py @@ -166,7 +166,9 @@ class MinMaxNorm(Processor): return df -class ZscoreNorm(Processor): +class ZScoreNorm(Processor): + """ZScore Normalization""" + def __init__(self, fit_start_time, fit_end_time, fields_group=None): self.fit_start_time = fit_start_time self.fit_end_time = fit_end_time @@ -193,6 +195,40 @@ class ZscoreNorm(Processor): return df +class RobustZScoreNorm(Processor): + """Robust ZScore Normalization + + Use robust statistics for Z-Score normalization: + mean(x) = median(x) + std(x) = MAD(x) * 1.4826 + + Reference: + https://en.wikipedia.org/wiki/Median_absolute_deviation. + """ + + def __init__(self, fit_start_time, fit_end_time, fields_group=None, clip_outlier=True): + self.fit_start_time = fit_start_time + self.fit_end_time = fit_end_time + self.fields_group = fields_group + self.clip_outlier = clip_outlier + + def fit(self, df): + df = fetch_df_by_index(df, slice(self.fit_start_time, self.fit_end_time), level="datetime") + self.cols = get_group_columns(df, self.fields_group) + X = df[self.cols].values + self.mean_train = np.nanmedian(X, axis=0) + self.std_train = np.nanmedian(np.abs(X - self.mean_train), axis=0) + self.std_train += EPS + self.std_train *= 1.4826 + + def __call__(self, df): + df.loc(axis=1)[self.cols] -= self.mean_train + df.loc(axis=1)[self.cols] /= self.std_train + if self.clip_outlier: + df.clip(-3, 3, inplace=True) + return df + + class CSZScoreNorm(Processor): """Cross Sectional ZScore Normalization""" From 37cc51465cb4ad55bf45f0740dfab9a949c94efc Mon Sep 17 00:00:00 2001 From: Dong Zhou Date: Thu, 26 Nov 2020 12:25:39 +0800 Subject: [PATCH 3/6] improve perf of robust zscore processor --- qlib/data/dataset/processor.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/qlib/data/dataset/processor.py b/qlib/data/dataset/processor.py index b764875ed..e2d251aa7 100755 --- a/qlib/data/dataset/processor.py +++ b/qlib/data/dataset/processor.py @@ -222,8 +222,10 @@ class RobustZScoreNorm(Processor): self.std_train *= 1.4826 def __call__(self, df): - df.loc(axis=1)[self.cols] -= self.mean_train - df.loc(axis=1)[self.cols] /= self.std_train + X = df[self.cols] + X -= self.mean_train + X /= self.std_train + df[self.cols] = X if self.clip_outlier: df.clip(-3, 3, inplace=True) return df From 45192413f9abb5f89c6cc05d80cd980a4cd145db Mon Sep 17 00:00:00 2001 From: Jactus Date: Thu, 26 Nov 2020 13:11:56 +0800 Subject: [PATCH 4/6] Fix --- examples/run_all_model.py | 5 +++-- qlib/workflow/cli.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/run_all_model.py b/examples/run_all_model.py index 6f12434da..b09750674 100644 --- a/examples/run_all_model.py +++ b/examples/run_all_model.py @@ -164,13 +164,14 @@ class ExtendedEnvBuilder(venv.EnvBuilder): # function to check cuda version on the machine, this case is for the model TFT def check_cuda(folders): - path = "/usr/local/cuda/version.txt" + path = "/usr/local/cuda/version.txt" # TODO: FIX ME, this will not work on other os systems. exclude_tft = True if os.path.exists(path): - with open(path, "w") as f: + with open(path, "r") as f: if "10.1" in str(f.read()) or "10.0" in str(f.read()): exclude_tft = False if exclude_tft and "TFT" in folders: + sys.stderr.write("Compatible CUDA version not found! Removing TFT from the workflow...\n") del folders["TFT"] return folders diff --git a/qlib/workflow/cli.py b/qlib/workflow/cli.py index 2e087877b..ecec8d3d7 100644 --- a/qlib/workflow/cli.py +++ b/qlib/workflow/cli.py @@ -27,9 +27,9 @@ def sys_config(config, config_path): Parameters ---------- config : dict - configuration of the workflow + configuration of the workflow. config_path : str - configuration of the path + configuration of the path. """ sys_config = config.get("sys", {}) From 5102566aad27ef8e3f55aa022e2216f614394357 Mon Sep 17 00:00:00 2001 From: lwwang1995 Date: Thu, 26 Nov 2020 13:35:07 +0800 Subject: [PATCH 5/6] Update GRU and LSTM model. --- qlib/contrib/model/pytorch_gru.py | 17 ++++------------- qlib/contrib/model/pytorch_lstm.py | 18 +++++------------- 2 files changed, 9 insertions(+), 26 deletions(-) diff --git a/qlib/contrib/model/pytorch_gru.py b/qlib/contrib/model/pytorch_gru.py index 4cc7f9852..2dd8464e2 100755 --- a/qlib/contrib/model/pytorch_gru.py +++ b/qlib/contrib/model/pytorch_gru.py @@ -28,14 +28,10 @@ class GRU(Model): Parameters ---------- - input_dim : int - input dimension - output_dim : int - output dimension - layers : tuple - layer sizes - lr : float - learning rate + d_feat : int + input dimension for each time step + metric: str + the evaluate metric used in early stop optimizer : str optimizer name GPU : str @@ -112,10 +108,6 @@ class GRU(Model): ) ) - if loss not in {"mse", "binary"}: - raise NotImplementedError("loss {} is not supported!".format(loss)) - self._scorer = mean_squared_error if loss == "mse" else roc_auc_score - self.gru_model = GRUModel( d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers, dropout=self.dropout ) @@ -251,7 +243,6 @@ class GRU(Model): # train self.logger.info("training...") self._fitted = True - # return for step in range(self.n_epochs): self.logger.info("Epoch%d:", step) diff --git a/qlib/contrib/model/pytorch_lstm.py b/qlib/contrib/model/pytorch_lstm.py index 8b8454380..adb895247 100755 --- a/qlib/contrib/model/pytorch_lstm.py +++ b/qlib/contrib/model/pytorch_lstm.py @@ -28,20 +28,17 @@ class LSTM(Model): Parameters ---------- - input_dim : int - input dimension - output_dim : int - output dimension - layers : tuple - layer sizes - lr : float - learning rate + d_feat : int + input dimension for each time step + metric: str + the evaluate metric used in early stop optimizer : str optimizer name GPU : str the GPU ID(s) used for training """ + def __init__( self, d_feat=6, @@ -112,10 +109,6 @@ class LSTM(Model): ) ) - if loss not in {"mse", "binary"}: - raise NotImplementedError("loss {} is not supported!".format(loss)) - self._scorer = mean_squared_error if loss == "mse" else roc_auc_score - self.lstm_model = LSTMModel( d_feat=self.d_feat, hidden_size=self.hidden_size, num_layers=self.num_layers, dropout=self.dropout ) @@ -251,7 +244,6 @@ class LSTM(Model): # train self.logger.info("training...") self._fitted = True - # return for step in range(self.n_epochs): self.logger.info("Epoch%d:", step) From 2a170624216c5344c039edec9173a74370d3de86 Mon Sep 17 00:00:00 2001 From: Jactus Date: Thu, 26 Nov 2020 13:49:51 +0800 Subject: [PATCH 6/6] Update run_all_model and format --- examples/benchmarks/TFT/README.md | 4 ++-- examples/run_all_model.py | 15 --------------- qlib/contrib/model/pytorch_lstm.py | 1 - 3 files changed, 2 insertions(+), 18 deletions(-) diff --git a/examples/benchmarks/TFT/README.md b/examples/benchmarks/TFT/README.md index e9e44db1a..5a6a9f153 100644 --- a/examples/benchmarks/TFT/README.md +++ b/examples/benchmarks/TFT/README.md @@ -8,7 +8,7 @@ Users can follow the ``workflow_by_code_tft.py`` to run the benchmark. ### Notes -1. Please be **aware** that this script can only support `Python 3.5 - 3.8`, and `Cuda 10.0 or 10.1`. -2. Please remember to install `cudatoolkit==10.1` and `cudnn==7.6` on your machine. +1. Please be **aware** that this script can only support `Python 3.5 - 3.8`. +2. If the CUDA version on your machine is not 10.0, please remember to run the following commands `conda install anaconda cudatoolkit=10.0` and `conda install cudnn` on your machine. 3. The model must run in GPU, or an error will be raised. 4. New datasets should be registered in ``data_formatters``, for detail please visit the source. diff --git a/examples/run_all_model.py b/examples/run_all_model.py index b09750674..2f6c4299e 100644 --- a/examples/run_all_model.py +++ b/examples/run_all_model.py @@ -162,20 +162,6 @@ class ExtendedEnvBuilder(venv.EnvBuilder): self.install_script(context, "pip", url) -# function to check cuda version on the machine, this case is for the model TFT -def check_cuda(folders): - path = "/usr/local/cuda/version.txt" # TODO: FIX ME, this will not work on other os systems. - exclude_tft = True - if os.path.exists(path): - with open(path, "r") as f: - if "10.1" in str(f.read()) or "10.0" in str(f.read()): - exclude_tft = False - if exclude_tft and "TFT" in folders: - sys.stderr.write("Compatible CUDA version not found! Removing TFT from the workflow...\n") - del folders["TFT"] - return folders - - # function to calculate the mean and std of a list in the results dictionary def cal_mean_std(results) -> dict: mean_std = dict() @@ -205,7 +191,6 @@ def get_all_folders(models, exclude) -> dict: if add: path = Path("benchmarks") / f.name folders[f.name] = str(path.resolve()) - folders = check_cuda(folders) return folders diff --git a/qlib/contrib/model/pytorch_lstm.py b/qlib/contrib/model/pytorch_lstm.py index adb895247..be43d3698 100755 --- a/qlib/contrib/model/pytorch_lstm.py +++ b/qlib/contrib/model/pytorch_lstm.py @@ -38,7 +38,6 @@ class LSTM(Model): the GPU ID(s) used for training """ - def __init__( self, d_feat=6,