From 44a7dc004d03d4e4a84e3e912c83a803711ea9b5 Mon Sep 17 00:00:00 2001 From: Young Date: Fri, 12 Mar 2021 07:50:17 +0000 Subject: [PATCH 1/3] update docs and fix duplicated pred bug --- qlib/data/dataset/handler.py | 7 +++++++ qlib/workflow/task/collect.py | 7 ++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/qlib/data/dataset/handler.py b/qlib/data/dataset/handler.py index 2889c4465..25d02fdf6 100644 --- a/qlib/data/dataset/handler.py +++ b/qlib/data/dataset/handler.py @@ -50,6 +50,9 @@ class DataHandler(Serializable): SH600004 13.313329 11800983.0 13.313329 13.317701 0.183632 0.0042 SH600005 37.796539 12231662.0 38.258602 37.919757 0.970325 0.0289 + + Tips for improving the performance of datahandler + - Fetching data with `col_set=CS_RAW` will return the raw data and may avoid pandas from copying the data when calling `loc` """ def __init__( @@ -257,6 +260,10 @@ class DataHandler(Serializable): class DataHandlerLP(DataHandler): """ DataHandler with **(L)earnable (P)rocessor** + + Tips to improving the performance of data handler + - To reduce the memory cost + - `drop_raw=True`: this will modify the data inplace on raw data; """ # data key diff --git a/qlib/workflow/task/collect.py b/qlib/workflow/task/collect.py index 7cdca30fa..6c4e45c72 100644 --- a/qlib/workflow/task/collect.py +++ b/qlib/workflow/task/collect.py @@ -46,7 +46,12 @@ class RollingEnsemble: pred_l = [] for rec in rec_l: pred_l.append(rec.load_object("pred.pkl").iloc[:, 0]) - pred = pd.concat(pred_l).sort_index() + # Make sure the pred are sorted according to the rolling start time + pred_l.sort(key=lambda pred: pred.index.get_level_values("datetime").min()) + pred = pd.concat(pred_l) + # If there are duplicated predition, we use the latest perdiction + pred = pred[~pred.index.duplicated(keep="last")] + pred = pred.sort_index() reduce_group[k] = pred return reduce_group From e4e8a4abcdb72a2e27d4bc929996795b768381a2 Mon Sep 17 00:00:00 2001 From: Young Date: Fri, 12 Mar 2021 10:17:16 +0000 Subject: [PATCH 2/3] fix task name & add cur_path --- qlib/__init__.py | 7 ++++--- qlib/model/trainer.py | 2 +- qlib/workflow/task/collect.py | 5 +++-- qlib/workflow/task/update.py | 4 ++-- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/qlib/__init__.py b/qlib/__init__.py index 816e5a585..4fd48f8c1 100644 --- a/qlib/__init__.py +++ b/qlib/__init__.py @@ -154,7 +154,7 @@ def init_from_yaml_conf(conf_path, **kwargs): init(default_conf, **config) -def get_project_path(config_name="config.yaml") -> Path: +def get_project_path(config_name="config.yaml", cur_path=None) -> Path: """ If users are building a project follow the following pattern. - Qlib is a sub folder in project path @@ -181,7 +181,8 @@ def get_project_path(config_name="config.yaml") -> Path: FileNotFoundError: If project path is not found """ - cur_path = Path(__file__).absolute().resolve() + if cur_path is None: + cur_path = Path(__file__).absolute().resolve() while True: if (cur_path / config_name).exists(): return cur_path @@ -199,7 +200,7 @@ def auto_init(**kwargs): """ try: - pp = get_project_path() + pp = get_project_path(cur_path=kwargs.pop("cur_path", None)) except FileNotFoundError: init(**kwargs) else: diff --git a/qlib/model/trainer.py b/qlib/model/trainer.py index 5e62a141c..a4df92218 100644 --- a/qlib/model/trainer.py +++ b/qlib/model/trainer.py @@ -34,7 +34,7 @@ def task_train(task_config: dict, experiment_name: str) -> str: model.fit(dataset) recorder = R.get_recorder() R.save_objects(**{"params.pkl": model}) - R.save_objects(**{"task.pkl": task_config}) # keep the original format and datatype + R.save_objects(task=task_config) # keep the original format and datatype # generate records: prediction, backtest, and analysis records = task_config.get("record", []) diff --git a/qlib/workflow/task/collect.py b/qlib/workflow/task/collect.py index b16312ff7..b4a584494 100644 --- a/qlib/workflow/task/collect.py +++ b/qlib/workflow/task/collect.py @@ -1,5 +1,6 @@ from qlib.workflow import R import pandas as pd +import tqdm.auto import tqdm from typing import Union from qlib import get_module_logger @@ -35,7 +36,7 @@ class TaskCollector: recs_flt = {} for rid, rec in recs.items(): - params = rec.load_object("task.pkl") + params = rec.load_object("task") if rec.status == rec.STATUS_FI: if filter_func is None or filter_func(params): rec.params = params @@ -83,7 +84,7 @@ class RollingCollector: recs_flt = {} for rid, rec in tqdm(recs.items(), desc="Loading data"): - params = rec.load_object("task.pkl") + params = rec.load_object("task") if rec.status == rec.STATUS_FI: if self.flt_func is None or self.flt_func(params): rec.params = params diff --git a/qlib/workflow/task/update.py b/qlib/workflow/task/update.py index f9d03efbc..628225a20 100644 --- a/qlib/workflow/task/update.py +++ b/qlib/workflow/task/update.py @@ -74,7 +74,7 @@ class ModelUpdater: rec = self.exp.get_recorder(recorder_id=rid) old_pred = rec.load_object("pred.pkl") last_end = old_pred.index.get_level_values("datetime").max() - task_config = rec.load_object("task.pkl") + task_config = rec.load_object("task") # updated to the latest trading day cal = D.calendar(start_time=last_end + pd.Timedelta(days=1), end_time=None) @@ -107,7 +107,7 @@ class ModelUpdater: .. code-block:: python def record_filter(record): - task_config = record.load_object("task.pkl") + task_config = record.load_object("task") if task_config["model"]["class"]=="LGBModel": return True return False From 8362780e22c2e576f78c8acf30f53dc0ee723e5c Mon Sep 17 00:00:00 2001 From: Young Date: Sun, 14 Mar 2021 15:16:38 +0000 Subject: [PATCH 3/3] fix import bug --- qlib/workflow/task/collect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/workflow/task/collect.py b/qlib/workflow/task/collect.py index b4a584494..552456e62 100644 --- a/qlib/workflow/task/collect.py +++ b/qlib/workflow/task/collect.py @@ -1,6 +1,6 @@ from qlib.workflow import R import pandas as pd -import tqdm.auto import tqdm +from tqdm.auto import tqdm from typing import Union from qlib import get_module_logger