diff --git a/qlib/data/dataset/handler.py b/qlib/data/dataset/handler.py index 2889c4465..25d02fdf6 100644 --- a/qlib/data/dataset/handler.py +++ b/qlib/data/dataset/handler.py @@ -50,6 +50,9 @@ class DataHandler(Serializable): SH600004 13.313329 11800983.0 13.313329 13.317701 0.183632 0.0042 SH600005 37.796539 12231662.0 38.258602 37.919757 0.970325 0.0289 + + Tips for improving the performance of datahandler + - Fetching data with `col_set=CS_RAW` will return the raw data and may avoid pandas from copying the data when calling `loc` """ def __init__( @@ -257,6 +260,10 @@ class DataHandler(Serializable): class DataHandlerLP(DataHandler): """ DataHandler with **(L)earnable (P)rocessor** + + Tips to improving the performance of data handler + - To reduce the memory cost + - `drop_raw=True`: this will modify the data inplace on raw data; """ # data key diff --git a/qlib/workflow/task/collect.py b/qlib/workflow/task/collect.py index 7cdca30fa..6c4e45c72 100644 --- a/qlib/workflow/task/collect.py +++ b/qlib/workflow/task/collect.py @@ -46,7 +46,12 @@ class RollingEnsemble: pred_l = [] for rec in rec_l: pred_l.append(rec.load_object("pred.pkl").iloc[:, 0]) - pred = pd.concat(pred_l).sort_index() + # Make sure the pred are sorted according to the rolling start time + pred_l.sort(key=lambda pred: pred.index.get_level_values("datetime").min()) + pred = pd.concat(pred_l) + # If there are duplicated predition, we use the latest perdiction + pred = pred[~pred.index.duplicated(keep="last")] + pred = pred.sort_index() reduce_group[k] = pred return reduce_group