From 370477288dcf8fb2abbb7e7bba69cf6f841d959b Mon Sep 17 00:00:00 2001 From: Fivele-Li <128388363+Fivele-Li@users.noreply.github.com> Date: Wed, 24 May 2023 15:49:58 +0800 Subject: [PATCH 01/15] fix_DDG-DA_workflow_bug (#1516) * 1.specify group_keys=False to avoid FutureWarning; 2.fix get train_start from dict unexpected problem; * fix black * Add comments * Add make file --------- Co-authored-by: Young --- examples/benchmarks_dynamic/DDG-DA/Makefile | 4 ++++ examples/benchmarks_dynamic/DDG-DA/workflow.py | 7 +++++-- 2 files changed, 9 insertions(+), 2 deletions(-) create mode 100644 examples/benchmarks_dynamic/DDG-DA/Makefile diff --git a/examples/benchmarks_dynamic/DDG-DA/Makefile b/examples/benchmarks_dynamic/DDG-DA/Makefile new file mode 100644 index 000000000..c6cf5206e --- /dev/null +++ b/examples/benchmarks_dynamic/DDG-DA/Makefile @@ -0,0 +1,4 @@ +.PHONY: clean + +clean: + -rm -r *.pkl mlruns || true diff --git a/examples/benchmarks_dynamic/DDG-DA/workflow.py b/examples/benchmarks_dynamic/DDG-DA/workflow.py index b69107549..f57080055 100644 --- a/examples/benchmarks_dynamic/DDG-DA/workflow.py +++ b/examples/benchmarks_dynamic/DDG-DA/workflow.py @@ -116,7 +116,9 @@ class DDGDA: feature_selected = feature_df.loc[:, col_selected.index] - feature_selected = feature_selected.groupby("datetime").apply(lambda df: (df - df.mean()).div(df.std())) + feature_selected = feature_selected.groupby("datetime", group_keys=False).apply( + lambda df: (df - df.mean()).div(df.std()) + ) feature_selected = feature_selected.fillna(0.0) df_all = { @@ -168,7 +170,8 @@ class DDGDA: # - Only the dataset part is important, in current version of meta model will integrate the rb = RollingBenchmark(model_type=self.sim_task_model, **self.rb_kwargs) sim_task = rb.basic_task() - train_start = self.rb_kwargs.get("train_start", "2008-01-01") + # the train_start for training meta model does not necessarily align with final rolling + train_start = "2008-01-01" if self.rb_kwargs.get("train_start") is None else self.rb_kwargs.get("train_start") train_end = "2010-12-31" if self.meta_1st_train_end is None else self.meta_1st_train_end test_start = (pd.Timestamp(train_end) + pd.Timedelta(days=1)).strftime("%Y-%m-%d") proxy_forecast_model_task = { From 19a0eb78bc641485cfa6d67fe8f934fbfa116b5c Mon Sep 17 00:00:00 2001 From: Fivele-Li <128388363+Fivele-Li@users.noreply.github.com> Date: Fri, 26 May 2023 14:44:34 +0800 Subject: [PATCH 02/15] Fix TCN model input dimension mismatch (#1520) * transpose dimension 1 and 2 to match nn.Conv1d input * 1.update TCN benchmarks; 2.Emphasize updating the benchmark table; * replace specific version with main --------- Co-authored-by: lijinhui <362237642@qq.com> --- examples/benchmarks/README.md | 4 ++-- qlib/contrib/model/pytorch_tcn_ts.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/benchmarks/README.md b/examples/benchmarks/README.md index 24d3f5902..af4403bbb 100644 --- a/examples/benchmarks/README.md +++ b/examples/benchmarks/README.md @@ -26,7 +26,7 @@ The numbers shown below demonstrate the performance of the entire `workflow` of | Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown | |------------------------------------------|-------------------------------------|-------------|-------------|-------------|-------------|-------------------|-------------------|--------------| -| TCN(Shaojie Bai, et al.) | Alpha158 | 0.0275±0.00 | 0.2157±0.01 | 0.0411±0.00 | 0.3379±0.01 | 0.0190±0.02 | 0.2887±0.27 | -0.1202±0.03 | +| TCN(Shaojie Bai, et al.) | Alpha158 | 0.0279±0.00 | 0.2181±0.01 | 0.0421±0.00 | 0.3429±0.01 | 0.0262±0.02 | 0.4133±0.25 | -0.1090±0.03 | | TabNet(Sercan O. Arik, et al.) | Alpha158 | 0.0204±0.01 | 0.1554±0.07 | 0.0333±0.00 | 0.2552±0.05 | 0.0227±0.04 | 0.3676±0.54 | -0.1089±0.08 | | Transformer(Ashish Vaswani, et al.) | Alpha158 | 0.0264±0.00 | 0.2053±0.02 | 0.0407±0.00 | 0.3273±0.02 | 0.0273±0.02 | 0.3970±0.26 | -0.1101±0.02 | | GRU(Kyunghyun Cho, et al.) | Alpha158(with selected 20 features) | 0.0315±0.00 | 0.2450±0.04 | 0.0428±0.00 | 0.3440±0.03 | 0.0344±0.02 | 0.5160±0.25 | -0.1017±0.02 | @@ -134,7 +134,7 @@ If you want to contribute your new models, you can follow the steps below. - `README.md`: a brief introduction to your models - `workflow_config__.yaml`: a configuration which can read by `qrun`. You are encouraged to run your model in all datasets. 3. You can integrate your model as a module [in this folder](https://github.com/microsoft/qlib/tree/main/qlib/contrib/model). -4. Please updated your results in the benchmark tables, e.g. [Alpha360](#alpha158-dataset), [Alpha158](#alpha158-dataset)(the values of each metric are the mean and std calculated based on 20 runs with different random seeds, if you don't have enough computational resource, you can ask for help in the PR). +4. Please update your results in the above **Benchmark Tables**, e.g. [Alpha360](#alpha158-dataset), [Alpha158](#alpha158-dataset)(the values of each metric are the mean and std calculated based on **20 Runs** with different random seeds. You can accomplish the above operations through the automated [script](https://github.com/microsoft/qlib/blob/main/examples/run_all_model.py#LL286C22-L286C22) provided by Qlib, and get the final result in the .md file. if you don't have enough computational resource, you can ask for help in the PR). 5. Update the info in the index page in the [news list](https://github.com/microsoft/qlib#newspaper-whats-new----sparkling_heart) and [model list](https://github.com/microsoft/qlib#quant-model-paper-zoo). Finally, you can send PR for review. ([here is an example](https://github.com/microsoft/qlib/pull/1040)) diff --git a/qlib/contrib/model/pytorch_tcn_ts.py b/qlib/contrib/model/pytorch_tcn_ts.py index 4972a3065..bb2e5ea5b 100755 --- a/qlib/contrib/model/pytorch_tcn_ts.py +++ b/qlib/contrib/model/pytorch_tcn_ts.py @@ -168,7 +168,8 @@ class TCN(Model): self.TCN_model.train() for data in data_loader: - feature = data[:, :, 0:-1].to(self.device) + data = torch.transpose(data, 1, 2) + feature = data[:, 0:-1, :].to(self.device) label = data[:, -1, -1].to(self.device) pred = self.TCN_model(feature.float()) @@ -187,8 +188,8 @@ class TCN(Model): losses = [] for data in data_loader: - - feature = data[:, :, 0:-1].to(self.device) + data = torch.transpose(data, 1, 2) + feature = data[:, 0:-1, :].to(self.device) # feature[torch.isnan(feature)] = 0 label = data[:, -1, -1].to(self.device) From efffb2819acd056302b7e7facb2d2a78aaa64f08 Mon Sep 17 00:00:00 2001 From: yaxuan999 <96709511+yaxuan999@users.noreply.github.com> Date: Fri, 26 May 2023 18:42:58 +0800 Subject: [PATCH 03/15] added KRNN and Sandwich models and their example results based on Alpha360 (#1414) * Update README.md updated the result of KRNN and Sandwich models based on Alpha360 * Update README.md * Update README.md * Add files via upload * Update README.md * Update README.md * Update README.md * Add files via upload * Delete pytorch_krnn.py * Delete pytorch_sandwich.py * Add files via upload * Update pytorch_sandwich.py * Update pytorch_krnn.py * Update pytorch_sandwich.py * Update pytorch_krnn.py * Update README.md * Update README.md * Update requirements.txt * Update requirements.txt * Update README.md * Update README.md * Update pytorch_sandwich.py * Update link on index --------- Co-authored-by: Young --- README.md | 3 + examples/benchmarks/KRNN/README.md | 8 + examples/benchmarks/KRNN/requirements.txt | 2 + .../KRNN/workflow_config_krnn_Alpha360.yaml | 91 ++++ examples/benchmarks/README.md | 2 + examples/benchmarks/Sandwich/README.md | 8 + examples/benchmarks/Sandwich/requirements.txt | 2 + .../workflow_config_sandwich_Alpha360.yaml | 93 ++++ qlib/contrib/model/pytorch_krnn.py | 511 ++++++++++++++++++ qlib/contrib/model/pytorch_sandwich.py | 376 +++++++++++++ 10 files changed, 1096 insertions(+) create mode 100644 examples/benchmarks/KRNN/README.md create mode 100644 examples/benchmarks/KRNN/requirements.txt create mode 100644 examples/benchmarks/KRNN/workflow_config_krnn_Alpha360.yaml create mode 100644 examples/benchmarks/Sandwich/README.md create mode 100644 examples/benchmarks/Sandwich/requirements.txt create mode 100644 examples/benchmarks/Sandwich/workflow_config_sandwich_Alpha360.yaml create mode 100644 qlib/contrib/model/pytorch_krnn.py create mode 100644 qlib/contrib/model/pytorch_sandwich.py diff --git a/README.md b/README.md index cedfdc348..c09e1276e 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ Recent released features | Feature | Status | | -- | ------ | +| KRNN and Sandwich models | :chart_with_upwards_trend: [Released](https://github.com/microsoft/qlib/pull/1414/) on May 26, 2023 | | Release Qlib v0.9.0 | :octocat: [Released](https://github.com/microsoft/qlib/releases/tag/v0.9.0) on Dec 9, 2022 | | RL Learning Framework | :hammer: :chart_with_upwards_trend: Released on Nov 10, 2022. [#1332](https://github.com/microsoft/qlib/pull/1332), [#1322](https://github.com/microsoft/qlib/pull/1322), [#1316](https://github.com/microsoft/qlib/pull/1316),[#1299](https://github.com/microsoft/qlib/pull/1299),[#1263](https://github.com/microsoft/qlib/pull/1263), [#1244](https://github.com/microsoft/qlib/pull/1244), [#1169](https://github.com/microsoft/qlib/pull/1169), [#1125](https://github.com/microsoft/qlib/pull/1125), [#1076](https://github.com/microsoft/qlib/pull/1076)| | HIST and IGMTF models | :chart_with_upwards_trend: [Released](https://github.com/microsoft/qlib/pull/1040) on Apr 10, 2022 | @@ -353,6 +354,8 @@ Here is a list of models built on `Qlib`. - [ADD based on pytorch (Hongshun Tang, et al.2020)](examples/benchmarks/ADD/) - [IGMTF based on pytorch (Wentao Xu, et al.2021)](examples/benchmarks/IGMTF/) - [HIST based on pytorch (Wentao Xu, et al.2021)](examples/benchmarks/HIST/) +- [KRNN based on pytorch](examples/benchmarks/KRNN/) +- [Sandwich based on pytorch](examples/benchmarks/Sandwich/) Your PR of new Quant models is highly welcomed. diff --git a/examples/benchmarks/KRNN/README.md b/examples/benchmarks/KRNN/README.md new file mode 100644 index 000000000..31af523e6 --- /dev/null +++ b/examples/benchmarks/KRNN/README.md @@ -0,0 +1,8 @@ +# KRNN +* Code: [https://github.com/microsoft/FOST/blob/main/fostool/model/krnn.py](https://github.com/microsoft/FOST/blob/main/fostool/model/krnn.py) + + +# Introductions about the settings/configs. +* Torch_geometric is used in the original model in FOST, but we didn't use it. +* make use your CUDA version matches the torch version to allow the usage of GPU, we use CUDA==10.2 and torch.__version__==1.12.1 + diff --git a/examples/benchmarks/KRNN/requirements.txt b/examples/benchmarks/KRNN/requirements.txt new file mode 100644 index 000000000..87d3b2dda --- /dev/null +++ b/examples/benchmarks/KRNN/requirements.txt @@ -0,0 +1,2 @@ +numpy==1.23.4 +pandas==1.5.2 diff --git a/examples/benchmarks/KRNN/workflow_config_krnn_Alpha360.yaml b/examples/benchmarks/KRNN/workflow_config_krnn_Alpha360.yaml new file mode 100644 index 000000000..691607ad1 --- /dev/null +++ b/examples/benchmarks/KRNN/workflow_config_krnn_Alpha360.yaml @@ -0,0 +1,91 @@ +qlib_init: + provider_uri: "~/.qlib/qlib_data/cn_data" + region: cn +market: &market csi300 +benchmark: &benchmark SH000300 +data_handler_config: &data_handler_config + start_time: 2008-01-01 + end_time: 2020-08-01 + fit_start_time: 2008-01-01 + fit_end_time: 2014-12-31 + instruments: *market + infer_processors: + - class: RobustZScoreNorm + kwargs: + fields_group: feature + clip_outlier: true + - class: Fillna + kwargs: + fields_group: feature + learn_processors: + - class: DropnaLabel + - class: CSRankNorm + kwargs: + fields_group: label + label: ["Ref($close, -2) / Ref($close, -1) - 1"] +port_analysis_config: &port_analysis_config + strategy: + class: TopkDropoutStrategy + module_path: qlib.contrib.strategy + kwargs: + signal: + - + - + topk: 50 + n_drop: 5 + backtest: + start_time: 2017-01-01 + end_time: 2020-08-01 + account: 100000000 + benchmark: *benchmark + exchange_kwargs: + limit_threshold: 0.095 + deal_price: close + open_cost: 0.0005 + close_cost: 0.0015 + min_cost: 5 +task: + model: + class: KRNN + module_path: qlib.contrib.model.pytorch_krnn + kwargs: + fea_dim: 6 + cnn_dim: 8 + cnn_kernel_size: 3 + rnn_dim: 8 + rnn_dups: 2 + rnn_layers: 2 + n_epochs: 200 + lr: 0.001 + early_stop: 20 + batch_size: 2000 + metric: loss + GPU: 0 + dataset: + class: DatasetH + module_path: qlib.data.dataset + kwargs: + handler: + class: Alpha360 + module_path: qlib.contrib.data.handler + kwargs: *data_handler_config + segments: + train: [2008-01-01, 2014-12-31] + valid: [2015-01-01, 2016-12-31] + test: [2017-01-01, 2020-08-01] + record: + - class: SignalRecord + module_path: qlib.workflow.record_temp + kwargs: + model: + dataset: + - class: SigAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + ana_long_short: False + ann_scaler: 252 + - class: PortAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config + diff --git a/examples/benchmarks/README.md b/examples/benchmarks/README.md index af4403bbb..41799205e 100644 --- a/examples/benchmarks/README.md +++ b/examples/benchmarks/README.md @@ -68,6 +68,8 @@ The numbers shown below demonstrate the performance of the entire `workflow` of | TRA(Hengxu Lin, et al.) | Alpha360 | 0.0485±0.00 | 0.3787±0.03 | 0.0587±0.00 | 0.4756±0.03 | 0.0920±0.03 | 1.2789±0.42 | -0.0834±0.02 | | IGMTF(Wentao Xu, et al.) | Alpha360 | 0.0480±0.00 | 0.3589±0.02 | 0.0606±0.00 | 0.4773±0.01 | 0.0946±0.02 | 1.3509±0.25 | -0.0716±0.02 | | HIST(Wentao Xu, et al.) | Alpha360 | 0.0522±0.00 | 0.3530±0.01 | 0.0667±0.00 | 0.4576±0.01 | 0.0987±0.02 | 1.3726±0.27 | -0.0681±0.01 | +| KRNN | Alpha360 | 0.0173±0.01 | 0.1210±0.06 | 0.0270±0.01 | 0.2018±0.04 | -0.0465±0.05 | -0.5415±0.62 | -0.2919±0.13 | +| Sandwich | Alpha360 | 0.0258±0.00 | 0.1924±0.04 | 0.0337±0.00 | 0.2624±0.03 | 0.0005±0.03 | 0.0001±0.33 | -0.1752±0.05 | - The selected 20 features are based on the feature importance of a lightgbm-based model. diff --git a/examples/benchmarks/Sandwich/README.md b/examples/benchmarks/Sandwich/README.md new file mode 100644 index 000000000..26f189a39 --- /dev/null +++ b/examples/benchmarks/Sandwich/README.md @@ -0,0 +1,8 @@ +# Sandwich +* Code: [https://github.com/microsoft/FOST/blob/main/fostool/model/sandwich.py](https://github.com/microsoft/FOST/blob/main/fostool/model/sandwich.py) + + +# Introductions about the settings/configs. +* Torch_geometric is used in the original model in FOST, but we didn't use it. +make use your CUDA version matches the torch version to allow the usage of GPU, we use CUDA==10.2 and torch.version==1.12.1 + diff --git a/examples/benchmarks/Sandwich/requirements.txt b/examples/benchmarks/Sandwich/requirements.txt new file mode 100644 index 000000000..87d3b2dda --- /dev/null +++ b/examples/benchmarks/Sandwich/requirements.txt @@ -0,0 +1,2 @@ +numpy==1.23.4 +pandas==1.5.2 diff --git a/examples/benchmarks/Sandwich/workflow_config_sandwich_Alpha360.yaml b/examples/benchmarks/Sandwich/workflow_config_sandwich_Alpha360.yaml new file mode 100644 index 000000000..717a03471 --- /dev/null +++ b/examples/benchmarks/Sandwich/workflow_config_sandwich_Alpha360.yaml @@ -0,0 +1,93 @@ +qlib_init: + provider_uri: "~/.qlib/qlib_data/cn_data" + region: cn +market: &market csi300 +benchmark: &benchmark SH000300 +data_handler_config: &data_handler_config + start_time: 2008-01-01 + end_time: 2020-08-01 + fit_start_time: 2008-01-01 + fit_end_time: 2014-12-31 + instruments: *market + infer_processors: + - class: RobustZScoreNorm + kwargs: + fields_group: feature + clip_outlier: true + - class: Fillna + kwargs: + fields_group: feature + learn_processors: + - class: DropnaLabel + - class: CSRankNorm + kwargs: + fields_group: label + label: ["Ref($close, -2) / Ref($close, -1) - 1"] +port_analysis_config: &port_analysis_config + strategy: + class: TopkDropoutStrategy + module_path: qlib.contrib.strategy + kwargs: + signal: + - + - + topk: 50 + n_drop: 5 + backtest: + start_time: 2017-01-01 + end_time: 2020-08-01 + account: 100000000 + benchmark: *benchmark + exchange_kwargs: + limit_threshold: 0.095 + deal_price: close + open_cost: 0.0005 + close_cost: 0.0015 + min_cost: 5 +task: + model: + class: Sandwich + module_path: qlib.contrib.model.pytorch_sandwich + kwargs: + fea_dim: 6 + cnn_dim_1: 16 + cnn_dim_2: 16 + cnn_kernel_size: 3 + rnn_dim_1: 8 + rnn_dim_2: 8 + rnn_dups: 2 + rnn_layers: 2 + n_epochs: 200 + lr: 0.001 + early_stop: 20 + batch_size: 2000 + metric: loss + GPU: 0 + dataset: + class: DatasetH + module_path: qlib.data.dataset + kwargs: + handler: + class: Alpha360 + module_path: qlib.contrib.data.handler + kwargs: *data_handler_config + segments: + train: [2008-01-01, 2014-12-31] + valid: [2015-01-01, 2016-12-31] + test: [2017-01-01, 2020-08-01] + record: + - class: SignalRecord + module_path: qlib.workflow.record_temp + kwargs: + model: + dataset: + - class: SigAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + ana_long_short: False + ann_scaler: 252 + - class: PortAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config + diff --git a/qlib/contrib/model/pytorch_krnn.py b/qlib/contrib/model/pytorch_krnn.py new file mode 100644 index 000000000..7c252672d --- /dev/null +++ b/qlib/contrib/model/pytorch_krnn.py @@ -0,0 +1,511 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + + +from __future__ import division +from __future__ import print_function + +import numpy as np +import pandas as pd +from typing import Text, Union +import copy +from ...utils import get_or_create_path +from ...log import get_module_logger + +import torch +import torch.nn as nn +import torch.optim as optim + +from ...model.base import Model +from ...data.dataset import DatasetH +from ...data.dataset.handler import DataHandlerLP + +######################################################################## +######################################################################## +######################################################################## + + +class CNNEncoderBase(nn.Module): + def __init__(self, input_dim, output_dim, kernel_size, device): + """Build a basic CNN encoder + + Parameters + ---------- + input_dim : int + The input dimension + output_dim : int + The output dimension + kernel_size : int + The size of convolutional kernels + """ + super().__init__() + + self.input_dim = input_dim + self.output_dim = output_dim + self.kernel_size = kernel_size + self.device = device + + # set padding to ensure the same length + # it is correct only when kernel_size is odd, dilation is 1, stride is 1 + self.conv = nn.Conv1d(input_dim, output_dim, kernel_size, padding=(kernel_size - 1) // 2) + + def forward(self, x): + """ + Parameters + ---------- + x : torch.Tensor + input data + + Returns + ------- + torch.Tensor + Updated representations + """ + + # input shape: [batch_size, seq_len*input_dim] + # output shape: [batch_size, seq_len, input_dim] + x = x.view(x.shape[0], -1, self.input_dim).permute(0, 2, 1).to(self.device) + y = self.conv(x) # [batch_size, output_dim, conved_seq_len] + y = y.permute(0, 2, 1) # [batch_size, conved_seq_len, output_dim] + + return y + + +class KRNNEncoderBase(nn.Module): + def __init__(self, input_dim, output_dim, dup_num, rnn_layers, dropout, device): + """Build K parallel RNNs + + Parameters + ---------- + input_dim : int + The input dimension + output_dim : int + The output dimension + dup_num : int + The number of parallel RNNs + rnn_layers: int + The number of RNN layers + """ + super().__init__() + + self.input_dim = input_dim + self.output_dim = output_dim + self.dup_num = dup_num + self.rnn_layers = rnn_layers + self.dropout = dropout + self.device = device + + self.rnn_modules = nn.ModuleList() + for _ in range(dup_num): + self.rnn_modules.append(nn.GRU(input_dim, output_dim, num_layers=self.rnn_layers, dropout=dropout)) + + def forward(self, x): + """ + Parameters + ---------- + x : torch.Tensor + Input data + n_id : torch.Tensor + Node indices + + Returns + ------- + torch.Tensor + Updated representations + """ + + # input shape: [batch_size, seq_len, input_dim] + # output shape: [batch_size, seq_len, output_dim] + # [seq_len, batch_size, input_dim] + batch_size, seq_len, input_dim = x.shape + x = x.permute(1, 0, 2).to(self.device) + + hids = [] + for rnn in self.rnn_modules: + h, _ = rnn(x) # [seq_len, batch_size, output_dim] + hids.append(h) + # [seq_len, batch_size, output_dim, num_dups] + hids = torch.stack(hids, dim=-1) + hids = hids.view(seq_len, batch_size, self.output_dim, self.dup_num) + hids = hids.mean(dim=3) + hids = hids.permute(1, 0, 2) + + return hids + + +class CNNKRNNEncoder(nn.Module): + def __init__( + self, cnn_input_dim, cnn_output_dim, cnn_kernel_size, rnn_output_dim, rnn_dup_num, rnn_layers, dropout, device + ): + """Build an encoder composed of CNN and KRNN + + Parameters + ---------- + cnn_input_dim : int + The input dimension of CNN + cnn_output_dim : int + The output dimension of CNN + cnn_kernel_size : int + The size of convolutional kernels + rnn_output_dim : int + The output dimension of KRNN + rnn_dup_num : int + The number of parallel duplicates for KRNN + rnn_layers : int + The number of RNN layers + """ + super().__init__() + + self.cnn_encoder = CNNEncoderBase(cnn_input_dim, cnn_output_dim, cnn_kernel_size, device) + self.krnn_encoder = KRNNEncoderBase(cnn_output_dim, rnn_output_dim, rnn_dup_num, rnn_layers, dropout, device) + + def forward(self, x): + """ + Parameters + ---------- + x : torch.Tensor + Input data + n_id : torch.Tensor + Node indices + + Returns + ------- + torch.Tensor + Updated representations + """ + cnn_out = self.cnn_encoder(x) + krnn_out = self.krnn_encoder(cnn_out) + + return krnn_out + + +class KRNNModel(nn.Module): + def __init__(self, fea_dim, cnn_dim, cnn_kernel_size, rnn_dim, rnn_dups, rnn_layers, dropout, device, **params): + """Build a KRNN model + + Parameters + ---------- + fea_dim : int + The feature dimension + cnn_dim : int + The hidden dimension of CNN + cnn_kernel_size : int + The size of convolutional kernels + rnn_dim : int + The hidden dimension of KRNN + rnn_dups : int + The number of parallel duplicates + rnn_layers: int + The number of RNN layers + """ + super().__init__() + + self.encoder = CNNKRNNEncoder( + cnn_input_dim=fea_dim, + cnn_output_dim=cnn_dim, + cnn_kernel_size=cnn_kernel_size, + rnn_output_dim=rnn_dim, + rnn_dup_num=rnn_dups, + rnn_layers=rnn_layers, + dropout=dropout, + device=device, + ) + + self.out_fc = nn.Linear(rnn_dim, 1) + self.device = device + + def forward(self, x): + # x: [batch_size, node_num, seq_len, input_dim] + encode = self.encoder(x) + out = self.out_fc(encode[:, -1, :]).squeeze().to(self.device) + + return out + + +class KRNN(Model): + """KRNN Model + + Parameters + ---------- + d_feat : int + input dimension for each time step + metric: str + the evaluation metric used in early stop + optimizer : str + optimizer name + GPU : str + the GPU ID(s) used for training + """ + + def __init__( + self, + fea_dim=6, + cnn_dim=64, + cnn_kernel_size=3, + rnn_dim=64, + rnn_dups=3, + rnn_layers=2, + dropout=0, + n_epochs=200, + lr=0.001, + metric="", + batch_size=2000, + early_stop=20, + loss="mse", + optimizer="adam", + GPU=0, + seed=None, + **kwargs + ): + # Set logger. + self.logger = get_module_logger("KRNN") + self.logger.info("KRNN pytorch version...") + + # set hyper-parameters. + self.fea_dim = fea_dim + self.cnn_dim = cnn_dim + self.cnn_kernel_size = cnn_kernel_size + self.rnn_dim = rnn_dim + self.rnn_dups = rnn_dups + self.rnn_layers = rnn_layers + self.dropout = dropout + self.n_epochs = n_epochs + self.lr = lr + self.metric = metric + self.batch_size = batch_size + self.early_stop = early_stop + self.optimizer = optimizer.lower() + self.loss = loss + self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.seed = seed + + self.logger.info( + "KRNN parameters setting:" + "\nfea_dim : {}" + "\ncnn_dim : {}" + "\ncnn_kernel_size : {}" + "\nrnn_dim : {}" + "\nrnn_dups : {}" + "\nrnn_layers : {}" + "\ndropout : {}" + "\nn_epochs : {}" + "\nlr : {}" + "\nmetric : {}" + "\nbatch_size: {}" + "\nearly_stop : {}" + "\noptimizer : {}" + "\nloss_type : {}" + "\nvisible_GPU : {}" + "\nuse_GPU : {}" + "\nseed : {}".format( + fea_dim, + cnn_dim, + cnn_kernel_size, + rnn_dim, + rnn_dups, + rnn_layers, + dropout, + n_epochs, + lr, + metric, + batch_size, + early_stop, + optimizer.lower(), + loss, + GPU, + self.use_gpu, + seed, + ) + ) + + if self.seed is not None: + np.random.seed(self.seed) + torch.manual_seed(self.seed) + + self.krnn_model = KRNNModel( + fea_dim=self.fea_dim, + cnn_dim=self.cnn_dim, + cnn_kernel_size=self.cnn_kernel_size, + rnn_dim=self.rnn_dim, + rnn_dups=self.rnn_dups, + rnn_layers=self.rnn_layers, + dropout=self.dropout, + device=self.device, + ) + if optimizer.lower() == "adam": + self.train_optimizer = optim.Adam(self.krnn_model.parameters(), lr=self.lr) + elif optimizer.lower() == "gd": + self.train_optimizer = optim.SGD(self.krnn_model.parameters(), lr=self.lr) + else: + raise NotImplementedError("optimizer {} is not supported!".format(optimizer)) + + self.fitted = False + self.krnn_model.to(self.device) + + @property + def use_gpu(self): + return self.device != torch.device("cpu") + + def mse(self, pred, label): + loss = (pred - label) ** 2 + return torch.mean(loss) + + def loss_fn(self, pred, label): + mask = ~torch.isnan(label) + + if self.loss == "mse": + return self.mse(pred[mask], label[mask]) + + raise ValueError("unknown loss `%s`" % self.loss) + + def metric_fn(self, pred, label): + mask = torch.isfinite(label) + + if self.metric in ("", "loss"): + return -self.loss_fn(pred[mask], label[mask]) + + raise ValueError("unknown metric `%s`" % self.metric) + + def get_daily_inter(self, df, shuffle=False): + # organize the train data into daily batches + daily_count = df.groupby(level=0).size().values + daily_index = np.roll(np.cumsum(daily_count), 1) + daily_index[0] = 0 + if shuffle: + # shuffle data + daily_shuffle = list(zip(daily_index, daily_count)) + np.random.shuffle(daily_shuffle) + daily_index, daily_count = zip(*daily_shuffle) + return daily_index, daily_count + + def train_epoch(self, x_train, y_train): + x_train_values = x_train.values + y_train_values = np.squeeze(y_train.values) + self.krnn_model.train() + + indices = np.arange(len(x_train_values)) + np.random.shuffle(indices) + + for i in range(len(indices))[:: self.batch_size]: + if len(indices) - i < self.batch_size: + break + + feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float().to(self.device) + label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float().to(self.device) + + pred = self.krnn_model(feature) + loss = self.loss_fn(pred, label) + + self.train_optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_value_(self.krnn_model.parameters(), 3.0) + self.train_optimizer.step() + + def test_epoch(self, data_x, data_y): + # prepare training data + x_values = data_x.values + y_values = np.squeeze(data_y.values) + + self.krnn_model.eval() + + scores = [] + losses = [] + + indices = np.arange(len(x_values)) + + for i in range(len(indices))[:: self.batch_size]: + if len(indices) - i < self.batch_size: + break + + feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float().to(self.device) + label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float().to(self.device) + + pred = self.krnn_model(feature) + loss = self.loss_fn(pred, label) + losses.append(loss.item()) + + score = self.metric_fn(pred, label) + scores.append(score.item()) + + return np.mean(losses), np.mean(scores) + + def fit( + self, + dataset: DatasetH, + evals_result=dict(), + save_path=None, + ): + df_train, df_valid, df_test = dataset.prepare( + ["train", "valid", "test"], + col_set=["feature", "label"], + data_key=DataHandlerLP.DK_L, + ) + if df_train.empty or df_valid.empty: + raise ValueError("Empty data from dataset, please check your dataset config.") + + x_train, y_train = df_train["feature"], df_train["label"] + x_valid, y_valid = df_valid["feature"], df_valid["label"] + + save_path = get_or_create_path(save_path) + stop_steps = 0 + train_loss = 0 + best_score = -np.inf + best_epoch = 0 + evals_result["train"] = [] + evals_result["valid"] = [] + + # train + self.logger.info("training...") + self.fitted = True + + for step in range(self.n_epochs): + self.logger.info("Epoch%d:", step) + self.logger.info("training...") + self.train_epoch(x_train, y_train) + self.logger.info("evaluating...") + train_loss, train_score = self.test_epoch(x_train, y_train) + val_loss, val_score = self.test_epoch(x_valid, y_valid) + self.logger.info("train %.6f, valid %.6f" % (train_score, val_score)) + evals_result["train"].append(train_score) + evals_result["valid"].append(val_score) + + if val_score > best_score: + best_score = val_score + stop_steps = 0 + best_epoch = step + best_param = copy.deepcopy(self.krnn_model.state_dict()) + else: + stop_steps += 1 + if stop_steps >= self.early_stop: + self.logger.info("early stop") + break + + self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch)) + self.krnn_model.load_state_dict(best_param) + torch.save(best_param, save_path) + + if self.use_gpu: + torch.cuda.empty_cache() + + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): + if not self.fitted: + raise ValueError("model is not fitted yet!") + + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) + index = x_test.index + self.krnn_model.eval() + x_values = x_test.values + sample_num = x_values.shape[0] + preds = [] + + for begin in range(sample_num)[:: self.batch_size]: + if sample_num - begin < self.batch_size: + end = sample_num + else: + end = begin + self.batch_size + x_batch = torch.from_numpy(x_values[begin:end]).float().to(self.device) + with torch.no_grad(): + pred = self.krnn_model(x_batch).detach().cpu().numpy() + preds.append(pred) + + return pd.Series(np.concatenate(preds), index=index) diff --git a/qlib/contrib/model/pytorch_sandwich.py b/qlib/contrib/model/pytorch_sandwich.py new file mode 100644 index 000000000..4a61be5e1 --- /dev/null +++ b/qlib/contrib/model/pytorch_sandwich.py @@ -0,0 +1,376 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + + +from __future__ import division +from __future__ import print_function + +import numpy as np +import pandas as pd +from typing import Text, Union +import copy +from ...utils import get_or_create_path +from ...log import get_module_logger + +import torch +import torch.nn as nn +import torch.optim as optim + +from ...model.base import Model +from ...data.dataset import DatasetH +from ...data.dataset.handler import DataHandlerLP +from .pytorch_krnn import CNNKRNNEncoder + + +class SandwichModel(nn.Module): + def __init__( + self, + fea_dim, + cnn_dim_1, + cnn_dim_2, + cnn_kernel_size, + rnn_dim_1, + rnn_dim_2, + rnn_dups, + rnn_layers, + dropout, + device, + **params + ): + """Build a Sandwich model + + Parameters + ---------- + fea_dim : int + The feature dimension + cnn_dim_1 : int + The hidden dimension of the first CNN + cnn_dim_2 : int + The hidden dimension of the second CNN + cnn_kernel_size : int + The size of convolutional kernels + rnn_dim_1 : int + The hidden dimension of the first KRNN + rnn_dim_2 : int + The hidden dimension of the second KRNN + rnn_dups : int + The number of parallel duplicates + rnn_layers: int + The number of RNN layers + """ + super().__init__() + + self.first_encoder = CNNKRNNEncoder( + cnn_input_dim=fea_dim, + cnn_output_dim=cnn_dim_1, + cnn_kernel_size=cnn_kernel_size, + rnn_output_dim=rnn_dim_1, + rnn_dup_num=rnn_dups, + rnn_layers=rnn_layers, + dropout=dropout, + device=device, + ) + + self.second_encoder = CNNKRNNEncoder( + cnn_input_dim=rnn_dim_1, + cnn_output_dim=cnn_dim_2, + cnn_kernel_size=cnn_kernel_size, + rnn_output_dim=rnn_dim_2, + rnn_dup_num=rnn_dups, + rnn_layers=rnn_layers, + dropout=dropout, + device=device, + ) + + self.out_fc = nn.Linear(rnn_dim_2, 1) + self.device = device + + def forward(self, x): + # x: [batch_size, node_num, seq_len, input_dim] + encode = self.first_encoder(x) + encode = self.second_encoder(encode) + out = self.out_fc(encode[:, -1, :]).squeeze().to(self.device) + + return out + + +class Sandwich(Model): + """Sandwich Model + + Parameters + ---------- + d_feat : int + input dimension for each time step + metric: str + the evaluation metric used in early stop + optimizer : str + optimizer name + GPU : str + the GPU ID(s) used for training + """ + + def __init__( + self, + fea_dim=6, + cnn_dim_1=64, + cnn_dim_2=32, + cnn_kernel_size=3, + rnn_dim_1=16, + rnn_dim_2=8, + rnn_dups=3, + rnn_layers=2, + dropout=0, + n_epochs=200, + lr=0.001, + metric="", + batch_size=2000, + early_stop=20, + loss="mse", + optimizer="adam", + GPU=0, + seed=None, + **kwargs + ): + # Set logger. + self.logger = get_module_logger("Sandwich") + self.logger.info("Sandwich pytorch version...") + + # set hyper-parameters. + self.fea_dim = fea_dim + self.cnn_dim_1 = cnn_dim_1 + self.cnn_dim_2 = cnn_dim_2 + self.cnn_kernel_size = cnn_kernel_size + self.rnn_dim_1 = rnn_dim_1 + self.rnn_dim_2 = rnn_dim_2 + self.rnn_dups = rnn_dups + self.rnn_layers = rnn_layers + self.dropout = dropout + self.n_epochs = n_epochs + self.lr = lr + self.metric = metric + self.batch_size = batch_size + self.early_stop = early_stop + self.optimizer = optimizer.lower() + self.loss = loss + self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu") + self.seed = seed + + self.logger.info( + "Sandwich parameters setting:" + "\nfea_dim : {}" + "\ncnn_dim_1 : {}" + "\ncnn_dim_2 : {}" + "\ncnn_kernel_size : {}" + "\nrnn_dim_1 : {}" + "\nrnn_dim_2 : {}" + "\nrnn_dups : {}" + "\nrnn_layers : {}" + "\ndropout : {}" + "\nn_epochs : {}" + "\nlr : {}" + "\nmetric : {}" + "\nbatch_size: {}" + "\nearly_stop : {}" + "\noptimizer : {}" + "\nloss_type : {}" + "\nvisible_GPU : {}" + "\nuse_GPU : {}" + "\nseed : {}".format( + fea_dim, + cnn_dim_1, + cnn_dim_2, + cnn_kernel_size, + rnn_dim_1, + rnn_dim_2, + rnn_dups, + rnn_layers, + dropout, + n_epochs, + lr, + metric, + batch_size, + early_stop, + optimizer.lower(), + loss, + GPU, + self.use_gpu, + seed, + ) + ) + + if self.seed is not None: + np.random.seed(self.seed) + torch.manual_seed(self.seed) + + self.sandwich_model = SandwichModel( + fea_dim=self.fea_dim, + cnn_dim_1=self.cnn_dim_1, + cnn_dim_2=self.cnn_dim_2, + cnn_kernel_size=self.cnn_kernel_size, + rnn_dim_1=self.rnn_dim_1, + rnn_dim_2=self.rnn_dim_2, + rnn_dups=self.rnn_dups, + rnn_layers=self.rnn_layers, + dropout=self.dropout, + device=self.device, + ) + if optimizer.lower() == "adam": + self.train_optimizer = optim.Adam(self.sandwich_model.parameters(), lr=self.lr) + elif optimizer.lower() == "gd": + self.train_optimizer = optim.SGD(self.sandwich_model.parameters(), lr=self.lr) + else: + raise NotImplementedError("optimizer {} is not supported!".format(optimizer)) + + self.fitted = False + self.sandwich_model.to(self.device) + + @property + def use_gpu(self): + return self.device != torch.device("cpu") + + def mse(self, pred, label): + loss = (pred - label) ** 2 + return torch.mean(loss) + + def loss_fn(self, pred, label): + mask = ~torch.isnan(label) + + if self.loss == "mse": + return self.mse(pred[mask], label[mask]) + + raise ValueError("unknown loss `%s`" % self.loss) + + def metric_fn(self, pred, label): + mask = torch.isfinite(label) + + if self.metric in ("", "loss"): + return -self.loss_fn(pred[mask], label[mask]) + + raise ValueError("unknown metric `%s`" % self.metric) + + def train_epoch(self, x_train, y_train): + x_train_values = x_train.values + y_train_values = np.squeeze(y_train.values) + self.sandwich_model.train() + + indices = np.arange(len(x_train_values)) + np.random.shuffle(indices) + + for i in range(len(indices))[:: self.batch_size]: + if len(indices) - i < self.batch_size: + break + + feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float().to(self.device) + label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float().to(self.device) + + pred = self.sandwich_model(feature) + loss = self.loss_fn(pred, label) + + self.train_optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_value_(self.sandwich_model.parameters(), 3.0) + self.train_optimizer.step() + + def test_epoch(self, data_x, data_y): + # prepare training data + x_values = data_x.values + y_values = np.squeeze(data_y.values) + + self.sandwich_model.eval() + + scores = [] + losses = [] + + indices = np.arange(len(x_values)) + + for i in range(len(indices))[:: self.batch_size]: + if len(indices) - i < self.batch_size: + break + + feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float().to(self.device) + label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float().to(self.device) + + pred = self.sandwich_model(feature) + loss = self.loss_fn(pred, label) + losses.append(loss.item()) + + score = self.metric_fn(pred, label) + scores.append(score.item()) + + return np.mean(losses), np.mean(scores) + + def fit( + self, dataset: DatasetH, evals_result=dict(), save_path=None, + ): + df_train, df_valid, df_test = dataset.prepare( + ["train", "valid", "test"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L, + ) + if df_train.empty or df_valid.empty: + raise ValueError("Empty data from dataset, please check your dataset config.") + + x_train, y_train = df_train["feature"], df_train["label"] + x_valid, y_valid = df_valid["feature"], df_valid["label"] + + save_path = get_or_create_path(save_path) + stop_steps = 0 + train_loss = 0 + best_score = -np.inf + best_epoch = 0 + evals_result["train"] = [] + evals_result["valid"] = [] + + # train + self.logger.info("training...") + self.fitted = True + + for step in range(self.n_epochs): + self.logger.info("Epoch%d:", step) + self.logger.info("training...") + self.train_epoch(x_train, y_train) + self.logger.info("evaluating...") + train_loss, train_score = self.test_epoch(x_train, y_train) + val_loss, val_score = self.test_epoch(x_valid, y_valid) + self.logger.info("train %.6f, valid %.6f" % (train_score, val_score)) + evals_result["train"].append(train_score) + evals_result["valid"].append(val_score) + + if val_score > best_score: + best_score = val_score + stop_steps = 0 + best_epoch = step + best_param = copy.deepcopy(self.sandwich_model.state_dict()) + else: + stop_steps += 1 + if stop_steps >= self.early_stop: + self.logger.info("early stop") + break + + self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch)) + self.sandwich_model.load_state_dict(best_param) + torch.save(best_param, save_path) + + if self.use_gpu: + torch.cuda.empty_cache() + + def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): + if not self.fitted: + raise ValueError("model is not fitted yet!") + + x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) + index = x_test.index + self.sandwich_model.eval() + x_values = x_test.values + sample_num = x_values.shape[0] + preds = [] + + for begin in range(sample_num)[:: self.batch_size]: + if sample_num - begin < self.batch_size: + end = sample_num + else: + end = begin + self.batch_size + x_batch = torch.from_numpy(x_values[begin:end]).float().to(self.device) + with torch.no_grad(): + pred = self.sandwich_model(x_batch).detach().cpu().numpy() + preds.append(pred) + + return pd.Series(np.concatenate(preds), index=index) From 0e9ac9dce738223beaa8e8667dd8a38cbe59b962 Mon Sep 17 00:00:00 2001 From: you-n-g Date: Wed, 31 May 2023 08:39:52 +0800 Subject: [PATCH 04/15] Fix CI (#1529) --- qlib/contrib/model/pytorch_sandwich.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/qlib/contrib/model/pytorch_sandwich.py b/qlib/contrib/model/pytorch_sandwich.py index 4a61be5e1..020c736fd 100644 --- a/qlib/contrib/model/pytorch_sandwich.py +++ b/qlib/contrib/model/pytorch_sandwich.py @@ -300,10 +300,15 @@ class Sandwich(Model): return np.mean(losses), np.mean(scores) def fit( - self, dataset: DatasetH, evals_result=dict(), save_path=None, + self, + dataset: DatasetH, + evals_result=dict(), + save_path=None, ): df_train, df_valid, df_test = dataset.prepare( - ["train", "valid", "test"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L, + ["train", "valid", "test"], + col_set=["feature", "label"], + data_key=DataHandlerLP.DK_L, ) if df_train.empty or df_valid.empty: raise ValueError("Empty data from dataset, please check your dataset config.") From cd4ab998fbba39cff507694bc6159fb68f11b1d5 Mon Sep 17 00:00:00 2001 From: Wendi Li Date: Sat, 3 Jun 2023 08:42:24 +0800 Subject: [PATCH 05/15] Update on Dynamic Benchmark (#1539) * move config file to benchmark_dynamic & switch default sim task model to GBDT * Update benchmark_dynamic results * Change the default value of alpha of DDG-DA --- .../benchmarks_dynamic/DDG-DA/workflow.py | 6 +- examples/benchmarks_dynamic/README.md | 12 +-- .../baseline/rolling_benchmark.py | 5 +- .../workflow_config_lightgbm_Alpha158.yaml | 72 +++++++++++++++++ .../workflow_config_linear_Alpha158.yaml | 79 +++++++++++++++++++ 5 files changed, 164 insertions(+), 10 deletions(-) create mode 100644 examples/benchmarks_dynamic/baseline/workflow_config_lightgbm_Alpha158.yaml create mode 100644 examples/benchmarks_dynamic/baseline/workflow_config_linear_Alpha158.yaml diff --git a/examples/benchmarks_dynamic/DDG-DA/workflow.py b/examples/benchmarks_dynamic/DDG-DA/workflow.py index f57080055..fef86726d 100644 --- a/examples/benchmarks_dynamic/DDG-DA/workflow.py +++ b/examples/benchmarks_dynamic/DDG-DA/workflow.py @@ -34,14 +34,14 @@ class DDGDA: def __init__( self, - sim_task_model: Literal["linear", "gbdt"] = "linear", + sim_task_model: Literal["linear", "gbdt"] = "gbdt", forecast_model: Literal["linear", "gbdt"] = "linear", h_path: Optional[str] = None, test_end: Optional[str] = None, train_start: Optional[str] = None, meta_1st_train_end: Optional[str] = None, task_ext_conf: Optional[dict] = None, - alpha: float = 0.0, + alpha: float = 0.01, proxy_hd: str = "handler_proxy.pkl", ): """ @@ -215,7 +215,7 @@ class DDGDA: with R.start(experiment_name=self.meta_exp_name): R.log_params(**kwargs) mm = MetaModelDS( - step=self.step, hist_step_n=kwargs["hist_step_n"], lr=0.001, max_epoch=100, seed=43, alpha=self.alpha + step=self.step, hist_step_n=kwargs["hist_step_n"], lr=0.001, max_epoch=30, seed=43, alpha=self.alpha ) mm.fit(md) R.save_objects(model=mm) diff --git a/examples/benchmarks_dynamic/README.md b/examples/benchmarks_dynamic/README.md index 261fcc035..6f78fa71a 100644 --- a/examples/benchmarks_dynamic/README.md +++ b/examples/benchmarks_dynamic/README.md @@ -8,15 +8,17 @@ The table below shows the performances of different solutions on different forec Here is the [crowd sourced version of qlib data](data_collector/crowd_source/README.md): https://github.com/chenditc/investment_data/releases ```bash wget https://github.com/chenditc/investment_data/releases/download/20220720/qlib_bin.tar.gz +mkdir -p ~/.qlib/qlib_data/cn_data tar -zxvf qlib_bin.tar.gz -C ~/.qlib/qlib_data/cn_data --strip-components=2 +rm -f qlib_bin.tar.gz ``` | Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown | -|------------------|---------|----|------|---------|-----------|-------------------|-------------------|--------------| -| RR[Linear] |Alpha158 |0.089|0.577|0.102 |0.627 |0.093 |1.458 |-0.073 | -| DDG-DA[Linear] |Alpha158 |0.096|0.636|0.107 |0.677 |0.067 |0.996 |-0.091 | -| RR[LightGBM] |Alpha158 |0.082|0.589|0.091 |0.626 |0.077 |1.320 |-0.091 | -| DDG-DA[LightGBM] |Alpha158 |0.085|0.658|0.094 |0.686 |0.115 |1.792 |-0.068 | +|------------------|---------|------|------|---------|-----------|-------------------|-------------------|--------------| +| RR[Linear] |Alpha158 |0.0945|0.5989|0.1069 |0.6495 |0.0857 |1.3682 |-0.0986 | +| DDG-DA[Linear] |Alpha158 |0.0983|0.6157|0.1108 |0.6646 |0.0764 |1.1904 |-0.0769 | +| RR[LightGBM] |Alpha158 |0.0816|0.5887|0.0912 |0.6263 |0.0771 |1.3196 |-0.0909 | +| DDG-DA[LightGBM] |Alpha158 |0.0878|0.6185|0.0975 |0.6524 |0.1261 |2.0096 |-0.0744 | - The label horizon of the `Alpha158` dataset is set to 20. - The rolling time intervals are set to 20 trading days. diff --git a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py index d452957d4..b0c7aea4f 100644 --- a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py +++ b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py @@ -67,11 +67,12 @@ class RollingBenchmark: def basic_task(self): """For fast training rolling""" if self.model_type == "gbdt": - conf_path = DIRNAME.parent.parent / "benchmarks" / "LightGBM" / "workflow_config_lightgbm_Alpha158.yaml" + conf_path = DIRNAME / "workflow_config_lightgbm_Alpha158.yaml" # dump the processed data on to disk for later loading to speed up the processing h_path = DIRNAME / "lightgbm_alpha158_handler_horizon{}.pkl".format(self.horizon) elif self.model_type == "linear": - conf_path = DIRNAME.parent.parent / "benchmarks" / "Linear" / "workflow_config_linear_Alpha158.yaml" + # We use ridge regression to stabilize the performance + conf_path = DIRNAME / "workflow_config_linear_Alpha158.yaml" h_path = DIRNAME / "linear_alpha158_handler_horizon{}.pkl".format(self.horizon) else: raise AssertionError("Model type is not supported!") diff --git a/examples/benchmarks_dynamic/baseline/workflow_config_lightgbm_Alpha158.yaml b/examples/benchmarks_dynamic/baseline/workflow_config_lightgbm_Alpha158.yaml new file mode 100644 index 000000000..2d441dea9 --- /dev/null +++ b/examples/benchmarks_dynamic/baseline/workflow_config_lightgbm_Alpha158.yaml @@ -0,0 +1,72 @@ +qlib_init: + provider_uri: "~/.qlib/qlib_data/cn_data" + region: cn +market: &market csi300 +benchmark: &benchmark SH000300 +data_handler_config: &data_handler_config + start_time: 2008-01-01 + end_time: 2020-08-01 + fit_start_time: 2008-01-01 + fit_end_time: 2014-12-31 + instruments: *market +port_analysis_config: &port_analysis_config + strategy: + class: TopkDropoutStrategy + module_path: qlib.contrib.strategy + kwargs: + model: + dataset: + topk: 50 + n_drop: 5 + backtest: + start_time: 2017-01-01 + end_time: 2020-08-01 + account: 100000000 + benchmark: *benchmark + exchange_kwargs: + limit_threshold: 0.095 + deal_price: close + open_cost: 0.0005 + close_cost: 0.0015 + min_cost: 5 +task: + model: + class: LGBModel + module_path: qlib.contrib.model.gbdt + kwargs: + loss: mse + colsample_bytree: 0.8879 + learning_rate: 0.2 + subsample: 0.8789 + lambda_l1: 205.6999 + lambda_l2: 580.9768 + max_depth: 8 + num_leaves: 210 + num_threads: 20 + dataset: + class: DatasetH + module_path: qlib.data.dataset + kwargs: + handler: + class: Alpha158 + module_path: qlib.contrib.data.handler + kwargs: *data_handler_config + segments: + train: [2008-01-01, 2014-12-31] + valid: [2015-01-01, 2016-12-31] + test: [2017-01-01, 2020-08-01] + record: + - class: SignalRecord + module_path: qlib.workflow.record_temp + kwargs: + model: + dataset: + - class: SigAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + ana_long_short: False + ann_scaler: 252 + - class: PortAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config diff --git a/examples/benchmarks_dynamic/baseline/workflow_config_linear_Alpha158.yaml b/examples/benchmarks_dynamic/baseline/workflow_config_linear_Alpha158.yaml new file mode 100644 index 000000000..78ec4e612 --- /dev/null +++ b/examples/benchmarks_dynamic/baseline/workflow_config_linear_Alpha158.yaml @@ -0,0 +1,79 @@ +qlib_init: + provider_uri: "~/.qlib/qlib_data/cn_data" + region: cn +market: &market csi300 +benchmark: &benchmark SH000300 +data_handler_config: &data_handler_config + start_time: 2008-01-01 + end_time: 2020-08-01 + fit_start_time: 2008-01-01 + fit_end_time: 2014-12-31 + instruments: *market + infer_processors: + - class: RobustZScoreNorm + kwargs: + fields_group: feature + clip_outlier: true + - class: Fillna + kwargs: + fields_group: feature + learn_processors: + - class: DropnaLabel + - class: CSRankNorm + kwargs: + fields_group: label +port_analysis_config: &port_analysis_config + strategy: + class: TopkDropoutStrategy + module_path: qlib.contrib.strategy + kwargs: + signal: + - + - + topk: 50 + n_drop: 5 + backtest: + start_time: 2017-01-01 + end_time: 2020-08-01 + account: 100000000 + benchmark: *benchmark + exchange_kwargs: + limit_threshold: 0.095 + deal_price: close + open_cost: 0.0005 + close_cost: 0.0015 + min_cost: 5 +task: + model: + class: LinearModel + module_path: qlib.contrib.model.linear + kwargs: + estimator: ridge + alpha: 0.05 + dataset: + class: DatasetH + module_path: qlib.data.dataset + kwargs: + handler: + class: Alpha158 + module_path: qlib.contrib.data.handler + kwargs: *data_handler_config + segments: + train: [2008-01-01, 2014-12-31] + valid: [2015-01-01, 2016-12-31] + test: [2017-01-01, 2020-08-01] + record: + - class: SignalRecord + module_path: qlib.workflow.record_temp + kwargs: + model: + dataset: + - class: SigAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + ana_long_short: True + ann_scaler: 252 + - class: PortAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config From 21f0b394e7d133bbf35ceed8cc2c888ad95e34ef Mon Sep 17 00:00:00 2001 From: Linlang <30293408+SunsetWolf@users.noreply.github.com> Date: Sun, 25 Jun 2023 23:39:11 +0800 Subject: [PATCH 06/15] change get_data url (#1558) * change_url * fix_CI * fix_CI_2 * fix_CI_3 * fix_CI_4 * fix_CI_5 * fix_CI_6 * fix_CI_7 * fix_CI_8 * fix_CI_9 * fix_CI_10 * fix_CI_11 * fix_CI_12 * fix_CI_13 * fix_CI_13 * fix_CI_14 * fix_CI_15 * fix_CI_16 * fix_CI_17 * fix_CI_18 * fix_CI_19 * fix_CI_20 * fix_CI_21 * fix_CI_22 * fix_CI_23 * fix_CI_24 * fix_CI_25 * fix_CI_26 * fix_CI_27 * fix_get_data_error * fix_get_data_error2 * modify_get_data * modify_get_data2 * modify_get_data3 * modify_get_data4 * fix_CI_28 * fix_CI_29 * fix_CI_30 --------- Co-authored-by: Linlang --- .github/workflows/test_qlib_from_source.yml | 21 +++-- .../workflows/test_qlib_from_source_slow.yml | 18 +++- docs/component/data.rst | 2 +- qlib/tests/data.py | 90 ++++++++++--------- setup.py | 1 + tests/test_dump_data.py | 2 +- tests/test_get_data.py | 2 +- 7 files changed, 82 insertions(+), 54 deletions(-) diff --git a/.github/workflows/test_qlib_from_source.yml b/.github/workflows/test_qlib_from_source.yml index 68dfe5b3f..0bd3517d5 100644 --- a/.github/workflows/test_qlib_from_source.yml +++ b/.github/workflows/test_qlib_from_source.yml @@ -20,18 +20,28 @@ jobs: steps: - name: Test qlib from source - uses: actions/checkout@v2 + uses: actions/checkout@v3 + + # Since version 3.7 of python for MacOS is installed in CI, version 3.7.17, this version causes "_bz not found error". + # So we make the version number of python 3.7 for MacOS more specific. + # refs: https://github.com/actions/setup-python/issues/682 + - name: Set up Python ${{ matrix.python-version }} + if: (matrix.os == 'macos-latest' && matrix.python-version == '3.7') || (matrix.os == 'macos-11' && matrix.python-version == '3.7') + uses: actions/setup-python@v4 + with: + python-version: "3.7.16" - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + if: (matrix.os != 'macos-latest' || matrix.python-version != '3.7') && (matrix.os != 'macos-11' || matrix.python-version != '3.7') + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Update pip to the latest version # pip release version 23.1 on Apr.15 2023, CI failed to run, Please refer to #1495 ofr detailed logs. - # The pip version has been temporarily fixed to 23.0.1 + # The pip version has been temporarily fixed to 23.0 run: | - python -m pip install pip==23.0.1 + python -m pip install pip==23.0 - name: Installing pytorch for macos if: ${{ matrix.os == 'macos-11' || matrix.os == 'macos-latest' }} @@ -129,8 +139,7 @@ jobs: - name: Test data downloads run: | python scripts/get_data.py qlib_data --name qlib_data_simple --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn - azcopy copy https://qlibpublic.blob.core.windows.net/data/rl /tmp/qlibpublic/data --recursive - mv /tmp/qlibpublic/data tests/.data + python scripts/get_data.py download_data --file_name rl_data.zip --target_dir tests/.data/rl - name: Install Lightgbm for MacOS if: ${{ matrix.os == 'macos-11' || matrix.os == 'macos-latest' }} diff --git a/.github/workflows/test_qlib_from_source_slow.yml b/.github/workflows/test_qlib_from_source_slow.yml index f8e43fa17..1dfcc0179 100644 --- a/.github/workflows/test_qlib_from_source_slow.yml +++ b/.github/workflows/test_qlib_from_source_slow.yml @@ -20,18 +20,28 @@ jobs: steps: - name: Test qlib from source slow - uses: actions/checkout@v2 + uses: actions/checkout@v3 + + # Since version 3.7 of python for MacOS is installed in CI, version 3.7.17, this version causes "_bz not found error". + # So we make the version number of python 3.7 for MacOS more specific. + # refs: https://github.com/actions/setup-python/issues/682 + - name: Set up Python ${{ matrix.python-version }} + if: (matrix.os == 'macos-latest' && matrix.python-version == '3.7') || (matrix.os == 'macos-11' && matrix.python-version == '3.7') + uses: actions/setup-python@v4 + with: + python-version: "3.7.16" - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + if: (matrix.os != 'macos-latest' || matrix.python-version != '3.7') && (matrix.os != 'macos-11' || matrix.python-version != '3.7') + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Set up Python tools # pip release version 23.1 on Apr.15 2023, CI failed to run, Please refer to #1495 ofr detailed logs. - # The pip version has been temporarily fixed to 23.0.1 + # The pip version has been temporarily fixed to 23.0 run: | - python -m pip install pip==23.0.1 + python -m pip install pip==23.0 pip install --upgrade cython numpy pip install -e .[dev] diff --git a/docs/component/data.rst b/docs/component/data.rst index 60e8d4fa1..5a2d458f6 100644 --- a/docs/component/data.rst +++ b/docs/component/data.rst @@ -119,7 +119,7 @@ Here are some example: for daily data: .. code-block:: bash - python scripts/get_data.py csv_data_cn --target_dir ~/.qlib/csv_data/cn_data + python scripts/get_data.py download_data --file_name csv_data_cn.zip --target_dir ~/.qlib/csv_data/cn_data for 1min data: .. code-block:: bash diff --git a/qlib/tests/data.py b/qlib/tests/data.py index 2163b4bf7..8de32f3f6 100644 --- a/qlib/tests/data.py +++ b/qlib/tests/data.py @@ -1,6 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. +import os import re import sys import qlib @@ -11,13 +12,15 @@ import datetime from tqdm import tqdm from pathlib import Path from loguru import logger +from cryptography.fernet import Fernet from qlib.utils import exists_qlib_data class GetData: - DATASET_VERSION = "v2" REMOTE_URL = "https://qlibpublic.blob.core.windows.net/data/default/stock_data" - QLIB_DATA_NAME = "{dataset_name}_{region}_{interval}_{qlib_version}.zip" + # "?" is not included in the token. + TOKEN = "gAAAAABkmDhojHc0VSCDdNK1MqmRzNLeDFXe5hy8obHpa6SDQh4de6nW5gtzuD-fa6O_WZb0yyqYOL7ndOfJX_751W3xN5YB4-n-P22jK-t6ucoZqhT70KPD0Lf0_P328QPJVZ1gDnjIdjhi2YLOcP4BFTHLNYO0mvzszR8TKm9iT5AKRvuysWnpi8bbYwGU9zAcJK3x9EPL43hOGtxliFHcPNGMBoJW4g_ercdhi0-Qgv5_JLsV-29_MV-_AhuaYvJuN2dEywBy" + KEY = "EYcA8cgorA8X9OhyMwVfuFxn_1W3jGk6jCbs3L2oPoA=" def __init__(self, delete_zip_file=False): """ @@ -29,24 +32,44 @@ class GetData: """ self.delete_zip_file = delete_zip_file - def normalize_dataset_version(self, dataset_version: str = None): - if dataset_version is None: - dataset_version = self.DATASET_VERSION - return dataset_version + def merge_remote_url(self, file_name: str): + fernet = Fernet(self.KEY) + token = fernet.decrypt(self.TOKEN).decode() + return f"{self.REMOTE_URL}/{file_name}?{token}" - def merge_remote_url(self, file_name: str, dataset_version: str = None): - return f"{self.REMOTE_URL}/{self.normalize_dataset_version(dataset_version)}/{file_name}" + def download_data(self, file_name: str, target_dir: [Path, str], delete_old: bool = True): + """ + Download the specified file to the target folder. - def _download_data( - self, file_name: str, target_dir: [Path, str], delete_old: bool = True, dataset_version: str = None - ): + Parameters + ---------- + target_dir: str + data save directory + file_name: str + dataset name, needs to endwith .zip, value from [rl_data.zip, csv_data_cn.zip, ...] + may contain folder names, for example: v2/qlib_data_simple_cn_1d_latest.zip + delete_old: bool + delete an existing directory, by default True + + Examples + --------- + # get rl data + python get_data.py download_data --file_name rl_data.zip --target_dir ~/.qlib/qlib_data/rl_data + When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/rl_data.zip?{token} + + # get cn csv data + python get_data.py download_data --file_name csv_data_cn.zip --target_dir ~/.qlib/csv_data/cn_data + When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/csv_data_cn.zip?{token} + ------- + + """ target_dir = Path(target_dir).expanduser() target_dir.mkdir(exist_ok=True, parents=True) # saved file name - _target_file_name = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "_" + file_name + _target_file_name = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "_" + os.path.basename(file_name) target_path = target_dir.joinpath(_target_file_name) - url = self.merge_remote_url(file_name, dataset_version) + url = self.merge_remote_url(file_name) resp = requests.get(url, stream=True, timeout=60) resp.raise_for_status() if resp.status_code != 200: @@ -56,7 +79,7 @@ class GetData: logger.warning( f"The data for the example is collected from Yahoo Finance. Please be aware that the quality of the data might not be perfect. (You can refer to the original data source: https://finance.yahoo.com/lookup.)" ) - logger.info(f"{file_name} downloading......") + logger.info(f"{os.path.basename(file_name)} downloading......") with tqdm(total=int(resp.headers.get("Content-Length", 0))) as p_bar: with target_path.open("wb") as fp: for chunk in resp.iter_content(chunk_size=chunk_size): @@ -67,8 +90,8 @@ class GetData: if self.delete_zip_file: target_path.unlink() - def check_dataset(self, file_name: str, dataset_version: str = None): - url = self.merge_remote_url(file_name, dataset_version) + def check_dataset(self, file_name: str): + url = self.merge_remote_url(file_name) resp = requests.get(url, stream=True, timeout=60) status = True if resp.status_code == 404: @@ -140,9 +163,11 @@ class GetData: --------- # get 1d data python get_data.py qlib_data --name qlib_data --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn + When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/v2/qlib_data_cn_1d_latest.zip?{token} # get 1min data python get_data.py qlib_data --name qlib_data --target_dir ~/.qlib/qlib_data/cn_data_1min --interval 1min --region cn + When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/v2/qlib_data_cn_1min_latest.zip?{token} ------- """ @@ -155,29 +180,12 @@ class GetData: qlib_version = ".".join(re.findall(r"(\d+)\.+", qlib.__version__)) - def _get_file_name(v): - return self.QLIB_DATA_NAME.format( - dataset_name=name, region=region.lower(), interval=interval.lower(), qlib_version=v - ) + def _get_file_name_with_version(qlib_version, dataset_version): + dataset_version = "v2" if dataset_version is None else dataset_version + file_name_with_version = f"{dataset_version}/{name}_{region.lower()}_{interval.lower()}_{qlib_version}.zip" + return file_name_with_version - file_name = _get_file_name(qlib_version) - if not self.check_dataset(file_name, version): - file_name = _get_file_name("latest") - self._download_data(file_name.lower(), target_dir, delete_old, dataset_version=version) - - def csv_data_cn(self, target_dir="~/.qlib/csv_data/cn_data"): - """download cn csv data from remote - - Parameters - ---------- - target_dir: str - data save directory - - Examples - --------- - python get_data.py csv_data_cn --target_dir ~/.qlib/csv_data/cn_data - ------- - - """ - file_name = "csv_data_cn.zip" - self._download_data(file_name, target_dir) + file_name = _get_file_name_with_version(qlib_version, dataset_version=version) + if not self.check_dataset(file_name): + file_name = _get_file_name_with_version("latest", dataset_version=version) + self.download_data(file_name.lower(), target_dir, delete_old) diff --git a/setup.py b/setup.py index 109fed213..9d7c185ab 100644 --- a/setup.py +++ b/setup.py @@ -80,6 +80,7 @@ REQUIRED = [ "gym", # Installing the latest version of protobuf for python versions below 3.8 will cause unit tests to fail. "protobuf<=3.20.1;python_version<='3.8'", + "cryptography", ] # Numpy include diff --git a/tests/test_dump_data.py b/tests/test_dump_data.py index dfa7f8556..33cae4e80 100644 --- a/tests/test_dump_data.py +++ b/tests/test_dump_data.py @@ -35,7 +35,7 @@ class TestDumpData(unittest.TestCase): @classmethod def setUpClass(cls) -> None: - GetData().csv_data_cn(SOURCE_DIR) + GetData().download_data(file_name="csv_data_cn.zip", target_dir=SOURCE_DIR) TestDumpData.DUMP_DATA = DumpDataAll(csv_path=SOURCE_DIR, qlib_dir=QLIB_DIR, include_fields=cls.FIELDS) TestDumpData.STOCK_NAMES = list(map(lambda x: x.name[:-4].upper(), SOURCE_DIR.glob("*.csv"))) provider_uri = str(QLIB_DIR.resolve()) diff --git a/tests/test_get_data.py b/tests/test_get_data.py index 93a852f55..94e685e1f 100644 --- a/tests/test_get_data.py +++ b/tests/test_get_data.py @@ -42,7 +42,7 @@ class TestGetData(unittest.TestCase): self.assertFalse(df.dropna().empty, "get qlib data failed") def test_1_csv_data(self): - GetData().csv_data_cn(SOURCE_DIR) + GetData().download_data(file_name="csv_data_cn.zip", target_dir=SOURCE_DIR) stock_name = set(map(lambda x: x.name[:-4].upper(), SOURCE_DIR.glob("*.csv"))) self.assertEqual(len(stock_name), 85, "get csv data failed") From 0e61cac6a892a9b63520858d4a37fa4e2e8ac593 Mon Sep 17 00:00:00 2001 From: you-n-g Date: Sun, 25 Jun 2023 23:48:37 +0800 Subject: [PATCH 07/15] Update release-drafter.yml (#1569) * Update release-drafter.yml * Update release-drafter.yml --- .github/release-drafter.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml index ec8ea5d69..488419d52 100644 --- a/.github/release-drafter.yml +++ b/.github/release-drafter.yml @@ -14,6 +14,9 @@ categories: label: - 'doc' - 'documentation' + - title: '🧹 Maintenance' + label: + - 'maintenance' change-template: '- $TITLE @$AUTHOR (#$NUMBER)' change-title-escapes: '\<*_&' # You can add # and @ to disable mentions, and add ` to disable code blocks. version-resolver: @@ -30,4 +33,4 @@ version-resolver: template: | ## Changes - $CHANGES \ No newline at end of file + $CHANGES From 27f476b31198f0f04ad1a61175c1c0fc9c6a486b Mon Sep 17 00:00:00 2001 From: you-n-g Date: Mon, 26 Jun 2023 00:00:46 +0800 Subject: [PATCH 08/15] Update __init__.py --- qlib/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/__init__.py b/qlib/__init__.py index 11d22cc23..96daaad1a 100644 --- a/qlib/__init__.py +++ b/qlib/__init__.py @@ -2,7 +2,7 @@ # Licensed under the MIT License. from pathlib import Path -__version__ = "0.9.1.99" +__version__ = "0.9.2" __version__bak = __version__ # This version is backup for QlibConfig.reset_qlib_version import os from typing import Union From b1e7b19a3d57339e9e020ab82759f97bf2f5e27e Mon Sep 17 00:00:00 2001 From: you-n-g Date: Tue, 27 Jun 2023 11:55:40 +0800 Subject: [PATCH 09/15] Update __init__.py --- qlib/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/__init__.py b/qlib/__init__.py index 96daaad1a..a963a8c28 100644 --- a/qlib/__init__.py +++ b/qlib/__init__.py @@ -2,7 +2,7 @@ # Licensed under the MIT License. from pathlib import Path -__version__ = "0.9.2" +__version__ = "0.9.2.99" __version__bak = __version__ # This version is backup for QlibConfig.reset_qlib_version import os from typing import Union From 4db30b122533aa0f48cd3f9dbd1ef9b63a10c4d1 Mon Sep 17 00:00:00 2001 From: you-n-g Date: Wed, 28 Jun 2023 10:53:58 +0800 Subject: [PATCH 10/15] Update README.md for RL (#1573) * Update README.md * Update README.md --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index c09e1276e..539700a91 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,7 @@ For more details, please refer to our paper ["Qlib: An AI-oriented Quantitative
  • Adapting to Market Dynamics
  • +
  • Reinforcement Learning: modeling continuous decisions
  • @@ -392,6 +393,17 @@ Here is a list of solutions built on `Qlib`. - [Rolling Retraining](examples/benchmarks_dynamic/baseline/) - [DDG-DA on pytorch (Wendi, et al. AAAI 2022)](examples/benchmarks_dynamic/DDG-DA/) +## Reinforcement Learning: modeling continuous decisions +Qlib now supports reinforcement learning, a feature designed to model continuous investment decisions. This functionality assists investors in optimizing their trading strategies by learning from interactions with the environment to maximize some notion of cumulative reward. + +Here is a list of solutions built on `Qlib` categorized by scenarios. + +### [RL for order execution](examples/rl_order_execution) +[Here](https://qlib.readthedocs.io/en/latest/component/rl/overall.html#order-execution) is the introduction of this scenario. All the methods below are compared [here](examples/rl_order_execution). +- [TWAP](examples/rl_order_execution/exp_configs/backtest_twap.yml) +- [PPO: "An End-to-End Optimal Trade Execution Framework based on Proximal Policy Optimization", IJCAL 2020](examples/rl_order_execution/exp_configs/backtest_ppo.yml) +- [OPDS: "Universal Trading for Order Execution with Oracle Policy Distillation", AAAI 2021](examples/rl_order_execution/exp_configs/backtest_opds.yml) + # Quant Dataset Zoo Dataset plays a very important role in Quant. Here is a list of the datasets built on `Qlib`: From b7e5f63a07b7897408a0eda6c8860b9d59f0e921 Mon Sep 17 00:00:00 2001 From: Linlang <30293408+SunsetWolf@users.noreply.github.com> Date: Wed, 5 Jul 2023 21:23:15 +0800 Subject: [PATCH 11/15] fix_pip_ci (#1584) * fix_pip_ci * fix_ci_get_data_error --------- Co-authored-by: Linlang --- .github/workflows/test_qlib_from_pip.yml | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_qlib_from_pip.yml b/.github/workflows/test_qlib_from_pip.yml index e6202e57e..346dd4960 100644 --- a/.github/workflows/test_qlib_from_pip.yml +++ b/.github/workflows/test_qlib_from_pip.yml @@ -19,10 +19,20 @@ jobs: steps: - name: Test qlib from pip - uses: actions/checkout@v2 + uses: actions/checkout@v3 + + # Since version 3.7 of python for MacOS is installed in CI, version 3.7.17, this version causes "_bz not found error". + # So we make the version number of python 3.7 for MacOS more specific. + # refs: https://github.com/actions/setup-python/issues/682 + - name: Set up Python ${{ matrix.python-version }} + if: (matrix.os == 'macos-latest' && matrix.python-version == '3.7') || (matrix.os == 'macos-11' && matrix.python-version == '3.7') + uses: actions/setup-python@v4 + with: + python-version: "3.7.16" - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + if: (matrix.os != 'macos-latest' || matrix.python-version != '3.7') && (matrix.os != 'macos-11' || matrix.python-version != '3.7') + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} @@ -50,7 +60,9 @@ jobs: - name: Downloads dependencies data run: | - python scripts/get_data.py qlib_data --name qlib_data_simple --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn + cd .. + python -m qlib.run.get_data qlib_data --target_dir ~/.qlib/qlib_data/cn_data --region cn + cd qlib - name: Test workflow by config run: | From 3e074c84353da97305a0b8ea17904921891ca583 Mon Sep 17 00:00:00 2001 From: Yang <3349368+m3ngyang@users.noreply.github.com> Date: Thu, 6 Jul 2023 12:38:52 +0800 Subject: [PATCH 12/15] fix download token (#1577) --- qlib/tests/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/tests/data.py b/qlib/tests/data.py index 8de32f3f6..f6bd78090 100644 --- a/qlib/tests/data.py +++ b/qlib/tests/data.py @@ -19,7 +19,7 @@ from qlib.utils import exists_qlib_data class GetData: REMOTE_URL = "https://qlibpublic.blob.core.windows.net/data/default/stock_data" # "?" is not included in the token. - TOKEN = "gAAAAABkmDhojHc0VSCDdNK1MqmRzNLeDFXe5hy8obHpa6SDQh4de6nW5gtzuD-fa6O_WZb0yyqYOL7ndOfJX_751W3xN5YB4-n-P22jK-t6ucoZqhT70KPD0Lf0_P328QPJVZ1gDnjIdjhi2YLOcP4BFTHLNYO0mvzszR8TKm9iT5AKRvuysWnpi8bbYwGU9zAcJK3x9EPL43hOGtxliFHcPNGMBoJW4g_ercdhi0-Qgv5_JLsV-29_MV-_AhuaYvJuN2dEywBy" + TOKEN = b"gAAAAABkmDhojHc0VSCDdNK1MqmRzNLeDFXe5hy8obHpa6SDQh4de6nW5gtzuD-fa6O_WZb0yyqYOL7ndOfJX_751W3xN5YB4-n-P22jK-t6ucoZqhT70KPD0Lf0_P328QPJVZ1gDnjIdjhi2YLOcP4BFTHLNYO0mvzszR8TKm9iT5AKRvuysWnpi8bbYwGU9zAcJK3x9EPL43hOGtxliFHcPNGMBoJW4g_ercdhi0-Qgv5_JLsV-29_MV-_AhuaYvJuN2dEywBy" KEY = "EYcA8cgorA8X9OhyMwVfuFxn_1W3jGk6jCbs3L2oPoA=" def __init__(self, delete_zip_file=False): From b1dfc77ad78ace9374051c597e5d732cd3402305 Mon Sep 17 00:00:00 2001 From: Lewen Wang <49936435+lwwang1995@users.noreply.github.com> Date: Fri, 7 Jul 2023 15:40:03 +0800 Subject: [PATCH 13/15] Update qlibrl docs. (#1588) * Update qlibrl docs. * Update docs/component/rl/guidance.rst * Update docs/component/rl/guidance.rst * Update docs/component/rl/guidance.rst --------- Co-authored-by: Litzy Co-authored-by: you-n-g --- docs/component/rl/guidance.rst | 32 ++++++++++++++++++++ docs/component/rl/overall.rst | 54 +++++++++++++++++++++++----------- docs/component/rl/toctree.rst | 1 + 3 files changed, 70 insertions(+), 17 deletions(-) create mode 100644 docs/component/rl/guidance.rst diff --git a/docs/component/rl/guidance.rst b/docs/component/rl/guidance.rst new file mode 100644 index 000000000..7f917d559 --- /dev/null +++ b/docs/component/rl/guidance.rst @@ -0,0 +1,32 @@ + +======== +Guidance +======== +.. currentmodule:: qlib + +QlibRL can help users quickly get started and conveniently implement quantitative strategies based on reinforcement learning(RL) algorithms. For different user groups, we recommend the following guidance to use QlibRL. + +Beginners to Reinforcement Learning Algorithms +============================================== +Whether you are a quantitative researcher who wants to understand what RL can do in trading or a learner who wants to get started with RL algorithms in trading scenarios, if you have limited knowledge of RL and want to shield various detailed settings to quickly get started with RL algorithms, we recommend the following sequence to learn qlibrl: + - Learn the fundamentals of RL in `part1 `_. + - Understand the trading scenarios where RL methods can be applied in `part2 `_. + - Run the examples in `part3 `_ to solve trading problems using RL. + - If you want to further explore QlibRL and make some customizations, you need to first understand the framework of QlibRL in `part4 `_ and rewrite specific components according to your needs. + +Reinforcement Learning Algorithm Researcher +============================================== +If you are already familiar with existing RL algorithms and dedicated to researching RL algorithms but lack domain knowledge in the financial field, and you want to validate the effectiveness of your algorithms in financial trading scenarios, we recommend the following steps to get started with QlibRL: + - Understand the trading scenarios where RL methods can be applied in `part2 `_. + - Choose an RL application scenario (currently, QlibRL has implemented two scenario examples: order execution and algorithmic trading). Run the example in `part3 `_ to get it working. + - Modify the `policy `_ part to incorporate your own RL algorithm. + +Quantitative Researcher +======================= +If you have a certain level of financial domain knowledge and coding skills, and you want to explore the application of RL algorithms in the investment field, we recommend the following steps to explore QlibRL: + - Learn the fundamentals of RL in `part1 `_. + - Understand the trading scenarios where RL methods can be applied in `part2 `_. + - Run the examples in `part3 `_ to solve trading problems using RL. + - Understand the framework of QlibRL in `part4 `_. + - Choose a suitable RL algorithm based on the characteristics of the problem you want to solve (currently, QlibRL supports PPO and DQN algorithms based on tianshou). + - Design the MDP (Markov Decision Process) process based on market trading rules and the problem you want to solve. Refer to the example in order execution and make corresponding modifications to the following modules: `State `_, `Metrics `_, `ActionInterpreter `_, `StateInterpreter `_, `Reward `_, `Observation `_, `Simulator `_. \ No newline at end of file diff --git a/docs/component/rl/overall.rst b/docs/component/rl/overall.rst index 4f59dd17a..f586a07e2 100644 --- a/docs/component/rl/overall.rst +++ b/docs/component/rl/overall.rst @@ -4,7 +4,7 @@ Reinforcement Learning in Quantitative Trading Reinforcement Learning ====================== -Different from supervised learning tasks such as classification tasks and regression tasks. Another important paradigm in machine learning is Reinforcement Learning, +Different from supervised learning tasks such as classification tasks and regression tasks. Another important paradigm in machine learning is Reinforcement Learning(RL), which attempts to optimize an accumulative numerical reward signal by directly interacting with the environment under a few assumptions such as Markov Decision Process(MDP). As demonstrated in the following figure, an RL system consists of four elements, 1)the agent 2) the environment the agent interacts with 3) the policy that the agent follows to take actions on the environment and 4)the reward signal from the environment to the agent. @@ -25,26 +25,46 @@ The Qlib Reinforcement Learning toolkit (QlibRL) is an RL platform for quantitat Potential Application Scenarios in Quantitative Trading ======================================================= -RL methods have already achieved outstanding achievement in many applications, such as game playing, resource allocating, recommendation, marketing and advertising, etc. -Investment is always a continuous process, taking the stock market as an example, investors need to control their positions and stock holdings by one or more buying and selling behaviors, to maximize the investment returns. -Besides, each buy and sell decision is made by investors after fully considering the overall market information and stock information. -From the view of an investor, the process could be described as a continuous decision-making process generated according to interaction with the market, such problems could be solved by the RL algorithms. -Following are some scenarios where RL can potentially be used in quantitative investment. - -Portfolio Construction ----------------------- -Portfolio construction is a process of selecting securities optimally by taking a minimum risk to achieve maximum returns. With an RL-based solution, an agent allocates stocks at every time step by obtaining information for each stock and the market. The key is to develop of policy for building a portfolio and make the policy able to pick the optimal portfolio. +RL methods have demonstrated remarkable achievements in various applications, including game playing, resource allocation, recommendation systems, marketing, and advertising. +In the context of investment, which involves continuous decision-making, let's consider the example of the stock market. Investors strive to optimize their investment returns by effectively managing their positions and stock holdings through various buying and selling behaviors. +Furthermore, investors carefully evaluate market conditions and stock-specific information before making each buying or selling decision. From an investor's perspective, this process can be viewed as a continuous decision-making process driven by interactions with the market. RL algorithms offer a promising approach to tackle such challenges. +Here are several scenarios where RL holds potential for application in quantitative investment. Order Execution --------------- -As a fundamental problem in algorithmic trading, order execution aims at fulfilling a specific trading order, either liquidation or acquirement, for a given instrument. Essentially, the goal of order execution is twofold: it not only requires to fulfill the whole order but also targets a more economical execution with maximizing profit gain (or minimizing capital loss). The order execution with only one order of liquidation or acquirement is called single-asset order execution. +The order execution task is to execute orders efficiently while considering multiple factors, including optimal prices, minimizing trading costs, reducing market impact, maximizing order fullfill rates, and achieving execution within a specified time frame. RL can be applied to such tasks by incorporating these objectives into the reward function and action selection process. Specifically, the RL agent interacts with the market environment, observes the state from market information, and makes decisions on next step execution. The RL algorithm learns an optimal execution strategy through trial and error, aiming to maximize the expected cumulative reward, which incorporates the desired objectives. -Considering stock investment always aim to pursue long-term maximized profits, it usually manifests as a sequential process of continuously adjusting the asset portfolios, execution for multiple orders, including order of liquidation and acquirement, brings more constraints and makes the sequence of execution for different orders should be considered, e.g. before executing an order to buy some stocks, we have to sell at least one stock. The order execution with multiple assets is called multi-asset order execution. + - General Setting + - Environment: The environment represents the financial market where order execution takes place. It encompasses variables such as the order book dynamics, liquidity, price movements, and market conditions. -According to the order execution’s trait of sequential decision-making, an RL-based solution could be applied to solve the order execution. With an RL-based solution, an agent optimizes execution strategy by interacting with the market environment. + - State: The state refers to the information available to the RL agent at a given time step. It typically includes features such as the current order book state (bid-ask spread, order depth), historical price data, historical trading volume, market volatility, and any other relevant information that can aid in decision-making. -With QlibRL, the RL algorithm in the above scenarios can be easily implemented. + - Action: The action is the decision made by the RL agent based on the observed state. In order execution, actions can include selecting the order size, price, and timing of execution. -Nested Portfolio Construction and Order Executor ------------------------------------------------- -QlibRL makes it possible to jointly optimize different levels of strategies/models/agents. Take `Nested Decision Execution Framework `_ as an example, the optimization of order execution strategy and portfolio management strategies can interact with each other to maximize returns. + - Reward: The reward is a scalar signal that indicates the performance of the RL agent's action in the environment. The reward function is designed to encourage actions that lead to efficient and cost-effective order execution. It typically considers multiple objectives, such as maximizing price advantages, minimizing trading costs (including transaction fees and slippage), reducing market impact (the effect of the order on the market price) and maximizing order fullfill rates. + + - Scenarios + - Single-asset order execution: Single-asset order execution focuses on the task of executing a single order for a specific asset, such as a stock or a cryptocurrency. The primary objective is to execute the order efficiently while considering factors such as maximizing price advantages, minimizing trading costs, reducing market impact, and achieving a high fullfill rate. The RL agent interacts with the market environment and makes decisions on order size, price, and timing of execution for that particular asset. The goal is to learn an optimal execution strategy for the single asset, maximizing the expected cumulative reward while considering the specific dynamics and characteristics of that asset. + + - Multi-asset order execution: Multi-asset order execution expands the order execution task to involve multiple assets or securities. It typically involves executing a portfolio of orders across different assets simultaneously or sequentially. Unlike single-asset order execution, the focus is not only on the execution of individual orders but also on managing the interactions and dependencies between different assets within the portfolio. The RL agent needs to make decisions on the order sizes, prices, and timings for each asset in the portfolio, considering their interdependencies, cash constraints, market conditions, and transaction costs. The goal is to learn an optimal execution strategy that balances the execution efficiency for each asset while considering the overall performance and objectives of the portfolio as a whole. + +The choice of settings and RL algorithm depends on the specific requirements of the task, available data, and desired performance objectives. + +Portfolio Construction +---------------------- +Portfolio construction is a process of selecting and allocating assets in an investment portfolio. RL provides a framework to optimize portfolio management decisions by learning from interactions with the market environment and maximizing long-term returns while considering risk management. + - General Setting + - State: The state represents the current information about the market and the portfolio. It typically includes historical prices and volumes, technical indicators, and other relevant data. + + - Action: The action corresponds to the decision of allocating capital to different assets in the portfolio. It determines the weights or proportions of investments in each asset. + + - Reward: The reward is a metric that evaluates the performance of the portfolio. It can be defined in various ways, such as total return, risk-adjusted return, or other objectives like maximizing Sharpe ratio or minimizing drawdown. + + - Scenarios + - Stock market: RL can be used to construct portfolios of stocks, where the agent learns to allocate capital among different stocks. + + - Cryptocurrency market: RL can be applied to construct portfolios of cryptocurrencies, where the agent learns to make allocation decisions. + + - Foreign exchange (Forex) market: RL can be used to construct portfolios of currency pairs, where the agent learns to allocate capital across different currencies based on exchange rate data, economic indicators, and other factors. + +Similarly, the choice of basic setting and algorithm depends on the specific requirements of the problem and the characteristics of the market. \ No newline at end of file diff --git a/docs/component/rl/toctree.rst b/docs/component/rl/toctree.rst index d79d5e060..4b88de06e 100644 --- a/docs/component/rl/toctree.rst +++ b/docs/component/rl/toctree.rst @@ -5,6 +5,7 @@ Reinforcement Learning in Quantitative Trading ======================================================================== .. toctree:: + Guidance Overall Quick Start Framework From 8d3adf34ac406677f2ceec885acaf76b369c42c0 Mon Sep 17 00:00:00 2001 From: you-n-g Date: Wed, 12 Jul 2023 09:59:09 +0800 Subject: [PATCH 14/15] Postpone PR stale. (#1591) --- .github/workflows/stale.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index b07bdf1e7..6ce457dfd 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -18,7 +18,8 @@ jobs: stale-issue-label: 'stale' stale-pr-label: 'stale' days-before-stale: 90 + days-before-pr-stale: 365 days-before-close: 5 operations-per-run: 100 exempt-issue-labels: 'bug,enhancement' - remove-stale-when-updated: true \ No newline at end of file + remove-stale-when-updated: true From be4646b4b7ea062095fb702eb3de654107d5bbd5 Mon Sep 17 00:00:00 2001 From: you-n-g Date: Fri, 14 Jul 2023 12:16:12 +0800 Subject: [PATCH 15/15] Adjust rolling api (#1594) * Intermediate version * Fix yaml template & Successfully run rolling * Be compatible with benchmark * Get same results with previous linear model * Black formatting * Update black * Update the placeholder mechanism * Update CI * Update CI * Upgrade Black * Fix CI and simplify code * Fix CI * Move the data processing caching mechanism into utils. * Adjusting DDG-DA * Organize import --- .github/workflows/python-publish.yml | 2 +- .github/workflows/test_qlib_from_pip.yml | 1 + .github/workflows/test_qlib_from_source.yml | 3 + .pre-commit-config.yaml | 4 +- docs/component/workflow.rst | 8 +- .../workflow_config_adarnn_Alpha360.yaml | 3 +- .../ADD/workflow_config_add_Alpha360.yaml | 4 +- .../ALSTM/workflow_config_alstm_Alpha158.yaml | 4 +- .../ALSTM/workflow_config_alstm_Alpha360.yaml | 4 +- .../workflow_config_catboost_Alpha158.yaml | 4 +- ...kflow_config_catboost_Alpha158_csi500.yaml | 4 +- .../workflow_config_catboost_Alpha360.yaml | 4 +- ...kflow_config_catboost_Alpha360_csi500.yaml | 4 +- ...rkflow_config_doubleensemble_Alpha158.yaml | 4 +- ...config_doubleensemble_Alpha158_csi500.yaml | 4 +- ...rkflow_config_doubleensemble_Alpha360.yaml | 4 +- ...config_doubleensemble_Alpha360_csi500.yaml | 4 +- ...ig_doubleensemble_early_stop_Alpha158.yaml | 4 +- .../GATs/workflow_config_gats_Alpha158.yaml | 4 +- .../GATs/workflow_config_gats_Alpha360.yaml | 4 +- .../GRU/workflow_config_gru_Alpha158.yaml | 4 +- .../GRU/workflow_config_gru_Alpha360.yaml | 4 +- .../HIST/workflow_config_hist_Alpha360.yaml | 6 +- .../IGMTF/workflow_config_igmtf_Alpha360.yaml | 3 +- .../KRNN/workflow_config_krnn_Alpha360.yaml | 4 +- .../LSTM/workflow_config_lstm_Alpha158.yaml | 4 +- .../LSTM/workflow_config_lstm_Alpha360.yaml | 4 +- .../benchmarks/LightGBM/multi_freq_handler.py | 1 - .../workflow_config_lightgbm_Alpha158.yaml | 3 +- ...kflow_config_lightgbm_Alpha158_csi500.yaml | 3 +- ...w_config_lightgbm_Alpha158_multi_freq.yaml | 4 +- .../workflow_config_lightgbm_Alpha360.yaml | 4 +- ...kflow_config_lightgbm_Alpha360_csi500.yaml | 4 +- ..._config_lightgbm_configurable_dataset.yaml | 4 +- .../workflow_config_lightgbm_multi_freq.yaml | 4 +- .../workflow_config_linear_Alpha158.yaml | 4 +- ...orkflow_config_linear_Alpha158_csi500.yaml | 4 +- .../workflow_config_localformer_Alpha158.yaml | 4 +- .../workflow_config_localformer_Alpha360.yaml | 4 +- .../MLP/workflow_config_mlp_Alpha158.yaml | 4 +- .../workflow_config_mlp_Alpha158_csi500.yaml | 4 +- .../MLP/workflow_config_mlp_Alpha360.yaml | 4 +- .../workflow_config_mlp_Alpha360_csi500.yaml | 4 +- .../SFM/workflow_config_sfm_Alpha360.yaml | 4 +- .../workflow_config_sandwich_Alpha360.yaml | 4 +- .../TCN/workflow_config_tcn_Alpha158.yaml | 3 +- .../TCN/workflow_config_tcn_Alpha360.yaml | 3 +- .../TCTS/workflow_config_tcts_Alpha360.yaml | 6 +- .../benchmarks/TFT/data_formatters/base.py | 1 - .../benchmarks/TFT/expt_settings/configs.py | 1 - .../benchmarks/TFT/libs/hyperparam_opt.py | 2 - examples/benchmarks/TFT/libs/tft_model.py | 3 - .../TFT/workflow_config_tft_Alpha158.yaml | 4 +- examples/benchmarks/TRA/example.py | 2 - examples/benchmarks/TRA/src/dataset.py | 2 - examples/benchmarks/TRA/src/model.py | 7 - .../TRA/workflow_config_tra_Alpha158.yaml | 4 +- .../workflow_config_tra_Alpha158_full.yaml | 4 +- .../TRA/workflow_config_tra_Alpha360.yaml | 4 +- .../workflow_config_TabNet_Alpha158.yaml | 4 +- .../workflow_config_TabNet_Alpha360.yaml | 4 +- .../workflow_config_transformer_Alpha158.yaml | 4 +- .../workflow_config_transformer_Alpha360.yaml | 4 +- .../workflow_config_xgboost_Alpha158.yaml | 4 +- .../workflow_config_xgboost_Alpha360.yaml | 4 +- examples/benchmarks_dynamic/DDG-DA/README.md | 4 +- .../benchmarks_dynamic/DDG-DA/workflow.py | 307 ++-------------- .../benchmarks_dynamic/baseline/README.md | 7 +- .../baseline/rolling_benchmark.py | 164 +-------- .../workflow_config_lightgbm_Alpha158.yaml | 3 +- .../workflow_config_linear_Alpha158.yaml | 4 +- examples/highfreq/highfreq_handler.py | 1 - examples/highfreq/workflow.py | 1 - .../LightGBM/hyperparameter_158.py | 1 - .../LightGBM/hyperparameter_360.py | 1 - examples/model_interpreter/feature.py | 1 - examples/portfolio/prepare_riskdata.py | 3 - examples/rolling_process_data/workflow.py | 2 - examples/workflow_by_code.py | 1 - qlib/__init__.py | 1 - qlib/backtest/__init__.py | 1 - qlib/backtest/exchange.py | 1 - qlib/config.py | 1 - qlib/contrib/data/dataset.py | 5 - qlib/contrib/data/highfreq_handler.py | 2 - qlib/contrib/data/highfreq_processor.py | 1 - qlib/contrib/meta/data_selection/dataset.py | 2 + qlib/contrib/model/pytorch_adarnn.py | 2 - qlib/contrib/model/pytorch_alstm.py | 7 - qlib/contrib/model/pytorch_alstm_ts.py | 9 +- qlib/contrib/model/pytorch_gats.py | 4 - qlib/contrib/model/pytorch_gats_ts.py | 8 - qlib/contrib/model/pytorch_gru.py | 7 - qlib/contrib/model/pytorch_gru_ts.py | 9 +- qlib/contrib/model/pytorch_hist.py | 3 - qlib/contrib/model/pytorch_igmtf.py | 4 - qlib/contrib/model/pytorch_localformer.py | 8 - qlib/contrib/model/pytorch_localformer_ts.py | 6 - qlib/contrib/model/pytorch_lstm.py | 6 - qlib/contrib/model/pytorch_lstm_ts.py | 9 +- qlib/contrib/model/pytorch_sfm.py | 6 - qlib/contrib/model/pytorch_tabnet.py | 4 - qlib/contrib/model/pytorch_tcn.py | 6 - qlib/contrib/model/pytorch_tcn_ts.py | 4 - qlib/contrib/model/pytorch_tcts.py | 7 - qlib/contrib/model/pytorch_tra.py | 10 - qlib/contrib/model/pytorch_transformer.py | 8 - qlib/contrib/model/pytorch_transformer_ts.py | 6 - qlib/contrib/model/xgboost.py | 1 - qlib/contrib/report/data/ana.py | 1 - qlib/contrib/report/data/base.py | 1 - qlib/contrib/report/graph.py | 1 - qlib/contrib/rolling/__init__.py | 7 + qlib/contrib/rolling/__main__.py | 16 + qlib/contrib/rolling/base.py | 246 +++++++++++++ qlib/contrib/rolling/ddgda.py | 343 ++++++++++++++++++ qlib/contrib/strategy/optimizer/optimizer.py | 1 - qlib/contrib/strategy/rule_strategy.py | 1 - qlib/contrib/strategy/signal_strategy.py | 2 - qlib/contrib/tuner/config.py | 2 - qlib/contrib/tuner/pipeline.py | 4 - qlib/contrib/tuner/tuner.py | 7 - qlib/data/cache.py | 5 - qlib/data/data.py | 1 - qlib/data/dataset/processor.py | 1 - qlib/data/dataset/utils.py | 8 +- qlib/data/pit.py | 1 - qlib/data/storage/file_storage.py | 3 - qlib/log.py | 1 - qlib/model/riskmodel/poet.py | 1 - qlib/tests/__init__.py | 2 - qlib/utils/__init__.py | 230 ++---------- qlib/utils/index_data.py | 1 - qlib/utils/mod.py | 235 ++++++++++++ qlib/workflow/record_temp.py | 1 - qlib/workflow/task/gen.py | 1 - qlib/workflow/task/utils.py | 34 +- scripts/check_dump_bin.py | 1 - scripts/data_collector/base.py | 2 - scripts/data_collector/br_index/collector.py | 1 - scripts/data_collector/us_index/collector.py | 1 - scripts/dump_pit.py | 1 - tests/backtest/test_high_freq_trading.py | 1 - .../test_handler_storage.py | 4 - tests/misc/test_sepdf.py | 1 - tests/rolling_tests/test_update_pred.py | 1 - tests/storage_tests/test_storage.py | 1 - tests/test_get_data.py | 1 - 148 files changed, 1035 insertions(+), 1028 deletions(-) create mode 100644 qlib/contrib/rolling/__init__.py create mode 100644 qlib/contrib/rolling/__main__.py create mode 100644 qlib/contrib/rolling/base.py create mode 100644 qlib/contrib/rolling/ddgda.py create mode 100644 qlib/utils/mod.py diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index db14fbf3b..e95a9e88c 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -38,7 +38,7 @@ jobs: TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | twine upload dist/* - + deploy_with_manylinux: runs-on: ubuntu-latest steps: diff --git a/.github/workflows/test_qlib_from_pip.yml b/.github/workflows/test_qlib_from_pip.yml index 346dd4960..f5db06ccb 100644 --- a/.github/workflows/test_qlib_from_pip.yml +++ b/.github/workflows/test_qlib_from_pip.yml @@ -8,6 +8,7 @@ on: jobs: build: + if: ${{ false }} # FIXME: temporarily disable... Due to we are rushing a feature timeout-minutes: 120 runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_qlib_from_source.yml b/.github/workflows/test_qlib_from_source.yml index 0bd3517d5..7271287dc 100644 --- a/.github/workflows/test_qlib_from_source.yml +++ b/.github/workflows/test_qlib_from_source.yml @@ -64,7 +64,10 @@ jobs: python -m pip install -e .[dev] - name: Lint with Black + # Python 3.7 will use a black with low level. So we use python with higher version for black check + if: (matrix.python-version != '3.7') run: | + pip install -U black # follow the latest version of black, previous Qlib dependency will downgrade black black . -l 120 --check --diff - name: Make html with sphinx diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ea57aeb0e..15f00414c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/psf/black - rev: 22.6.0 + rev: 23.7.0 hooks: - id: black args: ["qlib", "-l 120"] @@ -9,4 +9,4 @@ repos: rev: 4.0.1 hooks: - id: flake8 - args: ["--ignore=E501,F541,E266,E402,W503,E731,E203"] \ No newline at end of file + args: ["--ignore=E501,F541,E266,E402,W503,E731,E203"] diff --git a/docs/component/workflow.rst b/docs/component/workflow.rst index 9b84ae4ca..19ba980a1 100644 --- a/docs/component/workflow.rst +++ b/docs/component/workflow.rst @@ -53,9 +53,7 @@ Below is a typical config file of ``qrun``. kwargs: topk: 50 n_drop: 5 - signal: - - - - + signal: backtest: limit_threshold: 0.095 account: 100000000 @@ -281,9 +279,7 @@ The following script is the configuration of `backtest` and the `strategy` used kwargs: topk: 50 n_drop: 5 - signal: - - - - + signal: backtest: limit_threshold: 0.095 account: 100000000 diff --git a/examples/benchmarks/ADARNN/workflow_config_adarnn_Alpha360.yaml b/examples/benchmarks/ADARNN/workflow_config_adarnn_Alpha360.yaml index ac49d0145..ae2bad5cc 100644 --- a/examples/benchmarks/ADARNN/workflow_config_adarnn_Alpha360.yaml +++ b/examples/benchmarks/ADARNN/workflow_config_adarnn_Alpha360.yaml @@ -28,8 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - model: - dataset: + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/ADD/workflow_config_add_Alpha360.yaml b/examples/benchmarks/ADD/workflow_config_add_Alpha360.yaml index 033d4d22e..b2168a1b8 100644 --- a/examples/benchmarks/ADD/workflow_config_add_Alpha360.yaml +++ b/examples/benchmarks/ADD/workflow_config_add_Alpha360.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/ALSTM/workflow_config_alstm_Alpha158.yaml b/examples/benchmarks/ALSTM/workflow_config_alstm_Alpha158.yaml index a8e89e360..568505ee3 100755 --- a/examples/benchmarks/ALSTM/workflow_config_alstm_Alpha158.yaml +++ b/examples/benchmarks/ALSTM/workflow_config_alstm_Alpha158.yaml @@ -36,9 +36,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/ALSTM/workflow_config_alstm_Alpha360.yaml b/examples/benchmarks/ALSTM/workflow_config_alstm_Alpha360.yaml index 3aa8147fc..b345cacd9 100644 --- a/examples/benchmarks/ALSTM/workflow_config_alstm_Alpha360.yaml +++ b/examples/benchmarks/ALSTM/workflow_config_alstm_Alpha360.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha158.yaml b/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha158.yaml index 2eb642741..635611ffa 100644 --- a/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha158.yaml +++ b/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha158.yaml @@ -14,9 +14,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha158_csi500.yaml b/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha158_csi500.yaml index bb7c42fd0..c40f0f81a 100644 --- a/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha158_csi500.yaml +++ b/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha158_csi500.yaml @@ -14,9 +14,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha360.yaml b/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha360.yaml index 982963eea..136ab7e6f 100644 --- a/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha360.yaml +++ b/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha360.yaml @@ -21,9 +21,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha360_csi500.yaml b/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha360_csi500.yaml index da4962b54..448140702 100644 --- a/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha360_csi500.yaml +++ b/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha360_csi500.yaml @@ -21,9 +21,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml index 85cc0a270..58a01d63a 100644 --- a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml +++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml @@ -14,9 +14,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158_csi500.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158_csi500.yaml index b2358c6bf..ea92fbc7c 100644 --- a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158_csi500.yaml +++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158_csi500.yaml @@ -14,9 +14,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml index 74db1f362..edb5e960f 100644 --- a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml +++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml @@ -21,9 +21,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360_csi500.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360_csi500.yaml index f10355f22..ec8afefb4 100644 --- a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360_csi500.yaml +++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360_csi500.yaml @@ -21,9 +21,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_early_stop_Alpha158.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_early_stop_Alpha158.yaml index b3c38870e..3960aca15 100644 --- a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_early_stop_Alpha158.yaml +++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_early_stop_Alpha158.yaml @@ -14,9 +14,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/GATs/workflow_config_gats_Alpha158.yaml b/examples/benchmarks/GATs/workflow_config_gats_Alpha158.yaml index e056bc845..0710f3181 100644 --- a/examples/benchmarks/GATs/workflow_config_gats_Alpha158.yaml +++ b/examples/benchmarks/GATs/workflow_config_gats_Alpha158.yaml @@ -35,9 +35,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/GATs/workflow_config_gats_Alpha360.yaml b/examples/benchmarks/GATs/workflow_config_gats_Alpha360.yaml index 2effecd61..095e0bade 100644 --- a/examples/benchmarks/GATs/workflow_config_gats_Alpha360.yaml +++ b/examples/benchmarks/GATs/workflow_config_gats_Alpha360.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/GRU/workflow_config_gru_Alpha158.yaml b/examples/benchmarks/GRU/workflow_config_gru_Alpha158.yaml index 7c525c12a..a2f03a230 100755 --- a/examples/benchmarks/GRU/workflow_config_gru_Alpha158.yaml +++ b/examples/benchmarks/GRU/workflow_config_gru_Alpha158.yaml @@ -36,9 +36,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/GRU/workflow_config_gru_Alpha360.yaml b/examples/benchmarks/GRU/workflow_config_gru_Alpha360.yaml index 2daaa0136..f5d837a06 100644 --- a/examples/benchmarks/GRU/workflow_config_gru_Alpha360.yaml +++ b/examples/benchmarks/GRU/workflow_config_gru_Alpha360.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/HIST/workflow_config_hist_Alpha360.yaml b/examples/benchmarks/HIST/workflow_config_hist_Alpha360.yaml index b3e96f485..cd50b3387 100644 --- a/examples/benchmarks/HIST/workflow_config_hist_Alpha360.yaml +++ b/examples/benchmarks/HIST/workflow_config_hist_Alpha360.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: @@ -89,4 +87,4 @@ task: - class: PortAnaRecord module_path: qlib.workflow.record_temp kwargs: - config: *port_analysis_config \ No newline at end of file + config: *port_analysis_config diff --git a/examples/benchmarks/IGMTF/workflow_config_igmtf_Alpha360.yaml b/examples/benchmarks/IGMTF/workflow_config_igmtf_Alpha360.yaml index 1fc908ea9..838e66064 100644 --- a/examples/benchmarks/IGMTF/workflow_config_igmtf_Alpha360.yaml +++ b/examples/benchmarks/IGMTF/workflow_config_igmtf_Alpha360.yaml @@ -28,8 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - model: - dataset: + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/KRNN/workflow_config_krnn_Alpha360.yaml b/examples/benchmarks/KRNN/workflow_config_krnn_Alpha360.yaml index 691607ad1..b5a3e3bc0 100644 --- a/examples/benchmarks/KRNN/workflow_config_krnn_Alpha360.yaml +++ b/examples/benchmarks/KRNN/workflow_config_krnn_Alpha360.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/LSTM/workflow_config_lstm_Alpha158.yaml b/examples/benchmarks/LSTM/workflow_config_lstm_Alpha158.yaml index bf3738bc0..522f6443c 100755 --- a/examples/benchmarks/LSTM/workflow_config_lstm_Alpha158.yaml +++ b/examples/benchmarks/LSTM/workflow_config_lstm_Alpha158.yaml @@ -36,9 +36,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/LSTM/workflow_config_lstm_Alpha360.yaml b/examples/benchmarks/LSTM/workflow_config_lstm_Alpha360.yaml index d550cacb2..e4f9b2fe9 100644 --- a/examples/benchmarks/LSTM/workflow_config_lstm_Alpha360.yaml +++ b/examples/benchmarks/LSTM/workflow_config_lstm_Alpha360.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/LightGBM/multi_freq_handler.py b/examples/benchmarks/LightGBM/multi_freq_handler.py index b3e138192..1d4ba2b82 100644 --- a/examples/benchmarks/LightGBM/multi_freq_handler.py +++ b/examples/benchmarks/LightGBM/multi_freq_handler.py @@ -48,7 +48,6 @@ class Avg15minHandler(DataHandlerLP): ) def loader_config(self): - # Results for dataset: df: pd.DataFrame # len(df.columns) == 6 + 6 * 16, len(df.index.get_level_values(level="datetime").unique()) == T # df.columns: close0, close1, ..., close16, open0, ..., open16, ..., vwap16 diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml index 2d441dea9..5ae316801 100644 --- a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml +++ b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml @@ -14,8 +14,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - model: - dataset: + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_csi500.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_csi500.yaml index 327e7fffa..aa017bc9b 100644 --- a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_csi500.yaml +++ b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_csi500.yaml @@ -14,8 +14,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - model: - dataset: + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_multi_freq.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_multi_freq.yaml index 6b58ea4bd..0e63b23f8 100644 --- a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_multi_freq.yaml +++ b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_multi_freq.yaml @@ -33,9 +33,7 @@ port_analysis_config: &port_analysis_config kwargs: topk: 50 n_drop: 5 - signal: - - - - + signal: backtest: verbose: False limit_threshold: 0.095 diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha360.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha360.yaml index 053c5bd29..e43a390a2 100644 --- a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha360.yaml +++ b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha360.yaml @@ -21,9 +21,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha360_csi500.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha360_csi500.yaml index 767050919..aa3ac8b5e 100644 --- a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha360_csi500.yaml +++ b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha360_csi500.yaml @@ -21,9 +21,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_configurable_dataset.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_configurable_dataset.yaml index f1ffc45da..7a784a5c8 100644 --- a/examples/benchmarks/LightGBM/workflow_config_lightgbm_configurable_dataset.yaml +++ b/examples/benchmarks/LightGBM/workflow_config_lightgbm_configurable_dataset.yaml @@ -29,9 +29,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_multi_freq.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_multi_freq.yaml index 11b277ce6..af867a24e 100644 --- a/examples/benchmarks/LightGBM/workflow_config_lightgbm_multi_freq.yaml +++ b/examples/benchmarks/LightGBM/workflow_config_lightgbm_multi_freq.yaml @@ -31,9 +31,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/Linear/workflow_config_linear_Alpha158.yaml b/examples/benchmarks/Linear/workflow_config_linear_Alpha158.yaml index 290a8bc42..e65dae250 100644 --- a/examples/benchmarks/Linear/workflow_config_linear_Alpha158.yaml +++ b/examples/benchmarks/Linear/workflow_config_linear_Alpha158.yaml @@ -27,9 +27,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/Linear/workflow_config_linear_Alpha158_csi500.yaml b/examples/benchmarks/Linear/workflow_config_linear_Alpha158_csi500.yaml index 53e12b999..bff2e6a74 100644 --- a/examples/benchmarks/Linear/workflow_config_linear_Alpha158_csi500.yaml +++ b/examples/benchmarks/Linear/workflow_config_linear_Alpha158_csi500.yaml @@ -27,9 +27,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/Localformer/workflow_config_localformer_Alpha158.yaml b/examples/benchmarks/Localformer/workflow_config_localformer_Alpha158.yaml index 7f5a78e74..e3200f129 100644 --- a/examples/benchmarks/Localformer/workflow_config_localformer_Alpha158.yaml +++ b/examples/benchmarks/Localformer/workflow_config_localformer_Alpha158.yaml @@ -36,9 +36,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/Localformer/workflow_config_localformer_Alpha360.yaml b/examples/benchmarks/Localformer/workflow_config_localformer_Alpha360.yaml index 9de80a350..39c0093ac 100644 --- a/examples/benchmarks/Localformer/workflow_config_localformer_Alpha360.yaml +++ b/examples/benchmarks/Localformer/workflow_config_localformer_Alpha360.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/MLP/workflow_config_mlp_Alpha158.yaml b/examples/benchmarks/MLP/workflow_config_mlp_Alpha158.yaml index b2012ba8c..6c85546ca 100644 --- a/examples/benchmarks/MLP/workflow_config_mlp_Alpha158.yaml +++ b/examples/benchmarks/MLP/workflow_config_mlp_Alpha158.yaml @@ -41,9 +41,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/MLP/workflow_config_mlp_Alpha158_csi500.yaml b/examples/benchmarks/MLP/workflow_config_mlp_Alpha158_csi500.yaml index 8628898d3..745c9b017 100644 --- a/examples/benchmarks/MLP/workflow_config_mlp_Alpha158_csi500.yaml +++ b/examples/benchmarks/MLP/workflow_config_mlp_Alpha158_csi500.yaml @@ -41,9 +41,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/MLP/workflow_config_mlp_Alpha360.yaml b/examples/benchmarks/MLP/workflow_config_mlp_Alpha360.yaml index 359e79202..b9cccd52e 100644 --- a/examples/benchmarks/MLP/workflow_config_mlp_Alpha360.yaml +++ b/examples/benchmarks/MLP/workflow_config_mlp_Alpha360.yaml @@ -29,9 +29,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/MLP/workflow_config_mlp_Alpha360_csi500.yaml b/examples/benchmarks/MLP/workflow_config_mlp_Alpha360_csi500.yaml index 3862295f6..215633463 100644 --- a/examples/benchmarks/MLP/workflow_config_mlp_Alpha360_csi500.yaml +++ b/examples/benchmarks/MLP/workflow_config_mlp_Alpha360_csi500.yaml @@ -29,9 +29,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/SFM/workflow_config_sfm_Alpha360.yaml b/examples/benchmarks/SFM/workflow_config_sfm_Alpha360.yaml index d750a9980..d992af342 100644 --- a/examples/benchmarks/SFM/workflow_config_sfm_Alpha360.yaml +++ b/examples/benchmarks/SFM/workflow_config_sfm_Alpha360.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/Sandwich/workflow_config_sandwich_Alpha360.yaml b/examples/benchmarks/Sandwich/workflow_config_sandwich_Alpha360.yaml index 717a03471..29e67d67e 100644 --- a/examples/benchmarks/Sandwich/workflow_config_sandwich_Alpha360.yaml +++ b/examples/benchmarks/Sandwich/workflow_config_sandwich_Alpha360.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/TCN/workflow_config_tcn_Alpha158.yaml b/examples/benchmarks/TCN/workflow_config_tcn_Alpha158.yaml index c6f663f94..dcb7508a4 100755 --- a/examples/benchmarks/TCN/workflow_config_tcn_Alpha158.yaml +++ b/examples/benchmarks/TCN/workflow_config_tcn_Alpha158.yaml @@ -36,8 +36,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - model: - dataset: + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/TCN/workflow_config_tcn_Alpha360.yaml b/examples/benchmarks/TCN/workflow_config_tcn_Alpha360.yaml index e383662fc..4756a93b2 100644 --- a/examples/benchmarks/TCN/workflow_config_tcn_Alpha360.yaml +++ b/examples/benchmarks/TCN/workflow_config_tcn_Alpha360.yaml @@ -28,8 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - model: - dataset: + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/TCTS/workflow_config_tcts_Alpha360.yaml b/examples/benchmarks/TCTS/workflow_config_tcts_Alpha360.yaml index 460a470bb..7adf97582 100644 --- a/examples/benchmarks/TCTS/workflow_config_tcts_Alpha360.yaml +++ b/examples/benchmarks/TCTS/workflow_config_tcts_Alpha360.yaml @@ -30,9 +30,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: @@ -95,4 +93,4 @@ task: - class: PortAnaRecord module_path: qlib.workflow.record_temp kwargs: - config: *port_analysis_config \ No newline at end of file + config: *port_analysis_config diff --git a/examples/benchmarks/TFT/data_formatters/base.py b/examples/benchmarks/TFT/data_formatters/base.py index 9df0448ba..9cdce6382 100644 --- a/examples/benchmarks/TFT/data_formatters/base.py +++ b/examples/benchmarks/TFT/data_formatters/base.py @@ -139,7 +139,6 @@ class GenericDataFormatter(abc.ABC): # Sanity checks first. # Ensure only one ID and time column exist def _check_single_column(input_type): - length = len([tup for tup in column_definition if tup[2] == input_type]) if length != 1: diff --git a/examples/benchmarks/TFT/expt_settings/configs.py b/examples/benchmarks/TFT/expt_settings/configs.py index 62aa68c38..55eb32a0b 100644 --- a/examples/benchmarks/TFT/expt_settings/configs.py +++ b/examples/benchmarks/TFT/expt_settings/configs.py @@ -78,7 +78,6 @@ class ExperimentConfig: @property def hyperparam_iterations(self): - return 240 if self.experiment == "volatility" else 60 def make_data_formatter(self): diff --git a/examples/benchmarks/TFT/libs/hyperparam_opt.py b/examples/benchmarks/TFT/libs/hyperparam_opt.py index e18f5b716..86f587d7d 100644 --- a/examples/benchmarks/TFT/libs/hyperparam_opt.py +++ b/examples/benchmarks/TFT/libs/hyperparam_opt.py @@ -88,7 +88,6 @@ class HyperparamOptManager: params_file = os.path.join(self.hyperparam_folder, "params.csv") if os.path.exists(results_file) and os.path.exists(params_file): - self.results = pd.read_csv(results_file, index_col=0) self.saved_params = pd.read_csv(params_file, index_col=0) @@ -178,7 +177,6 @@ class HyperparamOptManager: return parameters for _ in range(self._max_tries): - parameters = _get_next() name = self._get_name(parameters) diff --git a/examples/benchmarks/TFT/libs/tft_model.py b/examples/benchmarks/TFT/libs/tft_model.py index aa055e294..2a1a2fa15 100644 --- a/examples/benchmarks/TFT/libs/tft_model.py +++ b/examples/benchmarks/TFT/libs/tft_model.py @@ -475,7 +475,6 @@ class TemporalFusionTransformer: embeddings = [] for i in range(num_categorical_variables): - embedding = tf.keras.Sequential( [ tf.keras.layers.InputLayer([time_steps]), @@ -680,7 +679,6 @@ class TemporalFusionTransformer: data_map = {} for _, sliced in data.groupby(id_col): - col_mappings = {"identifier": [id_col], "time": [time_col], "outputs": [target_col], "inputs": input_cols} for k in col_mappings: @@ -954,7 +952,6 @@ class TemporalFusionTransformer: """ with tf.variable_scope(self.name): - transformer_layer, all_inputs, attention_components = self._build_base_graph() outputs = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(self.output_size * len(self.quantiles)))( diff --git a/examples/benchmarks/TFT/workflow_config_tft_Alpha158.yaml b/examples/benchmarks/TFT/workflow_config_tft_Alpha158.yaml index d83878e3e..e925fb772 100644 --- a/examples/benchmarks/TFT/workflow_config_tft_Alpha158.yaml +++ b/examples/benchmarks/TFT/workflow_config_tft_Alpha158.yaml @@ -16,9 +16,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/TRA/example.py b/examples/benchmarks/TRA/example.py index defacf412..0d52c8775 100644 --- a/examples/benchmarks/TRA/example.py +++ b/examples/benchmarks/TRA/example.py @@ -6,7 +6,6 @@ from qlib.utils import init_instance_by_config def main(seed, config_file="configs/config_alstm.yaml"): - # set random seed with open(config_file) as f: config = yaml.safe_load(f) @@ -30,7 +29,6 @@ def main(seed, config_file="configs/config_alstm.yaml"): if __name__ == "__main__": - # set params from cmd parser = argparse.ArgumentParser(allow_abbrev=False) parser.add_argument("--seed", type=int, default=1000, help="random seed") diff --git a/examples/benchmarks/TRA/src/dataset.py b/examples/benchmarks/TRA/src/dataset.py index 6740b1cbd..de4b2ad41 100644 --- a/examples/benchmarks/TRA/src/dataset.py +++ b/examples/benchmarks/TRA/src/dataset.py @@ -96,7 +96,6 @@ class MTSDatasetH(DatasetH): drop_last=False, **kwargs, ): - assert horizon > 0, "please specify `horizon` to avoid data leakage" self.seq_len = seq_len @@ -111,7 +110,6 @@ class MTSDatasetH(DatasetH): super().__init__(handler, segments, **kwargs) def setup_data(self, handler_kwargs: dict = None, **kwargs): - super().setup_data() # change index to diff --git a/examples/benchmarks/TRA/src/model.py b/examples/benchmarks/TRA/src/model.py index cff94388e..affb115a1 100644 --- a/examples/benchmarks/TRA/src/model.py +++ b/examples/benchmarks/TRA/src/model.py @@ -45,7 +45,6 @@ class TRAModel(Model): avg_params=True, **kwargs, ): - np.random.seed(seed) torch.manual_seed(seed) @@ -93,7 +92,6 @@ class TRAModel(Model): self.global_step = -1 def train_epoch(self, data_set): - self.model.train() self.tra.train() @@ -146,7 +144,6 @@ class TRAModel(Model): return total_loss def test_epoch(self, data_set, return_pred=False): - self.model.eval() self.tra.eval() data_set.eval() @@ -204,7 +201,6 @@ class TRAModel(Model): return metrics, preds def fit(self, dataset, evals_result=dict()): - train_set, valid_set, test_set = dataset.prepare(["train", "valid", "test"]) best_score = -1 @@ -380,7 +376,6 @@ class LSTM(nn.Module): self.output_size = hidden_size def forward(self, x): - x = self.input_drop(x) if self.training and self.noise_level > 0: @@ -464,7 +459,6 @@ class Transformer(nn.Module): self.output_size = hidden_size def forward(self, x): - x = self.input_drop(x) if self.training and self.noise_level > 0: @@ -514,7 +508,6 @@ class TRA(nn.Module): self.predictors = nn.Linear(input_size, num_states) def forward(self, hidden, hist_loss): - preds = self.predictors(hidden) if self.num_states == 1: diff --git a/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml b/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml index c86f87fc6..02c4ecac3 100644 --- a/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml +++ b/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml @@ -57,9 +57,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/TRA/workflow_config_tra_Alpha158_full.yaml b/examples/benchmarks/TRA/workflow_config_tra_Alpha158_full.yaml index 75f18f3ee..9ccf56e86 100644 --- a/examples/benchmarks/TRA/workflow_config_tra_Alpha158_full.yaml +++ b/examples/benchmarks/TRA/workflow_config_tra_Alpha158_full.yaml @@ -51,9 +51,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml b/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml index 9ab5b904b..29686d7da 100644 --- a/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml +++ b/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml @@ -51,9 +51,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/TabNet/workflow_config_TabNet_Alpha158.yaml b/examples/benchmarks/TabNet/workflow_config_TabNet_Alpha158.yaml index d9b94e86c..7549688b9 100644 --- a/examples/benchmarks/TabNet/workflow_config_TabNet_Alpha158.yaml +++ b/examples/benchmarks/TabNet/workflow_config_TabNet_Alpha158.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/TabNet/workflow_config_TabNet_Alpha360.yaml b/examples/benchmarks/TabNet/workflow_config_TabNet_Alpha360.yaml index 830943d6b..7155d25b1 100644 --- a/examples/benchmarks/TabNet/workflow_config_TabNet_Alpha360.yaml +++ b/examples/benchmarks/TabNet/workflow_config_TabNet_Alpha360.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/Transformer/workflow_config_transformer_Alpha158.yaml b/examples/benchmarks/Transformer/workflow_config_transformer_Alpha158.yaml index e36d44c43..ce5105108 100644 --- a/examples/benchmarks/Transformer/workflow_config_transformer_Alpha158.yaml +++ b/examples/benchmarks/Transformer/workflow_config_transformer_Alpha158.yaml @@ -36,9 +36,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/Transformer/workflow_config_transformer_Alpha360.yaml b/examples/benchmarks/Transformer/workflow_config_transformer_Alpha360.yaml index cab46a4d4..35342de94 100644 --- a/examples/benchmarks/Transformer/workflow_config_transformer_Alpha360.yaml +++ b/examples/benchmarks/Transformer/workflow_config_transformer_Alpha360.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/XGBoost/workflow_config_xgboost_Alpha158.yaml b/examples/benchmarks/XGBoost/workflow_config_xgboost_Alpha158.yaml index 5ee38cf70..0c7f55d02 100644 --- a/examples/benchmarks/XGBoost/workflow_config_xgboost_Alpha158.yaml +++ b/examples/benchmarks/XGBoost/workflow_config_xgboost_Alpha158.yaml @@ -14,9 +14,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/XGBoost/workflow_config_xgboost_Alpha360.yaml b/examples/benchmarks/XGBoost/workflow_config_xgboost_Alpha360.yaml index 7c98bd40c..8e7b54372 100644 --- a/examples/benchmarks/XGBoost/workflow_config_xgboost_Alpha360.yaml +++ b/examples/benchmarks/XGBoost/workflow_config_xgboost_Alpha360.yaml @@ -21,9 +21,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks_dynamic/DDG-DA/README.md b/examples/benchmarks_dynamic/DDG-DA/README.md index 4d49315bd..ac4349d91 100644 --- a/examples/benchmarks_dynamic/DDG-DA/README.md +++ b/examples/benchmarks_dynamic/DDG-DA/README.md @@ -16,12 +16,12 @@ Though the dataset is different, the conclusion remains the same. By applying `D # Run the Code Users can try `DDG-DA` by running the following command: ```bash - python workflow.py run_all + python workflow.py run ``` The default forecasting models are `Linear`. Users can choose other forecasting models by changing the `forecast_model` parameter when `DDG-DA` initializes. For example, users can try `LightGBM` forecasting models by running the following command: ```bash - python workflow.py --forecast_model="gbdt" run_all + python workflow.py --conf_path=../workflow_config_lightgbm_Alpha158.yaml run ``` # Results diff --git a/examples/benchmarks_dynamic/DDG-DA/workflow.py b/examples/benchmarks_dynamic/DDG-DA/workflow.py index fef86726d..7593fe374 100644 --- a/examples/benchmarks_dynamic/DDG-DA/workflow.py +++ b/examples/benchmarks_dynamic/DDG-DA/workflow.py @@ -1,305 +1,40 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. from pathlib import Path -from qlib.model.meta.task import MetaTask -from qlib.contrib.meta.data_selection.model import MetaModelDS -from qlib.contrib.meta.data_selection.dataset import InternalData, MetaDatasetDS -from qlib.data.dataset.handler import DataHandlerLP +from typing import Union -import pandas as pd import fire -import sys -import pickle -from typing import Optional + from qlib import auto_init -from qlib.model.trainer import TrainerR -from qlib.typehint import Literal -from qlib.utils import init_instance_by_config -from qlib.workflow import R +from qlib.contrib.rolling.ddgda import DDGDA from qlib.tests.data import GetData DIRNAME = Path(__file__).absolute().resolve().parent -sys.path.append(str(DIRNAME.parent / "baseline")) -from rolling_benchmark import RollingBenchmark # NOTE: sys.path is changed for import RollingBenchmark +BENCH_DIR = DIRNAME.parent / "baseline" -class DDGDA: - """ - please run `python workflow.py run_all` to run the full workflow of the experiment +class DDGDABench(DDGDA): + # The config in the README.md + CONF_LIST = [ + BENCH_DIR / "workflow_config_linear_Alpha158.yaml", + BENCH_DIR / "workflow_config_lightgbm_Alpha158.yaml", + ] - **NOTE** - before running the example, please clean your previous results with following command - - `rm -r mlruns` - """ + DEFAULT_CONF = CONF_LIST[0] # Linear by default due to efficiency - def __init__( - self, - sim_task_model: Literal["linear", "gbdt"] = "gbdt", - forecast_model: Literal["linear", "gbdt"] = "linear", - h_path: Optional[str] = None, - test_end: Optional[str] = None, - train_start: Optional[str] = None, - meta_1st_train_end: Optional[str] = None, - task_ext_conf: Optional[dict] = None, - alpha: float = 0.01, - proxy_hd: str = "handler_proxy.pkl", - ): - """ + def __init__(self, conf_path: Union[str, Path] = DEFAULT_CONF, horizon=20, **kwargs) -> None: + # This code is for being compatible with the previous old code + conf_path = Path(conf_path) + super().__init__(conf_path=conf_path, horizon=horizon, working_dir=DIRNAME, **kwargs) - Parameters - ---------- - - train_start: Optional[str] - the start datetime for data. It is used in training start time (for both tasks & meta learing) - test_end: Optional[str] - the end datetime for data. It is used in test end time - meta_1st_train_end: Optional[str] - the datetime of training end of the first meta_task - alpha: float - Setting the L2 regularization for ridge - The `alpha` is only passed to MetaModelDS (it is not passed to sim_task_model currently..) - """ - self.step = 20 - # NOTE: - # the horizon must match the meaning in the base task template - self.horizon = 20 - self.meta_exp_name = "DDG-DA" - self.sim_task_model = sim_task_model # The model to capture the distribution of data. - self.forecast_model = forecast_model # downstream forecasting models' type - self.rb_kwargs = { - "h_path": h_path, - "test_end": test_end, - "train_start": train_start, - "task_ext_conf": task_ext_conf, - } - self.alpha = alpha - self.meta_1st_train_end = meta_1st_train_end - self.proxy_hd = proxy_hd - - def get_feature_importance(self): - # this must be lightGBM, because it needs to get the feature importance - rb = RollingBenchmark(model_type="gbdt", **self.rb_kwargs) - task = rb.basic_task() - - with R.start(experiment_name="feature_importance"): - model = init_instance_by_config(task["model"]) - dataset = init_instance_by_config(task["dataset"]) - model.fit(dataset) - - fi = model.get_feature_importance() - - # Because the model use numpy instead of dataframe for training lightgbm - # So the we must use following extra steps to get the right feature importance - df = dataset.prepare(segments=slice(None), col_set="feature", data_key=DataHandlerLP.DK_R) - cols = df.columns - fi_named = {cols[int(k.split("_")[1])]: imp for k, imp in fi.to_dict().items()} - - return pd.Series(fi_named) - - def dump_data_for_proxy_model(self): - """ - Dump data for training meta model. - The meta model will be trained upon the proxy forecasting model. - This dataset is for the proxy forecasting model. - """ - topk = 30 - fi = self.get_feature_importance() - col_selected = fi.nlargest(topk) - - rb = RollingBenchmark(model_type=self.sim_task_model, **self.rb_kwargs) - task = rb.basic_task() - dataset = init_instance_by_config(task["dataset"]) - prep_ds = dataset.prepare(slice(None), col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) - - feature_df = prep_ds["feature"] - label_df = prep_ds["label"] - - feature_selected = feature_df.loc[:, col_selected.index] - - feature_selected = feature_selected.groupby("datetime", group_keys=False).apply( - lambda df: (df - df.mean()).div(df.std()) - ) - feature_selected = feature_selected.fillna(0.0) - - df_all = { - "label": label_df.reindex(feature_selected.index), - "feature": feature_selected, - } - df_all = pd.concat(df_all, axis=1) - df_all.to_pickle(DIRNAME / "fea_label_df.pkl") - - # dump data in handler format for aligning the interface - handler = DataHandlerLP( - data_loader={ - "class": "qlib.data.dataset.loader.StaticDataLoader", - "kwargs": {"config": DIRNAME / "fea_label_df.pkl"}, - } - ) - handler.to_pickle(DIRNAME / self.proxy_hd, dump_all=True) - - @property - def _internal_data_path(self): - return DIRNAME / f"internal_data_s{self.step}.pkl" - - def dump_meta_ipt(self): - """ - Dump data for training meta model. - This function will dump the input data for meta model - """ - # According to the experiments, the choice of the model type is very important for achieving good results - rb = RollingBenchmark(model_type=self.sim_task_model, **self.rb_kwargs) - sim_task = rb.basic_task() - - if self.sim_task_model == "gbdt": - sim_task["model"].setdefault("kwargs", {}).update({"early_stopping_rounds": None, "num_boost_round": 150}) - - exp_name_sim = f"data_sim_s{self.step}" - - internal_data = InternalData(sim_task, self.step, exp_name=exp_name_sim) - internal_data.setup(trainer=TrainerR) - - with self._internal_data_path.open("wb") as f: - pickle.dump(internal_data, f) - - def train_meta_model(self, fill_method="max"): - """ - training a meta model based on a simplified linear proxy model; - """ - - # 1) leverage the simplified proxy forecasting model to train meta model. - # - Only the dataset part is important, in current version of meta model will integrate the - rb = RollingBenchmark(model_type=self.sim_task_model, **self.rb_kwargs) - sim_task = rb.basic_task() - # the train_start for training meta model does not necessarily align with final rolling - train_start = "2008-01-01" if self.rb_kwargs.get("train_start") is None else self.rb_kwargs.get("train_start") - train_end = "2010-12-31" if self.meta_1st_train_end is None else self.meta_1st_train_end - test_start = (pd.Timestamp(train_end) + pd.Timedelta(days=1)).strftime("%Y-%m-%d") - proxy_forecast_model_task = { - # "model": "qlib.contrib.model.linear.LinearModel", - "dataset": { - "class": "qlib.data.dataset.DatasetH", - "kwargs": { - "handler": f"file://{(DIRNAME / self.proxy_hd).absolute()}", - "segments": { - "train": (train_start, train_end), - "test": (test_start, sim_task["dataset"]["kwargs"]["segments"]["test"][1]), - }, - }, - }, - # "record": ["qlib.workflow.record_temp.SignalRecord"] - } - # the proxy_forecast_model_task will be used to create meta tasks. - # The test date of first task will be 2011-01-01. Each test segment will be about 20days - # The tasks include all training tasks and test tasks. - - # 2) preparing meta dataset - kwargs = dict( - task_tpl=proxy_forecast_model_task, - step=self.step, - segments=0.62, # keep test period consistent with the dataset yaml - trunc_days=1 + self.horizon, - hist_step_n=30, - fill_method=fill_method, - rolling_ext_days=0, - ) - # NOTE: - # the input of meta model (internal data) are shared between proxy model and final forecasting model - # but their task test segment are not aligned! It worked in my previous experiment. - # So the misalignment will not affect the effectiveness of the method. - with self._internal_data_path.open("rb") as f: - internal_data = pickle.load(f) - - md = MetaDatasetDS(exp_name=internal_data, **kwargs) - - # 3) train and logging meta model - with R.start(experiment_name=self.meta_exp_name): - R.log_params(**kwargs) - mm = MetaModelDS( - step=self.step, hist_step_n=kwargs["hist_step_n"], lr=0.001, max_epoch=30, seed=43, alpha=self.alpha - ) - mm.fit(md) - R.save_objects(model=mm) - - @property - def _task_path(self): - return DIRNAME / f"tasks_s{self.step}.pkl" - - def meta_inference(self): - """ - Leverage meta-model for inference: - - Given - - baseline tasks - - input for meta model(internal data) - - meta model (its learnt knowledge on proxy forecasting model is expected to transfer to normal forecasting model) - """ - # 1) get meta model - exp = R.get_exp(experiment_name=self.meta_exp_name) - rec = exp.list_recorders(rtype=exp.RT_L)[0] - meta_model: MetaModelDS = rec.load_object("model") - - # 2) - # we are transfer to knowledge of meta model to final forecasting tasks. - # Create MetaTaskDataset for the final forecasting tasks - # Aligning the setting of it to the MetaTaskDataset when training Meta model is necessary - - # 2.1) get previous config - param = rec.list_params() - trunc_days = int(param["trunc_days"]) - step = int(param["step"]) - hist_step_n = int(param["hist_step_n"]) - fill_method = param.get("fill_method", "max") - - rb = RollingBenchmark(model_type=self.forecast_model, **self.rb_kwargs) - task_l = rb.create_rolling_tasks() - - # 2.2) create meta dataset for final dataset - kwargs = dict( - task_tpl=task_l, - step=step, - segments=0.0, # all the tasks are for testing - trunc_days=trunc_days, - hist_step_n=hist_step_n, - fill_method=fill_method, - task_mode=MetaTask.PROC_MODE_TRANSFER, - ) - - with self._internal_data_path.open("rb") as f: - internal_data = pickle.load(f) - mds = MetaDatasetDS(exp_name=internal_data, **kwargs) - - # 3) meta model make inference and get new qlib task - new_tasks = meta_model.inference(mds) - with self._task_path.open("wb") as f: - pickle.dump(new_tasks, f) - - def train_and_eval_tasks(self): - """ - Training the tasks generated by meta model - Then evaluate it - """ - with self._task_path.open("rb") as f: - tasks = pickle.load(f) - rb = RollingBenchmark(rolling_exp="rolling_ds", model_type=self.forecast_model, **self.rb_kwargs) - rb.train_rolling_tasks(tasks) - rb.ens_rolling() - rb.update_rolling_rec() - - def run_all(self): - # 1) file: handler_proxy.pkl (self.proxy_hd) - self.dump_data_for_proxy_model() - # 2) - # file: internal_data_s20.pkl - # mlflow: data_sim_s20, models for calculating meta_ipt - self.dump_meta_ipt() - # 3) meta model will be stored in `DDG-DA` - self.train_meta_model() - # 4) new_tasks are saved in "tasks_s20.pkl" (reweighter is added) - self.meta_inference() - # 5) load the saved tasks and train model - self.train_and_eval_tasks() + for f in self.CONF_LIST: + if conf_path.samefile(f): + break + else: + self.logger.warning("Model type is not in the benchmark!") if __name__ == "__main__": GetData().qlib_data(exists_skip=True) auto_init() - fire.Fire(DDGDA) + fire.Fire(DDGDABench) diff --git a/examples/benchmarks_dynamic/baseline/README.md b/examples/benchmarks_dynamic/baseline/README.md index 17e10482d..f17651412 100644 --- a/examples/benchmarks_dynamic/baseline/README.md +++ b/examples/benchmarks_dynamic/baseline/README.md @@ -5,11 +5,12 @@ This is the framework of periodically Rolling Retrain (RR) forecasting models. R ## Run the Code Users can try RR by running the following command: ```bash - python rolling_benchmark.py run_all + python rolling_benchmark.py run ``` The default forecasting models are `Linear`. Users can choose other forecasting models by changing the `model_type` parameter. For example, users can try `LightGBM` forecasting models by running the following command: ```bash - python rolling_benchmark.py --model_type="gbdt" run_all -``` \ No newline at end of file + python rolling_benchmark.py --conf_path=workflow_config_lightgbm_Alpha158.yaml run + +``` diff --git a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py index b0c7aea4f..1ce30ef8a 100644 --- a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py +++ b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py @@ -1,161 +1,33 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -from typing import Optional -from qlib.model.ens.ensemble import RollingEnsemble -from qlib.utils import init_instance_by_config -import fire -import yaml -import pandas as pd -from qlib import auto_init from pathlib import Path -from tqdm.auto import tqdm -from qlib.model.trainer import TrainerR -from qlib.log import get_module_logger -from qlib.utils.data import update_config -from qlib.workflow import R +from typing import Union + +import fire + +from qlib import auto_init +from qlib.contrib.rolling.base import Rolling from qlib.tests.data import GetData DIRNAME = Path(__file__).absolute().resolve().parent -from qlib.workflow.task.gen import task_generator, RollingGen -from qlib.workflow.task.collect import RecorderCollector -from qlib.workflow.record_temp import PortAnaRecord, SigAnaRecord -class RollingBenchmark: - """ - **NOTE** - before running the example, please clean your previous results with following command - - `rm -r mlruns` +class RollingBenchmark(Rolling): + # The config in the README.md + CONF_LIST = [DIRNAME / "workflow_config_linear_Alpha158.yaml", DIRNAME / "workflow_config_lightgbm_Alpha158.yaml"] - """ + DEFAULT_CONF = CONF_LIST[0] - def __init__( - self, - rolling_exp: str = "rolling_models", - model_type: str = "linear", - h_path: Optional[str] = None, - train_start: Optional[str] = None, - test_end: Optional[str] = None, - task_ext_conf: Optional[dict] = None, - ) -> None: - """ - Parameters - ---------- - rolling_exp : str - The name for the experiments for rolling - model_type : str - The model to be boosted. - h_path : Optional[str] - the dumped data handler; - test_end : Optional[str] - the test end for the data. It is typically used together with the handler - train_start : Optional[str] - the train start for the data. It is typically used together with the handler. - task_ext_conf : Optional[dict] - some option to update the - """ - self.step = 20 - self.horizon = 20 - self.rolling_exp = rolling_exp - self.model_type = model_type - self.h_path = h_path - self.train_start = train_start - self.test_end = test_end - self.logger = get_module_logger("RollingBenchmark") - self.task_ext_conf = task_ext_conf + def __init__(self, conf_path: Union[str, Path] = DEFAULT_CONF, horizon=20, **kwargs) -> None: + # This code is for being compatible with the previous old code + conf_path = Path(conf_path) + super().__init__(conf_path=conf_path, horizon=horizon, **kwargs) - def basic_task(self): - """For fast training rolling""" - if self.model_type == "gbdt": - conf_path = DIRNAME / "workflow_config_lightgbm_Alpha158.yaml" - # dump the processed data on to disk for later loading to speed up the processing - h_path = DIRNAME / "lightgbm_alpha158_handler_horizon{}.pkl".format(self.horizon) - elif self.model_type == "linear": - # We use ridge regression to stabilize the performance - conf_path = DIRNAME / "workflow_config_linear_Alpha158.yaml" - h_path = DIRNAME / "linear_alpha158_handler_horizon{}.pkl".format(self.horizon) + for f in self.CONF_LIST: + if conf_path.samefile(f): + break else: - raise AssertionError("Model type is not supported!") - - if self.h_path is not None: - h_path = Path(self.h_path) - - with conf_path.open("r") as f: - conf = yaml.safe_load(f) - - # modify dataset horizon - conf["task"]["dataset"]["kwargs"]["handler"]["kwargs"]["label"] = [ - "Ref($close, -{}) / Ref($close, -1) - 1".format(self.horizon + 1) - ] - - task = conf["task"] - - if self.task_ext_conf is not None: - task = update_config(task, self.task_ext_conf) - - if not h_path.exists(): - h_conf = task["dataset"]["kwargs"]["handler"] - h = init_instance_by_config(h_conf) - h.to_pickle(h_path, dump_all=True) - - task["dataset"]["kwargs"]["handler"] = f"file://{h_path}" - task["record"] = ["qlib.workflow.record_temp.SignalRecord"] - - if self.train_start is not None: - seg = task["dataset"]["kwargs"]["segments"]["train"] - task["dataset"]["kwargs"]["segments"]["train"] = pd.Timestamp(self.train_start), seg[1] - - if self.test_end is not None: - seg = task["dataset"]["kwargs"]["segments"]["test"] - task["dataset"]["kwargs"]["segments"]["test"] = seg[0], pd.Timestamp(self.test_end) - self.logger.info(task) - return task - - def create_rolling_tasks(self): - task = self.basic_task() - task_l = task_generator( - task, RollingGen(step=self.step, trunc_days=self.horizon + 1) - ) # the last two days should be truncated to avoid information leakage - return task_l - - def train_rolling_tasks(self, task_l=None): - if task_l is None: - task_l = self.create_rolling_tasks() - trainer = TrainerR(experiment_name=self.rolling_exp) - trainer(task_l) - - COMB_EXP = "rolling" - - def ens_rolling(self): - rc = RecorderCollector( - experiment=self.rolling_exp, - artifacts_key=["pred", "label"], - process_list=[RollingEnsemble()], - # rec_key_func=lambda rec: (self.COMB_EXP, rec.info["id"]), - artifacts_path={"pred": "pred.pkl", "label": "label.pkl"}, - ) - res = rc() - with R.start(experiment_name=self.COMB_EXP): - R.log_params(exp_name=self.rolling_exp) - R.save_objects(**{"pred.pkl": res["pred"], "label.pkl": res["label"]}) - - def update_rolling_rec(self): - """ - Evaluate the combined rolling results - """ - for _, rec in R.list_recorders(experiment_name=self.COMB_EXP).items(): - for rt_cls in SigAnaRecord, PortAnaRecord: - rt = rt_cls(recorder=rec, skip_existing=True) - rt.generate() - print(f"Your evaluation results can be found in the experiment named `{self.COMB_EXP}`.") - - def run_all(self): - # the results will be save in mlruns. - # 1) each rolling task is saved in rolling_models - self.train_rolling_tasks() - # 2) combined rolling tasks and evaluation results are saved in rolling - self.ens_rolling() - self.update_rolling_rec() + self.logger.warning("Model type is not in the benchmark!") if __name__ == "__main__": diff --git a/examples/benchmarks_dynamic/baseline/workflow_config_lightgbm_Alpha158.yaml b/examples/benchmarks_dynamic/baseline/workflow_config_lightgbm_Alpha158.yaml index 2d441dea9..5ae316801 100644 --- a/examples/benchmarks_dynamic/baseline/workflow_config_lightgbm_Alpha158.yaml +++ b/examples/benchmarks_dynamic/baseline/workflow_config_lightgbm_Alpha158.yaml @@ -14,8 +14,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - model: - dataset: + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks_dynamic/baseline/workflow_config_linear_Alpha158.yaml b/examples/benchmarks_dynamic/baseline/workflow_config_linear_Alpha158.yaml index 78ec4e612..a5c272f28 100644 --- a/examples/benchmarks_dynamic/baseline/workflow_config_linear_Alpha158.yaml +++ b/examples/benchmarks_dynamic/baseline/workflow_config_linear_Alpha158.yaml @@ -27,9 +27,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/highfreq/highfreq_handler.py b/examples/highfreq/highfreq_handler.py index c15c3ec41..7df564b7b 100644 --- a/examples/highfreq/highfreq_handler.py +++ b/examples/highfreq/highfreq_handler.py @@ -14,7 +14,6 @@ class HighFreqHandler(DataHandlerLP): fit_end_time=None, drop_raw=True, ): - infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time) learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time) diff --git a/examples/highfreq/workflow.py b/examples/highfreq/workflow.py index c631d72e7..02948c5a1 100644 --- a/examples/highfreq/workflow.py +++ b/examples/highfreq/workflow.py @@ -18,7 +18,6 @@ from highfreq_ops import get_calendar_day, DayLast, FFillNan, BFillNan, Date, Se class HighfreqWorkflow: - SPEC_CONF = {"custom_ops": [DayLast, FFillNan, BFillNan, Date, Select, IsNull, Cut], "expression_cache": None} MARKET = "all" diff --git a/examples/hyperparameter/LightGBM/hyperparameter_158.py b/examples/hyperparameter/LightGBM/hyperparameter_158.py index 8c3e9f3e8..7520390a6 100644 --- a/examples/hyperparameter/LightGBM/hyperparameter_158.py +++ b/examples/hyperparameter/LightGBM/hyperparameter_158.py @@ -35,7 +35,6 @@ def objective(trial): if __name__ == "__main__": - provider_uri = "~/.qlib/qlib_data/cn_data" GetData().qlib_data(target_dir=provider_uri, region=REG_CN, exists_skip=True) qlib.init(provider_uri=provider_uri, region="cn") diff --git a/examples/hyperparameter/LightGBM/hyperparameter_360.py b/examples/hyperparameter/LightGBM/hyperparameter_360.py index 322c0fa42..7ba28c78f 100644 --- a/examples/hyperparameter/LightGBM/hyperparameter_360.py +++ b/examples/hyperparameter/LightGBM/hyperparameter_360.py @@ -38,7 +38,6 @@ def objective(trial): if __name__ == "__main__": - provider_uri = "~/.qlib/qlib_data/cn_data" GetData().qlib_data(target_dir=provider_uri, region=REG_CN, exists_skip=True) qlib.init(provider_uri=provider_uri, region=REG_CN) diff --git a/examples/model_interpreter/feature.py b/examples/model_interpreter/feature.py index bfc58fc84..8ad673d0e 100644 --- a/examples/model_interpreter/feature.py +++ b/examples/model_interpreter/feature.py @@ -11,7 +11,6 @@ from qlib.tests.config import CSI300_GBDT_TASK if __name__ == "__main__": - # use default data provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir GetData().qlib_data(target_dir=provider_uri, region=REG_CN, exists_skip=True) diff --git a/examples/portfolio/prepare_riskdata.py b/examples/portfolio/prepare_riskdata.py index 3168e2f37..e502a1ff7 100644 --- a/examples/portfolio/prepare_riskdata.py +++ b/examples/portfolio/prepare_riskdata.py @@ -9,7 +9,6 @@ from qlib.model.riskmodel import StructuredCovEstimator def prepare_data(riskdata_root="./riskdata", T=240, start_time="2016-01-01"): - universe = D.features(D.instruments("csi300"), ["$close"], start_time=start_time).swaplevel().sort_index() price_all = ( @@ -20,7 +19,6 @@ def prepare_data(riskdata_root="./riskdata", T=240, start_time="2016-01-01"): riskmodel = StructuredCovEstimator() for i in range(T - 1, len(price_all)): - date = price_all.index[i] ref_date = price_all.index[i - T + 1] @@ -47,7 +45,6 @@ def prepare_data(riskdata_root="./riskdata", T=240, start_time="2016-01-01"): if __name__ == "__main__": - import qlib qlib.init(provider_uri="~/.qlib/qlib_data/cn_data") diff --git a/examples/rolling_process_data/workflow.py b/examples/rolling_process_data/workflow.py index 434d365e5..d1c03866a 100644 --- a/examples/rolling_process_data/workflow.py +++ b/examples/rolling_process_data/workflow.py @@ -13,7 +13,6 @@ from qlib.tests.data import GetData class RollingDataWorkflow: - MARKET = "csi300" start_time = "2010-01-01" end_time = "2019-12-31" @@ -93,7 +92,6 @@ class RollingDataWorkflow: dataset = init_instance_by_config(dataset_config) for rolling_offset in range(self.rolling_cnt): - print(f"===========rolling{rolling_offset} start===========") if rolling_offset: dataset.config( diff --git a/examples/workflow_by_code.py b/examples/workflow_by_code.py index 0c4d73a51..94de5c082 100644 --- a/examples/workflow_by_code.py +++ b/examples/workflow_by_code.py @@ -17,7 +17,6 @@ from qlib.tests.config import CSI300_BENCH, CSI300_GBDT_TASK if __name__ == "__main__": - # use default data provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir GetData().qlib_data(target_dir=provider_uri, region=REG_CN, exists_skip=True) diff --git a/qlib/__init__.py b/qlib/__init__.py index a963a8c28..3355ac04f 100644 --- a/qlib/__init__.py +++ b/qlib/__init__.py @@ -77,7 +77,6 @@ def init(default_conf="client", **kwargs): def _mount_nfs_uri(provider_uri, mount_path, auto_mount: bool = False): - LOG = get_module_logger("mount nfs", level=logging.INFO) if mount_path is None: raise ValueError(f"Invalid mount path: {mount_path}!") diff --git a/qlib/backtest/__init__.py b/qlib/backtest/__init__.py index bb8ca731b..d784aed57 100644 --- a/qlib/backtest/__init__.py +++ b/qlib/backtest/__init__.py @@ -182,7 +182,6 @@ def get_strategy_executor( exchange_kwargs: dict = {}, pos_type: str = "Position", ) -> Tuple[BaseStrategy, BaseExecutor]: - # NOTE: # - for avoiding recursive import # - typing annotations is not reliable diff --git a/qlib/backtest/exchange.py b/qlib/backtest/exchange.py index a752a9f8c..1ab0d07a7 100644 --- a/qlib/backtest/exchange.py +++ b/qlib/backtest/exchange.py @@ -638,7 +638,6 @@ class Exchange: random.seed(0) random.shuffle(sorted_ids) for stock_id in sorted_ids: - # Do not generate order for the non-tradable stocks if not self.is_stock_tradable(stock_id=stock_id, start_time=start_time, end_time=end_time): continue diff --git a/qlib/config.py b/qlib/config.py index 7b726c658..7910dab73 100644 --- a/qlib/config.py +++ b/qlib/config.py @@ -293,7 +293,6 @@ class QlibConfig(Config): """ def __init__(self, provider_uri: Union[str, Path, dict], mount_path: Union[str, Path, dict]): - """ The relation of `provider_uri` and `mount_path` - `mount_path` is used only if provider_uri is an NFS path diff --git a/qlib/contrib/data/dataset.py b/qlib/contrib/data/dataset.py index 9ce522cc0..8b40dba1f 100644 --- a/qlib/contrib/data/dataset.py +++ b/qlib/contrib/data/dataset.py @@ -130,7 +130,6 @@ class MTSDatasetH(DatasetH): input_size=None, **kwargs, ): - assert num_states == 0 or horizon > 0, "please specify `horizon` to avoid data leakage" assert memory_mode in ["sample", "daily"], "unsupported memory mode" assert memory_mode == "sample" or batch_size < 0, "daily memory requires daily sampling (`batch_size < 0`)" @@ -153,7 +152,6 @@ class MTSDatasetH(DatasetH): super().__init__(handler, segments, **kwargs) def setup_data(self, handler_kwargs: dict = None, **kwargs): - super().setup_data(**kwargs) if handler_kwargs is not None: @@ -288,7 +286,6 @@ class MTSDatasetH(DatasetH): daily_count = [] # store number of samples for each day for j in indices[i : i + batch_size]: - # normal sampling: self.batch_size > 0 => slices is a list => slices_subset is a slice # daily sampling: self.batch_size < 0 => slices is a nested list => slices_subset is a list slices_subset = slices[j] @@ -297,7 +294,6 @@ class MTSDatasetH(DatasetH): # each slices_subset contains a list of slices for multiple stocks # NOTE: daily sampling is used in 1) eval mode, 2) train mode with self.batch_size < 0 if self.batch_size < 0: - # store daily index idx = self._daily_index.index[j] # daily_index.index is the index of the original data daily_index.append(idx) @@ -320,7 +316,6 @@ class MTSDatasetH(DatasetH): slices_subset = [slices_subset] for slc in slices_subset: - # legacy support for Alpha360 data by `input_size` if self.input_size: data.append(self._data[slc.stop - 1].reshape(self.input_size, -1).T) diff --git a/qlib/contrib/data/highfreq_handler.py b/qlib/contrib/data/highfreq_handler.py index 638fbf0e8..8eed4814f 100644 --- a/qlib/contrib/data/highfreq_handler.py +++ b/qlib/contrib/data/highfreq_handler.py @@ -17,7 +17,6 @@ class HighFreqHandler(DataHandlerLP): fit_end_time=None, drop_raw=True, ): - infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time) learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time) @@ -318,7 +317,6 @@ class HighFreqOrderHandler(DataHandlerLP): inst_processors=None, drop_raw=True, ): - infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time) learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time) diff --git a/qlib/contrib/data/highfreq_processor.py b/qlib/contrib/data/highfreq_processor.py index f7041e9f4..db2a6e39b 100644 --- a/qlib/contrib/data/highfreq_processor.py +++ b/qlib/contrib/data/highfreq_processor.py @@ -29,7 +29,6 @@ class HighFreqNorm(Processor): feature_save_dir: str, norm_groups: Dict[str, int], ): - self.fit_start_time = fit_start_time self.fit_end_time = fit_end_time self.feature_save_dir = feature_save_dir diff --git a/qlib/contrib/meta/data_selection/dataset.py b/qlib/contrib/meta/data_selection/dataset.py index e3689d964..9349a12fe 100644 --- a/qlib/contrib/meta/data_selection/dataset.py +++ b/qlib/contrib/meta/data_selection/dataset.py @@ -49,6 +49,8 @@ class InternalData: # 1) prepare the prediction of proxy models perf_task_tpl = deepcopy(self.task_tpl) # this task is supposed to contains no complicated objects + # The only thing we want to save is the prediction + perf_task_tpl["record"] = ["qlib.workflow.record_temp.SignalRecord"] trainer = auto_filter_kwargs(trainer)(experiment_name=self.exp_name, **trainer_kwargs) # NOTE: diff --git a/qlib/contrib/model/pytorch_adarnn.py b/qlib/contrib/model/pytorch_adarnn.py index 4b0db7f4b..ca5e8ba86 100644 --- a/qlib/contrib/model/pytorch_adarnn.py +++ b/qlib/contrib/model/pytorch_adarnn.py @@ -246,7 +246,6 @@ class ADARNN(Model): evals_result=dict(), save_path=None, ): - df_train, df_valid = dataset.prepare( ["train", "valid"], col_set=["feature", "label"], @@ -318,7 +317,6 @@ class ADARNN(Model): preds = [] for begin in range(sample_num)[:: self.batch_size]: - if sample_num - begin < self.batch_size: end = sample_num else: diff --git a/qlib/contrib/model/pytorch_alstm.py b/qlib/contrib/model/pytorch_alstm.py index b0770e2bd..2fe7cce3b 100644 --- a/qlib/contrib/model/pytorch_alstm.py +++ b/qlib/contrib/model/pytorch_alstm.py @@ -146,7 +146,6 @@ class ALSTM(Model): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -155,7 +154,6 @@ class ALSTM(Model): raise ValueError("unknown metric `%s`" % self.metric) def train_epoch(self, x_train, y_train): - x_train_values = x_train.values y_train_values = np.squeeze(y_train.values) @@ -165,7 +163,6 @@ class ALSTM(Model): np.random.shuffle(indices) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -181,7 +178,6 @@ class ALSTM(Model): self.train_optimizer.step() def test_epoch(self, data_x, data_y): - # prepare training data x_values = data_x.values y_values = np.squeeze(data_y.values) @@ -194,7 +190,6 @@ class ALSTM(Model): indices = np.arange(len(x_values)) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -217,7 +212,6 @@ class ALSTM(Model): evals_result=dict(), save_path=None, ): - df_train, df_valid, df_test = dataset.prepare( ["train", "valid", "test"], col_set=["feature", "label"], @@ -282,7 +276,6 @@ class ALSTM(Model): preds = [] for begin in range(sample_num)[:: self.batch_size]: - if sample_num - begin < self.batch_size: end = sample_num else: diff --git a/qlib/contrib/model/pytorch_alstm_ts.py b/qlib/contrib/model/pytorch_alstm_ts.py index 3ab8ed8ab..008d78940 100644 --- a/qlib/contrib/model/pytorch_alstm_ts.py +++ b/qlib/contrib/model/pytorch_alstm_ts.py @@ -156,7 +156,6 @@ class ALSTM(Model): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -165,10 +164,9 @@ class ALSTM(Model): raise ValueError("unknown metric `%s`" % self.metric) def train_epoch(self, data_loader): - self.ALSTM_model.train() - for (data, weight) in data_loader: + for data, weight in data_loader: feature = data[:, :, 0:-1].to(self.device) label = data[:, -1, -1].to(self.device) @@ -181,14 +179,12 @@ class ALSTM(Model): self.train_optimizer.step() def test_epoch(self, data_loader): - self.ALSTM_model.eval() scores = [] losses = [] - for (data, weight) in data_loader: - + for data, weight in data_loader: feature = data[:, :, 0:-1].to(self.device) # feature[torch.isnan(feature)] = 0 label = data[:, -1, -1].to(self.device) @@ -295,7 +291,6 @@ class ALSTM(Model): preds = [] for data in test_loader: - feature = data[:, :, 0:-1].to(self.device) with torch.no_grad(): diff --git a/qlib/contrib/model/pytorch_gats.py b/qlib/contrib/model/pytorch_gats.py index 127408877..63ebd480a 100644 --- a/qlib/contrib/model/pytorch_gats.py +++ b/qlib/contrib/model/pytorch_gats.py @@ -154,7 +154,6 @@ class GATs(Model): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -175,7 +174,6 @@ class GATs(Model): return daily_index, daily_count def train_epoch(self, x_train, y_train): - x_train_values = x_train.values y_train_values = np.squeeze(y_train.values) self.GAT_model.train() @@ -197,7 +195,6 @@ class GATs(Model): self.train_optimizer.step() def test_epoch(self, data_x, data_y): - # prepare training data x_values = data_x.values y_values = np.squeeze(data_y.values) @@ -230,7 +227,6 @@ class GATs(Model): evals_result=dict(), save_path=None, ): - df_train, df_valid, df_test = dataset.prepare( ["train", "valid", "test"], col_set=["feature", "label"], diff --git a/qlib/contrib/model/pytorch_gats_ts.py b/qlib/contrib/model/pytorch_gats_ts.py index 1b75efe89..b1239f78e 100644 --- a/qlib/contrib/model/pytorch_gats_ts.py +++ b/qlib/contrib/model/pytorch_gats_ts.py @@ -32,7 +32,6 @@ class DailyBatchSampler(Sampler): self.daily_index[0] = 0 def __iter__(self): - for idx, count in zip(self.daily_index, self.daily_count): yield np.arange(idx, idx + count) @@ -173,7 +172,6 @@ class GATs(Model): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -194,11 +192,9 @@ class GATs(Model): return daily_index, daily_count def train_epoch(self, data_loader): - self.GAT_model.train() for data in data_loader: - data = data.squeeze() feature = data[:, :, 0:-1].to(self.device) label = data[:, -1, -1].to(self.device) @@ -212,14 +208,12 @@ class GATs(Model): self.train_optimizer.step() def test_epoch(self, data_loader): - self.GAT_model.eval() scores = [] losses = [] for data in data_loader: - data = data.squeeze() feature = data[:, :, 0:-1].to(self.device) # feature[torch.isnan(feature)] = 0 @@ -240,7 +234,6 @@ class GATs(Model): evals_result=dict(), save_path=None, ): - dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) if dl_train.empty or dl_valid.empty: @@ -329,7 +322,6 @@ class GATs(Model): preds = [] for data in test_loader: - data = data.squeeze() feature = data[:, :, 0:-1].to(self.device) diff --git a/qlib/contrib/model/pytorch_gru.py b/qlib/contrib/model/pytorch_gru.py index 10998236b..2a476a657 100755 --- a/qlib/contrib/model/pytorch_gru.py +++ b/qlib/contrib/model/pytorch_gru.py @@ -146,7 +146,6 @@ class GRU(Model): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -155,7 +154,6 @@ class GRU(Model): raise ValueError("unknown metric `%s`" % self.metric) def train_epoch(self, x_train, y_train): - x_train_values = x_train.values y_train_values = np.squeeze(y_train.values) @@ -165,7 +163,6 @@ class GRU(Model): np.random.shuffle(indices) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -181,7 +178,6 @@ class GRU(Model): self.train_optimizer.step() def test_epoch(self, data_x, data_y): - # prepare training data x_values = data_x.values y_values = np.squeeze(data_y.values) @@ -194,7 +190,6 @@ class GRU(Model): indices = np.arange(len(x_values)) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -217,7 +212,6 @@ class GRU(Model): evals_result=dict(), save_path=None, ): - df_train, df_valid, df_test = dataset.prepare( ["train", "valid", "test"], col_set=["feature", "label"], @@ -282,7 +276,6 @@ class GRU(Model): preds = [] for begin in range(sample_num)[:: self.batch_size]: - if sample_num - begin < self.batch_size: end = sample_num else: diff --git a/qlib/contrib/model/pytorch_gru_ts.py b/qlib/contrib/model/pytorch_gru_ts.py index b588392a2..2e5076ea6 100755 --- a/qlib/contrib/model/pytorch_gru_ts.py +++ b/qlib/contrib/model/pytorch_gru_ts.py @@ -154,7 +154,6 @@ class GRU(Model): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -163,10 +162,9 @@ class GRU(Model): raise ValueError("unknown metric `%s`" % self.metric) def train_epoch(self, data_loader): - self.GRU_model.train() - for (data, weight) in data_loader: + for data, weight in data_loader: feature = data[:, :, 0:-1].to(self.device) label = data[:, -1, -1].to(self.device) @@ -179,14 +177,12 @@ class GRU(Model): self.train_optimizer.step() def test_epoch(self, data_loader): - self.GRU_model.eval() scores = [] losses = [] - for (data, weight) in data_loader: - + for data, weight in data_loader: feature = data[:, :, 0:-1].to(self.device) # feature[torch.isnan(feature)] = 0 label = data[:, -1, -1].to(self.device) @@ -293,7 +289,6 @@ class GRU(Model): preds = [] for data in test_loader: - feature = data[:, :, 0:-1].to(self.device) with torch.no_grad(): diff --git a/qlib/contrib/model/pytorch_hist.py b/qlib/contrib/model/pytorch_hist.py index f7b565dc5..5c3cd66a3 100644 --- a/qlib/contrib/model/pytorch_hist.py +++ b/qlib/contrib/model/pytorch_hist.py @@ -160,7 +160,6 @@ class HIST(Model): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric == "ic": @@ -189,7 +188,6 @@ class HIST(Model): return daily_index, daily_count def train_epoch(self, x_train, y_train, stock_index): - stock2concept_matrix = np.load(self.stock2concept) x_train_values = x_train.values y_train_values = np.squeeze(y_train.values) @@ -214,7 +212,6 @@ class HIST(Model): self.train_optimizer.step() def test_epoch(self, data_x, data_y, stock_index): - # prepare training data stock2concept_matrix = np.load(self.stock2concept) x_values = data_x.values diff --git a/qlib/contrib/model/pytorch_igmtf.py b/qlib/contrib/model/pytorch_igmtf.py index d38ef9ad4..46a25c00f 100644 --- a/qlib/contrib/model/pytorch_igmtf.py +++ b/qlib/contrib/model/pytorch_igmtf.py @@ -153,7 +153,6 @@ class IGMTF(Model): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric == "ic": @@ -201,7 +200,6 @@ class IGMTF(Model): return train_hidden, train_hidden_day def train_epoch(self, x_train, y_train, train_hidden, train_hidden_day): - x_train_values = x_train.values y_train_values = np.squeeze(y_train.values) @@ -222,7 +220,6 @@ class IGMTF(Model): self.train_optimizer.step() def test_epoch(self, data_x, data_y, train_hidden, train_hidden_day): - # prepare training data x_values = data_x.values y_values = np.squeeze(data_y.values) @@ -254,7 +251,6 @@ class IGMTF(Model): evals_result=dict(), save_path=None, ): - df_train, df_valid = dataset.prepare( ["train", "valid"], col_set=["feature", "label"], diff --git a/qlib/contrib/model/pytorch_localformer.py b/qlib/contrib/model/pytorch_localformer.py index 6e7d91180..830bc59f0 100644 --- a/qlib/contrib/model/pytorch_localformer.py +++ b/qlib/contrib/model/pytorch_localformer.py @@ -46,7 +46,6 @@ class LocalformerModel(Model): seed=None, **kwargs ): - # set hyper-parameters. self.d_model = d_model self.dropout = dropout @@ -96,7 +95,6 @@ class LocalformerModel(Model): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -105,7 +103,6 @@ class LocalformerModel(Model): raise ValueError("unknown metric `%s`" % self.metric) def train_epoch(self, x_train, y_train): - x_train_values = x_train.values y_train_values = np.squeeze(y_train.values) @@ -115,7 +112,6 @@ class LocalformerModel(Model): np.random.shuffle(indices) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -131,7 +127,6 @@ class LocalformerModel(Model): self.train_optimizer.step() def test_epoch(self, data_x, data_y): - # prepare training data x_values = data_x.values y_values = np.squeeze(data_y.values) @@ -144,7 +139,6 @@ class LocalformerModel(Model): indices = np.arange(len(x_values)) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -167,7 +161,6 @@ class LocalformerModel(Model): evals_result=dict(), save_path=None, ): - df_train, df_valid, df_test = dataset.prepare( ["train", "valid", "test"], col_set=["feature", "label"], @@ -232,7 +225,6 @@ class LocalformerModel(Model): preds = [] for begin in range(sample_num)[:: self.batch_size]: - if sample_num - begin < self.batch_size: end = sample_num else: diff --git a/qlib/contrib/model/pytorch_localformer_ts.py b/qlib/contrib/model/pytorch_localformer_ts.py index 18ef7f112..b05c2d311 100644 --- a/qlib/contrib/model/pytorch_localformer_ts.py +++ b/qlib/contrib/model/pytorch_localformer_ts.py @@ -44,7 +44,6 @@ class LocalformerModel(Model): seed=None, **kwargs ): - # set hyper-parameters. self.d_model = d_model self.dropout = dropout @@ -96,7 +95,6 @@ class LocalformerModel(Model): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -105,7 +103,6 @@ class LocalformerModel(Model): raise ValueError("unknown metric `%s`" % self.metric) def train_epoch(self, data_loader): - self.model.train() for data in data_loader: @@ -121,14 +118,12 @@ class LocalformerModel(Model): self.train_optimizer.step() def test_epoch(self, data_loader): - self.model.eval() scores = [] losses = [] for data in data_loader: - feature = data[:, :, 0:-1].to(self.device) label = data[:, -1, -1].to(self.device) @@ -148,7 +143,6 @@ class LocalformerModel(Model): evals_result=dict(), save_path=None, ): - dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) if dl_train.empty or dl_valid.empty: diff --git a/qlib/contrib/model/pytorch_lstm.py b/qlib/contrib/model/pytorch_lstm.py index a68cf5eac..168be6ca5 100755 --- a/qlib/contrib/model/pytorch_lstm.py +++ b/qlib/contrib/model/pytorch_lstm.py @@ -142,7 +142,6 @@ class LSTM(Model): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -151,7 +150,6 @@ class LSTM(Model): raise ValueError("unknown metric `%s`" % self.metric) def train_epoch(self, x_train, y_train): - x_train_values = x_train.values y_train_values = np.squeeze(y_train.values) @@ -161,7 +159,6 @@ class LSTM(Model): np.random.shuffle(indices) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -177,7 +174,6 @@ class LSTM(Model): self.train_optimizer.step() def test_epoch(self, data_x, data_y): - # prepare training data x_values = data_x.values y_values = np.squeeze(data_y.values) @@ -190,7 +186,6 @@ class LSTM(Model): indices = np.arange(len(x_values)) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -212,7 +207,6 @@ class LSTM(Model): evals_result=dict(), save_path=None, ): - df_train, df_valid, df_test = dataset.prepare( ["train", "valid", "test"], col_set=["feature", "label"], diff --git a/qlib/contrib/model/pytorch_lstm_ts.py b/qlib/contrib/model/pytorch_lstm_ts.py index f1a3c55e8..8ecafc2d5 100755 --- a/qlib/contrib/model/pytorch_lstm_ts.py +++ b/qlib/contrib/model/pytorch_lstm_ts.py @@ -150,7 +150,6 @@ class LSTM(Model): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -159,10 +158,9 @@ class LSTM(Model): raise ValueError("unknown metric `%s`" % self.metric) def train_epoch(self, data_loader): - self.LSTM_model.train() - for (data, weight) in data_loader: + for data, weight in data_loader: feature = data[:, :, 0:-1].to(self.device) label = data[:, -1, -1].to(self.device) @@ -175,14 +173,12 @@ class LSTM(Model): self.train_optimizer.step() def test_epoch(self, data_loader): - self.LSTM_model.eval() scores = [] losses = [] - for (data, weight) in data_loader: - + for data, weight in data_loader: feature = data[:, :, 0:-1].to(self.device) # feature[torch.isnan(feature)] = 0 label = data[:, -1, -1].to(self.device) @@ -288,7 +284,6 @@ class LSTM(Model): preds = [] for data in test_loader: - feature = data[:, :, 0:-1].to(self.device) with torch.no_grad(): diff --git a/qlib/contrib/model/pytorch_sfm.py b/qlib/contrib/model/pytorch_sfm.py index 29bae94a3..e79f475d6 100644 --- a/qlib/contrib/model/pytorch_sfm.py +++ b/qlib/contrib/model/pytorch_sfm.py @@ -306,7 +306,6 @@ class SFM(Model): return self.device != torch.device("cpu") def test_epoch(self, data_x, data_y): - # prepare training data x_values = data_x.values y_values = np.squeeze(data_y.values) @@ -319,7 +318,6 @@ class SFM(Model): indices = np.arange(len(x_values)) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -336,7 +334,6 @@ class SFM(Model): return np.mean(losses), np.mean(scores) def train_epoch(self, x_train, y_train): - x_train_values = x_train.values y_train_values = np.squeeze(y_train.values) @@ -346,7 +343,6 @@ class SFM(Model): np.random.shuffle(indices) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -367,7 +363,6 @@ class SFM(Model): evals_result=dict(), save_path=None, ): - df_train, df_valid = dataset.prepare( ["train", "valid"], col_set=["feature", "label"], @@ -431,7 +426,6 @@ class SFM(Model): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): diff --git a/qlib/contrib/model/pytorch_tabnet.py b/qlib/contrib/model/pytorch_tabnet.py index adc7354fe..3c698edad 100644 --- a/qlib/contrib/model/pytorch_tabnet.py +++ b/qlib/contrib/model/pytorch_tabnet.py @@ -256,7 +256,6 @@ class TabnetModel(Model): indices = np.arange(len(x_values)) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break feature = x_values[indices[i : i + self.batch_size]].float().to(self.device) @@ -283,7 +282,6 @@ class TabnetModel(Model): np.random.shuffle(indices) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -308,7 +306,6 @@ class TabnetModel(Model): self.tabnet_decoder.train() for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -339,7 +336,6 @@ class TabnetModel(Model): losses = [] for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break diff --git a/qlib/contrib/model/pytorch_tcn.py b/qlib/contrib/model/pytorch_tcn.py index 2af7a04ea..38e289342 100755 --- a/qlib/contrib/model/pytorch_tcn.py +++ b/qlib/contrib/model/pytorch_tcn.py @@ -154,7 +154,6 @@ class TCN(Model): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -163,7 +162,6 @@ class TCN(Model): raise ValueError("unknown metric `%s`" % self.metric) def train_epoch(self, x_train, y_train): - x_train_values = x_train.values y_train_values = np.squeeze(y_train.values) @@ -173,7 +171,6 @@ class TCN(Model): np.random.shuffle(indices) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -200,7 +197,6 @@ class TCN(Model): indices = np.arange(len(x_values)) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -223,7 +219,6 @@ class TCN(Model): evals_result=dict(), save_path=None, ): - df_train, df_valid, df_test = dataset.prepare( ["train", "valid", "test"], col_set=["feature", "label"], @@ -286,7 +281,6 @@ class TCN(Model): preds = [] for begin in range(sample_num)[:: self.batch_size]: - if sample_num - begin < self.batch_size: end = sample_num else: diff --git a/qlib/contrib/model/pytorch_tcn_ts.py b/qlib/contrib/model/pytorch_tcn_ts.py index bb2e5ea5b..605da62c4 100755 --- a/qlib/contrib/model/pytorch_tcn_ts.py +++ b/qlib/contrib/model/pytorch_tcn_ts.py @@ -155,7 +155,6 @@ class TCN(Model): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -164,7 +163,6 @@ class TCN(Model): raise ValueError("unknown metric `%s`" % self.metric) def train_epoch(self, data_loader): - self.TCN_model.train() for data in data_loader: @@ -181,7 +179,6 @@ class TCN(Model): self.train_optimizer.step() def test_epoch(self, data_loader): - self.TCN_model.eval() scores = [] @@ -277,7 +274,6 @@ class TCN(Model): preds = [] for data in test_loader: - feature = data[:, :, 0:-1].to(self.device) with torch.no_grad(): diff --git a/qlib/contrib/model/pytorch_tcts.py b/qlib/contrib/model/pytorch_tcts.py index b46835cb6..651bd03d2 100644 --- a/qlib/contrib/model/pytorch_tcts.py +++ b/qlib/contrib/model/pytorch_tcts.py @@ -119,7 +119,6 @@ class TCTS(Model): ) def loss_fn(self, pred, label, weight): - if self.mode == "hard": loc = torch.argmax(weight, 1) loss = (pred - label[np.arange(weight.shape[0]), loc]) ** 2 @@ -157,7 +156,6 @@ class TCTS(Model): for i in range(self.steps): for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -191,7 +189,6 @@ class TCTS(Model): # fix forecasting model and valid weight model for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -212,7 +209,6 @@ class TCTS(Model): self.weight_optimizer.step() def test_epoch(self, data_x, data_y): - # prepare training data x_values = data_x.values y_values = np.squeeze(data_y.values) @@ -224,7 +220,6 @@ class TCTS(Model): indices = np.arange(len(x_values)) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -282,7 +277,6 @@ class TCTS(Model): verbose=True, save_path=None, ): - self.fore_model = GRUModel( d_feat=self.d_feat, hidden_size=self.hidden_size, @@ -366,7 +360,6 @@ class TCTS(Model): preds = [] for begin in range(sample_num)[:: self.batch_size]: - if sample_num - begin < self.batch_size: end = sample_num else: diff --git a/qlib/contrib/model/pytorch_tra.py b/qlib/contrib/model/pytorch_tra.py index 46d362c68..964febf11 100644 --- a/qlib/contrib/model/pytorch_tra.py +++ b/qlib/contrib/model/pytorch_tra.py @@ -84,7 +84,6 @@ class TRAModel(Model): transport_method="none", memory_mode="sample", ): - self.logger = get_module_logger("TRA") assert memory_mode in ["sample", "daily"], "invalid memory mode" @@ -136,7 +135,6 @@ class TRAModel(Model): self._init_model() def _init_model(self): - self.logger.info("init TRAModel...") self.model = eval(self.model_type)(**self.model_config).to(device) @@ -176,7 +174,6 @@ class TRAModel(Model): self.global_step = -1 def train_epoch(self, epoch, data_set, is_pretrain=False): - self.model.train() self.tra.train() data_set.train() @@ -274,7 +271,6 @@ class TRAModel(Model): return total_loss def test_epoch(self, epoch, data_set, return_pred=False, prefix="test", is_pretrain=False): - self.model.eval() self.tra.eval() data_set.eval() @@ -360,7 +356,6 @@ class TRAModel(Model): return metrics, preds, probs, P_all def _fit(self, train_set, valid_set, test_set, evals_result, is_pretrain=True): - best_score = -1 best_epoch = 0 stop_rounds = 0 @@ -419,7 +414,6 @@ class TRAModel(Model): return best_score def fit(self, dataset, evals_result=dict()): - assert isinstance(dataset, MTSDatasetH), "TRAModel only supports `qlib.contrib.data.dataset.MTSDatasetH`" train_set, valid_set, test_set = dataset.prepare(["train", "valid", "test"]) @@ -503,7 +497,6 @@ class TRAModel(Model): json.dump(info, f) def predict(self, dataset, segment="test"): - assert isinstance(dataset, MTSDatasetH), "TRAModel only supports `qlib.contrib.data.dataset.MTSDatasetH`" if not self.fitted: @@ -571,7 +564,6 @@ class RNN(nn.Module): self.output_size = hidden_size def forward(self, x): - if self.input_proj is not None: x = self.input_proj(x) @@ -647,7 +639,6 @@ class Transformer(nn.Module): self.output_size = hidden_size def forward(self, x): - x = x.permute(1, 0, 2).contiguous() # the first dim need to be time x = self.pe(x) @@ -713,7 +704,6 @@ class TRA(nn.Module): child.reset_parameters() def forward(self, hidden, hist_loss): - preds = self.predictors(hidden) if self.num_states == 1: # no need for router when having only one prediction diff --git a/qlib/contrib/model/pytorch_transformer.py b/qlib/contrib/model/pytorch_transformer.py index 66e5b2c4e..f4b7a06eb 100644 --- a/qlib/contrib/model/pytorch_transformer.py +++ b/qlib/contrib/model/pytorch_transformer.py @@ -45,7 +45,6 @@ class TransformerModel(Model): seed=None, **kwargs ): - # set hyper-parameters. self.d_model = d_model self.dropout = dropout @@ -95,7 +94,6 @@ class TransformerModel(Model): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -104,7 +102,6 @@ class TransformerModel(Model): raise ValueError("unknown metric `%s`" % self.metric) def train_epoch(self, x_train, y_train): - x_train_values = x_train.values y_train_values = np.squeeze(y_train.values) @@ -114,7 +111,6 @@ class TransformerModel(Model): np.random.shuffle(indices) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -130,7 +126,6 @@ class TransformerModel(Model): self.train_optimizer.step() def test_epoch(self, data_x, data_y): - # prepare training data x_values = data_x.values y_values = np.squeeze(data_y.values) @@ -143,7 +138,6 @@ class TransformerModel(Model): indices = np.arange(len(x_values)) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -166,7 +160,6 @@ class TransformerModel(Model): evals_result=dict(), save_path=None, ): - df_train, df_valid, df_test = dataset.prepare( ["train", "valid", "test"], col_set=["feature", "label"], @@ -231,7 +224,6 @@ class TransformerModel(Model): preds = [] for begin in range(sample_num)[:: self.batch_size]: - if sample_num - begin < self.batch_size: end = sample_num else: diff --git a/qlib/contrib/model/pytorch_transformer_ts.py b/qlib/contrib/model/pytorch_transformer_ts.py index 6cffded9c..84b093805 100644 --- a/qlib/contrib/model/pytorch_transformer_ts.py +++ b/qlib/contrib/model/pytorch_transformer_ts.py @@ -43,7 +43,6 @@ class TransformerModel(Model): seed=None, **kwargs ): - # set hyper-parameters. self.d_model = d_model self.dropout = dropout @@ -93,7 +92,6 @@ class TransformerModel(Model): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -102,7 +100,6 @@ class TransformerModel(Model): raise ValueError("unknown metric `%s`" % self.metric) def train_epoch(self, data_loader): - self.model.train() for data in data_loader: @@ -118,14 +115,12 @@ class TransformerModel(Model): self.train_optimizer.step() def test_epoch(self, data_loader): - self.model.eval() scores = [] losses = [] for data in data_loader: - feature = data[:, :, 0:-1].to(self.device) label = data[:, -1, -1].to(self.device) @@ -145,7 +140,6 @@ class TransformerModel(Model): evals_result=dict(), save_path=None, ): - dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) diff --git a/qlib/contrib/model/xgboost.py b/qlib/contrib/model/xgboost.py index d38655ebd..67bedafa8 100755 --- a/qlib/contrib/model/xgboost.py +++ b/qlib/contrib/model/xgboost.py @@ -30,7 +30,6 @@ class XGBModel(Model, FeatureInt): reweighter=None, **kwargs ): - df_train, df_valid = dataset.prepare( ["train", "valid"], col_set=["feature", "label"], diff --git a/qlib/contrib/report/data/ana.py b/qlib/contrib/report/data/ana.py index 782a92d5a..567ef311d 100644 --- a/qlib/contrib/report/data/ana.py +++ b/qlib/contrib/report/data/ana.py @@ -30,7 +30,6 @@ class CombFeaAna(FeaAnalyser): """The statistics of features are finished in the underlying analysers""" def plot_all(self, *args, **kwargs): - ax_gen = iter(sub_fig_generator(row_n=len(self._fea_ana_l), *args, **kwargs)) for col in self._dataset: diff --git a/qlib/contrib/report/data/base.py b/qlib/contrib/report/data/base.py index 1e7e092af..a91eda48e 100644 --- a/qlib/contrib/report/data/base.py +++ b/qlib/contrib/report/data/base.py @@ -28,7 +28,6 @@ class FeaAnalyser: return False def plot_all(self, *args, **kwargs): - ax_gen = iter(sub_fig_generator(*args, **kwargs)) for col in self._dataset: if not self.skip(col): diff --git a/qlib/contrib/report/graph.py b/qlib/contrib/report/graph.py index c5f932978..f9cf517ea 100644 --- a/qlib/contrib/report/graph.py +++ b/qlib/contrib/report/graph.py @@ -15,7 +15,6 @@ from plotly.figure_factory import create_distplot class BaseGraph: - _name = None def __init__( diff --git a/qlib/contrib/rolling/__init__.py b/qlib/contrib/rolling/__init__.py new file mode 100644 index 000000000..b940486fd --- /dev/null +++ b/qlib/contrib/rolling/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +""" +The difference between me and the scripts in examples/benchmarks/benchmarks_dynamic +- This module only focus provide a general rolling implementation. + Anything specific that benchmark is placed in examples/benchmarks/benchmarks_dynamic +""" diff --git a/qlib/contrib/rolling/__main__.py b/qlib/contrib/rolling/__main__.py new file mode 100644 index 000000000..461c0e777 --- /dev/null +++ b/qlib/contrib/rolling/__main__.py @@ -0,0 +1,16 @@ +import fire +from qlib import auto_init +from qlib.contrib.rolling.base import Rolling +from qlib.utils.mod import find_all_classes + +if __name__ == "__main__": + sub_commands = {} + for cls in find_all_classes("qlib.contrib.rolling", Rolling): + sub_commands[cls.__module__.split(".")[-1]] = cls + # The sub_commands will be like + # {'base': , ...} + # So the you can run it with commands like command below + # - `python -m qlib.contrib.rolling base --conf_path run` + # - base can be replace with other module names + auto_init() + fire.Fire(sub_commands) diff --git a/qlib/contrib/rolling/base.py b/qlib/contrib/rolling/base.py new file mode 100644 index 000000000..d179efb38 --- /dev/null +++ b/qlib/contrib/rolling/base.py @@ -0,0 +1,246 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +from copy import deepcopy +from pathlib import Path +from typing import List, Optional, Union + +import fire +import pandas as pd +import yaml + +from qlib import auto_init +from qlib.log import get_module_logger +from qlib.model.ens.ensemble import RollingEnsemble +from qlib.model.trainer import TrainerR +from qlib.utils import get_cls_kwargs, init_instance_by_config +from qlib.utils.data import update_config +from qlib.workflow import R +from qlib.workflow.record_temp import SignalRecord +from qlib.workflow.task.collect import RecorderCollector +from qlib.workflow.task.gen import RollingGen, task_generator +from qlib.workflow.task.utils import replace_task_handler_with_cache + + +class Rolling: + """ + The motivation of Rolling Module + - It only focus **offlinely** turn a specific task to rollinng + - To make the implementation easier, following factors are ignored. + - The tasks is dependent (e.g. time series). + + Related modules and difference from me: + - MetaController: It is learning how to handle a task (e.g. learning to learn). + - But rolling is about how to split a single task into tasks in time series and run them. + - OnlineStrategy: It is focusing on serving a model, the model can be updated time dependently in time. + - Rolling is much simpler and is only for testing rolling models offline. It does not want to share the interface with OnlineStrategy. + + The code about rolling is shared in `task_generator` & `RollingGen` level between me and the above modules + But it is for different purpose, so other parts are not shared. + + + .. code-block:: shell + + # here is an typical use case of the module. + python -m qlib.contrib.rolling.base --conf_path run + + **NOTE** + before running the example, please clean your previous results with following command + - `rm -r mlruns` + - Because it is very hard to permanently delete a experiment (it will be moved into .trash and raise error when creating experiment with same name). + + """ + + def __init__( + self, + conf_path: Union[str, Path], + exp_name: Optional[str] = None, + horizon: Optional[int] = 20, + step: int = 20, + h_path: Optional[str] = None, + train_start: Optional[str] = None, + test_end: Optional[str] = None, + task_ext_conf: Optional[dict] = None, + rolling_exp: Optional[str] = None, + ) -> None: + """ + Parameters + ---------- + conf_path : str + Path to the config for rolling. + exp_name : Optional[str] + The exp name of the outputs (Output is a record which contains the concatenated predictions of rolling records). + horizon: Optional[int] = 20, + The horizon of the prediction target. + This is used to override the prediction horizon of the file. + h_path : Optional[str] + the dumped data handler; + It may come from other data source. It will override the data handler in the config. + test_end : Optional[str] + the test end for the data. It is typically used together with the handler + You can do the same thing with task_ext_conf in a more complicated way + train_start : Optional[str] + the train start for the data. It is typically used together with the handler. + You can do the same thing with task_ext_conf in a more complicated way + task_ext_conf : Optional[dict] + some option to update the task config. + rolling_exp : Optional[str] + The name for the experiments for rolling. + It will contains a lot of record in an experiment. Each record corresponds to a specific rolling. + Please note that it is different from the final experiments + """ + self.logger = get_module_logger("Rolling") + self.conf_path = Path(conf_path) + self.exp_name = exp_name + self._rid = None # the final combined recorder id in `exp_name` + + self.step = step + assert horizon is not None, "Current version does not support extracting horizon from the underlying dataset" + self.horizon = horizon + if rolling_exp is None: + datetime_suffix = pd.Timestamp.now().strftime("%Y%m%d%H%M%S") + self.rolling_exp = f"rolling_models_{datetime_suffix}" + else: + self.rolling_exp = rolling_exp + self.logger.warning( + "Using user specifiied name for rolling models. So the experiment names duplicateds. " + "Please manually remove your experiment for rolling model with command like `rm -r mlruns`." + " Otherwise it will prevents the creating of experimen with same name" + ) + self.train_start = train_start + self.test_end = test_end + self.task_ext_conf = task_ext_conf + self.h_path = h_path + + # FIXME: + # - the qlib_init section will be ignored by me. + # - So we have to design a priority mechanism to solve this issue. + + def _raw_conf(self) -> dict: + with self.conf_path.open("r") as f: + return yaml.safe_load(f) + + def _replace_hanler_with_cache(self, task: dict): + """ + Due to the data processing part in original rolling is slow. So we have to + This class tries to add more feature + """ + if self.h_path is not None: + h_path = Path(self.h_path) + task["dataset"]["kwargs"]["handler"] = f"file://{h_path}" + else: + task = replace_task_handler_with_cache(task, self.conf_path.parent) + return task + + def _update_start_end_time(self, task: dict): + if self.train_start is not None: + seg = task["dataset"]["kwargs"]["segments"]["train"] + task["dataset"]["kwargs"]["segments"]["train"] = pd.Timestamp(self.train_start), seg[1] + + if self.test_end is not None: + seg = task["dataset"]["kwargs"]["segments"]["test"] + task["dataset"]["kwargs"]["segments"]["test"] = seg[0], pd.Timestamp(self.test_end) + return task + + def basic_task(self, enable_handler_cache: Optional[bool] = True): + """ + The basic task may not be the exactly same as the config from `conf_path` from __init__ due to + - some parameters could be overriding by some parameters from __init__ + - user could implementing sublcass to change it for higher performance + """ + task: dict = self._raw_conf()["task"] + task = deepcopy(task) + + # modify dataset horizon + # NOTE: + # It assumpts that the label can be modifiled in the handler's kwargs + # But is not always a valid. It is only valid in the predefined dataset `Alpha158` & `Alpha360` + if self.horizon is None: + # TODO: + # - get horizon automatically from the expression!!!! + raise NotImplementedError(f"This type of input is not supported") + else: + self.logger.info("The prediction horizon is overrided") + task["dataset"]["kwargs"]["handler"]["kwargs"]["label"] = [ + "Ref($close, -{}) / Ref($close, -1) - 1".format(self.horizon + 1) + ] + + if enable_handler_cache: + task = self._replace_hanler_with_cache(task) + task = self._update_start_end_time(task) + + if self.task_ext_conf is not None: + task = update_config(task, self.task_ext_conf) + self.logger.info(task) + return task + + def get_task_list(self) -> List[dict]: + """return a batch of tasks for rolling.""" + task = self.basic_task() + task_l = task_generator( + task, RollingGen(step=self.step, trunc_days=self.horizon + 1) + ) # the last two days should be truncated to avoid information leakage + for t in task_l: + # when we rolling tasks. No further analyis is needed. + # analyis are postponed to the final ensemble. + t["record"] = ["qlib.workflow.record_temp.SignalRecord"] + return task_l + + def _train_rolling_tasks(self): + task_l = self.get_task_list() + self.logger.info("Deleting previous Rolling results") + try: + # TODO: mlflow does not support permanently delete experiment + # it will be moved to .trash and prevents creating the experiments with the same name + R.delete_exp(experiment_name=self.rolling_exp) # We should remove the rolling experiments. + except ValueError: + self.logger.info("No previous rolling results") + trainer = TrainerR(experiment_name=self.rolling_exp) + trainer(task_l) + + def _ens_rolling(self): + rc = RecorderCollector( + experiment=self.rolling_exp, + artifacts_key=["pred", "label"], + process_list=[RollingEnsemble()], + # rec_key_func=lambda rec: (self.COMB_EXP, rec.info["id"]), + artifacts_path={"pred": "pred.pkl", "label": "label.pkl"}, + ) + res = rc() + with R.start(experiment_name=self.exp_name): + R.log_params(exp_name=self.rolling_exp) + R.save_objects(**{"pred.pkl": res["pred"], "label.pkl": res["label"]}) + self._rid = R.get_recorder().id + + def _update_rolling_rec(self): + """ + Evaluate the combined rolling results + """ + rec = R.get_recorder(experiment_name=self.exp_name, recorder_id=self._rid) + # Follow the original analyser + records = self._raw_conf()["task"].get("record", []) + if isinstance(records, dict): # prevent only one dict + records = [records] + for record in records: + if issubclass(get_cls_kwargs(record)[0], SignalRecord): + # skip the signal record. + continue + r = init_instance_by_config( + record, + recorder=rec, + default_module="qlib.workflow.record_temp", + ) + r.generate() + print(f"Your evaluation results can be found in the experiment named `{self.exp_name}`.") + + def run(self): + # the results will be save in mlruns. + # 1) each rolling task is saved in rolling_models + self._train_rolling_tasks() + # 2) combined rolling tasks and evaluation results are saved in rolling + self._ens_rolling() + self._update_rolling_rec() + + +if __name__ == "__main__": + auto_init() + fire.Fire(Rolling) diff --git a/qlib/contrib/rolling/ddgda.py b/qlib/contrib/rolling/ddgda.py new file mode 100644 index 000000000..25fb4c36e --- /dev/null +++ b/qlib/contrib/rolling/ddgda.py @@ -0,0 +1,343 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +from pathlib import Path +import pickle +from typing import Optional, Union + +import pandas as pd +import yaml + +from qlib.contrib.meta.data_selection.dataset import InternalData, MetaDatasetDS +from qlib.contrib.meta.data_selection.model import MetaModelDS +from qlib.data.dataset.handler import DataHandlerLP +from qlib.model.meta.task import MetaTask +from qlib.model.trainer import TrainerR +from qlib.typehint import Literal +from qlib.utils import init_instance_by_config +from qlib.workflow import R +from qlib.workflow.task.utils import replace_task_handler_with_cache + +from .base import Rolling + +# LGBM is designed for feature importance & similarity +LGBM_MODEL = """ +class: LGBModel +module_path: qlib.contrib.model.gbdt +kwargs: + loss: mse + colsample_bytree: 0.8879 + learning_rate: 0.2 + subsample: 0.8789 + lambda_l1: 205.6999 + lambda_l2: 580.9768 + max_depth: 8 + num_leaves: 210 + num_threads: 20 +""" +# covnert the yaml to dict +LGBM_MODEL = yaml.load(LGBM_MODEL, Loader=yaml.FullLoader) + +LINEAR_MODEL = """ +class: LinearModel +module_path: qlib.contrib.model.linear +kwargs: + estimator: ridge + alpha: 0.05 +""" +LINEAR_MODEL = yaml.load(LINEAR_MODEL, Loader=yaml.FullLoader) + +PROC_ARGS = """ +infer_processors: + - class: RobustZScoreNorm + kwargs: + fields_group: feature + clip_outlier: true + - class: Fillna + kwargs: + fields_group: feature +learn_processors: + - class: DropnaLabel + - class: CSRankNorm + kwargs: + fields_group: label +""" +PROC_ARGS = yaml.load(PROC_ARGS, Loader=yaml.FullLoader) + +UTIL_MODEL_TYPE = Literal["linear", "gbdt"] + + +class DDGDA(Rolling): + """ + It is a rolling based on DDG-DA + + **NOTE** + before running the example, please clean your previous results with following command + - `rm -r mlruns` + """ + + def __init__( + self, + sim_task_model: UTIL_MODEL_TYPE = "gbdt", + meta_1st_train_end: Optional[str] = None, + alpha: float = 0.01, + working_dir: Optional[Union[str, Path]] = None, + **kwargs, + ): + """ + + Parameters + ---------- + sim_task_model: Literal["linear", "gbdt"] = "gbdt", + The model for calculating similarity between data. + meta_1st_train_end: Optional[str] + the datetime of training end of the first meta_task + alpha: float + Setting the L2 regularization for ridge + The `alpha` is only passed to MetaModelDS (it is not passed to sim_task_model currently..) + """ + # NOTE: + # the horizon must match the meaning in the base task template + self.meta_exp_name = "DDG-DA" + self.sim_task_model: UTIL_MODEL_TYPE = sim_task_model # The model to capture the distribution of data. + self.alpha = alpha + self.meta_1st_train_end = meta_1st_train_end + super().__init__(**kwargs) + self.working_dir = self.conf_path.parent if working_dir is None else Path(working_dir) + self.proxy_hd = self.working_dir / "handler_proxy.pkl" + + def _adjust_task(self, task: dict, astype: UTIL_MODEL_TYPE): + """ + some task are use for special purpose. + For example: + - GBDT for calculating feature importance + - Linear or GBDT for calculating similarity + - Datset (well processed) that aligned to Linear that for meta learning + """ + # NOTE: here is just for aligning with previous implementation + # It is not necessary for the current implementation + handler = task["dataset"].setdefault("kwargs", {}).setdefault("handler", {}) + if astype == "gbdt": + task["model"] = LGBM_MODEL + if isinstance(handler, dict): + for k in ["infer_processors", "learn_processors"]: + if k in handler.setdefault("kwargs", {}): + handler["kwargs"].pop(k) + elif astype == "linear": + task["model"] = LINEAR_MODEL + handler["kwargs"].update(PROC_ARGS) + else: + raise ValueError(f"astype not supported: {astype}") + return task + + def _get_feature_importance(self): + # this must be lightGBM, because it needs to get the feature importance + task = self.basic_task(enable_handler_cache=False) + task = self._adjust_task(task, astype="gbdt") + task = replace_task_handler_with_cache(task, self.working_dir) + + with R.start(experiment_name="feature_importance"): + model = init_instance_by_config(task["model"]) + dataset = init_instance_by_config(task["dataset"]) + model.fit(dataset) + + fi = model.get_feature_importance() + # Because the model use numpy instead of dataframe for training lightgbm + # So the we must use following extra steps to get the right feature importance + df = dataset.prepare(segments=slice(None), col_set="feature", data_key=DataHandlerLP.DK_R) + cols = df.columns + fi_named = {cols[int(k.split("_")[1])]: imp for k, imp in fi.to_dict().items()} + + return pd.Series(fi_named) + + def _dump_data_for_proxy_model(self): + """ + Dump data for training meta model. + The meta model will be trained upon the proxy forecasting model. + This dataset is for the proxy forecasting model. + """ + topk = 30 + fi = self._get_feature_importance() + col_selected = fi.nlargest(topk) + # NOTE: adjusting to `self.sim_task_model` just for aligning with previous implementation. + task = self._adjust_task(self.basic_task(enable_handler_cache=False), self.sim_task_model) + task = replace_task_handler_with_cache(task, self.working_dir) + + dataset = init_instance_by_config(task["dataset"]) + prep_ds = dataset.prepare(slice(None), col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) + + feature_df = prep_ds["feature"] + label_df = prep_ds["label"] + + feature_selected = feature_df.loc[:, col_selected.index] + + feature_selected = feature_selected.groupby("datetime", group_keys=False).apply( + lambda df: (df - df.mean()).div(df.std()) + ) + feature_selected = feature_selected.fillna(0.0) + + df_all = { + "label": label_df.reindex(feature_selected.index), + "feature": feature_selected, + } + df_all = pd.concat(df_all, axis=1) + df_all.to_pickle(self.working_dir / "fea_label_df.pkl") + + # dump data in handler format for aligning the interface + handler = DataHandlerLP( + data_loader={ + "class": "qlib.data.dataset.loader.StaticDataLoader", + "kwargs": {"config": self.working_dir / "fea_label_df.pkl"}, + } + ) + handler.to_pickle(self.working_dir / self.proxy_hd, dump_all=True) + + @property + def _internal_data_path(self): + return self.working_dir / f"internal_data_s{self.step}.pkl" + + def _dump_meta_ipt(self): + """ + Dump data for training meta model. + This function will dump the input data for meta model + """ + # According to the experiments, the choice of the model type is very important for achieving good results + sim_task = self._adjust_task(self.basic_task(enable_handler_cache=False), astype=self.sim_task_model) + sim_task = replace_task_handler_with_cache(sim_task, self.working_dir) + + if self.sim_task_model == "gbdt": + sim_task["model"].setdefault("kwargs", {}).update({"early_stopping_rounds": None, "num_boost_round": 150}) + + exp_name_sim = f"data_sim_s{self.step}" + + internal_data = InternalData(sim_task, self.step, exp_name=exp_name_sim) + internal_data.setup(trainer=TrainerR) + + with self._internal_data_path.open("wb") as f: + pickle.dump(internal_data, f) + + def _train_meta_model(self, fill_method="max"): + """ + training a meta model based on a simplified linear proxy model; + """ + + # 1) leverage the simplified proxy forecasting model to train meta model. + # - Only the dataset part is important, in current version of meta model will integrate the + + # the train_start for training meta model does not necessarily align with final rolling + train_start = "2008-01-01" if self.train_start is None else self.train_start + train_end = "2010-12-31" if self.meta_1st_train_end is None else self.meta_1st_train_end + test_start = (pd.Timestamp(train_end) + pd.Timedelta(days=1)).strftime("%Y-%m-%d") + proxy_forecast_model_task = { + # "model": "qlib.contrib.model.linear.LinearModel", + "dataset": { + "class": "qlib.data.dataset.DatasetH", + "kwargs": { + "handler": f"file://{(self.working_dir / self.proxy_hd).absolute()}", + "segments": { + "train": (train_start, train_end), + "test": (test_start, self.basic_task()["dataset"]["kwargs"]["segments"]["test"][1]), + }, + }, + }, + # "record": ["qlib.workflow.record_temp.SignalRecord"] + } + # the proxy_forecast_model_task will be used to create meta tasks. + # The test date of first task will be 2011-01-01. Each test segment will be about 20days + # The tasks include all training tasks and test tasks. + + # 2) preparing meta dataset + kwargs = dict( + task_tpl=proxy_forecast_model_task, + step=self.step, + segments=0.62, # keep test period consistent with the dataset yaml + trunc_days=1 + self.horizon, + hist_step_n=30, + fill_method=fill_method, + rolling_ext_days=0, + ) + # NOTE: + # the input of meta model (internal data) are shared between proxy model and final forecasting model + # but their task test segment are not aligned! It worked in my previous experiment. + # So the misalignment will not affect the effectiveness of the method. + with self._internal_data_path.open("rb") as f: + internal_data = pickle.load(f) + + md = MetaDatasetDS(exp_name=internal_data, **kwargs) + + # 3) train and logging meta model + with R.start(experiment_name=self.meta_exp_name): + R.log_params(**kwargs) + mm = MetaModelDS( + step=self.step, hist_step_n=kwargs["hist_step_n"], lr=0.001, max_epoch=30, seed=43, alpha=self.alpha + ) + mm.fit(md) + R.save_objects(model=mm) + + @property + def _task_path(self): + return self.working_dir / f"tasks_s{self.step}.pkl" + + def get_task_list(self): + """ + Leverage meta-model for inference: + - Given + - baseline tasks + - input for meta model(internal data) + - meta model (its learnt knowledge on proxy forecasting model is expected to transfer to normal forecasting model) + """ + # 1) get meta model + exp = R.get_exp(experiment_name=self.meta_exp_name) + rec = exp.list_recorders(rtype=exp.RT_L)[0] + meta_model: MetaModelDS = rec.load_object("model") + + # 2) + # we are transfer to knowledge of meta model to final forecasting tasks. + # Create MetaTaskDataset for the final forecasting tasks + # Aligning the setting of it to the MetaTaskDataset when training Meta model is necessary + + # 2.1) get previous config + param = rec.list_params() + trunc_days = int(param["trunc_days"]) + step = int(param["step"]) + hist_step_n = int(param["hist_step_n"]) + fill_method = param.get("fill_method", "max") + + task_l = super().get_task_list() + + # 2.2) create meta dataset for final dataset + kwargs = dict( + task_tpl=task_l, + step=step, + segments=0.0, # all the tasks are for testing + trunc_days=trunc_days, + hist_step_n=hist_step_n, + fill_method=fill_method, + task_mode=MetaTask.PROC_MODE_TRANSFER, + ) + + with self._internal_data_path.open("rb") as f: + internal_data = pickle.load(f) + mds = MetaDatasetDS(exp_name=internal_data, **kwargs) + + # 3) meta model make inference and get new qlib task + new_tasks = meta_model.inference(mds) + with self._task_path.open("wb") as f: + pickle.dump(new_tasks, f) + return new_tasks + + def run(self): + # prepare the meta model for rolling --------- + # 1) file: handler_proxy.pkl (self.proxy_hd) + self._dump_data_for_proxy_model() + # 2) + # file: internal_data_s20.pkl + # mlflow: data_sim_s20, models for calculating meta_ipt + self._dump_meta_ipt() + # 3) meta model will be stored in `DDG-DA` + self._train_meta_model() + + # Run rolling -------------------------------- + # 4) new_tasks are saved in "tasks_s20.pkl" (reweighter is added) + # - the meta inference are done when calling `get_task_list` + # 5) load the saved tasks and train model + super().run() diff --git a/qlib/contrib/strategy/optimizer/optimizer.py b/qlib/contrib/strategy/optimizer/optimizer.py index a70929e27..a5fb76312 100644 --- a/qlib/contrib/strategy/optimizer/optimizer.py +++ b/qlib/contrib/strategy/optimizer/optimizer.py @@ -112,7 +112,6 @@ class PortfolioOptimizer(BaseOptimizer): return w def _optimize(self, S: np.ndarray, r: Optional[np.ndarray] = None, w0: Optional[np.ndarray] = None) -> np.ndarray: - # inverse volatility if self.method == self.OPT_INV: if r is not None: diff --git a/qlib/contrib/strategy/rule_strategy.py b/qlib/contrib/strategy/rule_strategy.py index 30facf3a3..f2b919739 100644 --- a/qlib/contrib/strategy/rule_strategy.py +++ b/qlib/contrib/strategy/rule_strategy.py @@ -522,7 +522,6 @@ class ACStrategy(BaseStrategy): _order_amount = min(_order_amount, self.trade_amount[order.stock_id]) if _order_amount > 1e-5: - _order = Order( stock_id=order.stock_id, amount=_order_amount, diff --git a/qlib/contrib/strategy/signal_strategy.py b/qlib/contrib/strategy/signal_strategy.py index cb94017cd..9ba960eeb 100644 --- a/qlib/contrib/strategy/signal_strategy.py +++ b/qlib/contrib/strategy/signal_strategy.py @@ -435,7 +435,6 @@ class EnhancedIndexingStrategy(WeightStrategyBase): self._riskdata_cache = {} def get_risk_data(self, date): - if date in self._riskdata_cache: return self._riskdata_cache[date] @@ -462,7 +461,6 @@ class EnhancedIndexingStrategy(WeightStrategyBase): return self._riskdata_cache[date] def generate_target_weight_position(self, score, current, trade_start_time, trade_end_time): - trade_date = trade_start_time pre_date = get_pre_trading_date(trade_date, future=True) # previous trade date diff --git a/qlib/contrib/tuner/config.py b/qlib/contrib/tuner/config.py index 6e37f0097..7a8534a20 100644 --- a/qlib/contrib/tuner/config.py +++ b/qlib/contrib/tuner/config.py @@ -11,7 +11,6 @@ import os class TunerConfigManager: def __init__(self, config_path): - if not config_path: raise ValueError("Config path is invalid.") self.config_path = config_path @@ -58,7 +57,6 @@ class PipelineExperimentConfig: class OptimizationConfig: def __init__(self, config, TUNER_CONFIG_MANAGER): - self.report_type = config.get("report_type", "pred_long") if self.report_type not in [ "pred_long", diff --git a/qlib/contrib/tuner/pipeline.py b/qlib/contrib/tuner/pipeline.py index db48c46cf..34977fa55 100644 --- a/qlib/contrib/tuner/pipeline.py +++ b/qlib/contrib/tuner/pipeline.py @@ -15,11 +15,9 @@ from ...utils import get_module_by_module_path class Pipeline: - GLOBAL_BEST_PARAMS_NAME = "global_best_params.json" def __init__(self, tuner_config_manager): - self.logger = get_module_logger("Pipeline", sh_level=logging.INFO) self.tuner_config_manager = tuner_config_manager @@ -37,7 +35,6 @@ class Pipeline: self.best_tuner_index = None def run(self): - TimeInspector.set_time_mark() for tuner_index, tuner_config in enumerate(self.pipeline_config): tuner = self.init_tuner(tuner_index, tuner_config) @@ -77,7 +74,6 @@ class Pipeline: return tuner_class(tuner_config, self.optim_config) def save_tuner_exp_info(self): - TimeInspector.set_time_mark() save_path = os.path.join(self.pipeline_ex_config.tuner_ex_dir, Pipeline.GLOBAL_BEST_PARAMS_NAME) with open(save_path, "w") as fp: diff --git a/qlib/contrib/tuner/tuner.py b/qlib/contrib/tuner/tuner.py index c183b28ae..7705ce8b7 100644 --- a/qlib/contrib/tuner/tuner.py +++ b/qlib/contrib/tuner/tuner.py @@ -24,7 +24,6 @@ from hyperopt import STATUS_OK, STATUS_FAIL class Tuner: def __init__(self, tuner_config, optim_config): - self.logger = get_module_logger("Tuner", sh_level=logging.INFO) self.tuner_config = tuner_config @@ -42,7 +41,6 @@ class Tuner: self.space = self.setup_space() def tune(self): - TimeInspector.set_time_mark() fmin( fn=self.objective, @@ -84,7 +82,6 @@ class Tuner: class QLibTuner(Tuner): - ESTIMATOR_CONFIG_NAME = "estimator_config.yaml" EXP_INFO_NAME = "exp_info.json" EXP_RESULT_DIR = "sacred/{}" @@ -92,7 +89,6 @@ class QLibTuner(Tuner): LOCAL_BEST_PARAMS_NAME = "local_best_params.json" def objective(self, params): - # 1. Setup an config for a specific estimator process estimator_path = self.setup_estimator_config(params) self.logger.info("Searching params: {} ".format(params)) @@ -120,7 +116,6 @@ class QLibTuner(Tuner): return {"loss": res, "status": status} def fetch_result(self): - # 1. Get experiment information exp_info_path = os.path.join(self.ex_dir, QLibTuner.EXP_INFO_NAME) with open(exp_info_path) as fp: @@ -155,7 +150,6 @@ class QLibTuner(Tuner): return np.abs(res.values[0] - 1) def setup_estimator_config(self, params): - estimator_config = copy.deepcopy(self.tuner_config) estimator_config["model"].update({"args": params["model_space"]}) estimator_config["strategy"].update({"args": params["strategy_space"]}) @@ -212,7 +206,6 @@ class QLibTuner(Tuner): return space def save_local_best_params(self): - TimeInspector.set_time_mark() local_best_params_path = os.path.join(self.ex_dir, QLibTuner.LOCAL_BEST_PARAMS_NAME) with open(local_best_params_path, "w") as fp: diff --git a/qlib/data/cache.py b/qlib/data/cache.py index addd28871..3264dcd02 100644 --- a/qlib/data/cache.py +++ b/qlib/data/cache.py @@ -583,7 +583,6 @@ class DiskExpressionCache(ExpressionCache): r.tofile(str(cache_path)) def update(self, sid, cache_uri, freq: str = "day"): - cp_cache_uri = self.get_cache_dir(freq).joinpath(sid).joinpath(cache_uri) meta_path = cp_cache_uri.with_suffix(".meta") if not self.check_cache_exists(cp_cache_uri, suffix_list=[".meta"]): @@ -696,7 +695,6 @@ class DiskDatasetCache(DatasetCache): def _dataset( self, instruments, fields, start_time=None, end_time=None, freq="day", disk_cache=0, inst_processors=[] ): - if disk_cache == 0: # In this case, data_set cache is configured but will not be used. return self.provider.dataset( @@ -801,7 +799,6 @@ class DiskDatasetCache(DatasetCache): KEY = "df" def __init__(self, cache_path: Union[str, Path]): - self.index_path = cache_path.with_suffix(".index") self._data = None self.logger = get_module_logger(self.__class__.__name__) @@ -1126,7 +1123,6 @@ class DatasetURICache(DatasetCache): def dataset( self, instruments, fields, start_time=None, end_time=None, freq="day", disk_cache=0, inst_processors=[] ): - if "local" in C.dataset_provider.lower(): # use LocalDatasetProvider return self.provider.dataset( @@ -1189,7 +1185,6 @@ class MemoryCalendarCache(CalendarCache): uri = self._uri(start_time, end_time, freq, future) result, expire = MemCacheExpire.get_cache(H["c"], uri) if result is None or expire: - result = self.provider.calendar(start_time, end_time, freq, future) MemCacheExpire.set_cache(H["c"], uri, result) diff --git a/qlib/data/data.py b/qlib/data/data.py index 809b8d1c3..116827f23 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -1096,7 +1096,6 @@ class ClientDatasetProvider(DatasetProvider): else: return data else: - """ Call the server to generate the data-set cache, get the uri of the cache file. Then load the data from the file on NFS directly. diff --git a/qlib/data/dataset/processor.py b/qlib/data/dataset/processor.py index f7204cf78..63acd937e 100644 --- a/qlib/data/dataset/processor.py +++ b/qlib/data/dataset/processor.py @@ -132,7 +132,6 @@ class FilterCol(Processor): self.col_list = col_list def __call__(self, df): - cols = get_group_columns(df, self.fields_group) all_cols = df.columns diff_cols = np.setdiff1d(all_cols.get_level_values(-1), cols.get_level_values(-1)) diff --git a/qlib/data/dataset/utils.py b/qlib/data/dataset/utils.py index 4761fb383..f19dfe08f 100644 --- a/qlib/data/dataset/utils.py +++ b/qlib/data/dataset/utils.py @@ -71,15 +71,11 @@ def fetch_df_by_index( if fetch_orig: for slc in idx_slc: if slc != slice(None, None): - return df.loc[ - pd.IndexSlice[idx_slc], - ] + return df.loc[pd.IndexSlice[idx_slc],] # noqa: E231 else: # pylint: disable=W0120 return df else: - return df.loc[ - pd.IndexSlice[idx_slc], - ] + return df.loc[pd.IndexSlice[idx_slc],] # noqa: E231 def fetch_df_by_col(df: pd.DataFrame, col_set: Union[str, List[str]]) -> pd.DataFrame: diff --git a/qlib/data/pit.py b/qlib/data/pit.py index 093b98cab..33d5e0c5c 100644 --- a/qlib/data/pit.py +++ b/qlib/data/pit.py @@ -22,7 +22,6 @@ from .data import Cal class P(ElemOperator): def _load_internal(self, instrument, start_index, end_index, freq): - _calendar = Cal.calendar(freq=freq) resample_data = np.empty(end_index - start_index + 1, dtype="float32") diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index 288500c55..8a100a2d1 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -190,7 +190,6 @@ class FileCalendarStorage(FileStorageMixin, CalendarStorage): class FileInstrumentStorage(FileStorageMixin, InstrumentStorage): - INSTRUMENT_SEP = "\t" INSTRUMENT_START_FIELD = "start_datetime" INSTRUMENT_END_FIELD = "end_datetime" @@ -260,7 +259,6 @@ class FileInstrumentStorage(FileStorageMixin, InstrumentStorage): return self._read_instrument()[k] def update(self, *args, **kwargs) -> None: - if len(args) > 1: raise TypeError(f"update expected at most 1 arguments, got {len(args)}") inst = self._read_instrument() @@ -358,7 +356,6 @@ class FileFeatureStorage(FileStorageMixin, FeatureStorage): storage_end_index = self.end_index with self.uri.open("rb") as fp: if isinstance(i, int): - if storage_start_index > i: raise IndexError(f"{i}: start index is {storage_start_index}") fp.seek(4 * (i - storage_start_index) + 4) diff --git a/qlib/log.py b/qlib/log.py index 115abc137..f7683d511 100644 --- a/qlib/log.py +++ b/qlib/log.py @@ -84,7 +84,6 @@ get_module_logger = _QLibLoggerManager() class TimeInspector: - timer_logger = get_module_logger("timer") time_marks = [] diff --git a/qlib/model/riskmodel/poet.py b/qlib/model/riskmodel/poet.py index 8946b2ac5..42388d84c 100644 --- a/qlib/model/riskmodel/poet.py +++ b/qlib/model/riskmodel/poet.py @@ -43,7 +43,6 @@ class POETCovEstimator(RiskModel): self.thresh_method = thresh_method def _predict(self, X: np.ndarray) -> np.ndarray: - Y = X.T # NOTE: to match POET's implementation p, n = Y.shape diff --git a/qlib/tests/__init__.py b/qlib/tests/__init__.py index 52c924918..97ff00c57 100644 --- a/qlib/tests/__init__.py +++ b/qlib/tests/__init__.py @@ -14,7 +14,6 @@ from qlib.data.storage import CalendarStorage, InstrumentStorage, FeatureStorage class TestAutoData(unittest.TestCase): - _setup_kwargs = {} provider_uri = "~/.qlib/qlib_data/cn_data_simple" # target_dir provider_uri_1day = "~/.qlib/qlib_data/cn_data" # target_dir @@ -286,6 +285,5 @@ class TestMockData(unittest.TestCase): @classmethod def setUpClass(cls) -> None: - provider_uri = "Not necessary." init(region=REG_TW, provider_uri=provider_uri, expression_cache=None, dataset_cache=None, **cls._setup_kwargs) diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index 910a4c08b..9e63c104a 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -7,12 +7,9 @@ from __future__ import division from __future__ import print_function import os -import pickle import re -import sys import copy import json -from qlib.typehint import InstConf import yaml import redis import bisect @@ -22,15 +19,11 @@ import inspect import hashlib import datetime import requests -import importlib -import contextlib import collections import numpy as np import pandas as pd from pathlib import Path -from typing import List, Dict, Union, Tuple, Any, Optional, Callable -from types import ModuleType -from urllib.parse import urlparse +from typing import List, Union, Optional, Callable from packaging import version from .file import get_or_create_path, save_multiple_parts_file, unpack_archive_with_buffer, get_tmp_file_with_buffer from ..config import C @@ -288,182 +281,6 @@ def parse_field(field): return field -def get_module_by_module_path(module_path: Union[str, ModuleType]): - """Load module path - - :param module_path: - :return: - :raises: ModuleNotFoundError - """ - if module_path is None: - raise ModuleNotFoundError("None is passed in as parameters as module_path") - - if isinstance(module_path, ModuleType): - module = module_path - else: - if module_path.endswith(".py"): - module_name = re.sub("^[^a-zA-Z_]+", "", re.sub("[^0-9a-zA-Z_]", "", module_path[:-3].replace("/", "_"))) - module_spec = importlib.util.spec_from_file_location(module_name, module_path) - module = importlib.util.module_from_spec(module_spec) - sys.modules[module_name] = module - module_spec.loader.exec_module(module) - else: - module = importlib.import_module(module_path) - return module - - -def split_module_path(module_path: str) -> Tuple[str, str]: - """ - - Parameters - ---------- - module_path : str - e.g. "a.b.c.ClassName" - - Returns - ------- - Tuple[str, str] - e.g. ("a.b.c", "ClassName") - """ - *m_path, cls = module_path.split(".") - m_path = ".".join(m_path) - return m_path, cls - - -def get_callable_kwargs(config: InstConf, default_module: Union[str, ModuleType] = None) -> (type, dict): - """ - extract class/func and kwargs from config info - - Parameters - ---------- - config : [dict, str] - similar to config - please refer to the doc of init_instance_by_config - - default_module : Python module or str - It should be a python module to load the class type - This function will load class from the config['module_path'] first. - If config['module_path'] doesn't exists, it will load the class from default_module. - - Returns - ------- - (type, dict): - the class/func object and it's arguments. - - Raises - ------ - ModuleNotFoundError - """ - if isinstance(config, dict): - key = "class" if "class" in config else "func" - if isinstance(config[key], str): - # 1) get module and class - # - case 1): "a.b.c.ClassName" - # - case 2): {"class": "ClassName", "module_path": "a.b.c"} - m_path, cls = split_module_path(config[key]) - if m_path == "": - m_path = config.get("module_path", default_module) - module = get_module_by_module_path(m_path) - - # 2) get callable - _callable = getattr(module, cls) # may raise AttributeError - else: - _callable = config[key] # the class type itself is passed in - kwargs = config.get("kwargs", {}) - elif isinstance(config, str): - # a.b.c.ClassName - m_path, cls = split_module_path(config) - module = get_module_by_module_path(default_module if m_path == "" else m_path) - - _callable = getattr(module, cls) - kwargs = {} - else: - raise NotImplementedError(f"This type of input is not supported") - return _callable, kwargs - - -get_cls_kwargs = get_callable_kwargs # NOTE: this is for compatibility for the previous version - - -def init_instance_by_config( - config: InstConf, - default_module=None, - accept_types: Union[type, Tuple[type]] = (), - try_kwargs: Dict = {}, - **kwargs, -) -> Any: - """ - get initialized instance with config - - Parameters - ---------- - config : InstConf - - default_module : Python module - Optional. It should be a python module. - NOTE: the "module_path" will be override by `module` arguments - - This function will load class from the config['module_path'] first. - If config['module_path'] doesn't exists, it will load the class from default_module. - - accept_types: Union[type, Tuple[type]] - Optional. If the config is a instance of specific type, return the config directly. - This will be passed into the second parameter of isinstance. - - try_kwargs: Dict - Try to pass in kwargs in `try_kwargs` when initialized the instance - If error occurred, it will fail back to initialization without try_kwargs. - - Returns - ------- - object: - An initialized object based on the config info - """ - if isinstance(config, accept_types): - return config - - if isinstance(config, (str, Path)): - if isinstance(config, str): - # path like 'file:////obj.pkl' - pr = urlparse(config) - if pr.scheme == "file": - pr_path = os.path.join(pr.netloc, pr.path) if bool(pr.path) else pr.netloc - with open(os.path.normpath(pr_path), "rb") as f: - return pickle.load(f) - else: - with config.open("rb") as f: - return pickle.load(f) - - klass, cls_kwargs = get_callable_kwargs(config, default_module=default_module) - - try: - return klass(**cls_kwargs, **try_kwargs, **kwargs) - except (TypeError,): - # TypeError for handling errors like - # 1: `XXX() got multiple values for keyword argument 'YYY'` - # 2: `XXX() got an unexpected keyword argument 'YYY' - return klass(**cls_kwargs, **kwargs) - - -@contextlib.contextmanager -def class_casting(obj: object, cls: type): - """ - Python doesn't provide the downcasting mechanism. - We use the trick here to downcast the class - - Parameters - ---------- - obj : object - the object to be cast - cls : type - the target class type - """ - orig_cls = obj.__class__ - obj.__class__ = cls - yield - obj.__class__ = orig_cls - - def compare_dict_value(src_data: dict, dst_data: dict): """Compare dict value @@ -744,7 +561,6 @@ def exists_qlib_data(qlib_dir): return False # check calendar bin for _calendar in calendars_dir.iterdir(): - if ("_future" not in _calendar.name) and ( not list(features_dir.rglob(f"*.{_calendar.name.split('.')[0]}.bin")) ): @@ -872,9 +688,9 @@ def get_item_from_obj(config: dict, name_path: str) -> object: cur_cfg = config for k in name_path.split("."): if isinstance(cur_cfg, dict): - cur_cfg = cur_cfg[k] + cur_cfg = cur_cfg[k] # may raise KeyError elif k.isdigit(): - cur_cfg = cur_cfg[int(k)] + cur_cfg = cur_cfg[int(k)] # may raise IndexError else: raise ValueError(f"Error when getting {k} from cur_cfg") return cur_cfg @@ -910,6 +726,21 @@ def fill_placeholder(config: dict, config_extend: dict): top = 0 tail = 1 item_queue = [config] + + def try_replace_placeholder(value): + if value in config_extend.keys(): + value = config_extend[value] + else: + m = re.match(r"<(?P[^<>]+)>", value) + if m is not None: + try: + value = get_item_from_obj(config, m.groupdict()["name_path"]) + except (KeyError, ValueError, IndexError): + get_module_logger("fill_placeholder").info( + f"{value} lookes like a placeholder, but it can't match to any given values" + ) + return value + while top < tail: now_item = item_queue[top] top += 1 @@ -917,17 +748,13 @@ def fill_placeholder(config: dict, config_extend: dict): item_keys = range(len(now_item)) elif isinstance(now_item, dict): item_keys = now_item.keys() - for key in item_keys: + for key in item_keys: # noqa if isinstance(now_item[key], (list, dict)): item_queue.append(now_item[key]) tail += 1 elif isinstance(now_item[key], str): - if now_item[key] in config_extend.keys(): - now_item[key] = config_extend[now_item[key]] - else: - m = re.match(r"<(?P[^<>]+)>", now_item[key]) - if m is not None: - now_item[key] = get_item_from_obj(config, m.groupdict()["name_path"]) + # If it is a string, try to replace it with placeholder + now_item[key] = try_replace_placeholder(now_item[key]) return config @@ -1049,6 +876,15 @@ def fname_to_code(fname: str): return fname +from .mod import ( + get_module_by_module_path, + split_module_path, + get_callable_kwargs, + get_cls_kwargs, + init_instance_by_config, + class_casting, +) + __all__ = [ "get_or_create_path", "save_multiple_parts_file", @@ -1056,4 +892,10 @@ __all__ = [ "get_tmp_file_with_buffer", "set_log_with_config", "init_instance_by_config", + "get_module_by_module_path", + "split_module_path", + "get_callable_kwargs", + "get_cls_kwargs", + "init_instance_by_config", + "class_casting", ] diff --git a/qlib/utils/index_data.py b/qlib/utils/index_data.py index b62bc02ce..113f9802d 100644 --- a/qlib/utils/index_data.py +++ b/qlib/utils/index_data.py @@ -351,7 +351,6 @@ class IndexData(metaclass=index_data_ops_creator): loc_idx_cls = LocIndexer def __init__(self, data: np.ndarray, *indices: Union[List, pd.Index, Index]): - self.data = data self.indices = indices diff --git a/qlib/utils/mod.py b/qlib/utils/mod.py new file mode 100644 index 000000000..e53957260 --- /dev/null +++ b/qlib/utils/mod.py @@ -0,0 +1,235 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +""" +All module related class, e.g. : +- importing a module, class +- walkiing a module +- operations on class or module... +""" + +import contextlib +import importlib +import os +from pathlib import Path +import pickle +import pkgutil +import re +import sys +from types import ModuleType +from typing import Any, Dict, List, Tuple, Union +from urllib.parse import urlparse + +from qlib.typehint import InstConf + + +def get_module_by_module_path(module_path: Union[str, ModuleType]): + """Load module path + + :param module_path: + :return: + :raises: ModuleNotFoundError + """ + if module_path is None: + raise ModuleNotFoundError("None is passed in as parameters as module_path") + + if isinstance(module_path, ModuleType): + module = module_path + else: + if module_path.endswith(".py"): + module_name = re.sub("^[^a-zA-Z_]+", "", re.sub("[^0-9a-zA-Z_]", "", module_path[:-3].replace("/", "_"))) + module_spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(module_spec) + sys.modules[module_name] = module + module_spec.loader.exec_module(module) + else: + module = importlib.import_module(module_path) + return module + + +def split_module_path(module_path: str) -> Tuple[str, str]: + """ + + Parameters + ---------- + module_path : str + e.g. "a.b.c.ClassName" + + Returns + ------- + Tuple[str, str] + e.g. ("a.b.c", "ClassName") + """ + *m_path, cls = module_path.split(".") + m_path = ".".join(m_path) + return m_path, cls + + +def get_callable_kwargs(config: InstConf, default_module: Union[str, ModuleType] = None) -> (type, dict): + """ + extract class/func and kwargs from config info + + Parameters + ---------- + config : [dict, str] + similar to config + please refer to the doc of init_instance_by_config + + default_module : Python module or str + It should be a python module to load the class type + This function will load class from the config['module_path'] first. + If config['module_path'] doesn't exists, it will load the class from default_module. + + Returns + ------- + (type, dict): + the class/func object and it's arguments. + + Raises + ------ + ModuleNotFoundError + """ + if isinstance(config, dict): + key = "class" if "class" in config else "func" + if isinstance(config[key], str): + # 1) get module and class + # - case 1): "a.b.c.ClassName" + # - case 2): {"class": "ClassName", "module_path": "a.b.c"} + m_path, cls = split_module_path(config[key]) + if m_path == "": + m_path = config.get("module_path", default_module) + module = get_module_by_module_path(m_path) + + # 2) get callable + _callable = getattr(module, cls) # may raise AttributeError + else: + _callable = config[key] # the class type itself is passed in + kwargs = config.get("kwargs", {}) + elif isinstance(config, str): + # a.b.c.ClassName + m_path, cls = split_module_path(config) + module = get_module_by_module_path(default_module if m_path == "" else m_path) + + _callable = getattr(module, cls) + kwargs = {} + else: + raise NotImplementedError(f"This type of input is not supported") + return _callable, kwargs + + +get_cls_kwargs = get_callable_kwargs # NOTE: this is for compatibility for the previous version + + +def init_instance_by_config( + config: InstConf, + default_module=None, + accept_types: Union[type, Tuple[type]] = (), + try_kwargs: Dict = {}, + **kwargs, +) -> Any: + """ + get initialized instance with config + + Parameters + ---------- + config : InstConf + + default_module : Python module + Optional. It should be a python module. + NOTE: the "module_path" will be override by `module` arguments + + This function will load class from the config['module_path'] first. + If config['module_path'] doesn't exists, it will load the class from default_module. + + accept_types: Union[type, Tuple[type]] + Optional. If the config is a instance of specific type, return the config directly. + This will be passed into the second parameter of isinstance. + + try_kwargs: Dict + Try to pass in kwargs in `try_kwargs` when initialized the instance + If error occurred, it will fail back to initialization without try_kwargs. + + Returns + ------- + object: + An initialized object based on the config info + """ + if isinstance(config, accept_types): + return config + + if isinstance(config, (str, Path)): + if isinstance(config, str): + # path like 'file:////obj.pkl' + pr = urlparse(config) + if pr.scheme == "file": + pr_path = os.path.join(pr.netloc, pr.path) if bool(pr.path) else pr.netloc + with open(os.path.normpath(pr_path), "rb") as f: + return pickle.load(f) + else: + with config.open("rb") as f: + return pickle.load(f) + + klass, cls_kwargs = get_callable_kwargs(config, default_module=default_module) + + try: + return klass(**cls_kwargs, **try_kwargs, **kwargs) + except (TypeError,): + # TypeError for handling errors like + # 1: `XXX() got multiple values for keyword argument 'YYY'` + # 2: `XXX() got an unexpected keyword argument 'YYY' + return klass(**cls_kwargs, **kwargs) + + +@contextlib.contextmanager +def class_casting(obj: object, cls: type): + """ + Python doesn't provide the downcasting mechanism. + We use the trick here to downcast the class + + Parameters + ---------- + obj : object + the object to be cast + cls : type + the target class type + """ + orig_cls = obj.__class__ + obj.__class__ = cls + yield + obj.__class__ = orig_cls + + +def find_all_classes(module_path: Union[str, ModuleType], cls: type) -> List[type]: + """ + Find all the classes recursively that inherit from `cls` in a given module. + - `cls` itself is also included + + >>> from qlib.data.dataset.handler import DataHandler + >>> find_all_classes("qlib.contrib.data.handler", DataHandler) + [, , , , ] + + TODO: + - skip import error + + """ + if isinstance(module_path, ModuleType): + mod = module_path + else: + mod = importlib.import_module(module_path) + + cls_list = [] + + def _append_cls(obj): + # Leverage the closure trick to reuse code + if isinstance(obj, type) and issubclass(obj, cls) and cls not in cls_list: + cls_list.append(obj) + + for attr in dir(mod): + _append_cls(getattr(mod, attr)) + + if hasattr(mod, "__path__"): + # if the model is a package + for _, modname, _ in pkgutil.iter_modules(mod.__path__): + sub_mod = importlib.import_module(f"{mod.__package__}.{modname}") + for m_cls in find_all_classes(sub_mod, cls): + _append_cls(m_cls) + return cls_list diff --git a/qlib/workflow/record_temp.py b/qlib/workflow/record_temp.py index fdb3f6c92..d8b0a79a3 100644 --- a/qlib/workflow/record_temp.py +++ b/qlib/workflow/record_temp.py @@ -136,7 +136,6 @@ class RecordTemp: whether the records are stored properly. """ if include_self: - # Some mlflow backend will not list the directly recursively. # So we force to the directly artifacts = {} diff --git a/qlib/workflow/task/gen.py b/qlib/workflow/task/gen.py index 77bd2cbc1..bd98e501d 100644 --- a/qlib/workflow/task/gen.py +++ b/qlib/workflow/task/gen.py @@ -339,7 +339,6 @@ class MultiHorizonGenBase(TaskGen): def generate(self, task: dict): res = [] for hr in self.horizon: - # Add horizon t = copy.deepcopy(task) self.set_horizon(t, hr) diff --git a/qlib/workflow/task/utils.py b/qlib/workflow/task/utils.py index a914ea54f..19837b3c7 100644 --- a/qlib/workflow/task/utils.py +++ b/qlib/workflow/task/utils.py @@ -1,23 +1,25 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. - """ Some tools for task management. """ import bisect +from copy import deepcopy import pandas as pd from qlib.data import D +from qlib.utils import hash_args +from qlib.utils.mod import init_instance_by_config from qlib.workflow import R from qlib.config import C from qlib.log import get_module_logger from pymongo import MongoClient from pymongo.database import Database from typing import Union +from pathlib import Path def get_mongodb() -> Database: - """ Get database in MongoDB, which means you need to declare the address and the name of a database at first. @@ -276,3 +278,31 @@ class TimeAdjuster: return self.get(start_idx), self.get(end_idx) else: raise NotImplementedError(f"This type of input is not supported") + + +def replace_task_handler_with_cache(task: dict, cache_dir: Union[str, Path] = ".") -> dict: + """ + Replace the handler in task with a cache handler. + It will automatically cache the file and save it in cache_dir. + + >>> import qlib + >>> qlib.auto_init() + >>> import datetime + >>> # it is simplified task + >>> task = {"dataset": {"kwargs":{'handler': {'class': 'Alpha158', 'module_path': 'qlib.contrib.data.handler', 'kwargs': {'start_time': datetime.date(2008, 1, 1), 'end_time': datetime.date(2020, 8, 1), 'fit_start_time': datetime.date(2008, 1, 1), 'fit_end_time': datetime.date(2014, 12, 31), 'instruments': 'CSI300'}}}}} + >>> new_task = replace_task_handler_with_cache(task) + >>> print(new_task) + {'dataset': {'kwargs': {'handler': 'file...Alpha158.3584f5f8b4.pkl'}}} + + """ + cache_dir = Path(cache_dir) + task = deepcopy(task) + handler = task["dataset"]["kwargs"]["handler"] + if isinstance(handler, dict): + hash = hash_args(handler) + h_path = cache_dir / f"{handler['class']}.{hash[:10]}.pkl" + if not h_path.exists(): + h = init_instance_by_config(handler) + h.to_pickle(h_path, dump_all=True) + task["dataset"]["kwargs"]["handler"] = f"file://{h_path}" + return task diff --git a/scripts/check_dump_bin.py b/scripts/check_dump_bin.py index ef8023219..7ae8a26ab 100644 --- a/scripts/check_dump_bin.py +++ b/scripts/check_dump_bin.py @@ -15,7 +15,6 @@ from loguru import logger class CheckBin: - NOT_IN_FEATURES = "not in features" COMPARE_FALSE = "compare False" COMPARE_TRUE = "compare True" diff --git a/scripts/data_collector/base.py b/scripts/data_collector/base.py index e3cf1fcac..386bb1b2c 100644 --- a/scripts/data_collector/base.py +++ b/scripts/data_collector/base.py @@ -18,7 +18,6 @@ from qlib.utils import code_to_fname class BaseCollector(abc.ABC): - CACHE_FLAG = "CACHED" NORMAL_FLAG = "NORMAL" @@ -185,7 +184,6 @@ class BaseCollector(abc.ABC): return self.NORMAL_FLAG def _collector(self, instrument_list): - error_symbol = [] res = Parallel(n_jobs=self.max_workers)( delayed(self._simple_collector)(_inst) for _inst in tqdm(instrument_list) diff --git a/scripts/data_collector/br_index/collector.py b/scripts/data_collector/br_index/collector.py index 0dc12eff6..7d32170f0 100644 --- a/scripts/data_collector/br_index/collector.py +++ b/scripts/data_collector/br_index/collector.py @@ -21,7 +21,6 @@ quarter_dict = {"1Q": "01-03", "2Q": "05-01", "3Q": "09-01"} class IBOVIndex(IndexBase): - ibov_index_composition = "https://raw.githubusercontent.com/igor17400/IBOV-HCI/main/historic_composition/{}.csv" years_4_month_periods = [] diff --git a/scripts/data_collector/us_index/collector.py b/scripts/data_collector/us_index/collector.py index 97cbce825..cb0c3fc95 100644 --- a/scripts/data_collector/us_index/collector.py +++ b/scripts/data_collector/us_index/collector.py @@ -143,7 +143,6 @@ class WIKIIndex(IndexBase): class NASDAQ100Index(WIKIIndex): - HISTORY_COMPANIES_URL = ( "https://indexes.nasdaqomx.com/Index/WeightingData?id=NDX&tradeDate={trade_date}T00%3A00%3A00.000&timeOfDay=SOD" ) diff --git a/scripts/dump_pit.py b/scripts/dump_pit.py index cda872c09..c328eb67a 100644 --- a/scripts/dump_pit.py +++ b/scripts/dump_pit.py @@ -237,7 +237,6 @@ class DumpPitData: pass with open(data_file, "rb+") as fd, open(index_file, "rb+") as fi: - # update index if needed for i, row in df_sub.iterrows(): # get index diff --git a/tests/backtest/test_high_freq_trading.py b/tests/backtest/test_high_freq_trading.py index fd934914d..a538464db 100644 --- a/tests/backtest/test_high_freq_trading.py +++ b/tests/backtest/test_high_freq_trading.py @@ -27,7 +27,6 @@ class TestHFBacktest(TestAutoData): return pd.DataFrame(orders, columns=headers) def test_trading(self): - # date = "2020-02-03" # inst = "SH600068" # pos = 2.0167 diff --git a/tests/data_mid_layer_tests/test_handler_storage.py b/tests/data_mid_layer_tests/test_handler_storage.py index 0d8ad4d57..a8bb730f7 100644 --- a/tests/data_mid_layer_tests/test_handler_storage.py +++ b/tests/data_mid_layer_tests/test_handler_storage.py @@ -21,7 +21,6 @@ class TestHandler(DataHandlerLP): fit_end_time=None, drop_raw=True, ): - infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time) learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time) @@ -51,7 +50,6 @@ class TestHandler(DataHandlerLP): class TestHandlerStorage(TestAutoData): - market = "all" start_time = "2010-01-01" @@ -82,7 +80,6 @@ class TestHandlerStorage(TestAutoData): ) with TimeInspector.logt("random fetch with DataFrame Storage"): - # single stock for i in range(100): random_index = np.random.randint(len(instruments), size=1)[0] @@ -96,7 +93,6 @@ class TestHandlerStorage(TestAutoData): data_handler.fetch(selector=(fetch_stocks, slice(fetch_start_time, fetch_end_time)), level=None) with TimeInspector.logt("random fetch with HashingStock Storage"): - # single stock for i in range(100): random_index = np.random.randint(len(instruments), size=1)[0] diff --git a/tests/misc/test_sepdf.py b/tests/misc/test_sepdf.py index 9fdc0bb2d..76bd0e6bd 100644 --- a/tests/misc/test_sepdf.py +++ b/tests/misc/test_sepdf.py @@ -11,7 +11,6 @@ class SepDF(unittest.TestCase): return "".join(str(obj).split()) def test_index_data(self): - np.random.seed(42) index = [ diff --git a/tests/rolling_tests/test_update_pred.py b/tests/rolling_tests/test_update_pred.py index 324611948..b3ca2e036 100644 --- a/tests/rolling_tests/test_update_pred.py +++ b/tests/rolling_tests/test_update_pred.py @@ -77,7 +77,6 @@ class TestRolling(TestAutoData): @pytest.mark.slow def test_update_label(self): - task = copy.deepcopy(CSI300_GBDT_TASK) task["record"] = { diff --git a/tests/storage_tests/test_storage.py b/tests/storage_tests/test_storage.py index 50b16a041..92fed34ec 100644 --- a/tests/storage_tests/test_storage.py +++ b/tests/storage_tests/test_storage.py @@ -22,7 +22,6 @@ QLIB_DIR.mkdir(exist_ok=True, parents=True) class TestStorage(TestAutoData): def test_calendar_storage(self): - calendar = CalendarStorage(freq="day", future=False, provider_uri=self.provider_uri) assert isinstance(calendar[:], Iterable), f"{calendar.__class__.__name__}.__getitem__(s: slice) is not Iterable" assert isinstance(calendar.data, Iterable), f"{calendar.__class__.__name__}.data is not Iterable" diff --git a/tests/test_get_data.py b/tests/test_get_data.py index 94e685e1f..125b9203e 100644 --- a/tests/test_get_data.py +++ b/tests/test_get_data.py @@ -33,7 +33,6 @@ class TestGetData(unittest.TestCase): shutil.rmtree(str(DATA_DIR.resolve())) def test_0_qlib_data(self): - GetData().qlib_data( name="qlib_data_simple", target_dir=QLIB_DIR, region="cn", interval="1d", delete_old=False, exists_skip=True )