diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml index ec8ea5d69..488419d52 100644 --- a/.github/release-drafter.yml +++ b/.github/release-drafter.yml @@ -14,6 +14,9 @@ categories: label: - 'doc' - 'documentation' + - title: '🧹 Maintenance' + label: + - 'maintenance' change-template: '- $TITLE @$AUTHOR (#$NUMBER)' change-title-escapes: '\<*_&' # You can add # and @ to disable mentions, and add ` to disable code blocks. version-resolver: @@ -30,4 +33,4 @@ version-resolver: template: | ## Changes - $CHANGES \ No newline at end of file + $CHANGES diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index db14fbf3b..e95a9e88c 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -38,7 +38,7 @@ jobs: TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | twine upload dist/* - + deploy_with_manylinux: runs-on: ubuntu-latest steps: diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index b07bdf1e7..6ce457dfd 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -18,7 +18,8 @@ jobs: stale-issue-label: 'stale' stale-pr-label: 'stale' days-before-stale: 90 + days-before-pr-stale: 365 days-before-close: 5 operations-per-run: 100 exempt-issue-labels: 'bug,enhancement' - remove-stale-when-updated: true \ No newline at end of file + remove-stale-when-updated: true diff --git a/.github/workflows/test_qlib_from_pip.yml b/.github/workflows/test_qlib_from_pip.yml index e6202e57e..f5db06ccb 100644 --- a/.github/workflows/test_qlib_from_pip.yml +++ b/.github/workflows/test_qlib_from_pip.yml @@ -8,6 +8,7 @@ on: jobs: build: + if: ${{ false }} # FIXME: temporarily disable... Due to we are rushing a feature timeout-minutes: 120 runs-on: ${{ matrix.os }} @@ -19,10 +20,20 @@ jobs: steps: - name: Test qlib from pip - uses: actions/checkout@v2 + uses: actions/checkout@v3 + + # Since version 3.7 of python for MacOS is installed in CI, version 3.7.17, this version causes "_bz not found error". + # So we make the version number of python 3.7 for MacOS more specific. + # refs: https://github.com/actions/setup-python/issues/682 + - name: Set up Python ${{ matrix.python-version }} + if: (matrix.os == 'macos-latest' && matrix.python-version == '3.7') || (matrix.os == 'macos-11' && matrix.python-version == '3.7') + uses: actions/setup-python@v4 + with: + python-version: "3.7.16" - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + if: (matrix.os != 'macos-latest' || matrix.python-version != '3.7') && (matrix.os != 'macos-11' || matrix.python-version != '3.7') + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} @@ -50,7 +61,9 @@ jobs: - name: Downloads dependencies data run: | - python scripts/get_data.py qlib_data --name qlib_data_simple --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn + cd .. + python -m qlib.run.get_data qlib_data --target_dir ~/.qlib/qlib_data/cn_data --region cn + cd qlib - name: Test workflow by config run: | diff --git a/.github/workflows/test_qlib_from_source.yml b/.github/workflows/test_qlib_from_source.yml index 68dfe5b3f..7271287dc 100644 --- a/.github/workflows/test_qlib_from_source.yml +++ b/.github/workflows/test_qlib_from_source.yml @@ -20,18 +20,28 @@ jobs: steps: - name: Test qlib from source - uses: actions/checkout@v2 + uses: actions/checkout@v3 + + # Since version 3.7 of python for MacOS is installed in CI, version 3.7.17, this version causes "_bz not found error". + # So we make the version number of python 3.7 for MacOS more specific. + # refs: https://github.com/actions/setup-python/issues/682 + - name: Set up Python ${{ matrix.python-version }} + if: (matrix.os == 'macos-latest' && matrix.python-version == '3.7') || (matrix.os == 'macos-11' && matrix.python-version == '3.7') + uses: actions/setup-python@v4 + with: + python-version: "3.7.16" - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + if: (matrix.os != 'macos-latest' || matrix.python-version != '3.7') && (matrix.os != 'macos-11' || matrix.python-version != '3.7') + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Update pip to the latest version # pip release version 23.1 on Apr.15 2023, CI failed to run, Please refer to #1495 ofr detailed logs. - # The pip version has been temporarily fixed to 23.0.1 + # The pip version has been temporarily fixed to 23.0 run: | - python -m pip install pip==23.0.1 + python -m pip install pip==23.0 - name: Installing pytorch for macos if: ${{ matrix.os == 'macos-11' || matrix.os == 'macos-latest' }} @@ -54,7 +64,10 @@ jobs: python -m pip install -e .[dev] - name: Lint with Black + # Python 3.7 will use a black with low level. So we use python with higher version for black check + if: (matrix.python-version != '3.7') run: | + pip install -U black # follow the latest version of black, previous Qlib dependency will downgrade black black . -l 120 --check --diff - name: Make html with sphinx @@ -129,8 +142,7 @@ jobs: - name: Test data downloads run: | python scripts/get_data.py qlib_data --name qlib_data_simple --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn - azcopy copy https://qlibpublic.blob.core.windows.net/data/rl /tmp/qlibpublic/data --recursive - mv /tmp/qlibpublic/data tests/.data + python scripts/get_data.py download_data --file_name rl_data.zip --target_dir tests/.data/rl - name: Install Lightgbm for MacOS if: ${{ matrix.os == 'macos-11' || matrix.os == 'macos-latest' }} diff --git a/.github/workflows/test_qlib_from_source_slow.yml b/.github/workflows/test_qlib_from_source_slow.yml index f8e43fa17..1dfcc0179 100644 --- a/.github/workflows/test_qlib_from_source_slow.yml +++ b/.github/workflows/test_qlib_from_source_slow.yml @@ -20,18 +20,28 @@ jobs: steps: - name: Test qlib from source slow - uses: actions/checkout@v2 + uses: actions/checkout@v3 + + # Since version 3.7 of python for MacOS is installed in CI, version 3.7.17, this version causes "_bz not found error". + # So we make the version number of python 3.7 for MacOS more specific. + # refs: https://github.com/actions/setup-python/issues/682 + - name: Set up Python ${{ matrix.python-version }} + if: (matrix.os == 'macos-latest' && matrix.python-version == '3.7') || (matrix.os == 'macos-11' && matrix.python-version == '3.7') + uses: actions/setup-python@v4 + with: + python-version: "3.7.16" - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + if: (matrix.os != 'macos-latest' || matrix.python-version != '3.7') && (matrix.os != 'macos-11' || matrix.python-version != '3.7') + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Set up Python tools # pip release version 23.1 on Apr.15 2023, CI failed to run, Please refer to #1495 ofr detailed logs. - # The pip version has been temporarily fixed to 23.0.1 + # The pip version has been temporarily fixed to 23.0 run: | - python -m pip install pip==23.0.1 + python -m pip install pip==23.0 pip install --upgrade cython numpy pip install -e .[dev] diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ea57aeb0e..15f00414c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/psf/black - rev: 22.6.0 + rev: 23.7.0 hooks: - id: black args: ["qlib", "-l 120"] @@ -9,4 +9,4 @@ repos: rev: 4.0.1 hooks: - id: flake8 - args: ["--ignore=E501,F541,E266,E402,W503,E731,E203"] \ No newline at end of file + args: ["--ignore=E501,F541,E266,E402,W503,E731,E203"] diff --git a/README.md b/README.md index cedfdc348..539700a91 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ Recent released features | Feature | Status | | -- | ------ | +| KRNN and Sandwich models | :chart_with_upwards_trend: [Released](https://github.com/microsoft/qlib/pull/1414/) on May 26, 2023 | | Release Qlib v0.9.0 | :octocat: [Released](https://github.com/microsoft/qlib/releases/tag/v0.9.0) on Dec 9, 2022 | | RL Learning Framework | :hammer: :chart_with_upwards_trend: Released on Nov 10, 2022. [#1332](https://github.com/microsoft/qlib/pull/1332), [#1322](https://github.com/microsoft/qlib/pull/1322), [#1316](https://github.com/microsoft/qlib/pull/1316),[#1299](https://github.com/microsoft/qlib/pull/1299),[#1263](https://github.com/microsoft/qlib/pull/1263), [#1244](https://github.com/microsoft/qlib/pull/1244), [#1169](https://github.com/microsoft/qlib/pull/1169), [#1125](https://github.com/microsoft/qlib/pull/1125), [#1076](https://github.com/microsoft/qlib/pull/1076)| | HIST and IGMTF models | :chart_with_upwards_trend: [Released](https://github.com/microsoft/qlib/pull/1040) on Apr 10, 2022 | @@ -90,6 +91,7 @@ For more details, please refer to our paper ["Qlib: An AI-oriented Quantitative
diff --git a/examples/benchmarks/TRA/src/model.py b/examples/benchmarks/TRA/src/model.py
index cff94388e..affb115a1 100644
--- a/examples/benchmarks/TRA/src/model.py
+++ b/examples/benchmarks/TRA/src/model.py
@@ -45,7 +45,6 @@ class TRAModel(Model):
avg_params=True,
**kwargs,
):
-
np.random.seed(seed)
torch.manual_seed(seed)
@@ -93,7 +92,6 @@ class TRAModel(Model):
self.global_step = -1
def train_epoch(self, data_set):
-
self.model.train()
self.tra.train()
@@ -146,7 +144,6 @@ class TRAModel(Model):
return total_loss
def test_epoch(self, data_set, return_pred=False):
-
self.model.eval()
self.tra.eval()
data_set.eval()
@@ -204,7 +201,6 @@ class TRAModel(Model):
return metrics, preds
def fit(self, dataset, evals_result=dict()):
-
train_set, valid_set, test_set = dataset.prepare(["train", "valid", "test"])
best_score = -1
@@ -380,7 +376,6 @@ class LSTM(nn.Module):
self.output_size = hidden_size
def forward(self, x):
-
x = self.input_drop(x)
if self.training and self.noise_level > 0:
@@ -464,7 +459,6 @@ class Transformer(nn.Module):
self.output_size = hidden_size
def forward(self, x):
-
x = self.input_drop(x)
if self.training and self.noise_level > 0:
@@ -514,7 +508,6 @@ class TRA(nn.Module):
self.predictors = nn.Linear(input_size, num_states)
def forward(self, hidden, hist_loss):
-
preds = self.predictors(hidden)
if self.num_states == 1:
diff --git a/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml b/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml
index c86f87fc6..02c4ecac3 100644
--- a/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml
+++ b/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml
@@ -57,9 +57,7 @@ port_analysis_config: &port_analysis_config
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy
kwargs:
- signal:
- -
- -
+ signal:
topk: 50
n_drop: 5
backtest:
diff --git a/examples/benchmarks/TRA/workflow_config_tra_Alpha158_full.yaml b/examples/benchmarks/TRA/workflow_config_tra_Alpha158_full.yaml
index 75f18f3ee..9ccf56e86 100644
--- a/examples/benchmarks/TRA/workflow_config_tra_Alpha158_full.yaml
+++ b/examples/benchmarks/TRA/workflow_config_tra_Alpha158_full.yaml
@@ -51,9 +51,7 @@ port_analysis_config: &port_analysis_config
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy
kwargs:
- signal:
- -
- -
+ signal:
topk: 50
n_drop: 5
backtest:
diff --git a/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml b/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml
index 9ab5b904b..29686d7da 100644
--- a/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml
+++ b/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml
@@ -51,9 +51,7 @@ port_analysis_config: &port_analysis_config
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy
kwargs:
- signal:
- -
- -
+ signal:
topk: 50
n_drop: 5
backtest:
diff --git a/examples/benchmarks/TabNet/workflow_config_TabNet_Alpha158.yaml b/examples/benchmarks/TabNet/workflow_config_TabNet_Alpha158.yaml
index d9b94e86c..7549688b9 100644
--- a/examples/benchmarks/TabNet/workflow_config_TabNet_Alpha158.yaml
+++ b/examples/benchmarks/TabNet/workflow_config_TabNet_Alpha158.yaml
@@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy
kwargs:
- signal:
- -
- -
+ signal:
topk: 50
n_drop: 5
backtest:
diff --git a/examples/benchmarks/TabNet/workflow_config_TabNet_Alpha360.yaml b/examples/benchmarks/TabNet/workflow_config_TabNet_Alpha360.yaml
index 830943d6b..7155d25b1 100644
--- a/examples/benchmarks/TabNet/workflow_config_TabNet_Alpha360.yaml
+++ b/examples/benchmarks/TabNet/workflow_config_TabNet_Alpha360.yaml
@@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy
kwargs:
- signal:
- -
- -
+ signal:
topk: 50
n_drop: 5
backtest:
diff --git a/examples/benchmarks/Transformer/workflow_config_transformer_Alpha158.yaml b/examples/benchmarks/Transformer/workflow_config_transformer_Alpha158.yaml
index e36d44c43..ce5105108 100644
--- a/examples/benchmarks/Transformer/workflow_config_transformer_Alpha158.yaml
+++ b/examples/benchmarks/Transformer/workflow_config_transformer_Alpha158.yaml
@@ -36,9 +36,7 @@ port_analysis_config: &port_analysis_config
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy
kwargs:
- signal:
- -
- -
+ signal:
topk: 50
n_drop: 5
backtest:
diff --git a/examples/benchmarks/Transformer/workflow_config_transformer_Alpha360.yaml b/examples/benchmarks/Transformer/workflow_config_transformer_Alpha360.yaml
index cab46a4d4..35342de94 100644
--- a/examples/benchmarks/Transformer/workflow_config_transformer_Alpha360.yaml
+++ b/examples/benchmarks/Transformer/workflow_config_transformer_Alpha360.yaml
@@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy
kwargs:
- signal:
- -
- -
+ signal:
topk: 50
n_drop: 5
backtest:
diff --git a/examples/benchmarks/XGBoost/workflow_config_xgboost_Alpha158.yaml b/examples/benchmarks/XGBoost/workflow_config_xgboost_Alpha158.yaml
index 5ee38cf70..0c7f55d02 100644
--- a/examples/benchmarks/XGBoost/workflow_config_xgboost_Alpha158.yaml
+++ b/examples/benchmarks/XGBoost/workflow_config_xgboost_Alpha158.yaml
@@ -14,9 +14,7 @@ port_analysis_config: &port_analysis_config
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy
kwargs:
- signal:
- -
- -
+ signal:
topk: 50
n_drop: 5
backtest:
diff --git a/examples/benchmarks/XGBoost/workflow_config_xgboost_Alpha360.yaml b/examples/benchmarks/XGBoost/workflow_config_xgboost_Alpha360.yaml
index 7c98bd40c..8e7b54372 100644
--- a/examples/benchmarks/XGBoost/workflow_config_xgboost_Alpha360.yaml
+++ b/examples/benchmarks/XGBoost/workflow_config_xgboost_Alpha360.yaml
@@ -21,9 +21,7 @@ port_analysis_config: &port_analysis_config
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy
kwargs:
- signal:
- -
- -
+ signal:
topk: 50
n_drop: 5
backtest:
diff --git a/examples/benchmarks_dynamic/DDG-DA/Makefile b/examples/benchmarks_dynamic/DDG-DA/Makefile
new file mode 100644
index 000000000..c6cf5206e
--- /dev/null
+++ b/examples/benchmarks_dynamic/DDG-DA/Makefile
@@ -0,0 +1,4 @@
+.PHONY: clean
+
+clean:
+ -rm -r *.pkl mlruns || true
diff --git a/examples/benchmarks_dynamic/DDG-DA/README.md b/examples/benchmarks_dynamic/DDG-DA/README.md
index 4d49315bd..ac4349d91 100644
--- a/examples/benchmarks_dynamic/DDG-DA/README.md
+++ b/examples/benchmarks_dynamic/DDG-DA/README.md
@@ -16,12 +16,12 @@ Though the dataset is different, the conclusion remains the same. By applying `D
# Run the Code
Users can try `DDG-DA` by running the following command:
```bash
- python workflow.py run_all
+ python workflow.py run
```
The default forecasting models are `Linear`. Users can choose other forecasting models by changing the `forecast_model` parameter when `DDG-DA` initializes. For example, users can try `LightGBM` forecasting models by running the following command:
```bash
- python workflow.py --forecast_model="gbdt" run_all
+ python workflow.py --conf_path=../workflow_config_lightgbm_Alpha158.yaml run
```
# Results
diff --git a/examples/benchmarks_dynamic/DDG-DA/workflow.py b/examples/benchmarks_dynamic/DDG-DA/workflow.py
index b69107549..7593fe374 100644
--- a/examples/benchmarks_dynamic/DDG-DA/workflow.py
+++ b/examples/benchmarks_dynamic/DDG-DA/workflow.py
@@ -1,302 +1,40 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from pathlib import Path
-from qlib.model.meta.task import MetaTask
-from qlib.contrib.meta.data_selection.model import MetaModelDS
-from qlib.contrib.meta.data_selection.dataset import InternalData, MetaDatasetDS
-from qlib.data.dataset.handler import DataHandlerLP
+from typing import Union
-import pandas as pd
import fire
-import sys
-import pickle
-from typing import Optional
+
from qlib import auto_init
-from qlib.model.trainer import TrainerR
-from qlib.typehint import Literal
-from qlib.utils import init_instance_by_config
-from qlib.workflow import R
+from qlib.contrib.rolling.ddgda import DDGDA
from qlib.tests.data import GetData
DIRNAME = Path(__file__).absolute().resolve().parent
-sys.path.append(str(DIRNAME.parent / "baseline"))
-from rolling_benchmark import RollingBenchmark # NOTE: sys.path is changed for import RollingBenchmark
+BENCH_DIR = DIRNAME.parent / "baseline"
-class DDGDA:
- """
- please run `python workflow.py run_all` to run the full workflow of the experiment
+class DDGDABench(DDGDA):
+ # The config in the README.md
+ CONF_LIST = [
+ BENCH_DIR / "workflow_config_linear_Alpha158.yaml",
+ BENCH_DIR / "workflow_config_lightgbm_Alpha158.yaml",
+ ]
- **NOTE**
- before running the example, please clean your previous results with following command
- - `rm -r mlruns`
- """
+ DEFAULT_CONF = CONF_LIST[0] # Linear by default due to efficiency
- def __init__(
- self,
- sim_task_model: Literal["linear", "gbdt"] = "linear",
- forecast_model: Literal["linear", "gbdt"] = "linear",
- h_path: Optional[str] = None,
- test_end: Optional[str] = None,
- train_start: Optional[str] = None,
- meta_1st_train_end: Optional[str] = None,
- task_ext_conf: Optional[dict] = None,
- alpha: float = 0.0,
- proxy_hd: str = "handler_proxy.pkl",
- ):
- """
+ def __init__(self, conf_path: Union[str, Path] = DEFAULT_CONF, horizon=20, **kwargs) -> None:
+ # This code is for being compatible with the previous old code
+ conf_path = Path(conf_path)
+ super().__init__(conf_path=conf_path, horizon=horizon, working_dir=DIRNAME, **kwargs)
- Parameters
- ----------
-
- train_start: Optional[str]
- the start datetime for data. It is used in training start time (for both tasks & meta learing)
- test_end: Optional[str]
- the end datetime for data. It is used in test end time
- meta_1st_train_end: Optional[str]
- the datetime of training end of the first meta_task
- alpha: float
- Setting the L2 regularization for ridge
- The `alpha` is only passed to MetaModelDS (it is not passed to sim_task_model currently..)
- """
- self.step = 20
- # NOTE:
- # the horizon must match the meaning in the base task template
- self.horizon = 20
- self.meta_exp_name = "DDG-DA"
- self.sim_task_model = sim_task_model # The model to capture the distribution of data.
- self.forecast_model = forecast_model # downstream forecasting models' type
- self.rb_kwargs = {
- "h_path": h_path,
- "test_end": test_end,
- "train_start": train_start,
- "task_ext_conf": task_ext_conf,
- }
- self.alpha = alpha
- self.meta_1st_train_end = meta_1st_train_end
- self.proxy_hd = proxy_hd
-
- def get_feature_importance(self):
- # this must be lightGBM, because it needs to get the feature importance
- rb = RollingBenchmark(model_type="gbdt", **self.rb_kwargs)
- task = rb.basic_task()
-
- with R.start(experiment_name="feature_importance"):
- model = init_instance_by_config(task["model"])
- dataset = init_instance_by_config(task["dataset"])
- model.fit(dataset)
-
- fi = model.get_feature_importance()
-
- # Because the model use numpy instead of dataframe for training lightgbm
- # So the we must use following extra steps to get the right feature importance
- df = dataset.prepare(segments=slice(None), col_set="feature", data_key=DataHandlerLP.DK_R)
- cols = df.columns
- fi_named = {cols[int(k.split("_")[1])]: imp for k, imp in fi.to_dict().items()}
-
- return pd.Series(fi_named)
-
- def dump_data_for_proxy_model(self):
- """
- Dump data for training meta model.
- The meta model will be trained upon the proxy forecasting model.
- This dataset is for the proxy forecasting model.
- """
- topk = 30
- fi = self.get_feature_importance()
- col_selected = fi.nlargest(topk)
-
- rb = RollingBenchmark(model_type=self.sim_task_model, **self.rb_kwargs)
- task = rb.basic_task()
- dataset = init_instance_by_config(task["dataset"])
- prep_ds = dataset.prepare(slice(None), col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
-
- feature_df = prep_ds["feature"]
- label_df = prep_ds["label"]
-
- feature_selected = feature_df.loc[:, col_selected.index]
-
- feature_selected = feature_selected.groupby("datetime").apply(lambda df: (df - df.mean()).div(df.std()))
- feature_selected = feature_selected.fillna(0.0)
-
- df_all = {
- "label": label_df.reindex(feature_selected.index),
- "feature": feature_selected,
- }
- df_all = pd.concat(df_all, axis=1)
- df_all.to_pickle(DIRNAME / "fea_label_df.pkl")
-
- # dump data in handler format for aligning the interface
- handler = DataHandlerLP(
- data_loader={
- "class": "qlib.data.dataset.loader.StaticDataLoader",
- "kwargs": {"config": DIRNAME / "fea_label_df.pkl"},
- }
- )
- handler.to_pickle(DIRNAME / self.proxy_hd, dump_all=True)
-
- @property
- def _internal_data_path(self):
- return DIRNAME / f"internal_data_s{self.step}.pkl"
-
- def dump_meta_ipt(self):
- """
- Dump data for training meta model.
- This function will dump the input data for meta model
- """
- # According to the experiments, the choice of the model type is very important for achieving good results
- rb = RollingBenchmark(model_type=self.sim_task_model, **self.rb_kwargs)
- sim_task = rb.basic_task()
-
- if self.sim_task_model == "gbdt":
- sim_task["model"].setdefault("kwargs", {}).update({"early_stopping_rounds": None, "num_boost_round": 150})
-
- exp_name_sim = f"data_sim_s{self.step}"
-
- internal_data = InternalData(sim_task, self.step, exp_name=exp_name_sim)
- internal_data.setup(trainer=TrainerR)
-
- with self._internal_data_path.open("wb") as f:
- pickle.dump(internal_data, f)
-
- def train_meta_model(self, fill_method="max"):
- """
- training a meta model based on a simplified linear proxy model;
- """
-
- # 1) leverage the simplified proxy forecasting model to train meta model.
- # - Only the dataset part is important, in current version of meta model will integrate the
- rb = RollingBenchmark(model_type=self.sim_task_model, **self.rb_kwargs)
- sim_task = rb.basic_task()
- train_start = self.rb_kwargs.get("train_start", "2008-01-01")
- train_end = "2010-12-31" if self.meta_1st_train_end is None else self.meta_1st_train_end
- test_start = (pd.Timestamp(train_end) + pd.Timedelta(days=1)).strftime("%Y-%m-%d")
- proxy_forecast_model_task = {
- # "model": "qlib.contrib.model.linear.LinearModel",
- "dataset": {
- "class": "qlib.data.dataset.DatasetH",
- "kwargs": {
- "handler": f"file://{(DIRNAME / self.proxy_hd).absolute()}",
- "segments": {
- "train": (train_start, train_end),
- "test": (test_start, sim_task["dataset"]["kwargs"]["segments"]["test"][1]),
- },
- },
- },
- # "record": ["qlib.workflow.record_temp.SignalRecord"]
- }
- # the proxy_forecast_model_task will be used to create meta tasks.
- # The test date of first task will be 2011-01-01. Each test segment will be about 20days
- # The tasks include all training tasks and test tasks.
-
- # 2) preparing meta dataset
- kwargs = dict(
- task_tpl=proxy_forecast_model_task,
- step=self.step,
- segments=0.62, # keep test period consistent with the dataset yaml
- trunc_days=1 + self.horizon,
- hist_step_n=30,
- fill_method=fill_method,
- rolling_ext_days=0,
- )
- # NOTE:
- # the input of meta model (internal data) are shared between proxy model and final forecasting model
- # but their task test segment are not aligned! It worked in my previous experiment.
- # So the misalignment will not affect the effectiveness of the method.
- with self._internal_data_path.open("rb") as f:
- internal_data = pickle.load(f)
-
- md = MetaDatasetDS(exp_name=internal_data, **kwargs)
-
- # 3) train and logging meta model
- with R.start(experiment_name=self.meta_exp_name):
- R.log_params(**kwargs)
- mm = MetaModelDS(
- step=self.step, hist_step_n=kwargs["hist_step_n"], lr=0.001, max_epoch=100, seed=43, alpha=self.alpha
- )
- mm.fit(md)
- R.save_objects(model=mm)
-
- @property
- def _task_path(self):
- return DIRNAME / f"tasks_s{self.step}.pkl"
-
- def meta_inference(self):
- """
- Leverage meta-model for inference:
- - Given
- - baseline tasks
- - input for meta model(internal data)
- - meta model (its learnt knowledge on proxy forecasting model is expected to transfer to normal forecasting model)
- """
- # 1) get meta model
- exp = R.get_exp(experiment_name=self.meta_exp_name)
- rec = exp.list_recorders(rtype=exp.RT_L)[0]
- meta_model: MetaModelDS = rec.load_object("model")
-
- # 2)
- # we are transfer to knowledge of meta model to final forecasting tasks.
- # Create MetaTaskDataset for the final forecasting tasks
- # Aligning the setting of it to the MetaTaskDataset when training Meta model is necessary
-
- # 2.1) get previous config
- param = rec.list_params()
- trunc_days = int(param["trunc_days"])
- step = int(param["step"])
- hist_step_n = int(param["hist_step_n"])
- fill_method = param.get("fill_method", "max")
-
- rb = RollingBenchmark(model_type=self.forecast_model, **self.rb_kwargs)
- task_l = rb.create_rolling_tasks()
-
- # 2.2) create meta dataset for final dataset
- kwargs = dict(
- task_tpl=task_l,
- step=step,
- segments=0.0, # all the tasks are for testing
- trunc_days=trunc_days,
- hist_step_n=hist_step_n,
- fill_method=fill_method,
- task_mode=MetaTask.PROC_MODE_TRANSFER,
- )
-
- with self._internal_data_path.open("rb") as f:
- internal_data = pickle.load(f)
- mds = MetaDatasetDS(exp_name=internal_data, **kwargs)
-
- # 3) meta model make inference and get new qlib task
- new_tasks = meta_model.inference(mds)
- with self._task_path.open("wb") as f:
- pickle.dump(new_tasks, f)
-
- def train_and_eval_tasks(self):
- """
- Training the tasks generated by meta model
- Then evaluate it
- """
- with self._task_path.open("rb") as f:
- tasks = pickle.load(f)
- rb = RollingBenchmark(rolling_exp="rolling_ds", model_type=self.forecast_model, **self.rb_kwargs)
- rb.train_rolling_tasks(tasks)
- rb.ens_rolling()
- rb.update_rolling_rec()
-
- def run_all(self):
- # 1) file: handler_proxy.pkl (self.proxy_hd)
- self.dump_data_for_proxy_model()
- # 2)
- # file: internal_data_s20.pkl
- # mlflow: data_sim_s20, models for calculating meta_ipt
- self.dump_meta_ipt()
- # 3) meta model will be stored in `DDG-DA`
- self.train_meta_model()
- # 4) new_tasks are saved in "tasks_s20.pkl" (reweighter is added)
- self.meta_inference()
- # 5) load the saved tasks and train model
- self.train_and_eval_tasks()
+ for f in self.CONF_LIST:
+ if conf_path.samefile(f):
+ break
+ else:
+ self.logger.warning("Model type is not in the benchmark!")
if __name__ == "__main__":
GetData().qlib_data(exists_skip=True)
auto_init()
- fire.Fire(DDGDA)
+ fire.Fire(DDGDABench)
diff --git a/examples/benchmarks_dynamic/README.md b/examples/benchmarks_dynamic/README.md
index 261fcc035..6f78fa71a 100644
--- a/examples/benchmarks_dynamic/README.md
+++ b/examples/benchmarks_dynamic/README.md
@@ -8,15 +8,17 @@ The table below shows the performances of different solutions on different forec
Here is the [crowd sourced version of qlib data](data_collector/crowd_source/README.md): https://github.com/chenditc/investment_data/releases
```bash
wget https://github.com/chenditc/investment_data/releases/download/20220720/qlib_bin.tar.gz
+mkdir -p ~/.qlib/qlib_data/cn_data
tar -zxvf qlib_bin.tar.gz -C ~/.qlib/qlib_data/cn_data --strip-components=2
+rm -f qlib_bin.tar.gz
```
| Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown |
-|------------------|---------|----|------|---------|-----------|-------------------|-------------------|--------------|
-| RR[Linear] |Alpha158 |0.089|0.577|0.102 |0.627 |0.093 |1.458 |-0.073 |
-| DDG-DA[Linear] |Alpha158 |0.096|0.636|0.107 |0.677 |0.067 |0.996 |-0.091 |
-| RR[LightGBM] |Alpha158 |0.082|0.589|0.091 |0.626 |0.077 |1.320 |-0.091 |
-| DDG-DA[LightGBM] |Alpha158 |0.085|0.658|0.094 |0.686 |0.115 |1.792 |-0.068 |
+|------------------|---------|------|------|---------|-----------|-------------------|-------------------|--------------|
+| RR[Linear] |Alpha158 |0.0945|0.5989|0.1069 |0.6495 |0.0857 |1.3682 |-0.0986 |
+| DDG-DA[Linear] |Alpha158 |0.0983|0.6157|0.1108 |0.6646 |0.0764 |1.1904 |-0.0769 |
+| RR[LightGBM] |Alpha158 |0.0816|0.5887|0.0912 |0.6263 |0.0771 |1.3196 |-0.0909 |
+| DDG-DA[LightGBM] |Alpha158 |0.0878|0.6185|0.0975 |0.6524 |0.1261 |2.0096 |-0.0744 |
- The label horizon of the `Alpha158` dataset is set to 20.
- The rolling time intervals are set to 20 trading days.
diff --git a/examples/benchmarks_dynamic/baseline/README.md b/examples/benchmarks_dynamic/baseline/README.md
index 17e10482d..f17651412 100644
--- a/examples/benchmarks_dynamic/baseline/README.md
+++ b/examples/benchmarks_dynamic/baseline/README.md
@@ -5,11 +5,12 @@ This is the framework of periodically Rolling Retrain (RR) forecasting models. R
## Run the Code
Users can try RR by running the following command:
```bash
- python rolling_benchmark.py run_all
+ python rolling_benchmark.py run
```
The default forecasting models are `Linear`. Users can choose other forecasting models by changing the `model_type` parameter.
For example, users can try `LightGBM` forecasting models by running the following command:
```bash
- python rolling_benchmark.py --model_type="gbdt" run_all
-```
\ No newline at end of file
+ python rolling_benchmark.py --conf_path=workflow_config_lightgbm_Alpha158.yaml run
+
+```
diff --git a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py
index d452957d4..1ce30ef8a 100644
--- a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py
+++ b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py
@@ -1,160 +1,33 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
-from typing import Optional
-from qlib.model.ens.ensemble import RollingEnsemble
-from qlib.utils import init_instance_by_config
-import fire
-import yaml
-import pandas as pd
-from qlib import auto_init
from pathlib import Path
-from tqdm.auto import tqdm
-from qlib.model.trainer import TrainerR
-from qlib.log import get_module_logger
-from qlib.utils.data import update_config
-from qlib.workflow import R
+from typing import Union
+
+import fire
+
+from qlib import auto_init
+from qlib.contrib.rolling.base import Rolling
from qlib.tests.data import GetData
DIRNAME = Path(__file__).absolute().resolve().parent
-from qlib.workflow.task.gen import task_generator, RollingGen
-from qlib.workflow.task.collect import RecorderCollector
-from qlib.workflow.record_temp import PortAnaRecord, SigAnaRecord
-class RollingBenchmark:
- """
- **NOTE**
- before running the example, please clean your previous results with following command
- - `rm -r mlruns`
+class RollingBenchmark(Rolling):
+ # The config in the README.md
+ CONF_LIST = [DIRNAME / "workflow_config_linear_Alpha158.yaml", DIRNAME / "workflow_config_lightgbm_Alpha158.yaml"]
- """
+ DEFAULT_CONF = CONF_LIST[0]
- def __init__(
- self,
- rolling_exp: str = "rolling_models",
- model_type: str = "linear",
- h_path: Optional[str] = None,
- train_start: Optional[str] = None,
- test_end: Optional[str] = None,
- task_ext_conf: Optional[dict] = None,
- ) -> None:
- """
- Parameters
- ----------
- rolling_exp : str
- The name for the experiments for rolling
- model_type : str
- The model to be boosted.
- h_path : Optional[str]
- the dumped data handler;
- test_end : Optional[str]
- the test end for the data. It is typically used together with the handler
- train_start : Optional[str]
- the train start for the data. It is typically used together with the handler.
- task_ext_conf : Optional[dict]
- some option to update the
- """
- self.step = 20
- self.horizon = 20
- self.rolling_exp = rolling_exp
- self.model_type = model_type
- self.h_path = h_path
- self.train_start = train_start
- self.test_end = test_end
- self.logger = get_module_logger("RollingBenchmark")
- self.task_ext_conf = task_ext_conf
+ def __init__(self, conf_path: Union[str, Path] = DEFAULT_CONF, horizon=20, **kwargs) -> None:
+ # This code is for being compatible with the previous old code
+ conf_path = Path(conf_path)
+ super().__init__(conf_path=conf_path, horizon=horizon, **kwargs)
- def basic_task(self):
- """For fast training rolling"""
- if self.model_type == "gbdt":
- conf_path = DIRNAME.parent.parent / "benchmarks" / "LightGBM" / "workflow_config_lightgbm_Alpha158.yaml"
- # dump the processed data on to disk for later loading to speed up the processing
- h_path = DIRNAME / "lightgbm_alpha158_handler_horizon{}.pkl".format(self.horizon)
- elif self.model_type == "linear":
- conf_path = DIRNAME.parent.parent / "benchmarks" / "Linear" / "workflow_config_linear_Alpha158.yaml"
- h_path = DIRNAME / "linear_alpha158_handler_horizon{}.pkl".format(self.horizon)
+ for f in self.CONF_LIST:
+ if conf_path.samefile(f):
+ break
else:
- raise AssertionError("Model type is not supported!")
-
- if self.h_path is not None:
- h_path = Path(self.h_path)
-
- with conf_path.open("r") as f:
- conf = yaml.safe_load(f)
-
- # modify dataset horizon
- conf["task"]["dataset"]["kwargs"]["handler"]["kwargs"]["label"] = [
- "Ref($close, -{}) / Ref($close, -1) - 1".format(self.horizon + 1)
- ]
-
- task = conf["task"]
-
- if self.task_ext_conf is not None:
- task = update_config(task, self.task_ext_conf)
-
- if not h_path.exists():
- h_conf = task["dataset"]["kwargs"]["handler"]
- h = init_instance_by_config(h_conf)
- h.to_pickle(h_path, dump_all=True)
-
- task["dataset"]["kwargs"]["handler"] = f"file://{h_path}"
- task["record"] = ["qlib.workflow.record_temp.SignalRecord"]
-
- if self.train_start is not None:
- seg = task["dataset"]["kwargs"]["segments"]["train"]
- task["dataset"]["kwargs"]["segments"]["train"] = pd.Timestamp(self.train_start), seg[1]
-
- if self.test_end is not None:
- seg = task["dataset"]["kwargs"]["segments"]["test"]
- task["dataset"]["kwargs"]["segments"]["test"] = seg[0], pd.Timestamp(self.test_end)
- self.logger.info(task)
- return task
-
- def create_rolling_tasks(self):
- task = self.basic_task()
- task_l = task_generator(
- task, RollingGen(step=self.step, trunc_days=self.horizon + 1)
- ) # the last two days should be truncated to avoid information leakage
- return task_l
-
- def train_rolling_tasks(self, task_l=None):
- if task_l is None:
- task_l = self.create_rolling_tasks()
- trainer = TrainerR(experiment_name=self.rolling_exp)
- trainer(task_l)
-
- COMB_EXP = "rolling"
-
- def ens_rolling(self):
- rc = RecorderCollector(
- experiment=self.rolling_exp,
- artifacts_key=["pred", "label"],
- process_list=[RollingEnsemble()],
- # rec_key_func=lambda rec: (self.COMB_EXP, rec.info["id"]),
- artifacts_path={"pred": "pred.pkl", "label": "label.pkl"},
- )
- res = rc()
- with R.start(experiment_name=self.COMB_EXP):
- R.log_params(exp_name=self.rolling_exp)
- R.save_objects(**{"pred.pkl": res["pred"], "label.pkl": res["label"]})
-
- def update_rolling_rec(self):
- """
- Evaluate the combined rolling results
- """
- for _, rec in R.list_recorders(experiment_name=self.COMB_EXP).items():
- for rt_cls in SigAnaRecord, PortAnaRecord:
- rt = rt_cls(recorder=rec, skip_existing=True)
- rt.generate()
- print(f"Your evaluation results can be found in the experiment named `{self.COMB_EXP}`.")
-
- def run_all(self):
- # the results will be save in mlruns.
- # 1) each rolling task is saved in rolling_models
- self.train_rolling_tasks()
- # 2) combined rolling tasks and evaluation results are saved in rolling
- self.ens_rolling()
- self.update_rolling_rec()
+ self.logger.warning("Model type is not in the benchmark!")
if __name__ == "__main__":
diff --git a/examples/benchmarks_dynamic/baseline/workflow_config_lightgbm_Alpha158.yaml b/examples/benchmarks_dynamic/baseline/workflow_config_lightgbm_Alpha158.yaml
new file mode 100644
index 000000000..5ae316801
--- /dev/null
+++ b/examples/benchmarks_dynamic/baseline/workflow_config_lightgbm_Alpha158.yaml
@@ -0,0 +1,71 @@
+qlib_init:
+ provider_uri: "~/.qlib/qlib_data/cn_data"
+ region: cn
+market: &market csi300
+benchmark: &benchmark SH000300
+data_handler_config: &data_handler_config
+ start_time: 2008-01-01
+ end_time: 2020-08-01
+ fit_start_time: 2008-01-01
+ fit_end_time: 2014-12-31
+ instruments: *market
+port_analysis_config: &port_analysis_config
+ strategy:
+ class: TopkDropoutStrategy
+ module_path: qlib.contrib.strategy
+ kwargs:
+ signal:
+ topk: 50
+ n_drop: 5
+ backtest:
+ start_time: 2017-01-01
+ end_time: 2020-08-01
+ account: 100000000
+ benchmark: *benchmark
+ exchange_kwargs:
+ limit_threshold: 0.095
+ deal_price: close
+ open_cost: 0.0005
+ close_cost: 0.0015
+ min_cost: 5
+task:
+ model:
+ class: LGBModel
+ module_path: qlib.contrib.model.gbdt
+ kwargs:
+ loss: mse
+ colsample_bytree: 0.8879
+ learning_rate: 0.2
+ subsample: 0.8789
+ lambda_l1: 205.6999
+ lambda_l2: 580.9768
+ max_depth: 8
+ num_leaves: 210
+ num_threads: 20
+ dataset:
+ class: DatasetH
+ module_path: qlib.data.dataset
+ kwargs:
+ handler:
+ class: Alpha158
+ module_path: qlib.contrib.data.handler
+ kwargs: *data_handler_config
+ segments:
+ train: [2008-01-01, 2014-12-31]
+ valid: [2015-01-01, 2016-12-31]
+ test: [2017-01-01, 2020-08-01]
+ record:
+ - class: SignalRecord
+ module_path: qlib.workflow.record_temp
+ kwargs:
+ model:
+ dataset:
+ - class: SigAnaRecord
+ module_path: qlib.workflow.record_temp
+ kwargs:
+ ana_long_short: False
+ ann_scaler: 252
+ - class: PortAnaRecord
+ module_path: qlib.workflow.record_temp
+ kwargs:
+ config: *port_analysis_config
diff --git a/examples/benchmarks_dynamic/baseline/workflow_config_linear_Alpha158.yaml b/examples/benchmarks_dynamic/baseline/workflow_config_linear_Alpha158.yaml
new file mode 100644
index 000000000..a5c272f28
--- /dev/null
+++ b/examples/benchmarks_dynamic/baseline/workflow_config_linear_Alpha158.yaml
@@ -0,0 +1,77 @@
+qlib_init:
+ provider_uri: "~/.qlib/qlib_data/cn_data"
+ region: cn
+market: &market csi300
+benchmark: &benchmark SH000300
+data_handler_config: &data_handler_config
+ start_time: 2008-01-01
+ end_time: 2020-08-01
+ fit_start_time: 2008-01-01
+ fit_end_time: 2014-12-31
+ instruments: *market
+ infer_processors:
+ - class: RobustZScoreNorm
+ kwargs:
+ fields_group: feature
+ clip_outlier: true
+ - class: Fillna
+ kwargs:
+ fields_group: feature
+ learn_processors:
+ - class: DropnaLabel
+ - class: CSRankNorm
+ kwargs:
+ fields_group: label
+port_analysis_config: &port_analysis_config
+ strategy:
+ class: TopkDropoutStrategy
+ module_path: qlib.contrib.strategy
+ kwargs:
+ signal:
+ topk: 50
+ n_drop: 5
+ backtest:
+ start_time: 2017-01-01
+ end_time: 2020-08-01
+ account: 100000000
+ benchmark: *benchmark
+ exchange_kwargs:
+ limit_threshold: 0.095
+ deal_price: close
+ open_cost: 0.0005
+ close_cost: 0.0015
+ min_cost: 5
+task:
+ model:
+ class: LinearModel
+ module_path: qlib.contrib.model.linear
+ kwargs:
+ estimator: ridge
+ alpha: 0.05
+ dataset:
+ class: DatasetH
+ module_path: qlib.data.dataset
+ kwargs:
+ handler:
+ class: Alpha158
+ module_path: qlib.contrib.data.handler
+ kwargs: *data_handler_config
+ segments:
+ train: [2008-01-01, 2014-12-31]
+ valid: [2015-01-01, 2016-12-31]
+ test: [2017-01-01, 2020-08-01]
+ record:
+ - class: SignalRecord
+ module_path: qlib.workflow.record_temp
+ kwargs:
+ model:
+ dataset:
+ - class: SigAnaRecord
+ module_path: qlib.workflow.record_temp
+ kwargs:
+ ana_long_short: True
+ ann_scaler: 252
+ - class: PortAnaRecord
+ module_path: qlib.workflow.record_temp
+ kwargs:
+ config: *port_analysis_config
diff --git a/examples/highfreq/highfreq_handler.py b/examples/highfreq/highfreq_handler.py
index c15c3ec41..7df564b7b 100644
--- a/examples/highfreq/highfreq_handler.py
+++ b/examples/highfreq/highfreq_handler.py
@@ -14,7 +14,6 @@ class HighFreqHandler(DataHandlerLP):
fit_end_time=None,
drop_raw=True,
):
-
infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time)
learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time)
diff --git a/examples/highfreq/workflow.py b/examples/highfreq/workflow.py
index c631d72e7..02948c5a1 100644
--- a/examples/highfreq/workflow.py
+++ b/examples/highfreq/workflow.py
@@ -18,7 +18,6 @@ from highfreq_ops import get_calendar_day, DayLast, FFillNan, BFillNan, Date, Se
class HighfreqWorkflow:
-
SPEC_CONF = {"custom_ops": [DayLast, FFillNan, BFillNan, Date, Select, IsNull, Cut], "expression_cache": None}
MARKET = "all"
diff --git a/examples/hyperparameter/LightGBM/hyperparameter_158.py b/examples/hyperparameter/LightGBM/hyperparameter_158.py
index 8c3e9f3e8..7520390a6 100644
--- a/examples/hyperparameter/LightGBM/hyperparameter_158.py
+++ b/examples/hyperparameter/LightGBM/hyperparameter_158.py
@@ -35,7 +35,6 @@ def objective(trial):
if __name__ == "__main__":
-
provider_uri = "~/.qlib/qlib_data/cn_data"
GetData().qlib_data(target_dir=provider_uri, region=REG_CN, exists_skip=True)
qlib.init(provider_uri=provider_uri, region="cn")
diff --git a/examples/hyperparameter/LightGBM/hyperparameter_360.py b/examples/hyperparameter/LightGBM/hyperparameter_360.py
index 322c0fa42..7ba28c78f 100644
--- a/examples/hyperparameter/LightGBM/hyperparameter_360.py
+++ b/examples/hyperparameter/LightGBM/hyperparameter_360.py
@@ -38,7 +38,6 @@ def objective(trial):
if __name__ == "__main__":
-
provider_uri = "~/.qlib/qlib_data/cn_data"
GetData().qlib_data(target_dir=provider_uri, region=REG_CN, exists_skip=True)
qlib.init(provider_uri=provider_uri, region=REG_CN)
diff --git a/examples/model_interpreter/feature.py b/examples/model_interpreter/feature.py
index bfc58fc84..8ad673d0e 100644
--- a/examples/model_interpreter/feature.py
+++ b/examples/model_interpreter/feature.py
@@ -11,7 +11,6 @@ from qlib.tests.config import CSI300_GBDT_TASK
if __name__ == "__main__":
-
# use default data
provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir
GetData().qlib_data(target_dir=provider_uri, region=REG_CN, exists_skip=True)
diff --git a/examples/portfolio/prepare_riskdata.py b/examples/portfolio/prepare_riskdata.py
index 3168e2f37..e502a1ff7 100644
--- a/examples/portfolio/prepare_riskdata.py
+++ b/examples/portfolio/prepare_riskdata.py
@@ -9,7 +9,6 @@ from qlib.model.riskmodel import StructuredCovEstimator
def prepare_data(riskdata_root="./riskdata", T=240, start_time="2016-01-01"):
-
universe = D.features(D.instruments("csi300"), ["$close"], start_time=start_time).swaplevel().sort_index()
price_all = (
@@ -20,7 +19,6 @@ def prepare_data(riskdata_root="./riskdata", T=240, start_time="2016-01-01"):
riskmodel = StructuredCovEstimator()
for i in range(T - 1, len(price_all)):
-
date = price_all.index[i]
ref_date = price_all.index[i - T + 1]
@@ -47,7 +45,6 @@ def prepare_data(riskdata_root="./riskdata", T=240, start_time="2016-01-01"):
if __name__ == "__main__":
-
import qlib
qlib.init(provider_uri="~/.qlib/qlib_data/cn_data")
diff --git a/examples/rolling_process_data/workflow.py b/examples/rolling_process_data/workflow.py
index 434d365e5..d1c03866a 100644
--- a/examples/rolling_process_data/workflow.py
+++ b/examples/rolling_process_data/workflow.py
@@ -13,7 +13,6 @@ from qlib.tests.data import GetData
class RollingDataWorkflow:
-
MARKET = "csi300"
start_time = "2010-01-01"
end_time = "2019-12-31"
@@ -93,7 +92,6 @@ class RollingDataWorkflow:
dataset = init_instance_by_config(dataset_config)
for rolling_offset in range(self.rolling_cnt):
-
print(f"===========rolling{rolling_offset} start===========")
if rolling_offset:
dataset.config(
diff --git a/examples/workflow_by_code.py b/examples/workflow_by_code.py
index 0c4d73a51..94de5c082 100644
--- a/examples/workflow_by_code.py
+++ b/examples/workflow_by_code.py
@@ -17,7 +17,6 @@ from qlib.tests.config import CSI300_BENCH, CSI300_GBDT_TASK
if __name__ == "__main__":
-
# use default data
provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir
GetData().qlib_data(target_dir=provider_uri, region=REG_CN, exists_skip=True)
diff --git a/qlib/__init__.py b/qlib/__init__.py
index 11d22cc23..3355ac04f 100644
--- a/qlib/__init__.py
+++ b/qlib/__init__.py
@@ -2,7 +2,7 @@
# Licensed under the MIT License.
from pathlib import Path
-__version__ = "0.9.1.99"
+__version__ = "0.9.2.99"
__version__bak = __version__ # This version is backup for QlibConfig.reset_qlib_version
import os
from typing import Union
@@ -77,7 +77,6 @@ def init(default_conf="client", **kwargs):
def _mount_nfs_uri(provider_uri, mount_path, auto_mount: bool = False):
-
LOG = get_module_logger("mount nfs", level=logging.INFO)
if mount_path is None:
raise ValueError(f"Invalid mount path: {mount_path}!")
diff --git a/qlib/backtest/__init__.py b/qlib/backtest/__init__.py
index bb8ca731b..d784aed57 100644
--- a/qlib/backtest/__init__.py
+++ b/qlib/backtest/__init__.py
@@ -182,7 +182,6 @@ def get_strategy_executor(
exchange_kwargs: dict = {},
pos_type: str = "Position",
) -> Tuple[BaseStrategy, BaseExecutor]:
-
# NOTE:
# - for avoiding recursive import
# - typing annotations is not reliable
diff --git a/qlib/backtest/exchange.py b/qlib/backtest/exchange.py
index a752a9f8c..1ab0d07a7 100644
--- a/qlib/backtest/exchange.py
+++ b/qlib/backtest/exchange.py
@@ -638,7 +638,6 @@ class Exchange:
random.seed(0)
random.shuffle(sorted_ids)
for stock_id in sorted_ids:
-
# Do not generate order for the non-tradable stocks
if not self.is_stock_tradable(stock_id=stock_id, start_time=start_time, end_time=end_time):
continue
diff --git a/qlib/config.py b/qlib/config.py
index 7b726c658..7910dab73 100644
--- a/qlib/config.py
+++ b/qlib/config.py
@@ -293,7 +293,6 @@ class QlibConfig(Config):
"""
def __init__(self, provider_uri: Union[str, Path, dict], mount_path: Union[str, Path, dict]):
-
"""
The relation of `provider_uri` and `mount_path`
- `mount_path` is used only if provider_uri is an NFS path
diff --git a/qlib/contrib/data/dataset.py b/qlib/contrib/data/dataset.py
index 9ce522cc0..8b40dba1f 100644
--- a/qlib/contrib/data/dataset.py
+++ b/qlib/contrib/data/dataset.py
@@ -130,7 +130,6 @@ class MTSDatasetH(DatasetH):
input_size=None,
**kwargs,
):
-
assert num_states == 0 or horizon > 0, "please specify `horizon` to avoid data leakage"
assert memory_mode in ["sample", "daily"], "unsupported memory mode"
assert memory_mode == "sample" or batch_size < 0, "daily memory requires daily sampling (`batch_size < 0`)"
@@ -153,7 +152,6 @@ class MTSDatasetH(DatasetH):
super().__init__(handler, segments, **kwargs)
def setup_data(self, handler_kwargs: dict = None, **kwargs):
-
super().setup_data(**kwargs)
if handler_kwargs is not None:
@@ -288,7 +286,6 @@ class MTSDatasetH(DatasetH):
daily_count = [] # store number of samples for each day
for j in indices[i : i + batch_size]:
-
# normal sampling: self.batch_size > 0 => slices is a list => slices_subset is a slice
# daily sampling: self.batch_size < 0 => slices is a nested list => slices_subset is a list
slices_subset = slices[j]
@@ -297,7 +294,6 @@ class MTSDatasetH(DatasetH):
# each slices_subset contains a list of slices for multiple stocks
# NOTE: daily sampling is used in 1) eval mode, 2) train mode with self.batch_size < 0
if self.batch_size < 0:
-
# store daily index
idx = self._daily_index.index[j] # daily_index.index is the index of the original data
daily_index.append(idx)
@@ -320,7 +316,6 @@ class MTSDatasetH(DatasetH):
slices_subset = [slices_subset]
for slc in slices_subset:
-
# legacy support for Alpha360 data by `input_size`
if self.input_size:
data.append(self._data[slc.stop - 1].reshape(self.input_size, -1).T)
diff --git a/qlib/contrib/data/highfreq_handler.py b/qlib/contrib/data/highfreq_handler.py
index 638fbf0e8..8eed4814f 100644
--- a/qlib/contrib/data/highfreq_handler.py
+++ b/qlib/contrib/data/highfreq_handler.py
@@ -17,7 +17,6 @@ class HighFreqHandler(DataHandlerLP):
fit_end_time=None,
drop_raw=True,
):
-
infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time)
learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time)
@@ -318,7 +317,6 @@ class HighFreqOrderHandler(DataHandlerLP):
inst_processors=None,
drop_raw=True,
):
-
infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time)
learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time)
diff --git a/qlib/contrib/data/highfreq_processor.py b/qlib/contrib/data/highfreq_processor.py
index f7041e9f4..db2a6e39b 100644
--- a/qlib/contrib/data/highfreq_processor.py
+++ b/qlib/contrib/data/highfreq_processor.py
@@ -29,7 +29,6 @@ class HighFreqNorm(Processor):
feature_save_dir: str,
norm_groups: Dict[str, int],
):
-
self.fit_start_time = fit_start_time
self.fit_end_time = fit_end_time
self.feature_save_dir = feature_save_dir
diff --git a/qlib/contrib/meta/data_selection/dataset.py b/qlib/contrib/meta/data_selection/dataset.py
index e3689d964..9349a12fe 100644
--- a/qlib/contrib/meta/data_selection/dataset.py
+++ b/qlib/contrib/meta/data_selection/dataset.py
@@ -49,6 +49,8 @@ class InternalData:
# 1) prepare the prediction of proxy models
perf_task_tpl = deepcopy(self.task_tpl) # this task is supposed to contains no complicated objects
+ # The only thing we want to save is the prediction
+ perf_task_tpl["record"] = ["qlib.workflow.record_temp.SignalRecord"]
trainer = auto_filter_kwargs(trainer)(experiment_name=self.exp_name, **trainer_kwargs)
# NOTE:
diff --git a/qlib/contrib/model/pytorch_adarnn.py b/qlib/contrib/model/pytorch_adarnn.py
index 4b0db7f4b..ca5e8ba86 100644
--- a/qlib/contrib/model/pytorch_adarnn.py
+++ b/qlib/contrib/model/pytorch_adarnn.py
@@ -246,7 +246,6 @@ class ADARNN(Model):
evals_result=dict(),
save_path=None,
):
-
df_train, df_valid = dataset.prepare(
["train", "valid"],
col_set=["feature", "label"],
@@ -318,7 +317,6 @@ class ADARNN(Model):
preds = []
for begin in range(sample_num)[:: self.batch_size]:
-
if sample_num - begin < self.batch_size:
end = sample_num
else:
diff --git a/qlib/contrib/model/pytorch_alstm.py b/qlib/contrib/model/pytorch_alstm.py
index b0770e2bd..2fe7cce3b 100644
--- a/qlib/contrib/model/pytorch_alstm.py
+++ b/qlib/contrib/model/pytorch_alstm.py
@@ -146,7 +146,6 @@ class ALSTM(Model):
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
-
mask = torch.isfinite(label)
if self.metric in ("", "loss"):
@@ -155,7 +154,6 @@ class ALSTM(Model):
raise ValueError("unknown metric `%s`" % self.metric)
def train_epoch(self, x_train, y_train):
-
x_train_values = x_train.values
y_train_values = np.squeeze(y_train.values)
@@ -165,7 +163,6 @@ class ALSTM(Model):
np.random.shuffle(indices)
for i in range(len(indices))[:: self.batch_size]:
-
if len(indices) - i < self.batch_size:
break
@@ -181,7 +178,6 @@ class ALSTM(Model):
self.train_optimizer.step()
def test_epoch(self, data_x, data_y):
-
# prepare training data
x_values = data_x.values
y_values = np.squeeze(data_y.values)
@@ -194,7 +190,6 @@ class ALSTM(Model):
indices = np.arange(len(x_values))
for i in range(len(indices))[:: self.batch_size]:
-
if len(indices) - i < self.batch_size:
break
@@ -217,7 +212,6 @@ class ALSTM(Model):
evals_result=dict(),
save_path=None,
):
-
df_train, df_valid, df_test = dataset.prepare(
["train", "valid", "test"],
col_set=["feature", "label"],
@@ -282,7 +276,6 @@ class ALSTM(Model):
preds = []
for begin in range(sample_num)[:: self.batch_size]:
-
if sample_num - begin < self.batch_size:
end = sample_num
else:
diff --git a/qlib/contrib/model/pytorch_alstm_ts.py b/qlib/contrib/model/pytorch_alstm_ts.py
index 3ab8ed8ab..008d78940 100644
--- a/qlib/contrib/model/pytorch_alstm_ts.py
+++ b/qlib/contrib/model/pytorch_alstm_ts.py
@@ -156,7 +156,6 @@ class ALSTM(Model):
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
-
mask = torch.isfinite(label)
if self.metric in ("", "loss"):
@@ -165,10 +164,9 @@ class ALSTM(Model):
raise ValueError("unknown metric `%s`" % self.metric)
def train_epoch(self, data_loader):
-
self.ALSTM_model.train()
- for (data, weight) in data_loader:
+ for data, weight in data_loader:
feature = data[:, :, 0:-1].to(self.device)
label = data[:, -1, -1].to(self.device)
@@ -181,14 +179,12 @@ class ALSTM(Model):
self.train_optimizer.step()
def test_epoch(self, data_loader):
-
self.ALSTM_model.eval()
scores = []
losses = []
- for (data, weight) in data_loader:
-
+ for data, weight in data_loader:
feature = data[:, :, 0:-1].to(self.device)
# feature[torch.isnan(feature)] = 0
label = data[:, -1, -1].to(self.device)
@@ -295,7 +291,6 @@ class ALSTM(Model):
preds = []
for data in test_loader:
-
feature = data[:, :, 0:-1].to(self.device)
with torch.no_grad():
diff --git a/qlib/contrib/model/pytorch_gats.py b/qlib/contrib/model/pytorch_gats.py
index 127408877..63ebd480a 100644
--- a/qlib/contrib/model/pytorch_gats.py
+++ b/qlib/contrib/model/pytorch_gats.py
@@ -154,7 +154,6 @@ class GATs(Model):
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
-
mask = torch.isfinite(label)
if self.metric in ("", "loss"):
@@ -175,7 +174,6 @@ class GATs(Model):
return daily_index, daily_count
def train_epoch(self, x_train, y_train):
-
x_train_values = x_train.values
y_train_values = np.squeeze(y_train.values)
self.GAT_model.train()
@@ -197,7 +195,6 @@ class GATs(Model):
self.train_optimizer.step()
def test_epoch(self, data_x, data_y):
-
# prepare training data
x_values = data_x.values
y_values = np.squeeze(data_y.values)
@@ -230,7 +227,6 @@ class GATs(Model):
evals_result=dict(),
save_path=None,
):
-
df_train, df_valid, df_test = dataset.prepare(
["train", "valid", "test"],
col_set=["feature", "label"],
diff --git a/qlib/contrib/model/pytorch_gats_ts.py b/qlib/contrib/model/pytorch_gats_ts.py
index 1b75efe89..b1239f78e 100644
--- a/qlib/contrib/model/pytorch_gats_ts.py
+++ b/qlib/contrib/model/pytorch_gats_ts.py
@@ -32,7 +32,6 @@ class DailyBatchSampler(Sampler):
self.daily_index[0] = 0
def __iter__(self):
-
for idx, count in zip(self.daily_index, self.daily_count):
yield np.arange(idx, idx + count)
@@ -173,7 +172,6 @@ class GATs(Model):
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
-
mask = torch.isfinite(label)
if self.metric in ("", "loss"):
@@ -194,11 +192,9 @@ class GATs(Model):
return daily_index, daily_count
def train_epoch(self, data_loader):
-
self.GAT_model.train()
for data in data_loader:
-
data = data.squeeze()
feature = data[:, :, 0:-1].to(self.device)
label = data[:, -1, -1].to(self.device)
@@ -212,14 +208,12 @@ class GATs(Model):
self.train_optimizer.step()
def test_epoch(self, data_loader):
-
self.GAT_model.eval()
scores = []
losses = []
for data in data_loader:
-
data = data.squeeze()
feature = data[:, :, 0:-1].to(self.device)
# feature[torch.isnan(feature)] = 0
@@ -240,7 +234,6 @@ class GATs(Model):
evals_result=dict(),
save_path=None,
):
-
dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
if dl_train.empty or dl_valid.empty:
@@ -329,7 +322,6 @@ class GATs(Model):
preds = []
for data in test_loader:
-
data = data.squeeze()
feature = data[:, :, 0:-1].to(self.device)
diff --git a/qlib/contrib/model/pytorch_gru.py b/qlib/contrib/model/pytorch_gru.py
index 10998236b..2a476a657 100755
--- a/qlib/contrib/model/pytorch_gru.py
+++ b/qlib/contrib/model/pytorch_gru.py
@@ -146,7 +146,6 @@ class GRU(Model):
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
-
mask = torch.isfinite(label)
if self.metric in ("", "loss"):
@@ -155,7 +154,6 @@ class GRU(Model):
raise ValueError("unknown metric `%s`" % self.metric)
def train_epoch(self, x_train, y_train):
-
x_train_values = x_train.values
y_train_values = np.squeeze(y_train.values)
@@ -165,7 +163,6 @@ class GRU(Model):
np.random.shuffle(indices)
for i in range(len(indices))[:: self.batch_size]:
-
if len(indices) - i < self.batch_size:
break
@@ -181,7 +178,6 @@ class GRU(Model):
self.train_optimizer.step()
def test_epoch(self, data_x, data_y):
-
# prepare training data
x_values = data_x.values
y_values = np.squeeze(data_y.values)
@@ -194,7 +190,6 @@ class GRU(Model):
indices = np.arange(len(x_values))
for i in range(len(indices))[:: self.batch_size]:
-
if len(indices) - i < self.batch_size:
break
@@ -217,7 +212,6 @@ class GRU(Model):
evals_result=dict(),
save_path=None,
):
-
df_train, df_valid, df_test = dataset.prepare(
["train", "valid", "test"],
col_set=["feature", "label"],
@@ -282,7 +276,6 @@ class GRU(Model):
preds = []
for begin in range(sample_num)[:: self.batch_size]:
-
if sample_num - begin < self.batch_size:
end = sample_num
else:
diff --git a/qlib/contrib/model/pytorch_gru_ts.py b/qlib/contrib/model/pytorch_gru_ts.py
index b588392a2..2e5076ea6 100755
--- a/qlib/contrib/model/pytorch_gru_ts.py
+++ b/qlib/contrib/model/pytorch_gru_ts.py
@@ -154,7 +154,6 @@ class GRU(Model):
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
-
mask = torch.isfinite(label)
if self.metric in ("", "loss"):
@@ -163,10 +162,9 @@ class GRU(Model):
raise ValueError("unknown metric `%s`" % self.metric)
def train_epoch(self, data_loader):
-
self.GRU_model.train()
- for (data, weight) in data_loader:
+ for data, weight in data_loader:
feature = data[:, :, 0:-1].to(self.device)
label = data[:, -1, -1].to(self.device)
@@ -179,14 +177,12 @@ class GRU(Model):
self.train_optimizer.step()
def test_epoch(self, data_loader):
-
self.GRU_model.eval()
scores = []
losses = []
- for (data, weight) in data_loader:
-
+ for data, weight in data_loader:
feature = data[:, :, 0:-1].to(self.device)
# feature[torch.isnan(feature)] = 0
label = data[:, -1, -1].to(self.device)
@@ -293,7 +289,6 @@ class GRU(Model):
preds = []
for data in test_loader:
-
feature = data[:, :, 0:-1].to(self.device)
with torch.no_grad():
diff --git a/qlib/contrib/model/pytorch_hist.py b/qlib/contrib/model/pytorch_hist.py
index f7b565dc5..5c3cd66a3 100644
--- a/qlib/contrib/model/pytorch_hist.py
+++ b/qlib/contrib/model/pytorch_hist.py
@@ -160,7 +160,6 @@ class HIST(Model):
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
-
mask = torch.isfinite(label)
if self.metric == "ic":
@@ -189,7 +188,6 @@ class HIST(Model):
return daily_index, daily_count
def train_epoch(self, x_train, y_train, stock_index):
-
stock2concept_matrix = np.load(self.stock2concept)
x_train_values = x_train.values
y_train_values = np.squeeze(y_train.values)
@@ -214,7 +212,6 @@ class HIST(Model):
self.train_optimizer.step()
def test_epoch(self, data_x, data_y, stock_index):
-
# prepare training data
stock2concept_matrix = np.load(self.stock2concept)
x_values = data_x.values
diff --git a/qlib/contrib/model/pytorch_igmtf.py b/qlib/contrib/model/pytorch_igmtf.py
index d38ef9ad4..46a25c00f 100644
--- a/qlib/contrib/model/pytorch_igmtf.py
+++ b/qlib/contrib/model/pytorch_igmtf.py
@@ -153,7 +153,6 @@ class IGMTF(Model):
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
-
mask = torch.isfinite(label)
if self.metric == "ic":
@@ -201,7 +200,6 @@ class IGMTF(Model):
return train_hidden, train_hidden_day
def train_epoch(self, x_train, y_train, train_hidden, train_hidden_day):
-
x_train_values = x_train.values
y_train_values = np.squeeze(y_train.values)
@@ -222,7 +220,6 @@ class IGMTF(Model):
self.train_optimizer.step()
def test_epoch(self, data_x, data_y, train_hidden, train_hidden_day):
-
# prepare training data
x_values = data_x.values
y_values = np.squeeze(data_y.values)
@@ -254,7 +251,6 @@ class IGMTF(Model):
evals_result=dict(),
save_path=None,
):
-
df_train, df_valid = dataset.prepare(
["train", "valid"],
col_set=["feature", "label"],
diff --git a/qlib/contrib/model/pytorch_krnn.py b/qlib/contrib/model/pytorch_krnn.py
new file mode 100644
index 000000000..7c252672d
--- /dev/null
+++ b/qlib/contrib/model/pytorch_krnn.py
@@ -0,0 +1,511 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import pandas as pd
+from typing import Text, Union
+import copy
+from ...utils import get_or_create_path
+from ...log import get_module_logger
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+
+from ...model.base import Model
+from ...data.dataset import DatasetH
+from ...data.dataset.handler import DataHandlerLP
+
+########################################################################
+########################################################################
+########################################################################
+
+
+class CNNEncoderBase(nn.Module):
+ def __init__(self, input_dim, output_dim, kernel_size, device):
+ """Build a basic CNN encoder
+
+ Parameters
+ ----------
+ input_dim : int
+ The input dimension
+ output_dim : int
+ The output dimension
+ kernel_size : int
+ The size of convolutional kernels
+ """
+ super().__init__()
+
+ self.input_dim = input_dim
+ self.output_dim = output_dim
+ self.kernel_size = kernel_size
+ self.device = device
+
+ # set padding to ensure the same length
+ # it is correct only when kernel_size is odd, dilation is 1, stride is 1
+ self.conv = nn.Conv1d(input_dim, output_dim, kernel_size, padding=(kernel_size - 1) // 2)
+
+ def forward(self, x):
+ """
+ Parameters
+ ----------
+ x : torch.Tensor
+ input data
+
+ Returns
+ -------
+ torch.Tensor
+ Updated representations
+ """
+
+ # input shape: [batch_size, seq_len*input_dim]
+ # output shape: [batch_size, seq_len, input_dim]
+ x = x.view(x.shape[0], -1, self.input_dim).permute(0, 2, 1).to(self.device)
+ y = self.conv(x) # [batch_size, output_dim, conved_seq_len]
+ y = y.permute(0, 2, 1) # [batch_size, conved_seq_len, output_dim]
+
+ return y
+
+
+class KRNNEncoderBase(nn.Module):
+ def __init__(self, input_dim, output_dim, dup_num, rnn_layers, dropout, device):
+ """Build K parallel RNNs
+
+ Parameters
+ ----------
+ input_dim : int
+ The input dimension
+ output_dim : int
+ The output dimension
+ dup_num : int
+ The number of parallel RNNs
+ rnn_layers: int
+ The number of RNN layers
+ """
+ super().__init__()
+
+ self.input_dim = input_dim
+ self.output_dim = output_dim
+ self.dup_num = dup_num
+ self.rnn_layers = rnn_layers
+ self.dropout = dropout
+ self.device = device
+
+ self.rnn_modules = nn.ModuleList()
+ for _ in range(dup_num):
+ self.rnn_modules.append(nn.GRU(input_dim, output_dim, num_layers=self.rnn_layers, dropout=dropout))
+
+ def forward(self, x):
+ """
+ Parameters
+ ----------
+ x : torch.Tensor
+ Input data
+ n_id : torch.Tensor
+ Node indices
+
+ Returns
+ -------
+ torch.Tensor
+ Updated representations
+ """
+
+ # input shape: [batch_size, seq_len, input_dim]
+ # output shape: [batch_size, seq_len, output_dim]
+ # [seq_len, batch_size, input_dim]
+ batch_size, seq_len, input_dim = x.shape
+ x = x.permute(1, 0, 2).to(self.device)
+
+ hids = []
+ for rnn in self.rnn_modules:
+ h, _ = rnn(x) # [seq_len, batch_size, output_dim]
+ hids.append(h)
+ # [seq_len, batch_size, output_dim, num_dups]
+ hids = torch.stack(hids, dim=-1)
+ hids = hids.view(seq_len, batch_size, self.output_dim, self.dup_num)
+ hids = hids.mean(dim=3)
+ hids = hids.permute(1, 0, 2)
+
+ return hids
+
+
+class CNNKRNNEncoder(nn.Module):
+ def __init__(
+ self, cnn_input_dim, cnn_output_dim, cnn_kernel_size, rnn_output_dim, rnn_dup_num, rnn_layers, dropout, device
+ ):
+ """Build an encoder composed of CNN and KRNN
+
+ Parameters
+ ----------
+ cnn_input_dim : int
+ The input dimension of CNN
+ cnn_output_dim : int
+ The output dimension of CNN
+ cnn_kernel_size : int
+ The size of convolutional kernels
+ rnn_output_dim : int
+ The output dimension of KRNN
+ rnn_dup_num : int
+ The number of parallel duplicates for KRNN
+ rnn_layers : int
+ The number of RNN layers
+ """
+ super().__init__()
+
+ self.cnn_encoder = CNNEncoderBase(cnn_input_dim, cnn_output_dim, cnn_kernel_size, device)
+ self.krnn_encoder = KRNNEncoderBase(cnn_output_dim, rnn_output_dim, rnn_dup_num, rnn_layers, dropout, device)
+
+ def forward(self, x):
+ """
+ Parameters
+ ----------
+ x : torch.Tensor
+ Input data
+ n_id : torch.Tensor
+ Node indices
+
+ Returns
+ -------
+ torch.Tensor
+ Updated representations
+ """
+ cnn_out = self.cnn_encoder(x)
+ krnn_out = self.krnn_encoder(cnn_out)
+
+ return krnn_out
+
+
+class KRNNModel(nn.Module):
+ def __init__(self, fea_dim, cnn_dim, cnn_kernel_size, rnn_dim, rnn_dups, rnn_layers, dropout, device, **params):
+ """Build a KRNN model
+
+ Parameters
+ ----------
+ fea_dim : int
+ The feature dimension
+ cnn_dim : int
+ The hidden dimension of CNN
+ cnn_kernel_size : int
+ The size of convolutional kernels
+ rnn_dim : int
+ The hidden dimension of KRNN
+ rnn_dups : int
+ The number of parallel duplicates
+ rnn_layers: int
+ The number of RNN layers
+ """
+ super().__init__()
+
+ self.encoder = CNNKRNNEncoder(
+ cnn_input_dim=fea_dim,
+ cnn_output_dim=cnn_dim,
+ cnn_kernel_size=cnn_kernel_size,
+ rnn_output_dim=rnn_dim,
+ rnn_dup_num=rnn_dups,
+ rnn_layers=rnn_layers,
+ dropout=dropout,
+ device=device,
+ )
+
+ self.out_fc = nn.Linear(rnn_dim, 1)
+ self.device = device
+
+ def forward(self, x):
+ # x: [batch_size, node_num, seq_len, input_dim]
+ encode = self.encoder(x)
+ out = self.out_fc(encode[:, -1, :]).squeeze().to(self.device)
+
+ return out
+
+
+class KRNN(Model):
+ """KRNN Model
+
+ Parameters
+ ----------
+ d_feat : int
+ input dimension for each time step
+ metric: str
+ the evaluation metric used in early stop
+ optimizer : str
+ optimizer name
+ GPU : str
+ the GPU ID(s) used for training
+ """
+
+ def __init__(
+ self,
+ fea_dim=6,
+ cnn_dim=64,
+ cnn_kernel_size=3,
+ rnn_dim=64,
+ rnn_dups=3,
+ rnn_layers=2,
+ dropout=0,
+ n_epochs=200,
+ lr=0.001,
+ metric="",
+ batch_size=2000,
+ early_stop=20,
+ loss="mse",
+ optimizer="adam",
+ GPU=0,
+ seed=None,
+ **kwargs
+ ):
+ # Set logger.
+ self.logger = get_module_logger("KRNN")
+ self.logger.info("KRNN pytorch version...")
+
+ # set hyper-parameters.
+ self.fea_dim = fea_dim
+ self.cnn_dim = cnn_dim
+ self.cnn_kernel_size = cnn_kernel_size
+ self.rnn_dim = rnn_dim
+ self.rnn_dups = rnn_dups
+ self.rnn_layers = rnn_layers
+ self.dropout = dropout
+ self.n_epochs = n_epochs
+ self.lr = lr
+ self.metric = metric
+ self.batch_size = batch_size
+ self.early_stop = early_stop
+ self.optimizer = optimizer.lower()
+ self.loss = loss
+ self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu")
+ self.seed = seed
+
+ self.logger.info(
+ "KRNN parameters setting:"
+ "\nfea_dim : {}"
+ "\ncnn_dim : {}"
+ "\ncnn_kernel_size : {}"
+ "\nrnn_dim : {}"
+ "\nrnn_dups : {}"
+ "\nrnn_layers : {}"
+ "\ndropout : {}"
+ "\nn_epochs : {}"
+ "\nlr : {}"
+ "\nmetric : {}"
+ "\nbatch_size: {}"
+ "\nearly_stop : {}"
+ "\noptimizer : {}"
+ "\nloss_type : {}"
+ "\nvisible_GPU : {}"
+ "\nuse_GPU : {}"
+ "\nseed : {}".format(
+ fea_dim,
+ cnn_dim,
+ cnn_kernel_size,
+ rnn_dim,
+ rnn_dups,
+ rnn_layers,
+ dropout,
+ n_epochs,
+ lr,
+ metric,
+ batch_size,
+ early_stop,
+ optimizer.lower(),
+ loss,
+ GPU,
+ self.use_gpu,
+ seed,
+ )
+ )
+
+ if self.seed is not None:
+ np.random.seed(self.seed)
+ torch.manual_seed(self.seed)
+
+ self.krnn_model = KRNNModel(
+ fea_dim=self.fea_dim,
+ cnn_dim=self.cnn_dim,
+ cnn_kernel_size=self.cnn_kernel_size,
+ rnn_dim=self.rnn_dim,
+ rnn_dups=self.rnn_dups,
+ rnn_layers=self.rnn_layers,
+ dropout=self.dropout,
+ device=self.device,
+ )
+ if optimizer.lower() == "adam":
+ self.train_optimizer = optim.Adam(self.krnn_model.parameters(), lr=self.lr)
+ elif optimizer.lower() == "gd":
+ self.train_optimizer = optim.SGD(self.krnn_model.parameters(), lr=self.lr)
+ else:
+ raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
+
+ self.fitted = False
+ self.krnn_model.to(self.device)
+
+ @property
+ def use_gpu(self):
+ return self.device != torch.device("cpu")
+
+ def mse(self, pred, label):
+ loss = (pred - label) ** 2
+ return torch.mean(loss)
+
+ def loss_fn(self, pred, label):
+ mask = ~torch.isnan(label)
+
+ if self.loss == "mse":
+ return self.mse(pred[mask], label[mask])
+
+ raise ValueError("unknown loss `%s`" % self.loss)
+
+ def metric_fn(self, pred, label):
+ mask = torch.isfinite(label)
+
+ if self.metric in ("", "loss"):
+ return -self.loss_fn(pred[mask], label[mask])
+
+ raise ValueError("unknown metric `%s`" % self.metric)
+
+ def get_daily_inter(self, df, shuffle=False):
+ # organize the train data into daily batches
+ daily_count = df.groupby(level=0).size().values
+ daily_index = np.roll(np.cumsum(daily_count), 1)
+ daily_index[0] = 0
+ if shuffle:
+ # shuffle data
+ daily_shuffle = list(zip(daily_index, daily_count))
+ np.random.shuffle(daily_shuffle)
+ daily_index, daily_count = zip(*daily_shuffle)
+ return daily_index, daily_count
+
+ def train_epoch(self, x_train, y_train):
+ x_train_values = x_train.values
+ y_train_values = np.squeeze(y_train.values)
+ self.krnn_model.train()
+
+ indices = np.arange(len(x_train_values))
+ np.random.shuffle(indices)
+
+ for i in range(len(indices))[:: self.batch_size]:
+ if len(indices) - i < self.batch_size:
+ break
+
+ feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float().to(self.device)
+ label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float().to(self.device)
+
+ pred = self.krnn_model(feature)
+ loss = self.loss_fn(pred, label)
+
+ self.train_optimizer.zero_grad()
+ loss.backward()
+ torch.nn.utils.clip_grad_value_(self.krnn_model.parameters(), 3.0)
+ self.train_optimizer.step()
+
+ def test_epoch(self, data_x, data_y):
+ # prepare training data
+ x_values = data_x.values
+ y_values = np.squeeze(data_y.values)
+
+ self.krnn_model.eval()
+
+ scores = []
+ losses = []
+
+ indices = np.arange(len(x_values))
+
+ for i in range(len(indices))[:: self.batch_size]:
+ if len(indices) - i < self.batch_size:
+ break
+
+ feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float().to(self.device)
+ label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float().to(self.device)
+
+ pred = self.krnn_model(feature)
+ loss = self.loss_fn(pred, label)
+ losses.append(loss.item())
+
+ score = self.metric_fn(pred, label)
+ scores.append(score.item())
+
+ return np.mean(losses), np.mean(scores)
+
+ def fit(
+ self,
+ dataset: DatasetH,
+ evals_result=dict(),
+ save_path=None,
+ ):
+ df_train, df_valid, df_test = dataset.prepare(
+ ["train", "valid", "test"],
+ col_set=["feature", "label"],
+ data_key=DataHandlerLP.DK_L,
+ )
+ if df_train.empty or df_valid.empty:
+ raise ValueError("Empty data from dataset, please check your dataset config.")
+
+ x_train, y_train = df_train["feature"], df_train["label"]
+ x_valid, y_valid = df_valid["feature"], df_valid["label"]
+
+ save_path = get_or_create_path(save_path)
+ stop_steps = 0
+ train_loss = 0
+ best_score = -np.inf
+ best_epoch = 0
+ evals_result["train"] = []
+ evals_result["valid"] = []
+
+ # train
+ self.logger.info("training...")
+ self.fitted = True
+
+ for step in range(self.n_epochs):
+ self.logger.info("Epoch%d:", step)
+ self.logger.info("training...")
+ self.train_epoch(x_train, y_train)
+ self.logger.info("evaluating...")
+ train_loss, train_score = self.test_epoch(x_train, y_train)
+ val_loss, val_score = self.test_epoch(x_valid, y_valid)
+ self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
+ evals_result["train"].append(train_score)
+ evals_result["valid"].append(val_score)
+
+ if val_score > best_score:
+ best_score = val_score
+ stop_steps = 0
+ best_epoch = step
+ best_param = copy.deepcopy(self.krnn_model.state_dict())
+ else:
+ stop_steps += 1
+ if stop_steps >= self.early_stop:
+ self.logger.info("early stop")
+ break
+
+ self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
+ self.krnn_model.load_state_dict(best_param)
+ torch.save(best_param, save_path)
+
+ if self.use_gpu:
+ torch.cuda.empty_cache()
+
+ def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
+ if not self.fitted:
+ raise ValueError("model is not fitted yet!")
+
+ x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I)
+ index = x_test.index
+ self.krnn_model.eval()
+ x_values = x_test.values
+ sample_num = x_values.shape[0]
+ preds = []
+
+ for begin in range(sample_num)[:: self.batch_size]:
+ if sample_num - begin < self.batch_size:
+ end = sample_num
+ else:
+ end = begin + self.batch_size
+ x_batch = torch.from_numpy(x_values[begin:end]).float().to(self.device)
+ with torch.no_grad():
+ pred = self.krnn_model(x_batch).detach().cpu().numpy()
+ preds.append(pred)
+
+ return pd.Series(np.concatenate(preds), index=index)
diff --git a/qlib/contrib/model/pytorch_localformer.py b/qlib/contrib/model/pytorch_localformer.py
index 6e7d91180..830bc59f0 100644
--- a/qlib/contrib/model/pytorch_localformer.py
+++ b/qlib/contrib/model/pytorch_localformer.py
@@ -46,7 +46,6 @@ class LocalformerModel(Model):
seed=None,
**kwargs
):
-
# set hyper-parameters.
self.d_model = d_model
self.dropout = dropout
@@ -96,7 +95,6 @@ class LocalformerModel(Model):
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
-
mask = torch.isfinite(label)
if self.metric in ("", "loss"):
@@ -105,7 +103,6 @@ class LocalformerModel(Model):
raise ValueError("unknown metric `%s`" % self.metric)
def train_epoch(self, x_train, y_train):
-
x_train_values = x_train.values
y_train_values = np.squeeze(y_train.values)
@@ -115,7 +112,6 @@ class LocalformerModel(Model):
np.random.shuffle(indices)
for i in range(len(indices))[:: self.batch_size]:
-
if len(indices) - i < self.batch_size:
break
@@ -131,7 +127,6 @@ class LocalformerModel(Model):
self.train_optimizer.step()
def test_epoch(self, data_x, data_y):
-
# prepare training data
x_values = data_x.values
y_values = np.squeeze(data_y.values)
@@ -144,7 +139,6 @@ class LocalformerModel(Model):
indices = np.arange(len(x_values))
for i in range(len(indices))[:: self.batch_size]:
-
if len(indices) - i < self.batch_size:
break
@@ -167,7 +161,6 @@ class LocalformerModel(Model):
evals_result=dict(),
save_path=None,
):
-
df_train, df_valid, df_test = dataset.prepare(
["train", "valid", "test"],
col_set=["feature", "label"],
@@ -232,7 +225,6 @@ class LocalformerModel(Model):
preds = []
for begin in range(sample_num)[:: self.batch_size]:
-
if sample_num - begin < self.batch_size:
end = sample_num
else:
diff --git a/qlib/contrib/model/pytorch_localformer_ts.py b/qlib/contrib/model/pytorch_localformer_ts.py
index 18ef7f112..b05c2d311 100644
--- a/qlib/contrib/model/pytorch_localformer_ts.py
+++ b/qlib/contrib/model/pytorch_localformer_ts.py
@@ -44,7 +44,6 @@ class LocalformerModel(Model):
seed=None,
**kwargs
):
-
# set hyper-parameters.
self.d_model = d_model
self.dropout = dropout
@@ -96,7 +95,6 @@ class LocalformerModel(Model):
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
-
mask = torch.isfinite(label)
if self.metric in ("", "loss"):
@@ -105,7 +103,6 @@ class LocalformerModel(Model):
raise ValueError("unknown metric `%s`" % self.metric)
def train_epoch(self, data_loader):
-
self.model.train()
for data in data_loader:
@@ -121,14 +118,12 @@ class LocalformerModel(Model):
self.train_optimizer.step()
def test_epoch(self, data_loader):
-
self.model.eval()
scores = []
losses = []
for data in data_loader:
-
feature = data[:, :, 0:-1].to(self.device)
label = data[:, -1, -1].to(self.device)
@@ -148,7 +143,6 @@ class LocalformerModel(Model):
evals_result=dict(),
save_path=None,
):
-
dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
if dl_train.empty or dl_valid.empty:
diff --git a/qlib/contrib/model/pytorch_lstm.py b/qlib/contrib/model/pytorch_lstm.py
index a68cf5eac..168be6ca5 100755
--- a/qlib/contrib/model/pytorch_lstm.py
+++ b/qlib/contrib/model/pytorch_lstm.py
@@ -142,7 +142,6 @@ class LSTM(Model):
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
-
mask = torch.isfinite(label)
if self.metric in ("", "loss"):
@@ -151,7 +150,6 @@ class LSTM(Model):
raise ValueError("unknown metric `%s`" % self.metric)
def train_epoch(self, x_train, y_train):
-
x_train_values = x_train.values
y_train_values = np.squeeze(y_train.values)
@@ -161,7 +159,6 @@ class LSTM(Model):
np.random.shuffle(indices)
for i in range(len(indices))[:: self.batch_size]:
-
if len(indices) - i < self.batch_size:
break
@@ -177,7 +174,6 @@ class LSTM(Model):
self.train_optimizer.step()
def test_epoch(self, data_x, data_y):
-
# prepare training data
x_values = data_x.values
y_values = np.squeeze(data_y.values)
@@ -190,7 +186,6 @@ class LSTM(Model):
indices = np.arange(len(x_values))
for i in range(len(indices))[:: self.batch_size]:
-
if len(indices) - i < self.batch_size:
break
@@ -212,7 +207,6 @@ class LSTM(Model):
evals_result=dict(),
save_path=None,
):
-
df_train, df_valid, df_test = dataset.prepare(
["train", "valid", "test"],
col_set=["feature", "label"],
diff --git a/qlib/contrib/model/pytorch_lstm_ts.py b/qlib/contrib/model/pytorch_lstm_ts.py
index f1a3c55e8..8ecafc2d5 100755
--- a/qlib/contrib/model/pytorch_lstm_ts.py
+++ b/qlib/contrib/model/pytorch_lstm_ts.py
@@ -150,7 +150,6 @@ class LSTM(Model):
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
-
mask = torch.isfinite(label)
if self.metric in ("", "loss"):
@@ -159,10 +158,9 @@ class LSTM(Model):
raise ValueError("unknown metric `%s`" % self.metric)
def train_epoch(self, data_loader):
-
self.LSTM_model.train()
- for (data, weight) in data_loader:
+ for data, weight in data_loader:
feature = data[:, :, 0:-1].to(self.device)
label = data[:, -1, -1].to(self.device)
@@ -175,14 +173,12 @@ class LSTM(Model):
self.train_optimizer.step()
def test_epoch(self, data_loader):
-
self.LSTM_model.eval()
scores = []
losses = []
- for (data, weight) in data_loader:
-
+ for data, weight in data_loader:
feature = data[:, :, 0:-1].to(self.device)
# feature[torch.isnan(feature)] = 0
label = data[:, -1, -1].to(self.device)
@@ -288,7 +284,6 @@ class LSTM(Model):
preds = []
for data in test_loader:
-
feature = data[:, :, 0:-1].to(self.device)
with torch.no_grad():
diff --git a/qlib/contrib/model/pytorch_sandwich.py b/qlib/contrib/model/pytorch_sandwich.py
new file mode 100644
index 000000000..020c736fd
--- /dev/null
+++ b/qlib/contrib/model/pytorch_sandwich.py
@@ -0,0 +1,381 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import pandas as pd
+from typing import Text, Union
+import copy
+from ...utils import get_or_create_path
+from ...log import get_module_logger
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+
+from ...model.base import Model
+from ...data.dataset import DatasetH
+from ...data.dataset.handler import DataHandlerLP
+from .pytorch_krnn import CNNKRNNEncoder
+
+
+class SandwichModel(nn.Module):
+ def __init__(
+ self,
+ fea_dim,
+ cnn_dim_1,
+ cnn_dim_2,
+ cnn_kernel_size,
+ rnn_dim_1,
+ rnn_dim_2,
+ rnn_dups,
+ rnn_layers,
+ dropout,
+ device,
+ **params
+ ):
+ """Build a Sandwich model
+
+ Parameters
+ ----------
+ fea_dim : int
+ The feature dimension
+ cnn_dim_1 : int
+ The hidden dimension of the first CNN
+ cnn_dim_2 : int
+ The hidden dimension of the second CNN
+ cnn_kernel_size : int
+ The size of convolutional kernels
+ rnn_dim_1 : int
+ The hidden dimension of the first KRNN
+ rnn_dim_2 : int
+ The hidden dimension of the second KRNN
+ rnn_dups : int
+ The number of parallel duplicates
+ rnn_layers: int
+ The number of RNN layers
+ """
+ super().__init__()
+
+ self.first_encoder = CNNKRNNEncoder(
+ cnn_input_dim=fea_dim,
+ cnn_output_dim=cnn_dim_1,
+ cnn_kernel_size=cnn_kernel_size,
+ rnn_output_dim=rnn_dim_1,
+ rnn_dup_num=rnn_dups,
+ rnn_layers=rnn_layers,
+ dropout=dropout,
+ device=device,
+ )
+
+ self.second_encoder = CNNKRNNEncoder(
+ cnn_input_dim=rnn_dim_1,
+ cnn_output_dim=cnn_dim_2,
+ cnn_kernel_size=cnn_kernel_size,
+ rnn_output_dim=rnn_dim_2,
+ rnn_dup_num=rnn_dups,
+ rnn_layers=rnn_layers,
+ dropout=dropout,
+ device=device,
+ )
+
+ self.out_fc = nn.Linear(rnn_dim_2, 1)
+ self.device = device
+
+ def forward(self, x):
+ # x: [batch_size, node_num, seq_len, input_dim]
+ encode = self.first_encoder(x)
+ encode = self.second_encoder(encode)
+ out = self.out_fc(encode[:, -1, :]).squeeze().to(self.device)
+
+ return out
+
+
+class Sandwich(Model):
+ """Sandwich Model
+
+ Parameters
+ ----------
+ d_feat : int
+ input dimension for each time step
+ metric: str
+ the evaluation metric used in early stop
+ optimizer : str
+ optimizer name
+ GPU : str
+ the GPU ID(s) used for training
+ """
+
+ def __init__(
+ self,
+ fea_dim=6,
+ cnn_dim_1=64,
+ cnn_dim_2=32,
+ cnn_kernel_size=3,
+ rnn_dim_1=16,
+ rnn_dim_2=8,
+ rnn_dups=3,
+ rnn_layers=2,
+ dropout=0,
+ n_epochs=200,
+ lr=0.001,
+ metric="",
+ batch_size=2000,
+ early_stop=20,
+ loss="mse",
+ optimizer="adam",
+ GPU=0,
+ seed=None,
+ **kwargs
+ ):
+ # Set logger.
+ self.logger = get_module_logger("Sandwich")
+ self.logger.info("Sandwich pytorch version...")
+
+ # set hyper-parameters.
+ self.fea_dim = fea_dim
+ self.cnn_dim_1 = cnn_dim_1
+ self.cnn_dim_2 = cnn_dim_2
+ self.cnn_kernel_size = cnn_kernel_size
+ self.rnn_dim_1 = rnn_dim_1
+ self.rnn_dim_2 = rnn_dim_2
+ self.rnn_dups = rnn_dups
+ self.rnn_layers = rnn_layers
+ self.dropout = dropout
+ self.n_epochs = n_epochs
+ self.lr = lr
+ self.metric = metric
+ self.batch_size = batch_size
+ self.early_stop = early_stop
+ self.optimizer = optimizer.lower()
+ self.loss = loss
+ self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu")
+ self.seed = seed
+
+ self.logger.info(
+ "Sandwich parameters setting:"
+ "\nfea_dim : {}"
+ "\ncnn_dim_1 : {}"
+ "\ncnn_dim_2 : {}"
+ "\ncnn_kernel_size : {}"
+ "\nrnn_dim_1 : {}"
+ "\nrnn_dim_2 : {}"
+ "\nrnn_dups : {}"
+ "\nrnn_layers : {}"
+ "\ndropout : {}"
+ "\nn_epochs : {}"
+ "\nlr : {}"
+ "\nmetric : {}"
+ "\nbatch_size: {}"
+ "\nearly_stop : {}"
+ "\noptimizer : {}"
+ "\nloss_type : {}"
+ "\nvisible_GPU : {}"
+ "\nuse_GPU : {}"
+ "\nseed : {}".format(
+ fea_dim,
+ cnn_dim_1,
+ cnn_dim_2,
+ cnn_kernel_size,
+ rnn_dim_1,
+ rnn_dim_2,
+ rnn_dups,
+ rnn_layers,
+ dropout,
+ n_epochs,
+ lr,
+ metric,
+ batch_size,
+ early_stop,
+ optimizer.lower(),
+ loss,
+ GPU,
+ self.use_gpu,
+ seed,
+ )
+ )
+
+ if self.seed is not None:
+ np.random.seed(self.seed)
+ torch.manual_seed(self.seed)
+
+ self.sandwich_model = SandwichModel(
+ fea_dim=self.fea_dim,
+ cnn_dim_1=self.cnn_dim_1,
+ cnn_dim_2=self.cnn_dim_2,
+ cnn_kernel_size=self.cnn_kernel_size,
+ rnn_dim_1=self.rnn_dim_1,
+ rnn_dim_2=self.rnn_dim_2,
+ rnn_dups=self.rnn_dups,
+ rnn_layers=self.rnn_layers,
+ dropout=self.dropout,
+ device=self.device,
+ )
+ if optimizer.lower() == "adam":
+ self.train_optimizer = optim.Adam(self.sandwich_model.parameters(), lr=self.lr)
+ elif optimizer.lower() == "gd":
+ self.train_optimizer = optim.SGD(self.sandwich_model.parameters(), lr=self.lr)
+ else:
+ raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
+
+ self.fitted = False
+ self.sandwich_model.to(self.device)
+
+ @property
+ def use_gpu(self):
+ return self.device != torch.device("cpu")
+
+ def mse(self, pred, label):
+ loss = (pred - label) ** 2
+ return torch.mean(loss)
+
+ def loss_fn(self, pred, label):
+ mask = ~torch.isnan(label)
+
+ if self.loss == "mse":
+ return self.mse(pred[mask], label[mask])
+
+ raise ValueError("unknown loss `%s`" % self.loss)
+
+ def metric_fn(self, pred, label):
+ mask = torch.isfinite(label)
+
+ if self.metric in ("", "loss"):
+ return -self.loss_fn(pred[mask], label[mask])
+
+ raise ValueError("unknown metric `%s`" % self.metric)
+
+ def train_epoch(self, x_train, y_train):
+ x_train_values = x_train.values
+ y_train_values = np.squeeze(y_train.values)
+ self.sandwich_model.train()
+
+ indices = np.arange(len(x_train_values))
+ np.random.shuffle(indices)
+
+ for i in range(len(indices))[:: self.batch_size]:
+ if len(indices) - i < self.batch_size:
+ break
+
+ feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float().to(self.device)
+ label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float().to(self.device)
+
+ pred = self.sandwich_model(feature)
+ loss = self.loss_fn(pred, label)
+
+ self.train_optimizer.zero_grad()
+ loss.backward()
+ torch.nn.utils.clip_grad_value_(self.sandwich_model.parameters(), 3.0)
+ self.train_optimizer.step()
+
+ def test_epoch(self, data_x, data_y):
+ # prepare training data
+ x_values = data_x.values
+ y_values = np.squeeze(data_y.values)
+
+ self.sandwich_model.eval()
+
+ scores = []
+ losses = []
+
+ indices = np.arange(len(x_values))
+
+ for i in range(len(indices))[:: self.batch_size]:
+ if len(indices) - i < self.batch_size:
+ break
+
+ feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float().to(self.device)
+ label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float().to(self.device)
+
+ pred = self.sandwich_model(feature)
+ loss = self.loss_fn(pred, label)
+ losses.append(loss.item())
+
+ score = self.metric_fn(pred, label)
+ scores.append(score.item())
+
+ return np.mean(losses), np.mean(scores)
+
+ def fit(
+ self,
+ dataset: DatasetH,
+ evals_result=dict(),
+ save_path=None,
+ ):
+ df_train, df_valid, df_test = dataset.prepare(
+ ["train", "valid", "test"],
+ col_set=["feature", "label"],
+ data_key=DataHandlerLP.DK_L,
+ )
+ if df_train.empty or df_valid.empty:
+ raise ValueError("Empty data from dataset, please check your dataset config.")
+
+ x_train, y_train = df_train["feature"], df_train["label"]
+ x_valid, y_valid = df_valid["feature"], df_valid["label"]
+
+ save_path = get_or_create_path(save_path)
+ stop_steps = 0
+ train_loss = 0
+ best_score = -np.inf
+ best_epoch = 0
+ evals_result["train"] = []
+ evals_result["valid"] = []
+
+ # train
+ self.logger.info("training...")
+ self.fitted = True
+
+ for step in range(self.n_epochs):
+ self.logger.info("Epoch%d:", step)
+ self.logger.info("training...")
+ self.train_epoch(x_train, y_train)
+ self.logger.info("evaluating...")
+ train_loss, train_score = self.test_epoch(x_train, y_train)
+ val_loss, val_score = self.test_epoch(x_valid, y_valid)
+ self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
+ evals_result["train"].append(train_score)
+ evals_result["valid"].append(val_score)
+
+ if val_score > best_score:
+ best_score = val_score
+ stop_steps = 0
+ best_epoch = step
+ best_param = copy.deepcopy(self.sandwich_model.state_dict())
+ else:
+ stop_steps += 1
+ if stop_steps >= self.early_stop:
+ self.logger.info("early stop")
+ break
+
+ self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
+ self.sandwich_model.load_state_dict(best_param)
+ torch.save(best_param, save_path)
+
+ if self.use_gpu:
+ torch.cuda.empty_cache()
+
+ def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
+ if not self.fitted:
+ raise ValueError("model is not fitted yet!")
+
+ x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I)
+ index = x_test.index
+ self.sandwich_model.eval()
+ x_values = x_test.values
+ sample_num = x_values.shape[0]
+ preds = []
+
+ for begin in range(sample_num)[:: self.batch_size]:
+ if sample_num - begin < self.batch_size:
+ end = sample_num
+ else:
+ end = begin + self.batch_size
+ x_batch = torch.from_numpy(x_values[begin:end]).float().to(self.device)
+ with torch.no_grad():
+ pred = self.sandwich_model(x_batch).detach().cpu().numpy()
+ preds.append(pred)
+
+ return pd.Series(np.concatenate(preds), index=index)
diff --git a/qlib/contrib/model/pytorch_sfm.py b/qlib/contrib/model/pytorch_sfm.py
index 29bae94a3..e79f475d6 100644
--- a/qlib/contrib/model/pytorch_sfm.py
+++ b/qlib/contrib/model/pytorch_sfm.py
@@ -306,7 +306,6 @@ class SFM(Model):
return self.device != torch.device("cpu")
def test_epoch(self, data_x, data_y):
-
# prepare training data
x_values = data_x.values
y_values = np.squeeze(data_y.values)
@@ -319,7 +318,6 @@ class SFM(Model):
indices = np.arange(len(x_values))
for i in range(len(indices))[:: self.batch_size]:
-
if len(indices) - i < self.batch_size:
break
@@ -336,7 +334,6 @@ class SFM(Model):
return np.mean(losses), np.mean(scores)
def train_epoch(self, x_train, y_train):
-
x_train_values = x_train.values
y_train_values = np.squeeze(y_train.values)
@@ -346,7 +343,6 @@ class SFM(Model):
np.random.shuffle(indices)
for i in range(len(indices))[:: self.batch_size]:
-
if len(indices) - i < self.batch_size:
break
@@ -367,7 +363,6 @@ class SFM(Model):
evals_result=dict(),
save_path=None,
):
-
df_train, df_valid = dataset.prepare(
["train", "valid"],
col_set=["feature", "label"],
@@ -431,7 +426,6 @@ class SFM(Model):
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
-
mask = torch.isfinite(label)
if self.metric in ("", "loss"):
diff --git a/qlib/contrib/model/pytorch_tabnet.py b/qlib/contrib/model/pytorch_tabnet.py
index adc7354fe..3c698edad 100644
--- a/qlib/contrib/model/pytorch_tabnet.py
+++ b/qlib/contrib/model/pytorch_tabnet.py
@@ -256,7 +256,6 @@ class TabnetModel(Model):
indices = np.arange(len(x_values))
for i in range(len(indices))[:: self.batch_size]:
-
if len(indices) - i < self.batch_size:
break
feature = x_values[indices[i : i + self.batch_size]].float().to(self.device)
@@ -283,7 +282,6 @@ class TabnetModel(Model):
np.random.shuffle(indices)
for i in range(len(indices))[:: self.batch_size]:
-
if len(indices) - i < self.batch_size:
break
@@ -308,7 +306,6 @@ class TabnetModel(Model):
self.tabnet_decoder.train()
for i in range(len(indices))[:: self.batch_size]:
-
if len(indices) - i < self.batch_size:
break
@@ -339,7 +336,6 @@ class TabnetModel(Model):
losses = []
for i in range(len(indices))[:: self.batch_size]:
-
if len(indices) - i < self.batch_size:
break
diff --git a/qlib/contrib/model/pytorch_tcn.py b/qlib/contrib/model/pytorch_tcn.py
index 2af7a04ea..38e289342 100755
--- a/qlib/contrib/model/pytorch_tcn.py
+++ b/qlib/contrib/model/pytorch_tcn.py
@@ -154,7 +154,6 @@ class TCN(Model):
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
-
mask = torch.isfinite(label)
if self.metric in ("", "loss"):
@@ -163,7 +162,6 @@ class TCN(Model):
raise ValueError("unknown metric `%s`" % self.metric)
def train_epoch(self, x_train, y_train):
-
x_train_values = x_train.values
y_train_values = np.squeeze(y_train.values)
@@ -173,7 +171,6 @@ class TCN(Model):
np.random.shuffle(indices)
for i in range(len(indices))[:: self.batch_size]:
-
if len(indices) - i < self.batch_size:
break
@@ -200,7 +197,6 @@ class TCN(Model):
indices = np.arange(len(x_values))
for i in range(len(indices))[:: self.batch_size]:
-
if len(indices) - i < self.batch_size:
break
@@ -223,7 +219,6 @@ class TCN(Model):
evals_result=dict(),
save_path=None,
):
-
df_train, df_valid, df_test = dataset.prepare(
["train", "valid", "test"],
col_set=["feature", "label"],
@@ -286,7 +281,6 @@ class TCN(Model):
preds = []
for begin in range(sample_num)[:: self.batch_size]:
-
if sample_num - begin < self.batch_size:
end = sample_num
else:
diff --git a/qlib/contrib/model/pytorch_tcn_ts.py b/qlib/contrib/model/pytorch_tcn_ts.py
index 4972a3065..605da62c4 100755
--- a/qlib/contrib/model/pytorch_tcn_ts.py
+++ b/qlib/contrib/model/pytorch_tcn_ts.py
@@ -155,7 +155,6 @@ class TCN(Model):
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
-
mask = torch.isfinite(label)
if self.metric in ("", "loss"):
@@ -164,11 +163,11 @@ class TCN(Model):
raise ValueError("unknown metric `%s`" % self.metric)
def train_epoch(self, data_loader):
-
self.TCN_model.train()
for data in data_loader:
- feature = data[:, :, 0:-1].to(self.device)
+ data = torch.transpose(data, 1, 2)
+ feature = data[:, 0:-1, :].to(self.device)
label = data[:, -1, -1].to(self.device)
pred = self.TCN_model(feature.float())
@@ -180,15 +179,14 @@ class TCN(Model):
self.train_optimizer.step()
def test_epoch(self, data_loader):
-
self.TCN_model.eval()
scores = []
losses = []
for data in data_loader:
-
- feature = data[:, :, 0:-1].to(self.device)
+ data = torch.transpose(data, 1, 2)
+ feature = data[:, 0:-1, :].to(self.device)
# feature[torch.isnan(feature)] = 0
label = data[:, -1, -1].to(self.device)
@@ -276,7 +274,6 @@ class TCN(Model):
preds = []
for data in test_loader:
-
feature = data[:, :, 0:-1].to(self.device)
with torch.no_grad():
diff --git a/qlib/contrib/model/pytorch_tcts.py b/qlib/contrib/model/pytorch_tcts.py
index b46835cb6..651bd03d2 100644
--- a/qlib/contrib/model/pytorch_tcts.py
+++ b/qlib/contrib/model/pytorch_tcts.py
@@ -119,7 +119,6 @@ class TCTS(Model):
)
def loss_fn(self, pred, label, weight):
-
if self.mode == "hard":
loc = torch.argmax(weight, 1)
loss = (pred - label[np.arange(weight.shape[0]), loc]) ** 2
@@ -157,7 +156,6 @@ class TCTS(Model):
for i in range(self.steps):
for i in range(len(indices))[:: self.batch_size]:
-
if len(indices) - i < self.batch_size:
break
@@ -191,7 +189,6 @@ class TCTS(Model):
# fix forecasting model and valid weight model
for i in range(len(indices))[:: self.batch_size]:
-
if len(indices) - i < self.batch_size:
break
@@ -212,7 +209,6 @@ class TCTS(Model):
self.weight_optimizer.step()
def test_epoch(self, data_x, data_y):
-
# prepare training data
x_values = data_x.values
y_values = np.squeeze(data_y.values)
@@ -224,7 +220,6 @@ class TCTS(Model):
indices = np.arange(len(x_values))
for i in range(len(indices))[:: self.batch_size]:
-
if len(indices) - i < self.batch_size:
break
@@ -282,7 +277,6 @@ class TCTS(Model):
verbose=True,
save_path=None,
):
-
self.fore_model = GRUModel(
d_feat=self.d_feat,
hidden_size=self.hidden_size,
@@ -366,7 +360,6 @@ class TCTS(Model):
preds = []
for begin in range(sample_num)[:: self.batch_size]:
-
if sample_num - begin < self.batch_size:
end = sample_num
else:
diff --git a/qlib/contrib/model/pytorch_tra.py b/qlib/contrib/model/pytorch_tra.py
index 46d362c68..964febf11 100644
--- a/qlib/contrib/model/pytorch_tra.py
+++ b/qlib/contrib/model/pytorch_tra.py
@@ -84,7 +84,6 @@ class TRAModel(Model):
transport_method="none",
memory_mode="sample",
):
-
self.logger = get_module_logger("TRA")
assert memory_mode in ["sample", "daily"], "invalid memory mode"
@@ -136,7 +135,6 @@ class TRAModel(Model):
self._init_model()
def _init_model(self):
-
self.logger.info("init TRAModel...")
self.model = eval(self.model_type)(**self.model_config).to(device)
@@ -176,7 +174,6 @@ class TRAModel(Model):
self.global_step = -1
def train_epoch(self, epoch, data_set, is_pretrain=False):
-
self.model.train()
self.tra.train()
data_set.train()
@@ -274,7 +271,6 @@ class TRAModel(Model):
return total_loss
def test_epoch(self, epoch, data_set, return_pred=False, prefix="test", is_pretrain=False):
-
self.model.eval()
self.tra.eval()
data_set.eval()
@@ -360,7 +356,6 @@ class TRAModel(Model):
return metrics, preds, probs, P_all
def _fit(self, train_set, valid_set, test_set, evals_result, is_pretrain=True):
-
best_score = -1
best_epoch = 0
stop_rounds = 0
@@ -419,7 +414,6 @@ class TRAModel(Model):
return best_score
def fit(self, dataset, evals_result=dict()):
-
assert isinstance(dataset, MTSDatasetH), "TRAModel only supports `qlib.contrib.data.dataset.MTSDatasetH`"
train_set, valid_set, test_set = dataset.prepare(["train", "valid", "test"])
@@ -503,7 +497,6 @@ class TRAModel(Model):
json.dump(info, f)
def predict(self, dataset, segment="test"):
-
assert isinstance(dataset, MTSDatasetH), "TRAModel only supports `qlib.contrib.data.dataset.MTSDatasetH`"
if not self.fitted:
@@ -571,7 +564,6 @@ class RNN(nn.Module):
self.output_size = hidden_size
def forward(self, x):
-
if self.input_proj is not None:
x = self.input_proj(x)
@@ -647,7 +639,6 @@ class Transformer(nn.Module):
self.output_size = hidden_size
def forward(self, x):
-
x = x.permute(1, 0, 2).contiguous() # the first dim need to be time
x = self.pe(x)
@@ -713,7 +704,6 @@ class TRA(nn.Module):
child.reset_parameters()
def forward(self, hidden, hist_loss):
-
preds = self.predictors(hidden)
if self.num_states == 1: # no need for router when having only one prediction
diff --git a/qlib/contrib/model/pytorch_transformer.py b/qlib/contrib/model/pytorch_transformer.py
index 66e5b2c4e..f4b7a06eb 100644
--- a/qlib/contrib/model/pytorch_transformer.py
+++ b/qlib/contrib/model/pytorch_transformer.py
@@ -45,7 +45,6 @@ class TransformerModel(Model):
seed=None,
**kwargs
):
-
# set hyper-parameters.
self.d_model = d_model
self.dropout = dropout
@@ -95,7 +94,6 @@ class TransformerModel(Model):
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
-
mask = torch.isfinite(label)
if self.metric in ("", "loss"):
@@ -104,7 +102,6 @@ class TransformerModel(Model):
raise ValueError("unknown metric `%s`" % self.metric)
def train_epoch(self, x_train, y_train):
-
x_train_values = x_train.values
y_train_values = np.squeeze(y_train.values)
@@ -114,7 +111,6 @@ class TransformerModel(Model):
np.random.shuffle(indices)
for i in range(len(indices))[:: self.batch_size]:
-
if len(indices) - i < self.batch_size:
break
@@ -130,7 +126,6 @@ class TransformerModel(Model):
self.train_optimizer.step()
def test_epoch(self, data_x, data_y):
-
# prepare training data
x_values = data_x.values
y_values = np.squeeze(data_y.values)
@@ -143,7 +138,6 @@ class TransformerModel(Model):
indices = np.arange(len(x_values))
for i in range(len(indices))[:: self.batch_size]:
-
if len(indices) - i < self.batch_size:
break
@@ -166,7 +160,6 @@ class TransformerModel(Model):
evals_result=dict(),
save_path=None,
):
-
df_train, df_valid, df_test = dataset.prepare(
["train", "valid", "test"],
col_set=["feature", "label"],
@@ -231,7 +224,6 @@ class TransformerModel(Model):
preds = []
for begin in range(sample_num)[:: self.batch_size]:
-
if sample_num - begin < self.batch_size:
end = sample_num
else:
diff --git a/qlib/contrib/model/pytorch_transformer_ts.py b/qlib/contrib/model/pytorch_transformer_ts.py
index 6cffded9c..84b093805 100644
--- a/qlib/contrib/model/pytorch_transformer_ts.py
+++ b/qlib/contrib/model/pytorch_transformer_ts.py
@@ -43,7 +43,6 @@ class TransformerModel(Model):
seed=None,
**kwargs
):
-
# set hyper-parameters.
self.d_model = d_model
self.dropout = dropout
@@ -93,7 +92,6 @@ class TransformerModel(Model):
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
-
mask = torch.isfinite(label)
if self.metric in ("", "loss"):
@@ -102,7 +100,6 @@ class TransformerModel(Model):
raise ValueError("unknown metric `%s`" % self.metric)
def train_epoch(self, data_loader):
-
self.model.train()
for data in data_loader:
@@ -118,14 +115,12 @@ class TransformerModel(Model):
self.train_optimizer.step()
def test_epoch(self, data_loader):
-
self.model.eval()
scores = []
losses = []
for data in data_loader:
-
feature = data[:, :, 0:-1].to(self.device)
label = data[:, -1, -1].to(self.device)
@@ -145,7 +140,6 @@ class TransformerModel(Model):
evals_result=dict(),
save_path=None,
):
-
dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
diff --git a/qlib/contrib/model/xgboost.py b/qlib/contrib/model/xgboost.py
index d38655ebd..67bedafa8 100755
--- a/qlib/contrib/model/xgboost.py
+++ b/qlib/contrib/model/xgboost.py
@@ -30,7 +30,6 @@ class XGBModel(Model, FeatureInt):
reweighter=None,
**kwargs
):
-
df_train, df_valid = dataset.prepare(
["train", "valid"],
col_set=["feature", "label"],
diff --git a/qlib/contrib/report/data/ana.py b/qlib/contrib/report/data/ana.py
index 782a92d5a..567ef311d 100644
--- a/qlib/contrib/report/data/ana.py
+++ b/qlib/contrib/report/data/ana.py
@@ -30,7 +30,6 @@ class CombFeaAna(FeaAnalyser):
"""The statistics of features are finished in the underlying analysers"""
def plot_all(self, *args, **kwargs):
-
ax_gen = iter(sub_fig_generator(row_n=len(self._fea_ana_l), *args, **kwargs))
for col in self._dataset:
diff --git a/qlib/contrib/report/data/base.py b/qlib/contrib/report/data/base.py
index 1e7e092af..a91eda48e 100644
--- a/qlib/contrib/report/data/base.py
+++ b/qlib/contrib/report/data/base.py
@@ -28,7 +28,6 @@ class FeaAnalyser:
return False
def plot_all(self, *args, **kwargs):
-
ax_gen = iter(sub_fig_generator(*args, **kwargs))
for col in self._dataset:
if not self.skip(col):
diff --git a/qlib/contrib/report/graph.py b/qlib/contrib/report/graph.py
index c5f932978..f9cf517ea 100644
--- a/qlib/contrib/report/graph.py
+++ b/qlib/contrib/report/graph.py
@@ -15,7 +15,6 @@ from plotly.figure_factory import create_distplot
class BaseGraph:
-
_name = None
def __init__(
diff --git a/qlib/contrib/rolling/__init__.py b/qlib/contrib/rolling/__init__.py
new file mode 100644
index 000000000..b940486fd
--- /dev/null
+++ b/qlib/contrib/rolling/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+"""
+The difference between me and the scripts in examples/benchmarks/benchmarks_dynamic
+- This module only focus provide a general rolling implementation.
+ Anything specific that benchmark is placed in examples/benchmarks/benchmarks_dynamic
+"""
diff --git a/qlib/contrib/rolling/__main__.py b/qlib/contrib/rolling/__main__.py
new file mode 100644
index 000000000..461c0e777
--- /dev/null
+++ b/qlib/contrib/rolling/__main__.py
@@ -0,0 +1,16 @@
+import fire
+from qlib import auto_init
+from qlib.contrib.rolling.base import Rolling
+from qlib.utils.mod import find_all_classes
+
+if __name__ == "__main__":
+ sub_commands = {}
+ for cls in find_all_classes("qlib.contrib.rolling", Rolling):
+ sub_commands[cls.__module__.split(".")[-1]] = cls
+ # The sub_commands will be like
+ # {'base': , ...}
+ # So the you can run it with commands like command below
+ # - `python -m qlib.contrib.rolling base --conf_path run`
+ # - base can be replace with other module names
+ auto_init()
+ fire.Fire(sub_commands)
diff --git a/qlib/contrib/rolling/base.py b/qlib/contrib/rolling/base.py
new file mode 100644
index 000000000..d179efb38
--- /dev/null
+++ b/qlib/contrib/rolling/base.py
@@ -0,0 +1,246 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+from copy import deepcopy
+from pathlib import Path
+from typing import List, Optional, Union
+
+import fire
+import pandas as pd
+import yaml
+
+from qlib import auto_init
+from qlib.log import get_module_logger
+from qlib.model.ens.ensemble import RollingEnsemble
+from qlib.model.trainer import TrainerR
+from qlib.utils import get_cls_kwargs, init_instance_by_config
+from qlib.utils.data import update_config
+from qlib.workflow import R
+from qlib.workflow.record_temp import SignalRecord
+from qlib.workflow.task.collect import RecorderCollector
+from qlib.workflow.task.gen import RollingGen, task_generator
+from qlib.workflow.task.utils import replace_task_handler_with_cache
+
+
+class Rolling:
+ """
+ The motivation of Rolling Module
+ - It only focus **offlinely** turn a specific task to rollinng
+ - To make the implementation easier, following factors are ignored.
+ - The tasks is dependent (e.g. time series).
+
+ Related modules and difference from me:
+ - MetaController: It is learning how to handle a task (e.g. learning to learn).
+ - But rolling is about how to split a single task into tasks in time series and run them.
+ - OnlineStrategy: It is focusing on serving a model, the model can be updated time dependently in time.
+ - Rolling is much simpler and is only for testing rolling models offline. It does not want to share the interface with OnlineStrategy.
+
+ The code about rolling is shared in `task_generator` & `RollingGen` level between me and the above modules
+ But it is for different purpose, so other parts are not shared.
+
+
+ .. code-block:: shell
+
+ # here is an typical use case of the module.
+ python -m qlib.contrib.rolling.base --conf_path run
+
+ **NOTE**
+ before running the example, please clean your previous results with following command
+ - `rm -r mlruns`
+ - Because it is very hard to permanently delete a experiment (it will be moved into .trash and raise error when creating experiment with same name).
+
+ """
+
+ def __init__(
+ self,
+ conf_path: Union[str, Path],
+ exp_name: Optional[str] = None,
+ horizon: Optional[int] = 20,
+ step: int = 20,
+ h_path: Optional[str] = None,
+ train_start: Optional[str] = None,
+ test_end: Optional[str] = None,
+ task_ext_conf: Optional[dict] = None,
+ rolling_exp: Optional[str] = None,
+ ) -> None:
+ """
+ Parameters
+ ----------
+ conf_path : str
+ Path to the config for rolling.
+ exp_name : Optional[str]
+ The exp name of the outputs (Output is a record which contains the concatenated predictions of rolling records).
+ horizon: Optional[int] = 20,
+ The horizon of the prediction target.
+ This is used to override the prediction horizon of the file.
+ h_path : Optional[str]
+ the dumped data handler;
+ It may come from other data source. It will override the data handler in the config.
+ test_end : Optional[str]
+ the test end for the data. It is typically used together with the handler
+ You can do the same thing with task_ext_conf in a more complicated way
+ train_start : Optional[str]
+ the train start for the data. It is typically used together with the handler.
+ You can do the same thing with task_ext_conf in a more complicated way
+ task_ext_conf : Optional[dict]
+ some option to update the task config.
+ rolling_exp : Optional[str]
+ The name for the experiments for rolling.
+ It will contains a lot of record in an experiment. Each record corresponds to a specific rolling.
+ Please note that it is different from the final experiments
+ """
+ self.logger = get_module_logger("Rolling")
+ self.conf_path = Path(conf_path)
+ self.exp_name = exp_name
+ self._rid = None # the final combined recorder id in `exp_name`
+
+ self.step = step
+ assert horizon is not None, "Current version does not support extracting horizon from the underlying dataset"
+ self.horizon = horizon
+ if rolling_exp is None:
+ datetime_suffix = pd.Timestamp.now().strftime("%Y%m%d%H%M%S")
+ self.rolling_exp = f"rolling_models_{datetime_suffix}"
+ else:
+ self.rolling_exp = rolling_exp
+ self.logger.warning(
+ "Using user specifiied name for rolling models. So the experiment names duplicateds. "
+ "Please manually remove your experiment for rolling model with command like `rm -r mlruns`."
+ " Otherwise it will prevents the creating of experimen with same name"
+ )
+ self.train_start = train_start
+ self.test_end = test_end
+ self.task_ext_conf = task_ext_conf
+ self.h_path = h_path
+
+ # FIXME:
+ # - the qlib_init section will be ignored by me.
+ # - So we have to design a priority mechanism to solve this issue.
+
+ def _raw_conf(self) -> dict:
+ with self.conf_path.open("r") as f:
+ return yaml.safe_load(f)
+
+ def _replace_hanler_with_cache(self, task: dict):
+ """
+ Due to the data processing part in original rolling is slow. So we have to
+ This class tries to add more feature
+ """
+ if self.h_path is not None:
+ h_path = Path(self.h_path)
+ task["dataset"]["kwargs"]["handler"] = f"file://{h_path}"
+ else:
+ task = replace_task_handler_with_cache(task, self.conf_path.parent)
+ return task
+
+ def _update_start_end_time(self, task: dict):
+ if self.train_start is not None:
+ seg = task["dataset"]["kwargs"]["segments"]["train"]
+ task["dataset"]["kwargs"]["segments"]["train"] = pd.Timestamp(self.train_start), seg[1]
+
+ if self.test_end is not None:
+ seg = task["dataset"]["kwargs"]["segments"]["test"]
+ task["dataset"]["kwargs"]["segments"]["test"] = seg[0], pd.Timestamp(self.test_end)
+ return task
+
+ def basic_task(self, enable_handler_cache: Optional[bool] = True):
+ """
+ The basic task may not be the exactly same as the config from `conf_path` from __init__ due to
+ - some parameters could be overriding by some parameters from __init__
+ - user could implementing sublcass to change it for higher performance
+ """
+ task: dict = self._raw_conf()["task"]
+ task = deepcopy(task)
+
+ # modify dataset horizon
+ # NOTE:
+ # It assumpts that the label can be modifiled in the handler's kwargs
+ # But is not always a valid. It is only valid in the predefined dataset `Alpha158` & `Alpha360`
+ if self.horizon is None:
+ # TODO:
+ # - get horizon automatically from the expression!!!!
+ raise NotImplementedError(f"This type of input is not supported")
+ else:
+ self.logger.info("The prediction horizon is overrided")
+ task["dataset"]["kwargs"]["handler"]["kwargs"]["label"] = [
+ "Ref($close, -{}) / Ref($close, -1) - 1".format(self.horizon + 1)
+ ]
+
+ if enable_handler_cache:
+ task = self._replace_hanler_with_cache(task)
+ task = self._update_start_end_time(task)
+
+ if self.task_ext_conf is not None:
+ task = update_config(task, self.task_ext_conf)
+ self.logger.info(task)
+ return task
+
+ def get_task_list(self) -> List[dict]:
+ """return a batch of tasks for rolling."""
+ task = self.basic_task()
+ task_l = task_generator(
+ task, RollingGen(step=self.step, trunc_days=self.horizon + 1)
+ ) # the last two days should be truncated to avoid information leakage
+ for t in task_l:
+ # when we rolling tasks. No further analyis is needed.
+ # analyis are postponed to the final ensemble.
+ t["record"] = ["qlib.workflow.record_temp.SignalRecord"]
+ return task_l
+
+ def _train_rolling_tasks(self):
+ task_l = self.get_task_list()
+ self.logger.info("Deleting previous Rolling results")
+ try:
+ # TODO: mlflow does not support permanently delete experiment
+ # it will be moved to .trash and prevents creating the experiments with the same name
+ R.delete_exp(experiment_name=self.rolling_exp) # We should remove the rolling experiments.
+ except ValueError:
+ self.logger.info("No previous rolling results")
+ trainer = TrainerR(experiment_name=self.rolling_exp)
+ trainer(task_l)
+
+ def _ens_rolling(self):
+ rc = RecorderCollector(
+ experiment=self.rolling_exp,
+ artifacts_key=["pred", "label"],
+ process_list=[RollingEnsemble()],
+ # rec_key_func=lambda rec: (self.COMB_EXP, rec.info["id"]),
+ artifacts_path={"pred": "pred.pkl", "label": "label.pkl"},
+ )
+ res = rc()
+ with R.start(experiment_name=self.exp_name):
+ R.log_params(exp_name=self.rolling_exp)
+ R.save_objects(**{"pred.pkl": res["pred"], "label.pkl": res["label"]})
+ self._rid = R.get_recorder().id
+
+ def _update_rolling_rec(self):
+ """
+ Evaluate the combined rolling results
+ """
+ rec = R.get_recorder(experiment_name=self.exp_name, recorder_id=self._rid)
+ # Follow the original analyser
+ records = self._raw_conf()["task"].get("record", [])
+ if isinstance(records, dict): # prevent only one dict
+ records = [records]
+ for record in records:
+ if issubclass(get_cls_kwargs(record)[0], SignalRecord):
+ # skip the signal record.
+ continue
+ r = init_instance_by_config(
+ record,
+ recorder=rec,
+ default_module="qlib.workflow.record_temp",
+ )
+ r.generate()
+ print(f"Your evaluation results can be found in the experiment named `{self.exp_name}`.")
+
+ def run(self):
+ # the results will be save in mlruns.
+ # 1) each rolling task is saved in rolling_models
+ self._train_rolling_tasks()
+ # 2) combined rolling tasks and evaluation results are saved in rolling
+ self._ens_rolling()
+ self._update_rolling_rec()
+
+
+if __name__ == "__main__":
+ auto_init()
+ fire.Fire(Rolling)
diff --git a/qlib/contrib/rolling/ddgda.py b/qlib/contrib/rolling/ddgda.py
new file mode 100644
index 000000000..25fb4c36e
--- /dev/null
+++ b/qlib/contrib/rolling/ddgda.py
@@ -0,0 +1,343 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+from pathlib import Path
+import pickle
+from typing import Optional, Union
+
+import pandas as pd
+import yaml
+
+from qlib.contrib.meta.data_selection.dataset import InternalData, MetaDatasetDS
+from qlib.contrib.meta.data_selection.model import MetaModelDS
+from qlib.data.dataset.handler import DataHandlerLP
+from qlib.model.meta.task import MetaTask
+from qlib.model.trainer import TrainerR
+from qlib.typehint import Literal
+from qlib.utils import init_instance_by_config
+from qlib.workflow import R
+from qlib.workflow.task.utils import replace_task_handler_with_cache
+
+from .base import Rolling
+
+# LGBM is designed for feature importance & similarity
+LGBM_MODEL = """
+class: LGBModel
+module_path: qlib.contrib.model.gbdt
+kwargs:
+ loss: mse
+ colsample_bytree: 0.8879
+ learning_rate: 0.2
+ subsample: 0.8789
+ lambda_l1: 205.6999
+ lambda_l2: 580.9768
+ max_depth: 8
+ num_leaves: 210
+ num_threads: 20
+"""
+# covnert the yaml to dict
+LGBM_MODEL = yaml.load(LGBM_MODEL, Loader=yaml.FullLoader)
+
+LINEAR_MODEL = """
+class: LinearModel
+module_path: qlib.contrib.model.linear
+kwargs:
+ estimator: ridge
+ alpha: 0.05
+"""
+LINEAR_MODEL = yaml.load(LINEAR_MODEL, Loader=yaml.FullLoader)
+
+PROC_ARGS = """
+infer_processors:
+ - class: RobustZScoreNorm
+ kwargs:
+ fields_group: feature
+ clip_outlier: true
+ - class: Fillna
+ kwargs:
+ fields_group: feature
+learn_processors:
+ - class: DropnaLabel
+ - class: CSRankNorm
+ kwargs:
+ fields_group: label
+"""
+PROC_ARGS = yaml.load(PROC_ARGS, Loader=yaml.FullLoader)
+
+UTIL_MODEL_TYPE = Literal["linear", "gbdt"]
+
+
+class DDGDA(Rolling):
+ """
+ It is a rolling based on DDG-DA
+
+ **NOTE**
+ before running the example, please clean your previous results with following command
+ - `rm -r mlruns`
+ """
+
+ def __init__(
+ self,
+ sim_task_model: UTIL_MODEL_TYPE = "gbdt",
+ meta_1st_train_end: Optional[str] = None,
+ alpha: float = 0.01,
+ working_dir: Optional[Union[str, Path]] = None,
+ **kwargs,
+ ):
+ """
+
+ Parameters
+ ----------
+ sim_task_model: Literal["linear", "gbdt"] = "gbdt",
+ The model for calculating similarity between data.
+ meta_1st_train_end: Optional[str]
+ the datetime of training end of the first meta_task
+ alpha: float
+ Setting the L2 regularization for ridge
+ The `alpha` is only passed to MetaModelDS (it is not passed to sim_task_model currently..)
+ """
+ # NOTE:
+ # the horizon must match the meaning in the base task template
+ self.meta_exp_name = "DDG-DA"
+ self.sim_task_model: UTIL_MODEL_TYPE = sim_task_model # The model to capture the distribution of data.
+ self.alpha = alpha
+ self.meta_1st_train_end = meta_1st_train_end
+ super().__init__(**kwargs)
+ self.working_dir = self.conf_path.parent if working_dir is None else Path(working_dir)
+ self.proxy_hd = self.working_dir / "handler_proxy.pkl"
+
+ def _adjust_task(self, task: dict, astype: UTIL_MODEL_TYPE):
+ """
+ some task are use for special purpose.
+ For example:
+ - GBDT for calculating feature importance
+ - Linear or GBDT for calculating similarity
+ - Datset (well processed) that aligned to Linear that for meta learning
+ """
+ # NOTE: here is just for aligning with previous implementation
+ # It is not necessary for the current implementation
+ handler = task["dataset"].setdefault("kwargs", {}).setdefault("handler", {})
+ if astype == "gbdt":
+ task["model"] = LGBM_MODEL
+ if isinstance(handler, dict):
+ for k in ["infer_processors", "learn_processors"]:
+ if k in handler.setdefault("kwargs", {}):
+ handler["kwargs"].pop(k)
+ elif astype == "linear":
+ task["model"] = LINEAR_MODEL
+ handler["kwargs"].update(PROC_ARGS)
+ else:
+ raise ValueError(f"astype not supported: {astype}")
+ return task
+
+ def _get_feature_importance(self):
+ # this must be lightGBM, because it needs to get the feature importance
+ task = self.basic_task(enable_handler_cache=False)
+ task = self._adjust_task(task, astype="gbdt")
+ task = replace_task_handler_with_cache(task, self.working_dir)
+
+ with R.start(experiment_name="feature_importance"):
+ model = init_instance_by_config(task["model"])
+ dataset = init_instance_by_config(task["dataset"])
+ model.fit(dataset)
+
+ fi = model.get_feature_importance()
+ # Because the model use numpy instead of dataframe for training lightgbm
+ # So the we must use following extra steps to get the right feature importance
+ df = dataset.prepare(segments=slice(None), col_set="feature", data_key=DataHandlerLP.DK_R)
+ cols = df.columns
+ fi_named = {cols[int(k.split("_")[1])]: imp for k, imp in fi.to_dict().items()}
+
+ return pd.Series(fi_named)
+
+ def _dump_data_for_proxy_model(self):
+ """
+ Dump data for training meta model.
+ The meta model will be trained upon the proxy forecasting model.
+ This dataset is for the proxy forecasting model.
+ """
+ topk = 30
+ fi = self._get_feature_importance()
+ col_selected = fi.nlargest(topk)
+ # NOTE: adjusting to `self.sim_task_model` just for aligning with previous implementation.
+ task = self._adjust_task(self.basic_task(enable_handler_cache=False), self.sim_task_model)
+ task = replace_task_handler_with_cache(task, self.working_dir)
+
+ dataset = init_instance_by_config(task["dataset"])
+ prep_ds = dataset.prepare(slice(None), col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
+
+ feature_df = prep_ds["feature"]
+ label_df = prep_ds["label"]
+
+ feature_selected = feature_df.loc[:, col_selected.index]
+
+ feature_selected = feature_selected.groupby("datetime", group_keys=False).apply(
+ lambda df: (df - df.mean()).div(df.std())
+ )
+ feature_selected = feature_selected.fillna(0.0)
+
+ df_all = {
+ "label": label_df.reindex(feature_selected.index),
+ "feature": feature_selected,
+ }
+ df_all = pd.concat(df_all, axis=1)
+ df_all.to_pickle(self.working_dir / "fea_label_df.pkl")
+
+ # dump data in handler format for aligning the interface
+ handler = DataHandlerLP(
+ data_loader={
+ "class": "qlib.data.dataset.loader.StaticDataLoader",
+ "kwargs": {"config": self.working_dir / "fea_label_df.pkl"},
+ }
+ )
+ handler.to_pickle(self.working_dir / self.proxy_hd, dump_all=True)
+
+ @property
+ def _internal_data_path(self):
+ return self.working_dir / f"internal_data_s{self.step}.pkl"
+
+ def _dump_meta_ipt(self):
+ """
+ Dump data for training meta model.
+ This function will dump the input data for meta model
+ """
+ # According to the experiments, the choice of the model type is very important for achieving good results
+ sim_task = self._adjust_task(self.basic_task(enable_handler_cache=False), astype=self.sim_task_model)
+ sim_task = replace_task_handler_with_cache(sim_task, self.working_dir)
+
+ if self.sim_task_model == "gbdt":
+ sim_task["model"].setdefault("kwargs", {}).update({"early_stopping_rounds": None, "num_boost_round": 150})
+
+ exp_name_sim = f"data_sim_s{self.step}"
+
+ internal_data = InternalData(sim_task, self.step, exp_name=exp_name_sim)
+ internal_data.setup(trainer=TrainerR)
+
+ with self._internal_data_path.open("wb") as f:
+ pickle.dump(internal_data, f)
+
+ def _train_meta_model(self, fill_method="max"):
+ """
+ training a meta model based on a simplified linear proxy model;
+ """
+
+ # 1) leverage the simplified proxy forecasting model to train meta model.
+ # - Only the dataset part is important, in current version of meta model will integrate the
+
+ # the train_start for training meta model does not necessarily align with final rolling
+ train_start = "2008-01-01" if self.train_start is None else self.train_start
+ train_end = "2010-12-31" if self.meta_1st_train_end is None else self.meta_1st_train_end
+ test_start = (pd.Timestamp(train_end) + pd.Timedelta(days=1)).strftime("%Y-%m-%d")
+ proxy_forecast_model_task = {
+ # "model": "qlib.contrib.model.linear.LinearModel",
+ "dataset": {
+ "class": "qlib.data.dataset.DatasetH",
+ "kwargs": {
+ "handler": f"file://{(self.working_dir / self.proxy_hd).absolute()}",
+ "segments": {
+ "train": (train_start, train_end),
+ "test": (test_start, self.basic_task()["dataset"]["kwargs"]["segments"]["test"][1]),
+ },
+ },
+ },
+ # "record": ["qlib.workflow.record_temp.SignalRecord"]
+ }
+ # the proxy_forecast_model_task will be used to create meta tasks.
+ # The test date of first task will be 2011-01-01. Each test segment will be about 20days
+ # The tasks include all training tasks and test tasks.
+
+ # 2) preparing meta dataset
+ kwargs = dict(
+ task_tpl=proxy_forecast_model_task,
+ step=self.step,
+ segments=0.62, # keep test period consistent with the dataset yaml
+ trunc_days=1 + self.horizon,
+ hist_step_n=30,
+ fill_method=fill_method,
+ rolling_ext_days=0,
+ )
+ # NOTE:
+ # the input of meta model (internal data) are shared between proxy model and final forecasting model
+ # but their task test segment are not aligned! It worked in my previous experiment.
+ # So the misalignment will not affect the effectiveness of the method.
+ with self._internal_data_path.open("rb") as f:
+ internal_data = pickle.load(f)
+
+ md = MetaDatasetDS(exp_name=internal_data, **kwargs)
+
+ # 3) train and logging meta model
+ with R.start(experiment_name=self.meta_exp_name):
+ R.log_params(**kwargs)
+ mm = MetaModelDS(
+ step=self.step, hist_step_n=kwargs["hist_step_n"], lr=0.001, max_epoch=30, seed=43, alpha=self.alpha
+ )
+ mm.fit(md)
+ R.save_objects(model=mm)
+
+ @property
+ def _task_path(self):
+ return self.working_dir / f"tasks_s{self.step}.pkl"
+
+ def get_task_list(self):
+ """
+ Leverage meta-model for inference:
+ - Given
+ - baseline tasks
+ - input for meta model(internal data)
+ - meta model (its learnt knowledge on proxy forecasting model is expected to transfer to normal forecasting model)
+ """
+ # 1) get meta model
+ exp = R.get_exp(experiment_name=self.meta_exp_name)
+ rec = exp.list_recorders(rtype=exp.RT_L)[0]
+ meta_model: MetaModelDS = rec.load_object("model")
+
+ # 2)
+ # we are transfer to knowledge of meta model to final forecasting tasks.
+ # Create MetaTaskDataset for the final forecasting tasks
+ # Aligning the setting of it to the MetaTaskDataset when training Meta model is necessary
+
+ # 2.1) get previous config
+ param = rec.list_params()
+ trunc_days = int(param["trunc_days"])
+ step = int(param["step"])
+ hist_step_n = int(param["hist_step_n"])
+ fill_method = param.get("fill_method", "max")
+
+ task_l = super().get_task_list()
+
+ # 2.2) create meta dataset for final dataset
+ kwargs = dict(
+ task_tpl=task_l,
+ step=step,
+ segments=0.0, # all the tasks are for testing
+ trunc_days=trunc_days,
+ hist_step_n=hist_step_n,
+ fill_method=fill_method,
+ task_mode=MetaTask.PROC_MODE_TRANSFER,
+ )
+
+ with self._internal_data_path.open("rb") as f:
+ internal_data = pickle.load(f)
+ mds = MetaDatasetDS(exp_name=internal_data, **kwargs)
+
+ # 3) meta model make inference and get new qlib task
+ new_tasks = meta_model.inference(mds)
+ with self._task_path.open("wb") as f:
+ pickle.dump(new_tasks, f)
+ return new_tasks
+
+ def run(self):
+ # prepare the meta model for rolling ---------
+ # 1) file: handler_proxy.pkl (self.proxy_hd)
+ self._dump_data_for_proxy_model()
+ # 2)
+ # file: internal_data_s20.pkl
+ # mlflow: data_sim_s20, models for calculating meta_ipt
+ self._dump_meta_ipt()
+ # 3) meta model will be stored in `DDG-DA`
+ self._train_meta_model()
+
+ # Run rolling --------------------------------
+ # 4) new_tasks are saved in "tasks_s20.pkl" (reweighter is added)
+ # - the meta inference are done when calling `get_task_list`
+ # 5) load the saved tasks and train model
+ super().run()
diff --git a/qlib/contrib/strategy/optimizer/optimizer.py b/qlib/contrib/strategy/optimizer/optimizer.py
index a70929e27..a5fb76312 100644
--- a/qlib/contrib/strategy/optimizer/optimizer.py
+++ b/qlib/contrib/strategy/optimizer/optimizer.py
@@ -112,7 +112,6 @@ class PortfolioOptimizer(BaseOptimizer):
return w
def _optimize(self, S: np.ndarray, r: Optional[np.ndarray] = None, w0: Optional[np.ndarray] = None) -> np.ndarray:
-
# inverse volatility
if self.method == self.OPT_INV:
if r is not None:
diff --git a/qlib/contrib/strategy/rule_strategy.py b/qlib/contrib/strategy/rule_strategy.py
index 30facf3a3..f2b919739 100644
--- a/qlib/contrib/strategy/rule_strategy.py
+++ b/qlib/contrib/strategy/rule_strategy.py
@@ -522,7 +522,6 @@ class ACStrategy(BaseStrategy):
_order_amount = min(_order_amount, self.trade_amount[order.stock_id])
if _order_amount > 1e-5:
-
_order = Order(
stock_id=order.stock_id,
amount=_order_amount,
diff --git a/qlib/contrib/strategy/signal_strategy.py b/qlib/contrib/strategy/signal_strategy.py
index cb94017cd..9ba960eeb 100644
--- a/qlib/contrib/strategy/signal_strategy.py
+++ b/qlib/contrib/strategy/signal_strategy.py
@@ -435,7 +435,6 @@ class EnhancedIndexingStrategy(WeightStrategyBase):
self._riskdata_cache = {}
def get_risk_data(self, date):
-
if date in self._riskdata_cache:
return self._riskdata_cache[date]
@@ -462,7 +461,6 @@ class EnhancedIndexingStrategy(WeightStrategyBase):
return self._riskdata_cache[date]
def generate_target_weight_position(self, score, current, trade_start_time, trade_end_time):
-
trade_date = trade_start_time
pre_date = get_pre_trading_date(trade_date, future=True) # previous trade date
diff --git a/qlib/contrib/tuner/config.py b/qlib/contrib/tuner/config.py
index 6e37f0097..7a8534a20 100644
--- a/qlib/contrib/tuner/config.py
+++ b/qlib/contrib/tuner/config.py
@@ -11,7 +11,6 @@ import os
class TunerConfigManager:
def __init__(self, config_path):
-
if not config_path:
raise ValueError("Config path is invalid.")
self.config_path = config_path
@@ -58,7 +57,6 @@ class PipelineExperimentConfig:
class OptimizationConfig:
def __init__(self, config, TUNER_CONFIG_MANAGER):
-
self.report_type = config.get("report_type", "pred_long")
if self.report_type not in [
"pred_long",
diff --git a/qlib/contrib/tuner/pipeline.py b/qlib/contrib/tuner/pipeline.py
index db48c46cf..34977fa55 100644
--- a/qlib/contrib/tuner/pipeline.py
+++ b/qlib/contrib/tuner/pipeline.py
@@ -15,11 +15,9 @@ from ...utils import get_module_by_module_path
class Pipeline:
-
GLOBAL_BEST_PARAMS_NAME = "global_best_params.json"
def __init__(self, tuner_config_manager):
-
self.logger = get_module_logger("Pipeline", sh_level=logging.INFO)
self.tuner_config_manager = tuner_config_manager
@@ -37,7 +35,6 @@ class Pipeline:
self.best_tuner_index = None
def run(self):
-
TimeInspector.set_time_mark()
for tuner_index, tuner_config in enumerate(self.pipeline_config):
tuner = self.init_tuner(tuner_index, tuner_config)
@@ -77,7 +74,6 @@ class Pipeline:
return tuner_class(tuner_config, self.optim_config)
def save_tuner_exp_info(self):
-
TimeInspector.set_time_mark()
save_path = os.path.join(self.pipeline_ex_config.tuner_ex_dir, Pipeline.GLOBAL_BEST_PARAMS_NAME)
with open(save_path, "w") as fp:
diff --git a/qlib/contrib/tuner/tuner.py b/qlib/contrib/tuner/tuner.py
index c183b28ae..7705ce8b7 100644
--- a/qlib/contrib/tuner/tuner.py
+++ b/qlib/contrib/tuner/tuner.py
@@ -24,7 +24,6 @@ from hyperopt import STATUS_OK, STATUS_FAIL
class Tuner:
def __init__(self, tuner_config, optim_config):
-
self.logger = get_module_logger("Tuner", sh_level=logging.INFO)
self.tuner_config = tuner_config
@@ -42,7 +41,6 @@ class Tuner:
self.space = self.setup_space()
def tune(self):
-
TimeInspector.set_time_mark()
fmin(
fn=self.objective,
@@ -84,7 +82,6 @@ class Tuner:
class QLibTuner(Tuner):
-
ESTIMATOR_CONFIG_NAME = "estimator_config.yaml"
EXP_INFO_NAME = "exp_info.json"
EXP_RESULT_DIR = "sacred/{}"
@@ -92,7 +89,6 @@ class QLibTuner(Tuner):
LOCAL_BEST_PARAMS_NAME = "local_best_params.json"
def objective(self, params):
-
# 1. Setup an config for a specific estimator process
estimator_path = self.setup_estimator_config(params)
self.logger.info("Searching params: {} ".format(params))
@@ -120,7 +116,6 @@ class QLibTuner(Tuner):
return {"loss": res, "status": status}
def fetch_result(self):
-
# 1. Get experiment information
exp_info_path = os.path.join(self.ex_dir, QLibTuner.EXP_INFO_NAME)
with open(exp_info_path) as fp:
@@ -155,7 +150,6 @@ class QLibTuner(Tuner):
return np.abs(res.values[0] - 1)
def setup_estimator_config(self, params):
-
estimator_config = copy.deepcopy(self.tuner_config)
estimator_config["model"].update({"args": params["model_space"]})
estimator_config["strategy"].update({"args": params["strategy_space"]})
@@ -212,7 +206,6 @@ class QLibTuner(Tuner):
return space
def save_local_best_params(self):
-
TimeInspector.set_time_mark()
local_best_params_path = os.path.join(self.ex_dir, QLibTuner.LOCAL_BEST_PARAMS_NAME)
with open(local_best_params_path, "w") as fp:
diff --git a/qlib/data/cache.py b/qlib/data/cache.py
index addd28871..3264dcd02 100644
--- a/qlib/data/cache.py
+++ b/qlib/data/cache.py
@@ -583,7 +583,6 @@ class DiskExpressionCache(ExpressionCache):
r.tofile(str(cache_path))
def update(self, sid, cache_uri, freq: str = "day"):
-
cp_cache_uri = self.get_cache_dir(freq).joinpath(sid).joinpath(cache_uri)
meta_path = cp_cache_uri.with_suffix(".meta")
if not self.check_cache_exists(cp_cache_uri, suffix_list=[".meta"]):
@@ -696,7 +695,6 @@ class DiskDatasetCache(DatasetCache):
def _dataset(
self, instruments, fields, start_time=None, end_time=None, freq="day", disk_cache=0, inst_processors=[]
):
-
if disk_cache == 0:
# In this case, data_set cache is configured but will not be used.
return self.provider.dataset(
@@ -801,7 +799,6 @@ class DiskDatasetCache(DatasetCache):
KEY = "df"
def __init__(self, cache_path: Union[str, Path]):
-
self.index_path = cache_path.with_suffix(".index")
self._data = None
self.logger = get_module_logger(self.__class__.__name__)
@@ -1126,7 +1123,6 @@ class DatasetURICache(DatasetCache):
def dataset(
self, instruments, fields, start_time=None, end_time=None, freq="day", disk_cache=0, inst_processors=[]
):
-
if "local" in C.dataset_provider.lower():
# use LocalDatasetProvider
return self.provider.dataset(
@@ -1189,7 +1185,6 @@ class MemoryCalendarCache(CalendarCache):
uri = self._uri(start_time, end_time, freq, future)
result, expire = MemCacheExpire.get_cache(H["c"], uri)
if result is None or expire:
-
result = self.provider.calendar(start_time, end_time, freq, future)
MemCacheExpire.set_cache(H["c"], uri, result)
diff --git a/qlib/data/data.py b/qlib/data/data.py
index 809b8d1c3..116827f23 100644
--- a/qlib/data/data.py
+++ b/qlib/data/data.py
@@ -1096,7 +1096,6 @@ class ClientDatasetProvider(DatasetProvider):
else:
return data
else:
-
"""
Call the server to generate the data-set cache, get the uri of the cache file.
Then load the data from the file on NFS directly.
diff --git a/qlib/data/dataset/processor.py b/qlib/data/dataset/processor.py
index f7204cf78..63acd937e 100644
--- a/qlib/data/dataset/processor.py
+++ b/qlib/data/dataset/processor.py
@@ -132,7 +132,6 @@ class FilterCol(Processor):
self.col_list = col_list
def __call__(self, df):
-
cols = get_group_columns(df, self.fields_group)
all_cols = df.columns
diff_cols = np.setdiff1d(all_cols.get_level_values(-1), cols.get_level_values(-1))
diff --git a/qlib/data/dataset/utils.py b/qlib/data/dataset/utils.py
index 4761fb383..f19dfe08f 100644
--- a/qlib/data/dataset/utils.py
+++ b/qlib/data/dataset/utils.py
@@ -71,15 +71,11 @@ def fetch_df_by_index(
if fetch_orig:
for slc in idx_slc:
if slc != slice(None, None):
- return df.loc[
- pd.IndexSlice[idx_slc],
- ]
+ return df.loc[pd.IndexSlice[idx_slc],] # noqa: E231
else: # pylint: disable=W0120
return df
else:
- return df.loc[
- pd.IndexSlice[idx_slc],
- ]
+ return df.loc[pd.IndexSlice[idx_slc],] # noqa: E231
def fetch_df_by_col(df: pd.DataFrame, col_set: Union[str, List[str]]) -> pd.DataFrame:
diff --git a/qlib/data/pit.py b/qlib/data/pit.py
index 093b98cab..33d5e0c5c 100644
--- a/qlib/data/pit.py
+++ b/qlib/data/pit.py
@@ -22,7 +22,6 @@ from .data import Cal
class P(ElemOperator):
def _load_internal(self, instrument, start_index, end_index, freq):
-
_calendar = Cal.calendar(freq=freq)
resample_data = np.empty(end_index - start_index + 1, dtype="float32")
diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py
index 288500c55..8a100a2d1 100644
--- a/qlib/data/storage/file_storage.py
+++ b/qlib/data/storage/file_storage.py
@@ -190,7 +190,6 @@ class FileCalendarStorage(FileStorageMixin, CalendarStorage):
class FileInstrumentStorage(FileStorageMixin, InstrumentStorage):
-
INSTRUMENT_SEP = "\t"
INSTRUMENT_START_FIELD = "start_datetime"
INSTRUMENT_END_FIELD = "end_datetime"
@@ -260,7 +259,6 @@ class FileInstrumentStorage(FileStorageMixin, InstrumentStorage):
return self._read_instrument()[k]
def update(self, *args, **kwargs) -> None:
-
if len(args) > 1:
raise TypeError(f"update expected at most 1 arguments, got {len(args)}")
inst = self._read_instrument()
@@ -358,7 +356,6 @@ class FileFeatureStorage(FileStorageMixin, FeatureStorage):
storage_end_index = self.end_index
with self.uri.open("rb") as fp:
if isinstance(i, int):
-
if storage_start_index > i:
raise IndexError(f"{i}: start index is {storage_start_index}")
fp.seek(4 * (i - storage_start_index) + 4)
diff --git a/qlib/log.py b/qlib/log.py
index 115abc137..f7683d511 100644
--- a/qlib/log.py
+++ b/qlib/log.py
@@ -84,7 +84,6 @@ get_module_logger = _QLibLoggerManager()
class TimeInspector:
-
timer_logger = get_module_logger("timer")
time_marks = []
diff --git a/qlib/model/riskmodel/poet.py b/qlib/model/riskmodel/poet.py
index 8946b2ac5..42388d84c 100644
--- a/qlib/model/riskmodel/poet.py
+++ b/qlib/model/riskmodel/poet.py
@@ -43,7 +43,6 @@ class POETCovEstimator(RiskModel):
self.thresh_method = thresh_method
def _predict(self, X: np.ndarray) -> np.ndarray:
-
Y = X.T # NOTE: to match POET's implementation
p, n = Y.shape
diff --git a/qlib/tests/__init__.py b/qlib/tests/__init__.py
index 52c924918..97ff00c57 100644
--- a/qlib/tests/__init__.py
+++ b/qlib/tests/__init__.py
@@ -14,7 +14,6 @@ from qlib.data.storage import CalendarStorage, InstrumentStorage, FeatureStorage
class TestAutoData(unittest.TestCase):
-
_setup_kwargs = {}
provider_uri = "~/.qlib/qlib_data/cn_data_simple" # target_dir
provider_uri_1day = "~/.qlib/qlib_data/cn_data" # target_dir
@@ -286,6 +285,5 @@ class TestMockData(unittest.TestCase):
@classmethod
def setUpClass(cls) -> None:
-
provider_uri = "Not necessary."
init(region=REG_TW, provider_uri=provider_uri, expression_cache=None, dataset_cache=None, **cls._setup_kwargs)
diff --git a/qlib/tests/data.py b/qlib/tests/data.py
index 2163b4bf7..f6bd78090 100644
--- a/qlib/tests/data.py
+++ b/qlib/tests/data.py
@@ -1,6 +1,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
+import os
import re
import sys
import qlib
@@ -11,13 +12,15 @@ import datetime
from tqdm import tqdm
from pathlib import Path
from loguru import logger
+from cryptography.fernet import Fernet
from qlib.utils import exists_qlib_data
class GetData:
- DATASET_VERSION = "v2"
REMOTE_URL = "https://qlibpublic.blob.core.windows.net/data/default/stock_data"
- QLIB_DATA_NAME = "{dataset_name}_{region}_{interval}_{qlib_version}.zip"
+ # "?" is not included in the token.
+ TOKEN = b"gAAAAABkmDhojHc0VSCDdNK1MqmRzNLeDFXe5hy8obHpa6SDQh4de6nW5gtzuD-fa6O_WZb0yyqYOL7ndOfJX_751W3xN5YB4-n-P22jK-t6ucoZqhT70KPD0Lf0_P328QPJVZ1gDnjIdjhi2YLOcP4BFTHLNYO0mvzszR8TKm9iT5AKRvuysWnpi8bbYwGU9zAcJK3x9EPL43hOGtxliFHcPNGMBoJW4g_ercdhi0-Qgv5_JLsV-29_MV-_AhuaYvJuN2dEywBy"
+ KEY = "EYcA8cgorA8X9OhyMwVfuFxn_1W3jGk6jCbs3L2oPoA="
def __init__(self, delete_zip_file=False):
"""
@@ -29,24 +32,44 @@ class GetData:
"""
self.delete_zip_file = delete_zip_file
- def normalize_dataset_version(self, dataset_version: str = None):
- if dataset_version is None:
- dataset_version = self.DATASET_VERSION
- return dataset_version
+ def merge_remote_url(self, file_name: str):
+ fernet = Fernet(self.KEY)
+ token = fernet.decrypt(self.TOKEN).decode()
+ return f"{self.REMOTE_URL}/{file_name}?{token}"
- def merge_remote_url(self, file_name: str, dataset_version: str = None):
- return f"{self.REMOTE_URL}/{self.normalize_dataset_version(dataset_version)}/{file_name}"
+ def download_data(self, file_name: str, target_dir: [Path, str], delete_old: bool = True):
+ """
+ Download the specified file to the target folder.
- def _download_data(
- self, file_name: str, target_dir: [Path, str], delete_old: bool = True, dataset_version: str = None
- ):
+ Parameters
+ ----------
+ target_dir: str
+ data save directory
+ file_name: str
+ dataset name, needs to endwith .zip, value from [rl_data.zip, csv_data_cn.zip, ...]
+ may contain folder names, for example: v2/qlib_data_simple_cn_1d_latest.zip
+ delete_old: bool
+ delete an existing directory, by default True
+
+ Examples
+ ---------
+ # get rl data
+ python get_data.py download_data --file_name rl_data.zip --target_dir ~/.qlib/qlib_data/rl_data
+ When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/rl_data.zip?{token}
+
+ # get cn csv data
+ python get_data.py download_data --file_name csv_data_cn.zip --target_dir ~/.qlib/csv_data/cn_data
+ When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/csv_data_cn.zip?{token}
+ -------
+
+ """
target_dir = Path(target_dir).expanduser()
target_dir.mkdir(exist_ok=True, parents=True)
# saved file name
- _target_file_name = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "_" + file_name
+ _target_file_name = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "_" + os.path.basename(file_name)
target_path = target_dir.joinpath(_target_file_name)
- url = self.merge_remote_url(file_name, dataset_version)
+ url = self.merge_remote_url(file_name)
resp = requests.get(url, stream=True, timeout=60)
resp.raise_for_status()
if resp.status_code != 200:
@@ -56,7 +79,7 @@ class GetData:
logger.warning(
f"The data for the example is collected from Yahoo Finance. Please be aware that the quality of the data might not be perfect. (You can refer to the original data source: https://finance.yahoo.com/lookup.)"
)
- logger.info(f"{file_name} downloading......")
+ logger.info(f"{os.path.basename(file_name)} downloading......")
with tqdm(total=int(resp.headers.get("Content-Length", 0))) as p_bar:
with target_path.open("wb") as fp:
for chunk in resp.iter_content(chunk_size=chunk_size):
@@ -67,8 +90,8 @@ class GetData:
if self.delete_zip_file:
target_path.unlink()
- def check_dataset(self, file_name: str, dataset_version: str = None):
- url = self.merge_remote_url(file_name, dataset_version)
+ def check_dataset(self, file_name: str):
+ url = self.merge_remote_url(file_name)
resp = requests.get(url, stream=True, timeout=60)
status = True
if resp.status_code == 404:
@@ -140,9 +163,11 @@ class GetData:
---------
# get 1d data
python get_data.py qlib_data --name qlib_data --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
+ When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/v2/qlib_data_cn_1d_latest.zip?{token}
# get 1min data
python get_data.py qlib_data --name qlib_data --target_dir ~/.qlib/qlib_data/cn_data_1min --interval 1min --region cn
+ When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/v2/qlib_data_cn_1min_latest.zip?{token}
-------
"""
@@ -155,29 +180,12 @@ class GetData:
qlib_version = ".".join(re.findall(r"(\d+)\.+", qlib.__version__))
- def _get_file_name(v):
- return self.QLIB_DATA_NAME.format(
- dataset_name=name, region=region.lower(), interval=interval.lower(), qlib_version=v
- )
+ def _get_file_name_with_version(qlib_version, dataset_version):
+ dataset_version = "v2" if dataset_version is None else dataset_version
+ file_name_with_version = f"{dataset_version}/{name}_{region.lower()}_{interval.lower()}_{qlib_version}.zip"
+ return file_name_with_version
- file_name = _get_file_name(qlib_version)
- if not self.check_dataset(file_name, version):
- file_name = _get_file_name("latest")
- self._download_data(file_name.lower(), target_dir, delete_old, dataset_version=version)
-
- def csv_data_cn(self, target_dir="~/.qlib/csv_data/cn_data"):
- """download cn csv data from remote
-
- Parameters
- ----------
- target_dir: str
- data save directory
-
- Examples
- ---------
- python get_data.py csv_data_cn --target_dir ~/.qlib/csv_data/cn_data
- -------
-
- """
- file_name = "csv_data_cn.zip"
- self._download_data(file_name, target_dir)
+ file_name = _get_file_name_with_version(qlib_version, dataset_version=version)
+ if not self.check_dataset(file_name):
+ file_name = _get_file_name_with_version("latest", dataset_version=version)
+ self.download_data(file_name.lower(), target_dir, delete_old)
diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py
index 910a4c08b..9e63c104a 100644
--- a/qlib/utils/__init__.py
+++ b/qlib/utils/__init__.py
@@ -7,12 +7,9 @@ from __future__ import division
from __future__ import print_function
import os
-import pickle
import re
-import sys
import copy
import json
-from qlib.typehint import InstConf
import yaml
import redis
import bisect
@@ -22,15 +19,11 @@ import inspect
import hashlib
import datetime
import requests
-import importlib
-import contextlib
import collections
import numpy as np
import pandas as pd
from pathlib import Path
-from typing import List, Dict, Union, Tuple, Any, Optional, Callable
-from types import ModuleType
-from urllib.parse import urlparse
+from typing import List, Union, Optional, Callable
from packaging import version
from .file import get_or_create_path, save_multiple_parts_file, unpack_archive_with_buffer, get_tmp_file_with_buffer
from ..config import C
@@ -288,182 +281,6 @@ def parse_field(field):
return field
-def get_module_by_module_path(module_path: Union[str, ModuleType]):
- """Load module path
-
- :param module_path:
- :return:
- :raises: ModuleNotFoundError
- """
- if module_path is None:
- raise ModuleNotFoundError("None is passed in as parameters as module_path")
-
- if isinstance(module_path, ModuleType):
- module = module_path
- else:
- if module_path.endswith(".py"):
- module_name = re.sub("^[^a-zA-Z_]+", "", re.sub("[^0-9a-zA-Z_]", "", module_path[:-3].replace("/", "_")))
- module_spec = importlib.util.spec_from_file_location(module_name, module_path)
- module = importlib.util.module_from_spec(module_spec)
- sys.modules[module_name] = module
- module_spec.loader.exec_module(module)
- else:
- module = importlib.import_module(module_path)
- return module
-
-
-def split_module_path(module_path: str) -> Tuple[str, str]:
- """
-
- Parameters
- ----------
- module_path : str
- e.g. "a.b.c.ClassName"
-
- Returns
- -------
- Tuple[str, str]
- e.g. ("a.b.c", "ClassName")
- """
- *m_path, cls = module_path.split(".")
- m_path = ".".join(m_path)
- return m_path, cls
-
-
-def get_callable_kwargs(config: InstConf, default_module: Union[str, ModuleType] = None) -> (type, dict):
- """
- extract class/func and kwargs from config info
-
- Parameters
- ----------
- config : [dict, str]
- similar to config
- please refer to the doc of init_instance_by_config
-
- default_module : Python module or str
- It should be a python module to load the class type
- This function will load class from the config['module_path'] first.
- If config['module_path'] doesn't exists, it will load the class from default_module.
-
- Returns
- -------
- (type, dict):
- the class/func object and it's arguments.
-
- Raises
- ------
- ModuleNotFoundError
- """
- if isinstance(config, dict):
- key = "class" if "class" in config else "func"
- if isinstance(config[key], str):
- # 1) get module and class
- # - case 1): "a.b.c.ClassName"
- # - case 2): {"class": "ClassName", "module_path": "a.b.c"}
- m_path, cls = split_module_path(config[key])
- if m_path == "":
- m_path = config.get("module_path", default_module)
- module = get_module_by_module_path(m_path)
-
- # 2) get callable
- _callable = getattr(module, cls) # may raise AttributeError
- else:
- _callable = config[key] # the class type itself is passed in
- kwargs = config.get("kwargs", {})
- elif isinstance(config, str):
- # a.b.c.ClassName
- m_path, cls = split_module_path(config)
- module = get_module_by_module_path(default_module if m_path == "" else m_path)
-
- _callable = getattr(module, cls)
- kwargs = {}
- else:
- raise NotImplementedError(f"This type of input is not supported")
- return _callable, kwargs
-
-
-get_cls_kwargs = get_callable_kwargs # NOTE: this is for compatibility for the previous version
-
-
-def init_instance_by_config(
- config: InstConf,
- default_module=None,
- accept_types: Union[type, Tuple[type]] = (),
- try_kwargs: Dict = {},
- **kwargs,
-) -> Any:
- """
- get initialized instance with config
-
- Parameters
- ----------
- config : InstConf
-
- default_module : Python module
- Optional. It should be a python module.
- NOTE: the "module_path" will be override by `module` arguments
-
- This function will load class from the config['module_path'] first.
- If config['module_path'] doesn't exists, it will load the class from default_module.
-
- accept_types: Union[type, Tuple[type]]
- Optional. If the config is a instance of specific type, return the config directly.
- This will be passed into the second parameter of isinstance.
-
- try_kwargs: Dict
- Try to pass in kwargs in `try_kwargs` when initialized the instance
- If error occurred, it will fail back to initialization without try_kwargs.
-
- Returns
- -------
- object:
- An initialized object based on the config info
- """
- if isinstance(config, accept_types):
- return config
-
- if isinstance(config, (str, Path)):
- if isinstance(config, str):
- # path like 'file:////obj.pkl'
- pr = urlparse(config)
- if pr.scheme == "file":
- pr_path = os.path.join(pr.netloc, pr.path) if bool(pr.path) else pr.netloc
- with open(os.path.normpath(pr_path), "rb") as f:
- return pickle.load(f)
- else:
- with config.open("rb") as f:
- return pickle.load(f)
-
- klass, cls_kwargs = get_callable_kwargs(config, default_module=default_module)
-
- try:
- return klass(**cls_kwargs, **try_kwargs, **kwargs)
- except (TypeError,):
- # TypeError for handling errors like
- # 1: `XXX() got multiple values for keyword argument 'YYY'`
- # 2: `XXX() got an unexpected keyword argument 'YYY'
- return klass(**cls_kwargs, **kwargs)
-
-
-@contextlib.contextmanager
-def class_casting(obj: object, cls: type):
- """
- Python doesn't provide the downcasting mechanism.
- We use the trick here to downcast the class
-
- Parameters
- ----------
- obj : object
- the object to be cast
- cls : type
- the target class type
- """
- orig_cls = obj.__class__
- obj.__class__ = cls
- yield
- obj.__class__ = orig_cls
-
-
def compare_dict_value(src_data: dict, dst_data: dict):
"""Compare dict value
@@ -744,7 +561,6 @@ def exists_qlib_data(qlib_dir):
return False
# check calendar bin
for _calendar in calendars_dir.iterdir():
-
if ("_future" not in _calendar.name) and (
not list(features_dir.rglob(f"*.{_calendar.name.split('.')[0]}.bin"))
):
@@ -872,9 +688,9 @@ def get_item_from_obj(config: dict, name_path: str) -> object:
cur_cfg = config
for k in name_path.split("."):
if isinstance(cur_cfg, dict):
- cur_cfg = cur_cfg[k]
+ cur_cfg = cur_cfg[k] # may raise KeyError
elif k.isdigit():
- cur_cfg = cur_cfg[int(k)]
+ cur_cfg = cur_cfg[int(k)] # may raise IndexError
else:
raise ValueError(f"Error when getting {k} from cur_cfg")
return cur_cfg
@@ -910,6 +726,21 @@ def fill_placeholder(config: dict, config_extend: dict):
top = 0
tail = 1
item_queue = [config]
+
+ def try_replace_placeholder(value):
+ if value in config_extend.keys():
+ value = config_extend[value]
+ else:
+ m = re.match(r"<(?P[^<>]+)>", value)
+ if m is not None:
+ try:
+ value = get_item_from_obj(config, m.groupdict()["name_path"])
+ except (KeyError, ValueError, IndexError):
+ get_module_logger("fill_placeholder").info(
+ f"{value} lookes like a placeholder, but it can't match to any given values"
+ )
+ return value
+
while top < tail:
now_item = item_queue[top]
top += 1
@@ -917,17 +748,13 @@ def fill_placeholder(config: dict, config_extend: dict):
item_keys = range(len(now_item))
elif isinstance(now_item, dict):
item_keys = now_item.keys()
- for key in item_keys:
+ for key in item_keys: # noqa
if isinstance(now_item[key], (list, dict)):
item_queue.append(now_item[key])
tail += 1
elif isinstance(now_item[key], str):
- if now_item[key] in config_extend.keys():
- now_item[key] = config_extend[now_item[key]]
- else:
- m = re.match(r"<(?P[^<>]+)>", now_item[key])
- if m is not None:
- now_item[key] = get_item_from_obj(config, m.groupdict()["name_path"])
+ # If it is a string, try to replace it with placeholder
+ now_item[key] = try_replace_placeholder(now_item[key])
return config
@@ -1049,6 +876,15 @@ def fname_to_code(fname: str):
return fname
+from .mod import (
+ get_module_by_module_path,
+ split_module_path,
+ get_callable_kwargs,
+ get_cls_kwargs,
+ init_instance_by_config,
+ class_casting,
+)
+
__all__ = [
"get_or_create_path",
"save_multiple_parts_file",
@@ -1056,4 +892,10 @@ __all__ = [
"get_tmp_file_with_buffer",
"set_log_with_config",
"init_instance_by_config",
+ "get_module_by_module_path",
+ "split_module_path",
+ "get_callable_kwargs",
+ "get_cls_kwargs",
+ "init_instance_by_config",
+ "class_casting",
]
diff --git a/qlib/utils/index_data.py b/qlib/utils/index_data.py
index b62bc02ce..113f9802d 100644
--- a/qlib/utils/index_data.py
+++ b/qlib/utils/index_data.py
@@ -351,7 +351,6 @@ class IndexData(metaclass=index_data_ops_creator):
loc_idx_cls = LocIndexer
def __init__(self, data: np.ndarray, *indices: Union[List, pd.Index, Index]):
-
self.data = data
self.indices = indices
diff --git a/qlib/utils/mod.py b/qlib/utils/mod.py
new file mode 100644
index 000000000..e53957260
--- /dev/null
+++ b/qlib/utils/mod.py
@@ -0,0 +1,235 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+"""
+All module related class, e.g. :
+- importing a module, class
+- walkiing a module
+- operations on class or module...
+"""
+
+import contextlib
+import importlib
+import os
+from pathlib import Path
+import pickle
+import pkgutil
+import re
+import sys
+from types import ModuleType
+from typing import Any, Dict, List, Tuple, Union
+from urllib.parse import urlparse
+
+from qlib.typehint import InstConf
+
+
+def get_module_by_module_path(module_path: Union[str, ModuleType]):
+ """Load module path
+
+ :param module_path:
+ :return:
+ :raises: ModuleNotFoundError
+ """
+ if module_path is None:
+ raise ModuleNotFoundError("None is passed in as parameters as module_path")
+
+ if isinstance(module_path, ModuleType):
+ module = module_path
+ else:
+ if module_path.endswith(".py"):
+ module_name = re.sub("^[^a-zA-Z_]+", "", re.sub("[^0-9a-zA-Z_]", "", module_path[:-3].replace("/", "_")))
+ module_spec = importlib.util.spec_from_file_location(module_name, module_path)
+ module = importlib.util.module_from_spec(module_spec)
+ sys.modules[module_name] = module
+ module_spec.loader.exec_module(module)
+ else:
+ module = importlib.import_module(module_path)
+ return module
+
+
+def split_module_path(module_path: str) -> Tuple[str, str]:
+ """
+
+ Parameters
+ ----------
+ module_path : str
+ e.g. "a.b.c.ClassName"
+
+ Returns
+ -------
+ Tuple[str, str]
+ e.g. ("a.b.c", "ClassName")
+ """
+ *m_path, cls = module_path.split(".")
+ m_path = ".".join(m_path)
+ return m_path, cls
+
+
+def get_callable_kwargs(config: InstConf, default_module: Union[str, ModuleType] = None) -> (type, dict):
+ """
+ extract class/func and kwargs from config info
+
+ Parameters
+ ----------
+ config : [dict, str]
+ similar to config
+ please refer to the doc of init_instance_by_config
+
+ default_module : Python module or str
+ It should be a python module to load the class type
+ This function will load class from the config['module_path'] first.
+ If config['module_path'] doesn't exists, it will load the class from default_module.
+
+ Returns
+ -------
+ (type, dict):
+ the class/func object and it's arguments.
+
+ Raises
+ ------
+ ModuleNotFoundError
+ """
+ if isinstance(config, dict):
+ key = "class" if "class" in config else "func"
+ if isinstance(config[key], str):
+ # 1) get module and class
+ # - case 1): "a.b.c.ClassName"
+ # - case 2): {"class": "ClassName", "module_path": "a.b.c"}
+ m_path, cls = split_module_path(config[key])
+ if m_path == "":
+ m_path = config.get("module_path", default_module)
+ module = get_module_by_module_path(m_path)
+
+ # 2) get callable
+ _callable = getattr(module, cls) # may raise AttributeError
+ else:
+ _callable = config[key] # the class type itself is passed in
+ kwargs = config.get("kwargs", {})
+ elif isinstance(config, str):
+ # a.b.c.ClassName
+ m_path, cls = split_module_path(config)
+ module = get_module_by_module_path(default_module if m_path == "" else m_path)
+
+ _callable = getattr(module, cls)
+ kwargs = {}
+ else:
+ raise NotImplementedError(f"This type of input is not supported")
+ return _callable, kwargs
+
+
+get_cls_kwargs = get_callable_kwargs # NOTE: this is for compatibility for the previous version
+
+
+def init_instance_by_config(
+ config: InstConf,
+ default_module=None,
+ accept_types: Union[type, Tuple[type]] = (),
+ try_kwargs: Dict = {},
+ **kwargs,
+) -> Any:
+ """
+ get initialized instance with config
+
+ Parameters
+ ----------
+ config : InstConf
+
+ default_module : Python module
+ Optional. It should be a python module.
+ NOTE: the "module_path" will be override by `module` arguments
+
+ This function will load class from the config['module_path'] first.
+ If config['module_path'] doesn't exists, it will load the class from default_module.
+
+ accept_types: Union[type, Tuple[type]]
+ Optional. If the config is a instance of specific type, return the config directly.
+ This will be passed into the second parameter of isinstance.
+
+ try_kwargs: Dict
+ Try to pass in kwargs in `try_kwargs` when initialized the instance
+ If error occurred, it will fail back to initialization without try_kwargs.
+
+ Returns
+ -------
+ object:
+ An initialized object based on the config info
+ """
+ if isinstance(config, accept_types):
+ return config
+
+ if isinstance(config, (str, Path)):
+ if isinstance(config, str):
+ # path like 'file:////obj.pkl'
+ pr = urlparse(config)
+ if pr.scheme == "file":
+ pr_path = os.path.join(pr.netloc, pr.path) if bool(pr.path) else pr.netloc
+ with open(os.path.normpath(pr_path), "rb") as f:
+ return pickle.load(f)
+ else:
+ with config.open("rb") as f:
+ return pickle.load(f)
+
+ klass, cls_kwargs = get_callable_kwargs(config, default_module=default_module)
+
+ try:
+ return klass(**cls_kwargs, **try_kwargs, **kwargs)
+ except (TypeError,):
+ # TypeError for handling errors like
+ # 1: `XXX() got multiple values for keyword argument 'YYY'`
+ # 2: `XXX() got an unexpected keyword argument 'YYY'
+ return klass(**cls_kwargs, **kwargs)
+
+
+@contextlib.contextmanager
+def class_casting(obj: object, cls: type):
+ """
+ Python doesn't provide the downcasting mechanism.
+ We use the trick here to downcast the class
+
+ Parameters
+ ----------
+ obj : object
+ the object to be cast
+ cls : type
+ the target class type
+ """
+ orig_cls = obj.__class__
+ obj.__class__ = cls
+ yield
+ obj.__class__ = orig_cls
+
+
+def find_all_classes(module_path: Union[str, ModuleType], cls: type) -> List[type]:
+ """
+ Find all the classes recursively that inherit from `cls` in a given module.
+ - `cls` itself is also included
+
+ >>> from qlib.data.dataset.handler import DataHandler
+ >>> find_all_classes("qlib.contrib.data.handler", DataHandler)
+ [, , , , ]
+
+ TODO:
+ - skip import error
+
+ """
+ if isinstance(module_path, ModuleType):
+ mod = module_path
+ else:
+ mod = importlib.import_module(module_path)
+
+ cls_list = []
+
+ def _append_cls(obj):
+ # Leverage the closure trick to reuse code
+ if isinstance(obj, type) and issubclass(obj, cls) and cls not in cls_list:
+ cls_list.append(obj)
+
+ for attr in dir(mod):
+ _append_cls(getattr(mod, attr))
+
+ if hasattr(mod, "__path__"):
+ # if the model is a package
+ for _, modname, _ in pkgutil.iter_modules(mod.__path__):
+ sub_mod = importlib.import_module(f"{mod.__package__}.{modname}")
+ for m_cls in find_all_classes(sub_mod, cls):
+ _append_cls(m_cls)
+ return cls_list
diff --git a/qlib/workflow/record_temp.py b/qlib/workflow/record_temp.py
index 552848395..4f0413274 100644
--- a/qlib/workflow/record_temp.py
+++ b/qlib/workflow/record_temp.py
@@ -136,7 +136,6 @@ class RecordTemp:
whether the records are stored properly.
"""
if include_self:
-
# Some mlflow backend will not list the directly recursively.
# So we force to the directly
artifacts = {}
diff --git a/qlib/workflow/task/gen.py b/qlib/workflow/task/gen.py
index 77bd2cbc1..bd98e501d 100644
--- a/qlib/workflow/task/gen.py
+++ b/qlib/workflow/task/gen.py
@@ -339,7 +339,6 @@ class MultiHorizonGenBase(TaskGen):
def generate(self, task: dict):
res = []
for hr in self.horizon:
-
# Add horizon
t = copy.deepcopy(task)
self.set_horizon(t, hr)
diff --git a/qlib/workflow/task/utils.py b/qlib/workflow/task/utils.py
index a914ea54f..19837b3c7 100644
--- a/qlib/workflow/task/utils.py
+++ b/qlib/workflow/task/utils.py
@@ -1,23 +1,25 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
-
"""
Some tools for task management.
"""
import bisect
+from copy import deepcopy
import pandas as pd
from qlib.data import D
+from qlib.utils import hash_args
+from qlib.utils.mod import init_instance_by_config
from qlib.workflow import R
from qlib.config import C
from qlib.log import get_module_logger
from pymongo import MongoClient
from pymongo.database import Database
from typing import Union
+from pathlib import Path
def get_mongodb() -> Database:
-
"""
Get database in MongoDB, which means you need to declare the address and the name of a database at first.
@@ -276,3 +278,31 @@ class TimeAdjuster:
return self.get(start_idx), self.get(end_idx)
else:
raise NotImplementedError(f"This type of input is not supported")
+
+
+def replace_task_handler_with_cache(task: dict, cache_dir: Union[str, Path] = ".") -> dict:
+ """
+ Replace the handler in task with a cache handler.
+ It will automatically cache the file and save it in cache_dir.
+
+ >>> import qlib
+ >>> qlib.auto_init()
+ >>> import datetime
+ >>> # it is simplified task
+ >>> task = {"dataset": {"kwargs":{'handler': {'class': 'Alpha158', 'module_path': 'qlib.contrib.data.handler', 'kwargs': {'start_time': datetime.date(2008, 1, 1), 'end_time': datetime.date(2020, 8, 1), 'fit_start_time': datetime.date(2008, 1, 1), 'fit_end_time': datetime.date(2014, 12, 31), 'instruments': 'CSI300'}}}}}
+ >>> new_task = replace_task_handler_with_cache(task)
+ >>> print(new_task)
+ {'dataset': {'kwargs': {'handler': 'file...Alpha158.3584f5f8b4.pkl'}}}
+
+ """
+ cache_dir = Path(cache_dir)
+ task = deepcopy(task)
+ handler = task["dataset"]["kwargs"]["handler"]
+ if isinstance(handler, dict):
+ hash = hash_args(handler)
+ h_path = cache_dir / f"{handler['class']}.{hash[:10]}.pkl"
+ if not h_path.exists():
+ h = init_instance_by_config(handler)
+ h.to_pickle(h_path, dump_all=True)
+ task["dataset"]["kwargs"]["handler"] = f"file://{h_path}"
+ return task
diff --git a/scripts/check_dump_bin.py b/scripts/check_dump_bin.py
index ef8023219..7ae8a26ab 100644
--- a/scripts/check_dump_bin.py
+++ b/scripts/check_dump_bin.py
@@ -15,7 +15,6 @@ from loguru import logger
class CheckBin:
-
NOT_IN_FEATURES = "not in features"
COMPARE_FALSE = "compare False"
COMPARE_TRUE = "compare True"
diff --git a/scripts/data_collector/base.py b/scripts/data_collector/base.py
index e3cf1fcac..386bb1b2c 100644
--- a/scripts/data_collector/base.py
+++ b/scripts/data_collector/base.py
@@ -18,7 +18,6 @@ from qlib.utils import code_to_fname
class BaseCollector(abc.ABC):
-
CACHE_FLAG = "CACHED"
NORMAL_FLAG = "NORMAL"
@@ -185,7 +184,6 @@ class BaseCollector(abc.ABC):
return self.NORMAL_FLAG
def _collector(self, instrument_list):
-
error_symbol = []
res = Parallel(n_jobs=self.max_workers)(
delayed(self._simple_collector)(_inst) for _inst in tqdm(instrument_list)
diff --git a/scripts/data_collector/br_index/collector.py b/scripts/data_collector/br_index/collector.py
index 0dc12eff6..7d32170f0 100644
--- a/scripts/data_collector/br_index/collector.py
+++ b/scripts/data_collector/br_index/collector.py
@@ -21,7 +21,6 @@ quarter_dict = {"1Q": "01-03", "2Q": "05-01", "3Q": "09-01"}
class IBOVIndex(IndexBase):
-
ibov_index_composition = "https://raw.githubusercontent.com/igor17400/IBOV-HCI/main/historic_composition/{}.csv"
years_4_month_periods = []
diff --git a/scripts/data_collector/us_index/collector.py b/scripts/data_collector/us_index/collector.py
index 97cbce825..cb0c3fc95 100644
--- a/scripts/data_collector/us_index/collector.py
+++ b/scripts/data_collector/us_index/collector.py
@@ -143,7 +143,6 @@ class WIKIIndex(IndexBase):
class NASDAQ100Index(WIKIIndex):
-
HISTORY_COMPANIES_URL = (
"https://indexes.nasdaqomx.com/Index/WeightingData?id=NDX&tradeDate={trade_date}T00%3A00%3A00.000&timeOfDay=SOD"
)
diff --git a/scripts/dump_pit.py b/scripts/dump_pit.py
index cda872c09..c328eb67a 100644
--- a/scripts/dump_pit.py
+++ b/scripts/dump_pit.py
@@ -237,7 +237,6 @@ class DumpPitData:
pass
with open(data_file, "rb+") as fd, open(index_file, "rb+") as fi:
-
# update index if needed
for i, row in df_sub.iterrows():
# get index
diff --git a/setup.py b/setup.py
index bf533cfe4..86d11dd61 100644
--- a/setup.py
+++ b/setup.py
@@ -80,6 +80,7 @@ REQUIRED = [
"gym",
# Installing the latest version of protobuf for python versions below 3.8 will cause unit tests to fail.
"protobuf<=3.20.1;python_version<='3.8'",
+ "cryptography",
]
# Numpy include
diff --git a/tests/backtest/test_high_freq_trading.py b/tests/backtest/test_high_freq_trading.py
index fd934914d..a538464db 100644
--- a/tests/backtest/test_high_freq_trading.py
+++ b/tests/backtest/test_high_freq_trading.py
@@ -27,7 +27,6 @@ class TestHFBacktest(TestAutoData):
return pd.DataFrame(orders, columns=headers)
def test_trading(self):
-
# date = "2020-02-03"
# inst = "SH600068"
# pos = 2.0167
diff --git a/tests/data_mid_layer_tests/test_handler_storage.py b/tests/data_mid_layer_tests/test_handler_storage.py
index 0d8ad4d57..a8bb730f7 100644
--- a/tests/data_mid_layer_tests/test_handler_storage.py
+++ b/tests/data_mid_layer_tests/test_handler_storage.py
@@ -21,7 +21,6 @@ class TestHandler(DataHandlerLP):
fit_end_time=None,
drop_raw=True,
):
-
infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time)
learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time)
@@ -51,7 +50,6 @@ class TestHandler(DataHandlerLP):
class TestHandlerStorage(TestAutoData):
-
market = "all"
start_time = "2010-01-01"
@@ -82,7 +80,6 @@ class TestHandlerStorage(TestAutoData):
)
with TimeInspector.logt("random fetch with DataFrame Storage"):
-
# single stock
for i in range(100):
random_index = np.random.randint(len(instruments), size=1)[0]
@@ -96,7 +93,6 @@ class TestHandlerStorage(TestAutoData):
data_handler.fetch(selector=(fetch_stocks, slice(fetch_start_time, fetch_end_time)), level=None)
with TimeInspector.logt("random fetch with HashingStock Storage"):
-
# single stock
for i in range(100):
random_index = np.random.randint(len(instruments), size=1)[0]
diff --git a/tests/misc/test_sepdf.py b/tests/misc/test_sepdf.py
index 9fdc0bb2d..76bd0e6bd 100644
--- a/tests/misc/test_sepdf.py
+++ b/tests/misc/test_sepdf.py
@@ -11,7 +11,6 @@ class SepDF(unittest.TestCase):
return "".join(str(obj).split())
def test_index_data(self):
-
np.random.seed(42)
index = [
diff --git a/tests/rolling_tests/test_update_pred.py b/tests/rolling_tests/test_update_pred.py
index 324611948..b3ca2e036 100644
--- a/tests/rolling_tests/test_update_pred.py
+++ b/tests/rolling_tests/test_update_pred.py
@@ -77,7 +77,6 @@ class TestRolling(TestAutoData):
@pytest.mark.slow
def test_update_label(self):
-
task = copy.deepcopy(CSI300_GBDT_TASK)
task["record"] = {
diff --git a/tests/storage_tests/test_storage.py b/tests/storage_tests/test_storage.py
index 50b16a041..92fed34ec 100644
--- a/tests/storage_tests/test_storage.py
+++ b/tests/storage_tests/test_storage.py
@@ -22,7 +22,6 @@ QLIB_DIR.mkdir(exist_ok=True, parents=True)
class TestStorage(TestAutoData):
def test_calendar_storage(self):
-
calendar = CalendarStorage(freq="day", future=False, provider_uri=self.provider_uri)
assert isinstance(calendar[:], Iterable), f"{calendar.__class__.__name__}.__getitem__(s: slice) is not Iterable"
assert isinstance(calendar.data, Iterable), f"{calendar.__class__.__name__}.data is not Iterable"
diff --git a/tests/test_dump_data.py b/tests/test_dump_data.py
index dfa7f8556..33cae4e80 100644
--- a/tests/test_dump_data.py
+++ b/tests/test_dump_data.py
@@ -35,7 +35,7 @@ class TestDumpData(unittest.TestCase):
@classmethod
def setUpClass(cls) -> None:
- GetData().csv_data_cn(SOURCE_DIR)
+ GetData().download_data(file_name="csv_data_cn.zip", target_dir=SOURCE_DIR)
TestDumpData.DUMP_DATA = DumpDataAll(csv_path=SOURCE_DIR, qlib_dir=QLIB_DIR, include_fields=cls.FIELDS)
TestDumpData.STOCK_NAMES = list(map(lambda x: x.name[:-4].upper(), SOURCE_DIR.glob("*.csv")))
provider_uri = str(QLIB_DIR.resolve())
diff --git a/tests/test_get_data.py b/tests/test_get_data.py
index 93a852f55..125b9203e 100644
--- a/tests/test_get_data.py
+++ b/tests/test_get_data.py
@@ -33,7 +33,6 @@ class TestGetData(unittest.TestCase):
shutil.rmtree(str(DATA_DIR.resolve()))
def test_0_qlib_data(self):
-
GetData().qlib_data(
name="qlib_data_simple", target_dir=QLIB_DIR, region="cn", interval="1d", delete_old=False, exists_skip=True
)
@@ -42,7 +41,7 @@ class TestGetData(unittest.TestCase):
self.assertFalse(df.dropna().empty, "get qlib data failed")
def test_1_csv_data(self):
- GetData().csv_data_cn(SOURCE_DIR)
+ GetData().download_data(file_name="csv_data_cn.zip", target_dir=SOURCE_DIR)
stock_name = set(map(lambda x: x.name[:-4].upper(), SOURCE_DIR.glob("*.csv")))
self.assertEqual(len(stock_name), 85, "get csv data failed")