1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-06-06 14:01:28 +08:00

Compare commits

...

8 Commits
6cma ... v0.9.2

Author SHA1 Message Date
you-n-g
27f476b311 Update __init__.py 2023-06-26 00:00:46 +08:00
you-n-g
0e61cac6a8 Update release-drafter.yml (#1569)
* Update release-drafter.yml

* Update release-drafter.yml
2023-06-25 23:48:37 +08:00
Linlang
21f0b394e7 change get_data url (#1558)
* change_url

* fix_CI

* fix_CI_2

* fix_CI_3

* fix_CI_4

* fix_CI_5

* fix_CI_6

* fix_CI_7

* fix_CI_8

* fix_CI_9

* fix_CI_10

* fix_CI_11

* fix_CI_12

* fix_CI_13

* fix_CI_13

* fix_CI_14

* fix_CI_15

* fix_CI_16

* fix_CI_17

* fix_CI_18

* fix_CI_19

* fix_CI_20

* fix_CI_21

* fix_CI_22

* fix_CI_23

* fix_CI_24

* fix_CI_25

* fix_CI_26

* fix_CI_27

* fix_get_data_error

* fix_get_data_error2

* modify_get_data

* modify_get_data2

* modify_get_data3

* modify_get_data4

* fix_CI_28

* fix_CI_29

* fix_CI_30

---------

Co-authored-by: Linlang <v-linlanglv@microsoft.com>
2023-06-25 23:39:11 +08:00
Wendi Li
cd4ab998fb Update on Dynamic Benchmark (#1539)
* move config file to benchmark_dynamic & switch default sim task model to GBDT

* Update benchmark_dynamic results

* Change the default value of alpha of DDG-DA
2023-06-03 08:42:24 +08:00
you-n-g
0e9ac9dce7 Fix CI (#1529) 2023-05-31 08:39:52 +08:00
yaxuan999
efffb2819a added KRNN and Sandwich models and their example results based on Alpha360 (#1414)
* Update README.md

updated the result of KRNN and Sandwich models based on Alpha360

* Update README.md

* Update README.md

* Add files via upload

* Update README.md

* Update README.md

* Update README.md

* Add files via upload

* Delete pytorch_krnn.py

* Delete pytorch_sandwich.py

* Add files via upload

* Update pytorch_sandwich.py

* Update pytorch_krnn.py

* Update pytorch_sandwich.py

* Update pytorch_krnn.py

* Update README.md

* Update README.md

* Update requirements.txt

* Update requirements.txt

* Update README.md

* Update README.md

* Update pytorch_sandwich.py

* Update link on index

---------

Co-authored-by: Young <afe.young@gmail.com>
2023-05-26 18:42:58 +08:00
Fivele-Li
19a0eb78bc Fix TCN model input dimension mismatch (#1520)
* transpose dimension 1 and 2 to match nn.Conv1d input

* 1.update TCN benchmarks;
2.Emphasize updating the benchmark table;

* replace specific version with main

---------

Co-authored-by: lijinhui <362237642@qq.com>
2023-05-26 14:44:34 +08:00
Fivele-Li
370477288d fix_DDG-DA_workflow_bug (#1516)
* 1.specify group_keys=False to avoid FutureWarning;
2.fix get train_start from dict unexpected problem;

* fix black

* Add comments

* Add make file

---------

Co-authored-by: Young <afe.young@gmail.com>
2023-05-24 15:49:58 +08:00
26 changed files with 1367 additions and 73 deletions

View File

@@ -14,6 +14,9 @@ categories:
label:
- 'doc'
- 'documentation'
- title: '🧹 Maintenance'
label:
- 'maintenance'
change-template: '- $TITLE @$AUTHOR (#$NUMBER)'
change-title-escapes: '\<*_&' # You can add # and @ to disable mentions, and add ` to disable code blocks.
version-resolver:
@@ -30,4 +33,4 @@ version-resolver:
template: |
## Changes
$CHANGES
$CHANGES

View File

@@ -20,18 +20,28 @@ jobs:
steps:
- name: Test qlib from source
uses: actions/checkout@v2
uses: actions/checkout@v3
# Since version 3.7 of python for MacOS is installed in CI, version 3.7.17, this version causes "_bz not found error".
# So we make the version number of python 3.7 for MacOS more specific.
# refs: https://github.com/actions/setup-python/issues/682
- name: Set up Python ${{ matrix.python-version }}
if: (matrix.os == 'macos-latest' && matrix.python-version == '3.7') || (matrix.os == 'macos-11' && matrix.python-version == '3.7')
uses: actions/setup-python@v4
with:
python-version: "3.7.16"
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
if: (matrix.os != 'macos-latest' || matrix.python-version != '3.7') && (matrix.os != 'macos-11' || matrix.python-version != '3.7')
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Update pip to the latest version
# pip release version 23.1 on Apr.15 2023, CI failed to run, Please refer to #1495 ofr detailed logs.
# The pip version has been temporarily fixed to 23.0.1
# The pip version has been temporarily fixed to 23.0
run: |
python -m pip install pip==23.0.1
python -m pip install pip==23.0
- name: Installing pytorch for macos
if: ${{ matrix.os == 'macos-11' || matrix.os == 'macos-latest' }}
@@ -129,8 +139,7 @@ jobs:
- name: Test data downloads
run: |
python scripts/get_data.py qlib_data --name qlib_data_simple --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
azcopy copy https://qlibpublic.blob.core.windows.net/data/rl /tmp/qlibpublic/data --recursive
mv /tmp/qlibpublic/data tests/.data
python scripts/get_data.py download_data --file_name rl_data.zip --target_dir tests/.data/rl
- name: Install Lightgbm for MacOS
if: ${{ matrix.os == 'macos-11' || matrix.os == 'macos-latest' }}

View File

@@ -20,18 +20,28 @@ jobs:
steps:
- name: Test qlib from source slow
uses: actions/checkout@v2
uses: actions/checkout@v3
# Since version 3.7 of python for MacOS is installed in CI, version 3.7.17, this version causes "_bz not found error".
# So we make the version number of python 3.7 for MacOS more specific.
# refs: https://github.com/actions/setup-python/issues/682
- name: Set up Python ${{ matrix.python-version }}
if: (matrix.os == 'macos-latest' && matrix.python-version == '3.7') || (matrix.os == 'macos-11' && matrix.python-version == '3.7')
uses: actions/setup-python@v4
with:
python-version: "3.7.16"
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
if: (matrix.os != 'macos-latest' || matrix.python-version != '3.7') && (matrix.os != 'macos-11' || matrix.python-version != '3.7')
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Set up Python tools
# pip release version 23.1 on Apr.15 2023, CI failed to run, Please refer to #1495 ofr detailed logs.
# The pip version has been temporarily fixed to 23.0.1
# The pip version has been temporarily fixed to 23.0
run: |
python -m pip install pip==23.0.1
python -m pip install pip==23.0
pip install --upgrade cython numpy
pip install -e .[dev]

View File

@@ -11,6 +11,7 @@
Recent released features
| Feature | Status |
| -- | ------ |
| KRNN and Sandwich models | :chart_with_upwards_trend: [Released](https://github.com/microsoft/qlib/pull/1414/) on May 26, 2023 |
| Release Qlib v0.9.0 | :octocat: [Released](https://github.com/microsoft/qlib/releases/tag/v0.9.0) on Dec 9, 2022 |
| RL Learning Framework | :hammer: :chart_with_upwards_trend: Released on Nov 10, 2022. [#1332](https://github.com/microsoft/qlib/pull/1332), [#1322](https://github.com/microsoft/qlib/pull/1322), [#1316](https://github.com/microsoft/qlib/pull/1316),[#1299](https://github.com/microsoft/qlib/pull/1299),[#1263](https://github.com/microsoft/qlib/pull/1263), [#1244](https://github.com/microsoft/qlib/pull/1244), [#1169](https://github.com/microsoft/qlib/pull/1169), [#1125](https://github.com/microsoft/qlib/pull/1125), [#1076](https://github.com/microsoft/qlib/pull/1076)|
| HIST and IGMTF models | :chart_with_upwards_trend: [Released](https://github.com/microsoft/qlib/pull/1040) on Apr 10, 2022 |
@@ -353,6 +354,8 @@ Here is a list of models built on `Qlib`.
- [ADD based on pytorch (Hongshun Tang, et al.2020)](examples/benchmarks/ADD/)
- [IGMTF based on pytorch (Wentao Xu, et al.2021)](examples/benchmarks/IGMTF/)
- [HIST based on pytorch (Wentao Xu, et al.2021)](examples/benchmarks/HIST/)
- [KRNN based on pytorch](examples/benchmarks/KRNN/)
- [Sandwich based on pytorch](examples/benchmarks/Sandwich/)
Your PR of new Quant models is highly welcomed.

View File

@@ -119,7 +119,7 @@ Here are some example:
for daily data:
.. code-block:: bash
python scripts/get_data.py csv_data_cn --target_dir ~/.qlib/csv_data/cn_data
python scripts/get_data.py download_data --file_name csv_data_cn.zip --target_dir ~/.qlib/csv_data/cn_data
for 1min data:
.. code-block:: bash

View File

@@ -0,0 +1,8 @@
# KRNN
* Code: [https://github.com/microsoft/FOST/blob/main/fostool/model/krnn.py](https://github.com/microsoft/FOST/blob/main/fostool/model/krnn.py)
# Introductions about the settings/configs.
* Torch_geometric is used in the original model in FOST, but we didn't use it.
* make use your CUDA version matches the torch version to allow the usage of GPU, we use CUDA==10.2 and torch.__version__==1.12.1

View File

@@ -0,0 +1,2 @@
numpy==1.23.4
pandas==1.5.2

View File

@@ -0,0 +1,91 @@
qlib_init:
provider_uri: "~/.qlib/qlib_data/cn_data"
region: cn
market: &market csi300
benchmark: &benchmark SH000300
data_handler_config: &data_handler_config
start_time: 2008-01-01
end_time: 2020-08-01
fit_start_time: 2008-01-01
fit_end_time: 2014-12-31
instruments: *market
infer_processors:
- class: RobustZScoreNorm
kwargs:
fields_group: feature
clip_outlier: true
- class: Fillna
kwargs:
fields_group: feature
learn_processors:
- class: DropnaLabel
- class: CSRankNorm
kwargs:
fields_group: label
label: ["Ref($close, -2) / Ref($close, -1) - 1"]
port_analysis_config: &port_analysis_config
strategy:
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy
kwargs:
signal:
- <MODEL>
- <DATASET>
topk: 50
n_drop: 5
backtest:
start_time: 2017-01-01
end_time: 2020-08-01
account: 100000000
benchmark: *benchmark
exchange_kwargs:
limit_threshold: 0.095
deal_price: close
open_cost: 0.0005
close_cost: 0.0015
min_cost: 5
task:
model:
class: KRNN
module_path: qlib.contrib.model.pytorch_krnn
kwargs:
fea_dim: 6
cnn_dim: 8
cnn_kernel_size: 3
rnn_dim: 8
rnn_dups: 2
rnn_layers: 2
n_epochs: 200
lr: 0.001
early_stop: 20
batch_size: 2000
metric: loss
GPU: 0
dataset:
class: DatasetH
module_path: qlib.data.dataset
kwargs:
handler:
class: Alpha360
module_path: qlib.contrib.data.handler
kwargs: *data_handler_config
segments:
train: [2008-01-01, 2014-12-31]
valid: [2015-01-01, 2016-12-31]
test: [2017-01-01, 2020-08-01]
record:
- class: SignalRecord
module_path: qlib.workflow.record_temp
kwargs:
model: <MODEL>
dataset: <DATASET>
- class: SigAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
ana_long_short: False
ann_scaler: 252
- class: PortAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
config: *port_analysis_config

View File

@@ -26,7 +26,7 @@ The numbers shown below demonstrate the performance of the entire `workflow` of
| Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown |
|------------------------------------------|-------------------------------------|-------------|-------------|-------------|-------------|-------------------|-------------------|--------------|
| TCN(Shaojie Bai, et al.) | Alpha158 | 0.0275±0.00 | 0.2157±0.01 | 0.0411±0.00 | 0.3379±0.01 | 0.0190±0.02 | 0.2887±0.27 | -0.1202±0.03 |
| TCN(Shaojie Bai, et al.) | Alpha158 | 0.0279±0.00 | 0.2181±0.01 | 0.0421±0.00 | 0.3429±0.01 | 0.0262±0.02 | 0.4133±0.25 | -0.1090±0.03 |
| TabNet(Sercan O. Arik, et al.) | Alpha158 | 0.0204±0.01 | 0.1554±0.07 | 0.0333±0.00 | 0.2552±0.05 | 0.0227±0.04 | 0.3676±0.54 | -0.1089±0.08 |
| Transformer(Ashish Vaswani, et al.) | Alpha158 | 0.0264±0.00 | 0.2053±0.02 | 0.0407±0.00 | 0.3273±0.02 | 0.0273±0.02 | 0.3970±0.26 | -0.1101±0.02 |
| GRU(Kyunghyun Cho, et al.) | Alpha158(with selected 20 features) | 0.0315±0.00 | 0.2450±0.04 | 0.0428±0.00 | 0.3440±0.03 | 0.0344±0.02 | 0.5160±0.25 | -0.1017±0.02 |
@@ -68,6 +68,8 @@ The numbers shown below demonstrate the performance of the entire `workflow` of
| TRA(Hengxu Lin, et al.) | Alpha360 | 0.0485±0.00 | 0.3787±0.03 | 0.0587±0.00 | 0.4756±0.03 | 0.0920±0.03 | 1.2789±0.42 | -0.0834±0.02 |
| IGMTF(Wentao Xu, et al.) | Alpha360 | 0.0480±0.00 | 0.3589±0.02 | 0.0606±0.00 | 0.4773±0.01 | 0.0946±0.02 | 1.3509±0.25 | -0.0716±0.02 |
| HIST(Wentao Xu, et al.) | Alpha360 | 0.0522±0.00 | 0.3530±0.01 | 0.0667±0.00 | 0.4576±0.01 | 0.0987±0.02 | 1.3726±0.27 | -0.0681±0.01 |
| KRNN | Alpha360 | 0.0173±0.01 | 0.1210±0.06 | 0.0270±0.01 | 0.2018±0.04 | -0.0465±0.05 | -0.5415±0.62 | -0.2919±0.13 |
| Sandwich | Alpha360 | 0.0258±0.00 | 0.1924±0.04 | 0.0337±0.00 | 0.2624±0.03 | 0.0005±0.03 | 0.0001±0.33 | -0.1752±0.05 |
- The selected 20 features are based on the feature importance of a lightgbm-based model.
@@ -134,7 +136,7 @@ If you want to contribute your new models, you can follow the steps below.
- `README.md`: a brief introduction to your models
- `workflow_config_<model name>_<dataset>.yaml`: a configuration which can read by `qrun`. You are encouraged to run your model in all datasets.
3. You can integrate your model as a module [in this folder](https://github.com/microsoft/qlib/tree/main/qlib/contrib/model).
4. Please updated your results in the benchmark tables, e.g. [Alpha360](#alpha158-dataset), [Alpha158](#alpha158-dataset)(the values of each metric are the mean and std calculated based on 20 runs with different random seeds, if you don't have enough computational resource, you can ask for help in the PR).
4. Please update your results in the above **Benchmark Tables**, e.g. [Alpha360](#alpha158-dataset), [Alpha158](#alpha158-dataset)(the values of each metric are the mean and std calculated based on **20 Runs** with different random seeds. You can accomplish the above operations through the automated [script](https://github.com/microsoft/qlib/blob/main/examples/run_all_model.py#LL286C22-L286C22) provided by Qlib, and get the final result in the .md file. if you don't have enough computational resource, you can ask for help in the PR).
5. Update the info in the index page in the [news list](https://github.com/microsoft/qlib#newspaper-whats-new----sparkling_heart) and [model list](https://github.com/microsoft/qlib#quant-model-paper-zoo).
Finally, you can send PR for review. ([here is an example](https://github.com/microsoft/qlib/pull/1040))

View File

@@ -0,0 +1,8 @@
# Sandwich
* Code: [https://github.com/microsoft/FOST/blob/main/fostool/model/sandwich.py](https://github.com/microsoft/FOST/blob/main/fostool/model/sandwich.py)
# Introductions about the settings/configs.
* Torch_geometric is used in the original model in FOST, but we didn't use it.
make use your CUDA version matches the torch version to allow the usage of GPU, we use CUDA==10.2 and torch.version==1.12.1

View File

@@ -0,0 +1,2 @@
numpy==1.23.4
pandas==1.5.2

View File

@@ -0,0 +1,93 @@
qlib_init:
provider_uri: "~/.qlib/qlib_data/cn_data"
region: cn
market: &market csi300
benchmark: &benchmark SH000300
data_handler_config: &data_handler_config
start_time: 2008-01-01
end_time: 2020-08-01
fit_start_time: 2008-01-01
fit_end_time: 2014-12-31
instruments: *market
infer_processors:
- class: RobustZScoreNorm
kwargs:
fields_group: feature
clip_outlier: true
- class: Fillna
kwargs:
fields_group: feature
learn_processors:
- class: DropnaLabel
- class: CSRankNorm
kwargs:
fields_group: label
label: ["Ref($close, -2) / Ref($close, -1) - 1"]
port_analysis_config: &port_analysis_config
strategy:
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy
kwargs:
signal:
- <MODEL>
- <DATASET>
topk: 50
n_drop: 5
backtest:
start_time: 2017-01-01
end_time: 2020-08-01
account: 100000000
benchmark: *benchmark
exchange_kwargs:
limit_threshold: 0.095
deal_price: close
open_cost: 0.0005
close_cost: 0.0015
min_cost: 5
task:
model:
class: Sandwich
module_path: qlib.contrib.model.pytorch_sandwich
kwargs:
fea_dim: 6
cnn_dim_1: 16
cnn_dim_2: 16
cnn_kernel_size: 3
rnn_dim_1: 8
rnn_dim_2: 8
rnn_dups: 2
rnn_layers: 2
n_epochs: 200
lr: 0.001
early_stop: 20
batch_size: 2000
metric: loss
GPU: 0
dataset:
class: DatasetH
module_path: qlib.data.dataset
kwargs:
handler:
class: Alpha360
module_path: qlib.contrib.data.handler
kwargs: *data_handler_config
segments:
train: [2008-01-01, 2014-12-31]
valid: [2015-01-01, 2016-12-31]
test: [2017-01-01, 2020-08-01]
record:
- class: SignalRecord
module_path: qlib.workflow.record_temp
kwargs:
model: <MODEL>
dataset: <DATASET>
- class: SigAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
ana_long_short: False
ann_scaler: 252
- class: PortAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
config: *port_analysis_config

View File

@@ -0,0 +1,4 @@
.PHONY: clean
clean:
-rm -r *.pkl mlruns || true

View File

@@ -34,14 +34,14 @@ class DDGDA:
def __init__(
self,
sim_task_model: Literal["linear", "gbdt"] = "linear",
sim_task_model: Literal["linear", "gbdt"] = "gbdt",
forecast_model: Literal["linear", "gbdt"] = "linear",
h_path: Optional[str] = None,
test_end: Optional[str] = None,
train_start: Optional[str] = None,
meta_1st_train_end: Optional[str] = None,
task_ext_conf: Optional[dict] = None,
alpha: float = 0.0,
alpha: float = 0.01,
proxy_hd: str = "handler_proxy.pkl",
):
"""
@@ -116,7 +116,9 @@ class DDGDA:
feature_selected = feature_df.loc[:, col_selected.index]
feature_selected = feature_selected.groupby("datetime").apply(lambda df: (df - df.mean()).div(df.std()))
feature_selected = feature_selected.groupby("datetime", group_keys=False).apply(
lambda df: (df - df.mean()).div(df.std())
)
feature_selected = feature_selected.fillna(0.0)
df_all = {
@@ -168,7 +170,8 @@ class DDGDA:
# - Only the dataset part is important, in current version of meta model will integrate the
rb = RollingBenchmark(model_type=self.sim_task_model, **self.rb_kwargs)
sim_task = rb.basic_task()
train_start = self.rb_kwargs.get("train_start", "2008-01-01")
# the train_start for training meta model does not necessarily align with final rolling
train_start = "2008-01-01" if self.rb_kwargs.get("train_start") is None else self.rb_kwargs.get("train_start")
train_end = "2010-12-31" if self.meta_1st_train_end is None else self.meta_1st_train_end
test_start = (pd.Timestamp(train_end) + pd.Timedelta(days=1)).strftime("%Y-%m-%d")
proxy_forecast_model_task = {
@@ -212,7 +215,7 @@ class DDGDA:
with R.start(experiment_name=self.meta_exp_name):
R.log_params(**kwargs)
mm = MetaModelDS(
step=self.step, hist_step_n=kwargs["hist_step_n"], lr=0.001, max_epoch=100, seed=43, alpha=self.alpha
step=self.step, hist_step_n=kwargs["hist_step_n"], lr=0.001, max_epoch=30, seed=43, alpha=self.alpha
)
mm.fit(md)
R.save_objects(model=mm)

View File

@@ -8,15 +8,17 @@ The table below shows the performances of different solutions on different forec
Here is the [crowd sourced version of qlib data](data_collector/crowd_source/README.md): https://github.com/chenditc/investment_data/releases
```bash
wget https://github.com/chenditc/investment_data/releases/download/20220720/qlib_bin.tar.gz
mkdir -p ~/.qlib/qlib_data/cn_data
tar -zxvf qlib_bin.tar.gz -C ~/.qlib/qlib_data/cn_data --strip-components=2
rm -f qlib_bin.tar.gz
```
| Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown |
|------------------|---------|----|------|---------|-----------|-------------------|-------------------|--------------|
| RR[Linear] |Alpha158 |0.089|0.577|0.102 |0.627 |0.093 |1.458 |-0.073 |
| DDG-DA[Linear] |Alpha158 |0.096|0.636|0.107 |0.677 |0.067 |0.996 |-0.091 |
| RR[LightGBM] |Alpha158 |0.082|0.589|0.091 |0.626 |0.077 |1.320 |-0.091 |
| DDG-DA[LightGBM] |Alpha158 |0.085|0.658|0.094 |0.686 |0.115 |1.792 |-0.068 |
|------------------|---------|------|------|---------|-----------|-------------------|-------------------|--------------|
| RR[Linear] |Alpha158 |0.0945|0.5989|0.1069 |0.6495 |0.0857 |1.3682 |-0.0986 |
| DDG-DA[Linear] |Alpha158 |0.0983|0.6157|0.1108 |0.6646 |0.0764 |1.1904 |-0.0769 |
| RR[LightGBM] |Alpha158 |0.0816|0.5887|0.0912 |0.6263 |0.0771 |1.3196 |-0.0909 |
| DDG-DA[LightGBM] |Alpha158 |0.0878|0.6185|0.0975 |0.6524 |0.1261 |2.0096 |-0.0744 |
- The label horizon of the `Alpha158` dataset is set to 20.
- The rolling time intervals are set to 20 trading days.

View File

@@ -67,11 +67,12 @@ class RollingBenchmark:
def basic_task(self):
"""For fast training rolling"""
if self.model_type == "gbdt":
conf_path = DIRNAME.parent.parent / "benchmarks" / "LightGBM" / "workflow_config_lightgbm_Alpha158.yaml"
conf_path = DIRNAME / "workflow_config_lightgbm_Alpha158.yaml"
# dump the processed data on to disk for later loading to speed up the processing
h_path = DIRNAME / "lightgbm_alpha158_handler_horizon{}.pkl".format(self.horizon)
elif self.model_type == "linear":
conf_path = DIRNAME.parent.parent / "benchmarks" / "Linear" / "workflow_config_linear_Alpha158.yaml"
# We use ridge regression to stabilize the performance
conf_path = DIRNAME / "workflow_config_linear_Alpha158.yaml"
h_path = DIRNAME / "linear_alpha158_handler_horizon{}.pkl".format(self.horizon)
else:
raise AssertionError("Model type is not supported!")

View File

@@ -0,0 +1,72 @@
qlib_init:
provider_uri: "~/.qlib/qlib_data/cn_data"
region: cn
market: &market csi300
benchmark: &benchmark SH000300
data_handler_config: &data_handler_config
start_time: 2008-01-01
end_time: 2020-08-01
fit_start_time: 2008-01-01
fit_end_time: 2014-12-31
instruments: *market
port_analysis_config: &port_analysis_config
strategy:
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy
kwargs:
model: <MODEL>
dataset: <DATASET>
topk: 50
n_drop: 5
backtest:
start_time: 2017-01-01
end_time: 2020-08-01
account: 100000000
benchmark: *benchmark
exchange_kwargs:
limit_threshold: 0.095
deal_price: close
open_cost: 0.0005
close_cost: 0.0015
min_cost: 5
task:
model:
class: LGBModel
module_path: qlib.contrib.model.gbdt
kwargs:
loss: mse
colsample_bytree: 0.8879
learning_rate: 0.2
subsample: 0.8789
lambda_l1: 205.6999
lambda_l2: 580.9768
max_depth: 8
num_leaves: 210
num_threads: 20
dataset:
class: DatasetH
module_path: qlib.data.dataset
kwargs:
handler:
class: Alpha158
module_path: qlib.contrib.data.handler
kwargs: *data_handler_config
segments:
train: [2008-01-01, 2014-12-31]
valid: [2015-01-01, 2016-12-31]
test: [2017-01-01, 2020-08-01]
record:
- class: SignalRecord
module_path: qlib.workflow.record_temp
kwargs:
model: <MODEL>
dataset: <DATASET>
- class: SigAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
ana_long_short: False
ann_scaler: 252
- class: PortAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
config: *port_analysis_config

View File

@@ -0,0 +1,79 @@
qlib_init:
provider_uri: "~/.qlib/qlib_data/cn_data"
region: cn
market: &market csi300
benchmark: &benchmark SH000300
data_handler_config: &data_handler_config
start_time: 2008-01-01
end_time: 2020-08-01
fit_start_time: 2008-01-01
fit_end_time: 2014-12-31
instruments: *market
infer_processors:
- class: RobustZScoreNorm
kwargs:
fields_group: feature
clip_outlier: true
- class: Fillna
kwargs:
fields_group: feature
learn_processors:
- class: DropnaLabel
- class: CSRankNorm
kwargs:
fields_group: label
port_analysis_config: &port_analysis_config
strategy:
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy
kwargs:
signal:
- <MODEL>
- <DATASET>
topk: 50
n_drop: 5
backtest:
start_time: 2017-01-01
end_time: 2020-08-01
account: 100000000
benchmark: *benchmark
exchange_kwargs:
limit_threshold: 0.095
deal_price: close
open_cost: 0.0005
close_cost: 0.0015
min_cost: 5
task:
model:
class: LinearModel
module_path: qlib.contrib.model.linear
kwargs:
estimator: ridge
alpha: 0.05
dataset:
class: DatasetH
module_path: qlib.data.dataset
kwargs:
handler:
class: Alpha158
module_path: qlib.contrib.data.handler
kwargs: *data_handler_config
segments:
train: [2008-01-01, 2014-12-31]
valid: [2015-01-01, 2016-12-31]
test: [2017-01-01, 2020-08-01]
record:
- class: SignalRecord
module_path: qlib.workflow.record_temp
kwargs:
model: <MODEL>
dataset: <DATASET>
- class: SigAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
ana_long_short: True
ann_scaler: 252
- class: PortAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
config: *port_analysis_config

View File

@@ -2,7 +2,7 @@
# Licensed under the MIT License.
from pathlib import Path
__version__ = "0.9.1.99"
__version__ = "0.9.2"
__version__bak = __version__ # This version is backup for QlibConfig.reset_qlib_version
import os
from typing import Union

View File

@@ -0,0 +1,511 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from __future__ import division
from __future__ import print_function
import numpy as np
import pandas as pd
from typing import Text, Union
import copy
from ...utils import get_or_create_path
from ...log import get_module_logger
import torch
import torch.nn as nn
import torch.optim as optim
from ...model.base import Model
from ...data.dataset import DatasetH
from ...data.dataset.handler import DataHandlerLP
########################################################################
########################################################################
########################################################################
class CNNEncoderBase(nn.Module):
def __init__(self, input_dim, output_dim, kernel_size, device):
"""Build a basic CNN encoder
Parameters
----------
input_dim : int
The input dimension
output_dim : int
The output dimension
kernel_size : int
The size of convolutional kernels
"""
super().__init__()
self.input_dim = input_dim
self.output_dim = output_dim
self.kernel_size = kernel_size
self.device = device
# set padding to ensure the same length
# it is correct only when kernel_size is odd, dilation is 1, stride is 1
self.conv = nn.Conv1d(input_dim, output_dim, kernel_size, padding=(kernel_size - 1) // 2)
def forward(self, x):
"""
Parameters
----------
x : torch.Tensor
input data
Returns
-------
torch.Tensor
Updated representations
"""
# input shape: [batch_size, seq_len*input_dim]
# output shape: [batch_size, seq_len, input_dim]
x = x.view(x.shape[0], -1, self.input_dim).permute(0, 2, 1).to(self.device)
y = self.conv(x) # [batch_size, output_dim, conved_seq_len]
y = y.permute(0, 2, 1) # [batch_size, conved_seq_len, output_dim]
return y
class KRNNEncoderBase(nn.Module):
def __init__(self, input_dim, output_dim, dup_num, rnn_layers, dropout, device):
"""Build K parallel RNNs
Parameters
----------
input_dim : int
The input dimension
output_dim : int
The output dimension
dup_num : int
The number of parallel RNNs
rnn_layers: int
The number of RNN layers
"""
super().__init__()
self.input_dim = input_dim
self.output_dim = output_dim
self.dup_num = dup_num
self.rnn_layers = rnn_layers
self.dropout = dropout
self.device = device
self.rnn_modules = nn.ModuleList()
for _ in range(dup_num):
self.rnn_modules.append(nn.GRU(input_dim, output_dim, num_layers=self.rnn_layers, dropout=dropout))
def forward(self, x):
"""
Parameters
----------
x : torch.Tensor
Input data
n_id : torch.Tensor
Node indices
Returns
-------
torch.Tensor
Updated representations
"""
# input shape: [batch_size, seq_len, input_dim]
# output shape: [batch_size, seq_len, output_dim]
# [seq_len, batch_size, input_dim]
batch_size, seq_len, input_dim = x.shape
x = x.permute(1, 0, 2).to(self.device)
hids = []
for rnn in self.rnn_modules:
h, _ = rnn(x) # [seq_len, batch_size, output_dim]
hids.append(h)
# [seq_len, batch_size, output_dim, num_dups]
hids = torch.stack(hids, dim=-1)
hids = hids.view(seq_len, batch_size, self.output_dim, self.dup_num)
hids = hids.mean(dim=3)
hids = hids.permute(1, 0, 2)
return hids
class CNNKRNNEncoder(nn.Module):
def __init__(
self, cnn_input_dim, cnn_output_dim, cnn_kernel_size, rnn_output_dim, rnn_dup_num, rnn_layers, dropout, device
):
"""Build an encoder composed of CNN and KRNN
Parameters
----------
cnn_input_dim : int
The input dimension of CNN
cnn_output_dim : int
The output dimension of CNN
cnn_kernel_size : int
The size of convolutional kernels
rnn_output_dim : int
The output dimension of KRNN
rnn_dup_num : int
The number of parallel duplicates for KRNN
rnn_layers : int
The number of RNN layers
"""
super().__init__()
self.cnn_encoder = CNNEncoderBase(cnn_input_dim, cnn_output_dim, cnn_kernel_size, device)
self.krnn_encoder = KRNNEncoderBase(cnn_output_dim, rnn_output_dim, rnn_dup_num, rnn_layers, dropout, device)
def forward(self, x):
"""
Parameters
----------
x : torch.Tensor
Input data
n_id : torch.Tensor
Node indices
Returns
-------
torch.Tensor
Updated representations
"""
cnn_out = self.cnn_encoder(x)
krnn_out = self.krnn_encoder(cnn_out)
return krnn_out
class KRNNModel(nn.Module):
def __init__(self, fea_dim, cnn_dim, cnn_kernel_size, rnn_dim, rnn_dups, rnn_layers, dropout, device, **params):
"""Build a KRNN model
Parameters
----------
fea_dim : int
The feature dimension
cnn_dim : int
The hidden dimension of CNN
cnn_kernel_size : int
The size of convolutional kernels
rnn_dim : int
The hidden dimension of KRNN
rnn_dups : int
The number of parallel duplicates
rnn_layers: int
The number of RNN layers
"""
super().__init__()
self.encoder = CNNKRNNEncoder(
cnn_input_dim=fea_dim,
cnn_output_dim=cnn_dim,
cnn_kernel_size=cnn_kernel_size,
rnn_output_dim=rnn_dim,
rnn_dup_num=rnn_dups,
rnn_layers=rnn_layers,
dropout=dropout,
device=device,
)
self.out_fc = nn.Linear(rnn_dim, 1)
self.device = device
def forward(self, x):
# x: [batch_size, node_num, seq_len, input_dim]
encode = self.encoder(x)
out = self.out_fc(encode[:, -1, :]).squeeze().to(self.device)
return out
class KRNN(Model):
"""KRNN Model
Parameters
----------
d_feat : int
input dimension for each time step
metric: str
the evaluation metric used in early stop
optimizer : str
optimizer name
GPU : str
the GPU ID(s) used for training
"""
def __init__(
self,
fea_dim=6,
cnn_dim=64,
cnn_kernel_size=3,
rnn_dim=64,
rnn_dups=3,
rnn_layers=2,
dropout=0,
n_epochs=200,
lr=0.001,
metric="",
batch_size=2000,
early_stop=20,
loss="mse",
optimizer="adam",
GPU=0,
seed=None,
**kwargs
):
# Set logger.
self.logger = get_module_logger("KRNN")
self.logger.info("KRNN pytorch version...")
# set hyper-parameters.
self.fea_dim = fea_dim
self.cnn_dim = cnn_dim
self.cnn_kernel_size = cnn_kernel_size
self.rnn_dim = rnn_dim
self.rnn_dups = rnn_dups
self.rnn_layers = rnn_layers
self.dropout = dropout
self.n_epochs = n_epochs
self.lr = lr
self.metric = metric
self.batch_size = batch_size
self.early_stop = early_stop
self.optimizer = optimizer.lower()
self.loss = loss
self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu")
self.seed = seed
self.logger.info(
"KRNN parameters setting:"
"\nfea_dim : {}"
"\ncnn_dim : {}"
"\ncnn_kernel_size : {}"
"\nrnn_dim : {}"
"\nrnn_dups : {}"
"\nrnn_layers : {}"
"\ndropout : {}"
"\nn_epochs : {}"
"\nlr : {}"
"\nmetric : {}"
"\nbatch_size: {}"
"\nearly_stop : {}"
"\noptimizer : {}"
"\nloss_type : {}"
"\nvisible_GPU : {}"
"\nuse_GPU : {}"
"\nseed : {}".format(
fea_dim,
cnn_dim,
cnn_kernel_size,
rnn_dim,
rnn_dups,
rnn_layers,
dropout,
n_epochs,
lr,
metric,
batch_size,
early_stop,
optimizer.lower(),
loss,
GPU,
self.use_gpu,
seed,
)
)
if self.seed is not None:
np.random.seed(self.seed)
torch.manual_seed(self.seed)
self.krnn_model = KRNNModel(
fea_dim=self.fea_dim,
cnn_dim=self.cnn_dim,
cnn_kernel_size=self.cnn_kernel_size,
rnn_dim=self.rnn_dim,
rnn_dups=self.rnn_dups,
rnn_layers=self.rnn_layers,
dropout=self.dropout,
device=self.device,
)
if optimizer.lower() == "adam":
self.train_optimizer = optim.Adam(self.krnn_model.parameters(), lr=self.lr)
elif optimizer.lower() == "gd":
self.train_optimizer = optim.SGD(self.krnn_model.parameters(), lr=self.lr)
else:
raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
self.fitted = False
self.krnn_model.to(self.device)
@property
def use_gpu(self):
return self.device != torch.device("cpu")
def mse(self, pred, label):
loss = (pred - label) ** 2
return torch.mean(loss)
def loss_fn(self, pred, label):
mask = ~torch.isnan(label)
if self.loss == "mse":
return self.mse(pred[mask], label[mask])
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
mask = torch.isfinite(label)
if self.metric in ("", "loss"):
return -self.loss_fn(pred[mask], label[mask])
raise ValueError("unknown metric `%s`" % self.metric)
def get_daily_inter(self, df, shuffle=False):
# organize the train data into daily batches
daily_count = df.groupby(level=0).size().values
daily_index = np.roll(np.cumsum(daily_count), 1)
daily_index[0] = 0
if shuffle:
# shuffle data
daily_shuffle = list(zip(daily_index, daily_count))
np.random.shuffle(daily_shuffle)
daily_index, daily_count = zip(*daily_shuffle)
return daily_index, daily_count
def train_epoch(self, x_train, y_train):
x_train_values = x_train.values
y_train_values = np.squeeze(y_train.values)
self.krnn_model.train()
indices = np.arange(len(x_train_values))
np.random.shuffle(indices)
for i in range(len(indices))[:: self.batch_size]:
if len(indices) - i < self.batch_size:
break
feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float().to(self.device)
label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float().to(self.device)
pred = self.krnn_model(feature)
loss = self.loss_fn(pred, label)
self.train_optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_value_(self.krnn_model.parameters(), 3.0)
self.train_optimizer.step()
def test_epoch(self, data_x, data_y):
# prepare training data
x_values = data_x.values
y_values = np.squeeze(data_y.values)
self.krnn_model.eval()
scores = []
losses = []
indices = np.arange(len(x_values))
for i in range(len(indices))[:: self.batch_size]:
if len(indices) - i < self.batch_size:
break
feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float().to(self.device)
label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float().to(self.device)
pred = self.krnn_model(feature)
loss = self.loss_fn(pred, label)
losses.append(loss.item())
score = self.metric_fn(pred, label)
scores.append(score.item())
return np.mean(losses), np.mean(scores)
def fit(
self,
dataset: DatasetH,
evals_result=dict(),
save_path=None,
):
df_train, df_valid, df_test = dataset.prepare(
["train", "valid", "test"],
col_set=["feature", "label"],
data_key=DataHandlerLP.DK_L,
)
if df_train.empty or df_valid.empty:
raise ValueError("Empty data from dataset, please check your dataset config.")
x_train, y_train = df_train["feature"], df_train["label"]
x_valid, y_valid = df_valid["feature"], df_valid["label"]
save_path = get_or_create_path(save_path)
stop_steps = 0
train_loss = 0
best_score = -np.inf
best_epoch = 0
evals_result["train"] = []
evals_result["valid"] = []
# train
self.logger.info("training...")
self.fitted = True
for step in range(self.n_epochs):
self.logger.info("Epoch%d:", step)
self.logger.info("training...")
self.train_epoch(x_train, y_train)
self.logger.info("evaluating...")
train_loss, train_score = self.test_epoch(x_train, y_train)
val_loss, val_score = self.test_epoch(x_valid, y_valid)
self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
evals_result["train"].append(train_score)
evals_result["valid"].append(val_score)
if val_score > best_score:
best_score = val_score
stop_steps = 0
best_epoch = step
best_param = copy.deepcopy(self.krnn_model.state_dict())
else:
stop_steps += 1
if stop_steps >= self.early_stop:
self.logger.info("early stop")
break
self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
self.krnn_model.load_state_dict(best_param)
torch.save(best_param, save_path)
if self.use_gpu:
torch.cuda.empty_cache()
def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
if not self.fitted:
raise ValueError("model is not fitted yet!")
x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I)
index = x_test.index
self.krnn_model.eval()
x_values = x_test.values
sample_num = x_values.shape[0]
preds = []
for begin in range(sample_num)[:: self.batch_size]:
if sample_num - begin < self.batch_size:
end = sample_num
else:
end = begin + self.batch_size
x_batch = torch.from_numpy(x_values[begin:end]).float().to(self.device)
with torch.no_grad():
pred = self.krnn_model(x_batch).detach().cpu().numpy()
preds.append(pred)
return pd.Series(np.concatenate(preds), index=index)

View File

@@ -0,0 +1,381 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from __future__ import division
from __future__ import print_function
import numpy as np
import pandas as pd
from typing import Text, Union
import copy
from ...utils import get_or_create_path
from ...log import get_module_logger
import torch
import torch.nn as nn
import torch.optim as optim
from ...model.base import Model
from ...data.dataset import DatasetH
from ...data.dataset.handler import DataHandlerLP
from .pytorch_krnn import CNNKRNNEncoder
class SandwichModel(nn.Module):
def __init__(
self,
fea_dim,
cnn_dim_1,
cnn_dim_2,
cnn_kernel_size,
rnn_dim_1,
rnn_dim_2,
rnn_dups,
rnn_layers,
dropout,
device,
**params
):
"""Build a Sandwich model
Parameters
----------
fea_dim : int
The feature dimension
cnn_dim_1 : int
The hidden dimension of the first CNN
cnn_dim_2 : int
The hidden dimension of the second CNN
cnn_kernel_size : int
The size of convolutional kernels
rnn_dim_1 : int
The hidden dimension of the first KRNN
rnn_dim_2 : int
The hidden dimension of the second KRNN
rnn_dups : int
The number of parallel duplicates
rnn_layers: int
The number of RNN layers
"""
super().__init__()
self.first_encoder = CNNKRNNEncoder(
cnn_input_dim=fea_dim,
cnn_output_dim=cnn_dim_1,
cnn_kernel_size=cnn_kernel_size,
rnn_output_dim=rnn_dim_1,
rnn_dup_num=rnn_dups,
rnn_layers=rnn_layers,
dropout=dropout,
device=device,
)
self.second_encoder = CNNKRNNEncoder(
cnn_input_dim=rnn_dim_1,
cnn_output_dim=cnn_dim_2,
cnn_kernel_size=cnn_kernel_size,
rnn_output_dim=rnn_dim_2,
rnn_dup_num=rnn_dups,
rnn_layers=rnn_layers,
dropout=dropout,
device=device,
)
self.out_fc = nn.Linear(rnn_dim_2, 1)
self.device = device
def forward(self, x):
# x: [batch_size, node_num, seq_len, input_dim]
encode = self.first_encoder(x)
encode = self.second_encoder(encode)
out = self.out_fc(encode[:, -1, :]).squeeze().to(self.device)
return out
class Sandwich(Model):
"""Sandwich Model
Parameters
----------
d_feat : int
input dimension for each time step
metric: str
the evaluation metric used in early stop
optimizer : str
optimizer name
GPU : str
the GPU ID(s) used for training
"""
def __init__(
self,
fea_dim=6,
cnn_dim_1=64,
cnn_dim_2=32,
cnn_kernel_size=3,
rnn_dim_1=16,
rnn_dim_2=8,
rnn_dups=3,
rnn_layers=2,
dropout=0,
n_epochs=200,
lr=0.001,
metric="",
batch_size=2000,
early_stop=20,
loss="mse",
optimizer="adam",
GPU=0,
seed=None,
**kwargs
):
# Set logger.
self.logger = get_module_logger("Sandwich")
self.logger.info("Sandwich pytorch version...")
# set hyper-parameters.
self.fea_dim = fea_dim
self.cnn_dim_1 = cnn_dim_1
self.cnn_dim_2 = cnn_dim_2
self.cnn_kernel_size = cnn_kernel_size
self.rnn_dim_1 = rnn_dim_1
self.rnn_dim_2 = rnn_dim_2
self.rnn_dups = rnn_dups
self.rnn_layers = rnn_layers
self.dropout = dropout
self.n_epochs = n_epochs
self.lr = lr
self.metric = metric
self.batch_size = batch_size
self.early_stop = early_stop
self.optimizer = optimizer.lower()
self.loss = loss
self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu")
self.seed = seed
self.logger.info(
"Sandwich parameters setting:"
"\nfea_dim : {}"
"\ncnn_dim_1 : {}"
"\ncnn_dim_2 : {}"
"\ncnn_kernel_size : {}"
"\nrnn_dim_1 : {}"
"\nrnn_dim_2 : {}"
"\nrnn_dups : {}"
"\nrnn_layers : {}"
"\ndropout : {}"
"\nn_epochs : {}"
"\nlr : {}"
"\nmetric : {}"
"\nbatch_size: {}"
"\nearly_stop : {}"
"\noptimizer : {}"
"\nloss_type : {}"
"\nvisible_GPU : {}"
"\nuse_GPU : {}"
"\nseed : {}".format(
fea_dim,
cnn_dim_1,
cnn_dim_2,
cnn_kernel_size,
rnn_dim_1,
rnn_dim_2,
rnn_dups,
rnn_layers,
dropout,
n_epochs,
lr,
metric,
batch_size,
early_stop,
optimizer.lower(),
loss,
GPU,
self.use_gpu,
seed,
)
)
if self.seed is not None:
np.random.seed(self.seed)
torch.manual_seed(self.seed)
self.sandwich_model = SandwichModel(
fea_dim=self.fea_dim,
cnn_dim_1=self.cnn_dim_1,
cnn_dim_2=self.cnn_dim_2,
cnn_kernel_size=self.cnn_kernel_size,
rnn_dim_1=self.rnn_dim_1,
rnn_dim_2=self.rnn_dim_2,
rnn_dups=self.rnn_dups,
rnn_layers=self.rnn_layers,
dropout=self.dropout,
device=self.device,
)
if optimizer.lower() == "adam":
self.train_optimizer = optim.Adam(self.sandwich_model.parameters(), lr=self.lr)
elif optimizer.lower() == "gd":
self.train_optimizer = optim.SGD(self.sandwich_model.parameters(), lr=self.lr)
else:
raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
self.fitted = False
self.sandwich_model.to(self.device)
@property
def use_gpu(self):
return self.device != torch.device("cpu")
def mse(self, pred, label):
loss = (pred - label) ** 2
return torch.mean(loss)
def loss_fn(self, pred, label):
mask = ~torch.isnan(label)
if self.loss == "mse":
return self.mse(pred[mask], label[mask])
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
mask = torch.isfinite(label)
if self.metric in ("", "loss"):
return -self.loss_fn(pred[mask], label[mask])
raise ValueError("unknown metric `%s`" % self.metric)
def train_epoch(self, x_train, y_train):
x_train_values = x_train.values
y_train_values = np.squeeze(y_train.values)
self.sandwich_model.train()
indices = np.arange(len(x_train_values))
np.random.shuffle(indices)
for i in range(len(indices))[:: self.batch_size]:
if len(indices) - i < self.batch_size:
break
feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float().to(self.device)
label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float().to(self.device)
pred = self.sandwich_model(feature)
loss = self.loss_fn(pred, label)
self.train_optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_value_(self.sandwich_model.parameters(), 3.0)
self.train_optimizer.step()
def test_epoch(self, data_x, data_y):
# prepare training data
x_values = data_x.values
y_values = np.squeeze(data_y.values)
self.sandwich_model.eval()
scores = []
losses = []
indices = np.arange(len(x_values))
for i in range(len(indices))[:: self.batch_size]:
if len(indices) - i < self.batch_size:
break
feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float().to(self.device)
label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float().to(self.device)
pred = self.sandwich_model(feature)
loss = self.loss_fn(pred, label)
losses.append(loss.item())
score = self.metric_fn(pred, label)
scores.append(score.item())
return np.mean(losses), np.mean(scores)
def fit(
self,
dataset: DatasetH,
evals_result=dict(),
save_path=None,
):
df_train, df_valid, df_test = dataset.prepare(
["train", "valid", "test"],
col_set=["feature", "label"],
data_key=DataHandlerLP.DK_L,
)
if df_train.empty or df_valid.empty:
raise ValueError("Empty data from dataset, please check your dataset config.")
x_train, y_train = df_train["feature"], df_train["label"]
x_valid, y_valid = df_valid["feature"], df_valid["label"]
save_path = get_or_create_path(save_path)
stop_steps = 0
train_loss = 0
best_score = -np.inf
best_epoch = 0
evals_result["train"] = []
evals_result["valid"] = []
# train
self.logger.info("training...")
self.fitted = True
for step in range(self.n_epochs):
self.logger.info("Epoch%d:", step)
self.logger.info("training...")
self.train_epoch(x_train, y_train)
self.logger.info("evaluating...")
train_loss, train_score = self.test_epoch(x_train, y_train)
val_loss, val_score = self.test_epoch(x_valid, y_valid)
self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
evals_result["train"].append(train_score)
evals_result["valid"].append(val_score)
if val_score > best_score:
best_score = val_score
stop_steps = 0
best_epoch = step
best_param = copy.deepcopy(self.sandwich_model.state_dict())
else:
stop_steps += 1
if stop_steps >= self.early_stop:
self.logger.info("early stop")
break
self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
self.sandwich_model.load_state_dict(best_param)
torch.save(best_param, save_path)
if self.use_gpu:
torch.cuda.empty_cache()
def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
if not self.fitted:
raise ValueError("model is not fitted yet!")
x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I)
index = x_test.index
self.sandwich_model.eval()
x_values = x_test.values
sample_num = x_values.shape[0]
preds = []
for begin in range(sample_num)[:: self.batch_size]:
if sample_num - begin < self.batch_size:
end = sample_num
else:
end = begin + self.batch_size
x_batch = torch.from_numpy(x_values[begin:end]).float().to(self.device)
with torch.no_grad():
pred = self.sandwich_model(x_batch).detach().cpu().numpy()
preds.append(pred)
return pd.Series(np.concatenate(preds), index=index)

View File

@@ -168,7 +168,8 @@ class TCN(Model):
self.TCN_model.train()
for data in data_loader:
feature = data[:, :, 0:-1].to(self.device)
data = torch.transpose(data, 1, 2)
feature = data[:, 0:-1, :].to(self.device)
label = data[:, -1, -1].to(self.device)
pred = self.TCN_model(feature.float())
@@ -187,8 +188,8 @@ class TCN(Model):
losses = []
for data in data_loader:
feature = data[:, :, 0:-1].to(self.device)
data = torch.transpose(data, 1, 2)
feature = data[:, 0:-1, :].to(self.device)
# feature[torch.isnan(feature)] = 0
label = data[:, -1, -1].to(self.device)

View File

@@ -1,6 +1,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import os
import re
import sys
import qlib
@@ -11,13 +12,15 @@ import datetime
from tqdm import tqdm
from pathlib import Path
from loguru import logger
from cryptography.fernet import Fernet
from qlib.utils import exists_qlib_data
class GetData:
DATASET_VERSION = "v2"
REMOTE_URL = "https://qlibpublic.blob.core.windows.net/data/default/stock_data"
QLIB_DATA_NAME = "{dataset_name}_{region}_{interval}_{qlib_version}.zip"
# "?" is not included in the token.
TOKEN = "gAAAAABkmDhojHc0VSCDdNK1MqmRzNLeDFXe5hy8obHpa6SDQh4de6nW5gtzuD-fa6O_WZb0yyqYOL7ndOfJX_751W3xN5YB4-n-P22jK-t6ucoZqhT70KPD0Lf0_P328QPJVZ1gDnjIdjhi2YLOcP4BFTHLNYO0mvzszR8TKm9iT5AKRvuysWnpi8bbYwGU9zAcJK3x9EPL43hOGtxliFHcPNGMBoJW4g_ercdhi0-Qgv5_JLsV-29_MV-_AhuaYvJuN2dEywBy"
KEY = "EYcA8cgorA8X9OhyMwVfuFxn_1W3jGk6jCbs3L2oPoA="
def __init__(self, delete_zip_file=False):
"""
@@ -29,24 +32,44 @@ class GetData:
"""
self.delete_zip_file = delete_zip_file
def normalize_dataset_version(self, dataset_version: str = None):
if dataset_version is None:
dataset_version = self.DATASET_VERSION
return dataset_version
def merge_remote_url(self, file_name: str):
fernet = Fernet(self.KEY)
token = fernet.decrypt(self.TOKEN).decode()
return f"{self.REMOTE_URL}/{file_name}?{token}"
def merge_remote_url(self, file_name: str, dataset_version: str = None):
return f"{self.REMOTE_URL}/{self.normalize_dataset_version(dataset_version)}/{file_name}"
def download_data(self, file_name: str, target_dir: [Path, str], delete_old: bool = True):
"""
Download the specified file to the target folder.
def _download_data(
self, file_name: str, target_dir: [Path, str], delete_old: bool = True, dataset_version: str = None
):
Parameters
----------
target_dir: str
data save directory
file_name: str
dataset name, needs to endwith .zip, value from [rl_data.zip, csv_data_cn.zip, ...]
may contain folder names, for example: v2/qlib_data_simple_cn_1d_latest.zip
delete_old: bool
delete an existing directory, by default True
Examples
---------
# get rl data
python get_data.py download_data --file_name rl_data.zip --target_dir ~/.qlib/qlib_data/rl_data
When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/rl_data.zip?{token}
# get cn csv data
python get_data.py download_data --file_name csv_data_cn.zip --target_dir ~/.qlib/csv_data/cn_data
When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/csv_data_cn.zip?{token}
-------
"""
target_dir = Path(target_dir).expanduser()
target_dir.mkdir(exist_ok=True, parents=True)
# saved file name
_target_file_name = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "_" + file_name
_target_file_name = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "_" + os.path.basename(file_name)
target_path = target_dir.joinpath(_target_file_name)
url = self.merge_remote_url(file_name, dataset_version)
url = self.merge_remote_url(file_name)
resp = requests.get(url, stream=True, timeout=60)
resp.raise_for_status()
if resp.status_code != 200:
@@ -56,7 +79,7 @@ class GetData:
logger.warning(
f"The data for the example is collected from Yahoo Finance. Please be aware that the quality of the data might not be perfect. (You can refer to the original data source: https://finance.yahoo.com/lookup.)"
)
logger.info(f"{file_name} downloading......")
logger.info(f"{os.path.basename(file_name)} downloading......")
with tqdm(total=int(resp.headers.get("Content-Length", 0))) as p_bar:
with target_path.open("wb") as fp:
for chunk in resp.iter_content(chunk_size=chunk_size):
@@ -67,8 +90,8 @@ class GetData:
if self.delete_zip_file:
target_path.unlink()
def check_dataset(self, file_name: str, dataset_version: str = None):
url = self.merge_remote_url(file_name, dataset_version)
def check_dataset(self, file_name: str):
url = self.merge_remote_url(file_name)
resp = requests.get(url, stream=True, timeout=60)
status = True
if resp.status_code == 404:
@@ -140,9 +163,11 @@ class GetData:
---------
# get 1d data
python get_data.py qlib_data --name qlib_data --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/v2/qlib_data_cn_1d_latest.zip?{token}
# get 1min data
python get_data.py qlib_data --name qlib_data --target_dir ~/.qlib/qlib_data/cn_data_1min --interval 1min --region cn
When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/v2/qlib_data_cn_1min_latest.zip?{token}
-------
"""
@@ -155,29 +180,12 @@ class GetData:
qlib_version = ".".join(re.findall(r"(\d+)\.+", qlib.__version__))
def _get_file_name(v):
return self.QLIB_DATA_NAME.format(
dataset_name=name, region=region.lower(), interval=interval.lower(), qlib_version=v
)
def _get_file_name_with_version(qlib_version, dataset_version):
dataset_version = "v2" if dataset_version is None else dataset_version
file_name_with_version = f"{dataset_version}/{name}_{region.lower()}_{interval.lower()}_{qlib_version}.zip"
return file_name_with_version
file_name = _get_file_name(qlib_version)
if not self.check_dataset(file_name, version):
file_name = _get_file_name("latest")
self._download_data(file_name.lower(), target_dir, delete_old, dataset_version=version)
def csv_data_cn(self, target_dir="~/.qlib/csv_data/cn_data"):
"""download cn csv data from remote
Parameters
----------
target_dir: str
data save directory
Examples
---------
python get_data.py csv_data_cn --target_dir ~/.qlib/csv_data/cn_data
-------
"""
file_name = "csv_data_cn.zip"
self._download_data(file_name, target_dir)
file_name = _get_file_name_with_version(qlib_version, dataset_version=version)
if not self.check_dataset(file_name):
file_name = _get_file_name_with_version("latest", dataset_version=version)
self.download_data(file_name.lower(), target_dir, delete_old)

View File

@@ -80,6 +80,7 @@ REQUIRED = [
"gym",
# Installing the latest version of protobuf for python versions below 3.8 will cause unit tests to fail.
"protobuf<=3.20.1;python_version<='3.8'",
"cryptography",
]
# Numpy include

View File

@@ -35,7 +35,7 @@ class TestDumpData(unittest.TestCase):
@classmethod
def setUpClass(cls) -> None:
GetData().csv_data_cn(SOURCE_DIR)
GetData().download_data(file_name="csv_data_cn.zip", target_dir=SOURCE_DIR)
TestDumpData.DUMP_DATA = DumpDataAll(csv_path=SOURCE_DIR, qlib_dir=QLIB_DIR, include_fields=cls.FIELDS)
TestDumpData.STOCK_NAMES = list(map(lambda x: x.name[:-4].upper(), SOURCE_DIR.glob("*.csv")))
provider_uri = str(QLIB_DIR.resolve())

View File

@@ -42,7 +42,7 @@ class TestGetData(unittest.TestCase):
self.assertFalse(df.dropna().empty, "get qlib data failed")
def test_1_csv_data(self):
GetData().csv_data_cn(SOURCE_DIR)
GetData().download_data(file_name="csv_data_cn.zip", target_dir=SOURCE_DIR)
stock_name = set(map(lambda x: x.name[:-4].upper(), SOURCE_DIR.glob("*.csv")))
self.assertEqual(len(stock_name), 85, "get csv data failed")