1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-06-29 09:01:18 +08:00

Compare commits

..

148 Commits

Author SHA1 Message Date
Young
97d354fa73 update version for releasing 2021-02-02 09:24:33 +00:00
Young
a87fb5a68c fix contrib data freq 2021-02-02 16:52:50 +08:00
Young
835b47a7e7 simplify parameters 2021-02-02 16:52:50 +08:00
Young
802dac81c9 move freq params to dataloader 2021-02-02 16:52:50 +08:00
Wendi Li
bdc70c192a Update pytorch_nn.py 2021-02-02 14:48:12 +08:00
Wendi Li
213f809148 Update pytorch_alstm_ts.py 2021-02-02 14:47:41 +08:00
Wendi Li
f3fd5e0773 Update pytorch_gats.py 2021-02-02 14:47:31 +08:00
Wendi Li
decf74cbdf Update pytorch_gru.py 2021-02-02 14:47:20 +08:00
Wendi Li
b4a92d55f8 Update pytorch_gru_ts.py 2021-02-02 14:47:00 +08:00
Wendi Li
ebc31b9bdb Update pytorch_lstm.py 2021-02-02 14:46:49 +08:00
Wendi Li
56ebe9bf36 Update pytorch_lstm_ts.py 2021-02-02 14:46:21 +08:00
Wendi Li
ddd68fc761 Update pytorch_alstm.py 2021-02-02 14:34:57 +08:00
bxdd
f50463aca9 Fix bug in alpha360 2021-02-01 18:33:51 +08:00
Jactus
c0e7cbc983 Add filter_pipe API 2021-01-29 12:47:04 +08:00
you-n-g
828993b397 Merge pull request #222 from bxdd/rl-highfreq-include-examples
Qlib Highfreq Support & Highfreq DataHanlder/Operator/Processor Examples
2021-01-29 00:08:10 +08:00
bxdd
8ef89b4fa8 update 2021-01-28 15:01:07 +00:00
bxdd
76cf9dad99 update 2021-01-28 14:30:20 +00:00
bxdd
f3eb02a0bd update docstring 2021-01-28 14:26:30 +00:00
bxdd
ffa68fd010 update 2021-01-28 14:25:55 +00:00
bxdd
f6dd006c35 update 2021-01-28 11:31:15 +00:00
you-n-g
8c29105bca Update cache.py 2021-01-27 19:52:33 +08:00
bxdd
948b829ff4 add get_data in highfreq 2021-01-27 10:34:31 +00:00
Jactus
304a0c3d7a Add paper year 2021-01-27 18:15:52 +08:00
bxdd
02dea2aeb6 update paused 2021-01-27 07:42:00 +00:00
bxdd
6fc4f2b249 fix a bug 2021-01-27 07:02:59 +00:00
bxdd
2a5f06ee9e update dataset test 2021-01-27 06:25:40 +00:00
zhupr
7f9216dc90 Fix the number of minutes on the first and last trading day of high frequency 2021-01-27 10:59:46 +08:00
zhupr
263ccdfe6f US stock code supports Windows 2021-01-27 10:59:46 +08:00
zhupr
1a8f1bfc57 support collecting yahoo 1min data 2021-01-27 10:59:46 +08:00
bxdd
9dc11a9e3c Merge github.com:microsoft/qlib into qlib_register_ops 2021-01-26 17:12:33 +00:00
bxdd
3bdd54308b update some little code 2021-01-26 17:02:30 +00:00
bxdd
1b569d371d simpson vwap 2021-01-26 14:32:08 +00:00
you-n-g
36e5c601de Merge pull request #78 from zhupr/main
Fix the error when the stock code is a number
2021-01-26 21:50:21 +08:00
zhupr
ae45711e2b Merge remote-tracking branch 'qlib/main' into save_inst 2021-01-26 19:42:59 +08:00
you-n-g
bcc47aa4cb Merge pull request #92 from bxdd/qlib_register_ops
Support Register of Custom Feature Operators Easily
2021-01-26 18:53:43 +08:00
bxdd
ee94634b23 black 2021-01-26 08:47:53 +00:00
bxdd
2016ebbbb2 update tests 2021-01-26 08:47:07 +00:00
zhupr
1eaf09cce1 version removed .dev 2021-01-26 16:29:26 +08:00
zhupr
7579f4b4c0 Merge remote-tracking branch 'qlib/main' into save_inst 2021-01-26 16:14:11 +08:00
zhupr
1a1c45981c US stock code supports Windows 2021-01-26 16:06:38 +08:00
bxdd
e4ecea55e4 fix 2021-01-26 07:41:22 +00:00
bxdd
58616fced9 black format 2021-01-26 07:33:50 +00:00
bxdd
8e9ca22b07 del some print 2021-01-26 07:33:26 +00:00
bxdd
6a145df87c fix bug 2021-01-26 07:32:06 +00:00
bxdd
06dbd02b99 black format 2021-01-25 17:59:48 +00:00
bxdd
ffedb6382f add highfreq example 2021-01-25 17:58:45 +00:00
zhupr
3f9f295a87 add register in config 2021-01-24 11:22:02 +08:00
Wendi Li
84d77f4585 Update pytorch_nn.py 2021-01-24 10:40:47 +08:00
you-n-g
afdf58b4fa Update serial.py 2021-01-24 10:36:56 +08:00
Alex Wang
2b6d16feb1 fix naming 2021-01-22 19:16:57 +08:00
Alex Wang
0a86a6f392 update format 2021-01-22 19:16:57 +08:00
Alex Wang
5da5ad4b9f tabnet 2021-01-22 19:16:57 +08:00
you-n-g
dd07810b66 Update README.md 2021-01-22 12:53:05 +08:00
bxdd
a762248d98 update test&docs 2021-01-22 01:06:32 +09:00
bxdd
80c9a47e51 Merge github.com:microsoft/qlib into qlib_register_ops 2021-01-22 00:52:30 +09:00
王雪
784e73bceb black formatting 2021-01-21 00:07:03 +08:00
王雪
5ad1b4cc33 for IDE auto-complete with global Wrapper
R, D, Cal, Inst, FeatureD, ExpressionD, DatasetD, D
2021-01-21 00:07:03 +08:00
王雪
e85646762c Update .gitignore 2021-01-20 22:12:35 +08:00
Young
fc81a39317 Add dataset standalone usage example 2021-01-20 21:14:27 +08:00
you-n-g
d44c5bb2b2 Update README.md 2021-01-20 21:14:03 +08:00
bxdd
c622d3f6f8 Update data.rst 2021-01-20 18:55:30 +08:00
bxdd
6daaa79519 add register ops config 2021-01-20 18:44:53 +09:00
zhupr
3dda2cb379 Merge remote-tracking branch 'qlib/main' into qlib_register_ops 2021-01-20 15:16:06 +08:00
zhupr
4fcfde7cfb Initialization is split into: set_config and config_based_on_C 2021-01-20 15:06:18 +08:00
bxdd
3403c00b6b Update requirements.txt
fix readthedocs cant find cmake error
2021-01-19 20:35:11 +08:00
bxdd
ecdfe49fd1 del custom ops test for check the CI status 2021-01-19 20:39:15 +09:00
bxdd
cc214a3462 black format 2021-01-19 09:14:17 +08:00
bxdd
65d8af41e7 restructure backtest 2021-01-19 09:14:17 +08:00
bxdd
0e0970f06e update backtest 2021-01-19 09:14:17 +08:00
bxdd
917261dbf6 update backtest 2021-01-19 09:14:17 +08:00
bxdd
6a9105e065 add highfreq_backtest 2021-01-19 09:14:17 +08:00
王雪
570bb272eb fix setup error
why required pymongo
2021-01-18 19:37:24 +08:00
Wendi Li
0524a47cf4 Update pytorch_lstm_ts.py 2021-01-18 12:20:40 +08:00
Wendi Li
9abc0b0d4f Update pytorch_gru_ts.py 2021-01-18 12:20:31 +08:00
Wendi Li
fe60e40927 Update pytorch_gats_ts.py 2021-01-18 12:20:20 +08:00
Wendi Li
740c297618 Update pytorch_alstm_ts.py 2021-01-18 12:20:00 +08:00
Anon-Artist
b4a088efe8 Update cli.py 2021-01-14 18:42:33 +08:00
Jactus
b34890772f Make note more clear 2021-01-13 19:19:48 +08:00
Jactus
054ffa29f6 Update readme 2021-01-13 19:19:48 +08:00
Jactus
74e08c9e37 Add deepcopy to config 2021-01-13 19:19:48 +08:00
Jactus
ea96c9e22d Update docs and support Python 3.9 2021-01-13 19:19:48 +08:00
王雪
86e7c44c6b Update initialization.rst
need line changing
2021-01-13 15:28:05 +08:00
you-n-g
64cf2e2df8 Update data.rst 2021-01-12 18:43:05 +08:00
Jactus
4361a4049a Fix create_recorder bug 2021-01-07 18:30:18 +08:00
Zhichong Fang
231f37376b Fix unrecognized config bug 2021-01-07 18:28:17 +08:00
you-n-g
328cdeda4a Update README.md 2021-01-07 11:12:49 +08:00
Zhichong Fang
4dbc8e52ec Update data.py
Fix some typo
2021-01-06 16:36:23 +08:00
Young
ba447d3448 update valute 2021-01-06 14:43:14 +08:00
zhupr
df556532d0 Fix the error when the stock code is a number 2021-01-06 11:21:33 +08:00
Wendi Li
18e040f506 Update workflow_config_gru_Alpha158.yaml
Delete a redundant parameter.
2021-01-04 17:05:21 +08:00
Wendi Li
aefc98b1d7 Update workflow_config_lstm_Alpha158.yaml
Delete a redundant parameter.
2021-01-04 17:05:13 +08:00
Jactus
46c8d791ac Fix doc bugs 2020-12-30 23:51:05 +08:00
Young
afcd91a2d0 black format 2020-12-28 12:04:03 +00:00
Young
4a30d9d1ec update github issue template 2020-12-28 12:02:01 +00:00
you-n-g
2da2e9bd9e Update README.md 2020-12-26 20:21:30 +08:00
you-n-g
3e6877ff0f Update README.md 2020-12-25 22:01:18 +08:00
zhupr
a0f32036a6 Fix the first trading day of the calendar extra in report_df 2020-12-24 11:22:48 +08:00
bxdd
d8f36df7f4 debug on macos 2020-12-23 18:28:05 +00:00
bxdd
cb3b6c5bde black format 2020-12-23 16:41:32 +00:00
bxdd
b11712fa54 fix cant find ops error on Windows 2020-12-23 16:39:17 +00:00
Jactus
660edeb94f Remove fm in recorder 2020-12-23 21:14:53 +08:00
Jactus
95de4088df Fix recorder temp dir bug 2020-12-23 21:14:53 +08:00
hadrianl
e8d7a22651 fix _adjust_size 2020-12-23 17:39:04 +08:00
hadrianl
4a62b929ad add _get_value_size and remove _limit_flag 2020-12-23 17:39:04 +08:00
hadrianl
5efe82fb56 make code cleaner 2020-12-23 17:39:04 +08:00
hadrianl
40bbafcaab black format 2020-12-23 17:39:04 +08:00
hadrianl
4c4f0f3c5e black format 2020-12-23 17:39:04 +08:00
hadrianl
ae0e0eca3d better MemCacheUnit implement 2020-12-23 17:39:04 +08:00
bxdd
7e37fa710a update alpha.rst 2020-12-21 23:31:31 +08:00
bxdd
e0c460c33c Update alpha.rst 2020-12-21 23:31:31 +08:00
bxdd
53f501ac19 del import 2020-12-21 12:44:27 +00:00
bxdd
132df027a5 update format 2020-12-21 12:09:25 +00:00
bxdd
7d97fd39ce update ops register 2020-12-21 12:06:42 +00:00
Young
995fa98fc6 add more doc to PortAnaRecord 2020-12-20 16:11:07 +08:00
Maciej Domagała
824de921d1 fixing typos #4 2020-12-19 11:59:23 +08:00
Maciej Domagała
66d9bd1a68 fixing typos #3
I just randomly find these by the way. Good work on the framework!
2020-12-18 20:16:54 +08:00
you-n-g
1c0bb2f827 Merge pull request #97 from Derek-Wds/main
Update benchmark performance
2020-12-17 17:12:40 +08:00
Maciej Domagała
ea018ed4dc fixing typos #2 2020-12-17 17:12:18 +08:00
hadrianl
f3f1867b14 fix wrong attribute 2020-12-17 15:04:07 +08:00
hadrianl
8bbfd8810c formatting 2020-12-17 15:04:07 +08:00
hadrianl
3f84c3768a Make __getattr__ to raise AttributeError instead of return it.Avoid using try except. 2020-12-17 15:04:07 +08:00
Dingsu Wang
7372a3a598 Merge branch 'main' into main 2020-12-17 14:43:21 +08:00
Jactus
4b4cd38ca6 Update benchmark results 2020-12-17 14:41:12 +08:00
you-n-g
7d40ba753a Update README.md 2020-12-17 00:35:35 +08:00
Young
9b60214e0c make info more friendly 2020-12-16 02:16:06 +00:00
Young
f7e775f941 make message more friendly 2020-12-16 02:14:38 +00:00
Young
aefbf3b5f1 update collect info 2020-12-15 13:24:29 +00:00
G_will
3f85af05e5 Refactor to Python3 style 2020-12-15 20:37:43 +08:00
Jactus
192c2dc5ef Add demo 2020-12-15 20:33:32 +08:00
Jactus
911edd7839 Add stale bot 2020-12-15 20:31:38 +08:00
Maciej Domagała
3d47dd78c8 Typo fix 2020-12-15 20:29:30 +08:00
Jactus
8f6ab0af54 Format 2020-12-14 19:23:43 +08:00
Jactus
cb0b6fcdaa Update CI and script 2020-12-14 19:23:43 +08:00
Yifan Deng (FA Talent)
6b8824dd29 Update Sign in ops.py 2020-12-14 16:55:23 +08:00
Yifan Deng
c217e7c479 Update ops.py
Fix the bug when Sign followed by True/False
2020-12-14 16:55:23 +08:00
you-n-g
ea4fe1577b Update README.md 2020-12-14 13:05:12 +08:00
you-n-g
1bab07e419 Update README.md 2020-12-13 22:45:07 +08:00
bxdd
422d1d8c93 Update README.md 2020-12-12 19:41:16 +08:00
bxdd
c8f9b1162d Update README.md 2020-12-12 19:01:00 +08:00
Young
e2bdef7ffe update version number to dev 2020-12-12 10:09:18 +00:00
Jactus
c10955d026 Update tft 2020-12-11 14:33:16 +08:00
Jactus
d642c7b6ea Update benchmark performance 2020-12-11 09:55:37 +08:00
bxdd
0cdc5e125a update docs 2020-12-10 10:08:29 +00:00
bxdd
2de812f262 update ops docs 2020-12-10 10:04:09 +00:00
bxdd
16450c2876 fix import 2020-12-10 09:54:05 +00:00
bxdd
729b57e4a7 add example script 2020-12-10 09:11:12 +00:00
bxdd
87cc52cd05 black format 2020-12-10 09:02:43 +00:00
bxdd
0be57d51be support register custom feature ops easily 2020-12-10 09:00:00 +00:00
86 changed files with 3415 additions and 1034 deletions

View File

@@ -28,7 +28,8 @@ Steps to reproduce the behavior:
## Environment
**Note**: One could run `python scripts/collect_info.py` under the `qlib` directory to get the following information.
**Note**: User could run `cd scripts && python collect_info.py all` under project directory to get system information
and paste them here directly.
- Qlib version:
- Python version:
@@ -37,4 +38,4 @@ Steps to reproduce the behavior:
## Additional Notes
<!-- Add any other information about the problem here. -->
<!-- Add any other information about the problem here. -->

62
.github/stale.yml vendored Normal file
View File

@@ -0,0 +1,62 @@
# Configuration for probot-stale - https://github.com/probot/stale
# Number of days of inactivity before an Issue or Pull Request becomes stale
daysUntilStale: 60
# Number of days of inactivity before an Issue or Pull Request with the stale label is closed.
# Set to false to disable. If disabled, issues still need to be closed manually, but will remain marked as stale.
daysUntilClose: 7
# Only issues or pull requests with all of these labels are check if stale. Defaults to `[]` (disabled)
onlyLabels: []
# Issues or Pull Requests with these labels will never be considered stale. Set to `[]` to disable
exemptLabels:
- bug
- pinned
- security
- "[Status] Maybe Later"
# Set to true to ignore issues in a project (defaults to false)
exemptProjects: false
# Set to true to ignore issues in a milestone (defaults to false)
exemptMilestones: false
# Set to true to ignore issues with an assignee (defaults to false)
exemptAssignees: false
# Label to use when marking as stale
staleLabel: wontfix
# Comment to post when marking as stale. Set to `false` to disable
markComment: >
This issue has been automatically marked as stale because it has not had
recent activity. It will be closed if no further activity occurs. Thank you
for your contributions.
# Comment to post when removing the stale label.
# unmarkComment: >
# Your comment here.
# Comment to post when closing a stale Issue or Pull Request.
# closeComment: >
# Your comment here.
# Limit the number of actions per hour, from 1-30. Default is 30
limitPerRun: 30
# Limit to only `issues` or `pulls`
# only: issues
# Optionally, specify configuration settings that are specific to just 'issues' or 'pulls':
# pulls:
# daysUntilStale: 30
# markComment: >
# This pull request has been automatically marked as stale because it has not had
# recent activity. It will be closed if no further activity occurs. Thank you
# for your contributions.
# issues:
# exemptLabels:
# - confirmed

View File

@@ -13,7 +13,7 @@ jobs:
strategy:
matrix:
os: [windows-latest, ubuntu-16.04, ubuntu-18.04, ubuntu-20.04, macos-latest]
python-version: [3.6, 3.7, 3.8]
python-version: [3.6, 3.7, 3.8, 3.9]
steps:
- uses: actions/checkout@v2
@@ -22,9 +22,58 @@ jobs:
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Lint with Black
run: |
cd ..
if [ "$RUNNER_OS" == "Windows" ]; then
$CONDA\\python.exe -m pip install black
$CONDA\\python.exe -m black qlib -l 120 --check --diff
else
sudo $CONDA/bin/python -m pip install black
$CONDA/bin/python -m black qlib -l 120 --check --diff
fi
shell: bash
# Test Qlib installed with pip
- name: Install Qlib with pip
run: |
if [ "$RUNNER_OS" == "Windows" ]; then
$CONDA\\python.exe -m pip install pyqlib --ignore-installed ruamel.yaml --user
else
sudo $CONDA/bin/python -m pip install pyqlib --ignore-installed ruamel.yaml
fi
shell: bash
- name: Install dependencies
run: |
- name: Install Lightgbm for MacOS
if: runner.os == 'macOS'
run: |
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Microsoft/qlib/main/.github/brew_install.sh)"
HOMEBREW_NO_AUTO_UPDATE=1 brew install lightgbm
- name: Test data downloads
run: |
if [ "$RUNNER_OS" == "Windows" ]; then
$CONDA\\python.exe scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
else
$CONDA/bin/python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
fi
shell: bash
- name: Test workflow by config (install from pip)
run: |
if [ "$RUNNER_OS" == "Windows" ]; then
$CONDA\\python.exe qlib\\workflow\\cli.py examples\\benchmarks\\LightGBM\\workflow_config_lightgbm_Alpha158.yaml
$CONDA\\python.exe -m pip uninstall -y pyqlib
else
$CONDA/bin/python qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
sudo $CONDA/bin/python -m pip uninstall -y pyqlib
fi
shell: bash
# Test Qlib installed from source
- name: Install Qlib from source
run: |
if [ "$RUNNER_OS" == "Windows" ]; then
$CONDA\\python.exe -m pip install --upgrade cython
$CONDA\\python.exe -m pip install numpy jupyter jupyter_contrib_nbextensions
@@ -36,13 +85,7 @@ jobs:
sudo $CONDA/bin/python -m pip install -U scipy scikit-learn # installing without this line will cause errors on GitHub Actions, while instsalling locally won't
sudo $CONDA/bin/python setup.py install
fi
shell: bash
- name: Install Lightgbm for MacOS
if: runner.os == 'macOS'
run: |
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Microsoft/qlib/main/.github/brew_install.sh)"
HOMEBREW_NO_AUTO_UPDATE=1 brew install lightgbm
shell: bash
- name: Install test dependencies
run: |
@@ -54,16 +97,6 @@ jobs:
sudo $CONDA/bin/python -m pip install black pytest
fi
shell: bash
- name: Lint with Black
run: |
cd ..
if [ "$RUNNER_OS" == "Windows" ]; then
$CONDA\\python.exe -m black qlib -l 120 --check --diff
else
$CONDA/bin/python -m black qlib -l 120 --check --diff
fi
shell: bash
- name: Unit tests with Pytest
run: |
@@ -73,22 +106,13 @@ jobs:
else
$CONDA/bin/python -m pytest . --durations=0
fi
shell: bash
shell: bash
- name: Test data downloads
run: |
if [ "$RUNNER_OS" == "Windows" ]; then
$CONDA\\python.exe scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
else
$CONDA/bin/python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
fi
shell: bash
- name: Test workflow by config
- name: Test workflow by config (install from source)
run: |
if [ "$RUNNER_OS" == "Windows" ]; then
$CONDA\\python.exe qlib\\workflow\\cli.py examples\\benchmarks\\LightGBM\\workflow_config_lightgbm_Alpha158.yaml
else
$CONDA/bin/python qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
fi
shell: bash
shell: bash

1
.gitignore vendored
View File

@@ -2,6 +2,7 @@
__pycache__/
*.pyc
*.pyd
*.so
*.ipynb
.ipynb_checkpoints

View File

@@ -34,6 +34,7 @@ For more details, please refer to our paper ["Qlib: An AI-oriented Quantitative
- [More About Qlib](#more-about-qlib)
- [Offline Mode and Online Mode](#offline-mode-and-online-mode)
- [Performance of Qlib Data Server](#performance-of-qlib-data-server)
- [Related Reports](#related-reports)
- [Contributing](#contributing)
@@ -61,11 +62,27 @@ At the module level, Qlib is a platform that consists of the above components. T
This quick start guide tries to demonstrate
1. It's very easy to build a complete Quant research workflow and try your ideas with _Qlib_.
1. Though with *public data* and *simple models*, machine learning technologies **work very well** in practical Quant investment.
2. Though with *public data* and *simple models*, machine learning technologies **work very well** in practical Quant investment.
Here is a quick **[demo](https://terminalizer.com/view/3f24561a4470)** shows how to install ``Qlib``, and run LightGBM with ``qrun``. **But**, please make sure you have already prepared the data following the [instruction](#data-preparation).
## Installation
Users can easily install ``Qlib`` by pip according to the following command
This table demonstrates the supported Python version of `Qlib`:
| | install with pip | install from source | plot |
| ------------- |:---------------------:|:--------------------:|:----:|
| Python 3.6 | :heavy_check_mark: | :heavy_check_mark: (only with `Anaconda`) | :heavy_check_mark: |
| Python 3.7 | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
| Python 3.8 | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
| Python 3.9 | :x: | :heavy_check_mark: | :x: |
**Note**:
1. Please pay attention that installing cython in Python 3.6 will raise some error when installing ``Qlib`` from source. If users use Python 3.6 on their machines, it is recommended to *upgrade* Python to version 3.7 or use `conda`'s Python to install ``Qlib`` from source.
2. For Python 3.9, `Qlib` supports running workflows such as training models, doing backtest and plot most of the related figures (those included in [notebook](examples/workflow_by_code.ipynb)). However, plotting for the *model performance* is not supported for now and we will fix this when the dependent packages are upgraded in the future.
### Install with pip
Users can easily install ``Qlib`` by pip according to the following command.
```bash
pip install pyqlib
@@ -73,6 +90,7 @@ Users can easily install ``Qlib`` by pip according to the following command
**Note**: pip will install the latest stable qlib. However, the main branch of qlib is in active development. If you want to test the latest scripts or functions in the main branch. Please install qlib with the methods below.
### Install from source
Also, users can install the latest dev version ``Qlib`` by the source code according to the following steps:
* Before installing ``Qlib`` from source, users need to install some dependencies:
@@ -81,7 +99,6 @@ Also, users can install the latest dev version ``Qlib`` by the source code accor
pip install numpy
pip install --upgrade cython
```
**Note**: Please pay attention that installing cython in Python 3.6 will raise some error when installing ``Qlib`` from source. If users use Python 3.6 on their machines, it is recommended to *upgrade* Python to version 3.7 or use `conda`'s Python to install ``Qlib`` from source.
* Clone the repository and install ``Qlib`` as follows.
* If you haven't installed qlib by the command ``pip install pyqlib`` before:
@@ -94,7 +111,9 @@ Also, users can install the latest dev version ``Qlib`` by the source code accor
git clone https://github.com/microsoft/qlib.git && cd qlib
pip install .
```
**Note**: **Only** the command ``pip install .`` **can** overwrite the stable version installed by ``pip install pyqlib``, which the command ``python setup.py install`` **can't**.
**Note**: **Only** the command ``pip install .`` **can** overwrite the stable version installed by ``pip install pyqlib``, while the command ``python setup.py install`` **can't**.
**Tips**: If you fail to install `Qlib` or run the examples in your environment, comparing your steps and the [CI workflow](.github/workflows/test.yml) may help you find the problem.
## Data Preparation
Load and prepare data by running the following code:
@@ -138,12 +157,16 @@ Users could create the same dataset with it.
## Auto Quant Research Workflow
Qlib provides a tool named `qrun` to run the whole workflow automatically (including building dataset, training models, backtest and evaluation). You can start an auto quant research workflow and have a graphical reports analysis according to the following steps:
1. Quant Research Workflow: Run `qrun` with lightgbm workflow config ([workflow_config_lightgbm.yaml](examples/benchmarks/LightGBM/workflow_config_lightgbm.yaml)) as following.
1. Quant Research Workflow: Run `qrun` with lightgbm workflow config ([workflow_config_lightgbm_Alpha158.yaml](examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml) as following.
```bash
cd examples # Avoid running program under the directory contains `qlib`
qrun benchmarks/LightGBM/workflow_config_lightgbm.yaml
qrun benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
```
The result of `qrun` is as follows, please refer to please refer to [Intraday Trading](https://qlib.readthedocs.io/en/latest/component/backtest.html) for more details about the result.
If users want to use `qrun` under debug mode, please use the following command:
```bash
python -m pdb qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
```
The result of `qrun` is as follows, please refer to [Intraday Trading](https://qlib.readthedocs.io/en/latest/component/backtest.html) for more details about the result.
```bash
@@ -198,16 +221,17 @@ The automatic workflow may not suite the research workflow of all Quant research
# [Quant Model Zoo](examples/benchmarks)
Here is a list of models built on `Qlib`.
- [GBDT based on LightGBM (Guolin Ke, et al.)](qlib/contrib/model/gbdt.py)
- [GBDT based on Catboost (Liudmila Prokhorenkova, et al.)](qlib/contrib/model/catboost_model.py)
- [GBDT based on XGBoost (Tianqi Chen, et al.)](qlib/contrib/model/xgboost.py)
- [GBDT based on XGBoost (Tianqi Chen, et al. 2016)](qlib/contrib/model/xgboost.py)
- [GBDT based on LightGBM (Guolin Ke, et al. 2017)](qlib/contrib/model/gbdt.py)
- [GBDT based on Catboost (Liudmila Prokhorenkova, et al. 2017)](qlib/contrib/model/catboost_model.py)
- [MLP based on pytorch](qlib/contrib/model/pytorch_nn.py)
- [GRU based on pytorch (Kyunghyun Cho, et al.)](qlib/contrib/model/pytorch_gru.py)
- [LSTM based on pytorcn (Sepp Hochreiter, et al.)](qlib/contrib/model/pytorch_lstm.py)
- [ALSTM based on pytorcn (Yao Qin, et al.)](qlib/contrib/model/pytorch_alstm.py)
- [GATs based on pytorch (Petar Velickovic, et al.)](qlib/contrib/model/pytorch_gats.py)
- [SFM based on pytorch (Liheng Zhang, et al.)](qlib/contrib/model/pytorch_sfm.py)
- [TFT based on tensorflow (Bryan Lim, et al.)](examples/benchmarks/TFT/tft.py)
- [LSTM based on pytorch (Sepp Hochreiter, et al. 1997)](qlib/contrib/model/pytorch_lstm.py)
- [GRU based on pytorch (Kyunghyun Cho, et al. 2014)](qlib/contrib/model/pytorch_gru.py)
- [ALSTM based on pytorch (Yao Qin, et al. 2017)](qlib/contrib/model/pytorch_alstm.py)
- [GATs based on pytorch (Petar Velickovic, et al. 2017)](qlib/contrib/model/pytorch_gats.py)
- [SFM based on pytorch (Liheng Zhang, et al. 2017)](qlib/contrib/model/pytorch_sfm.py)
- [TFT based on tensorflow (Bryan Lim, et al. 2019)](examples/benchmarks/TFT/tft.py)
- [TabNet based on pytorch (Sercan O. Arik, et al. 2019)](qlib/contrib/model/pytorch_tabnet.py)
Your PR of new Quant models is highly welcomed.
@@ -288,7 +312,11 @@ Such overheads greatly slow down the data loading process.
Qlib data are stored in a compact format, which is efficient to be combined into arrays for scientific computation.
# Related Reports
- [Guide To Qlib: Microsofts AI Investment Platform](https://analyticsindiamag.com/qlib/)
- [【华泰金工林晓明团队】微软AI量化投资平台Qlib体验——华泰人工智能系列之四十](https://mp.weixin.qq.com/s/Brcd7im4NibJOJzZfMn6tQ)
- [微软也搞AI量化平台还是开源的](https://mp.weixin.qq.com/s/47bP5YwxfTp2uTHjUBzJQQ)
- [微矿Qlib业内首个AI量化投资开源平台](https://mp.weixin.qq.com/s/vsJv7lsgjEi-ALYUz4CvtQ)
# Contributing

12
docs/_static/demo.sh vendored Normal file
View File

@@ -0,0 +1,12 @@
#!/bin/sh
git clone https://github.com/microsoft/qlib.git
cd qlib
ls
pip install pyqlib
# or
# pip install numpy
# pip install --upgrade cython
# python setup.py install
cd examples
ls
qrun benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml

View File

@@ -50,57 +50,37 @@ Users can use ``Data Handler`` to build formulaic alphas `MACD` in qlib:
.. code-block:: python
>> from qlib.data.dataset.handler import QLibDataHandler
>> from qlib.data.dataset.loader import QlibDataLoader
>> MACD_EXP = '(EMA($close, 12) - EMA($close, 26))/$close - EMA((EMA($close, 12) - EMA($close, 26))/$close, 9)/$close'
>> fields = [MACD_EXP] # MACD
>> names = ['MACD']
>> labels = ['$close'] # label
>> labels = ['Ref($close, -2)/Ref($close, -1) - 1'] # label
>> label_names = ['LABEL']
>> data_handler = QLibDataHandler(start_date='2010-01-01', end_date='2017-12-31', fields=fields, names=names, labels=labels, label_names=label_names)
>> TRAINER_CONFIG = {
.. "train_start_date": "2007-01-01",
.. "train_end_date": "2014-12-31",
.. "validate_start_date": "2015-01-01",
.. "validate_end_date": "2016-12-31",
.. "test_start_date": "2017-01-01",
.. "test_end_date": "2020-08-01",
>> data_loader_config = {
.. "feature": (fields, names),
.. "label": (labels, label_names)
.. }
>> feature_train, label_train, feature_validate, label_validate, feature_test, label_test = data_handler.get_split_data(**TRAINER_CONFIG)
>> print(feature_train, label_train)
MACD
instrument datetime
SH600000 2010-01-04 -0.008625
2010-01-05 -0.007234
2010-01-06 -0.007693
2010-01-07 -0.009633
2010-01-08 -0.009891
... ...
SZ300251 2014-12-25 0.043072
2014-12-26 0.041345
2014-12-29 0.042733
2014-12-30 0.042066
2014-12-31 0.036299
[322025 rows x 1 columns]
LABEL
instrument datetime
SH600000 2010-01-04 4.260015
2010-01-05 4.292182
2010-01-06 4.207747
2010-01-07 4.113258
2010-01-08 4.159496
... ...
SZ300251 2014-12-25 4.343212
2014-12-26 4.470587
2014-12-29 4.762474
2014-12-30 4.369748
2014-12-31 4.182222
[322025 rows x 1 columns]
>> data_loader = QlibDataLoader(config=data_loader_config)
>> df = data_loader.load(instruments='csi300', start_time='2010-01-01', end_time='2017-12-31')
>> print(df)
feature label
MACD LABEL
datetime instrument
2010-01-04 SH600000 -0.011547 -0.019672
SH600004 0.002745 -0.014721
SH600006 0.010133 0.002911
SH600008 -0.001113 0.009818
SH600009 0.025878 -0.017758
... ... ...
2017-12-29 SZ300124 0.007306 -0.005074
SZ300136 -0.013492 0.056352
SZ300144 -0.000966 0.011853
SZ300251 0.004383 0.021739
SZ300315 -0.030557 0.012455
Reference
===========
To learn more about ``Data Handler``, please refer to `Data Handler <../component/data.html>`_
To learn more about ``Data Loader``, please refer to `Data Loader <../component/data.html#data-loader>`_
To learn more about ``Data API``, please refer to `Data API <../component/data.html>`_

View File

@@ -126,17 +126,17 @@ After conversion, users can find their Qlib format data in the directory `~/.qli
The arguments of `--include_fields` should correspond with the column names of CSV files. The columns names of dataset provided by ``Qlib`` should include open, close, high, low, volume and factor at least.
- `open`
The opening price
The adjusted opening price
- `close`
The closing price
The adjusted closing price
- `high`
The highest price
The adjusted highest price
- `low`
The lowest price
The adjusted lowest price
- `volume`
The trading volume
The adjusted trading volume
- `factor`
The Restoration factor
The Restoration factor. Normally, ``factor = adjusted_price / original_price``, `adjusted price` reference: `split adjusted <https://www.investopedia.com/terms/s/splitadjusted.asp>`_
In the convention of `Qlib` data processing, `open, close, high, low, volume, money and factor` will be set to NaN if the stock is suspended.
@@ -195,6 +195,7 @@ Feature
- `ExpressionOps`
`ExpressionOps` will use operator for feature construction.
To know more about ``Operator``, please refer to `Operator API <../reference/api.html#module-qlib.data.ops>`_.
Also, ``Qlib`` supports users to define their own custom ``Operator``, an example has been given in ``tests/test_register_ops.py``.
To know more about ``Feature``, please refer to `Feature API <../reference/api.html#module-qlib.data.base>`_.
@@ -295,6 +296,7 @@ The ``Processor`` module in ``Qlib`` is designed to be learnable and it is respo
- ``RobustZScoreNorm``: `processor` that applies robust z-score normalization.
- ``CSZScoreNorm``: `processor` that applies cross sectional z-score normalization.
- ``CSRankNorm``: `processor` that applies cross sectional rank normalization.
- ``CSZFillna``: `processor` that fills N/A values in a cross sectional way by the mean of the column.
Users can also create their own `processor` by inheriting the base class of ``Processor``. Please refer to the implementation of all the processors for more information (`Processor Link <https://github.com/microsoft/qlib/blob/main/qlib/data/dataset/processor.py>`_).

View File

@@ -34,8 +34,9 @@ Here is a general view of the structure of the system:
- Recorder 2
- ...
- ...
This experiment management system defines a set of interface and provided a concrete implementation based on the machine learning platform: ``MLFlow`` (`link <https://mlflow.org/>`_).
This experiment management system defines a set of interface and provided a concrete implementation ``MLflowExpManager``, which is based on the machine learning platform: ``MLFlow`` (`link <https://mlflow.org/>`_).
If users set the implementation of ``ExpManager`` to be ``MLflowExpManager``, they can use the command `mlflow ui` to visualize and check the experiment results. For more information, pleaes refer to the related documents `here <https://www.mlflow.org/docs/latest/cli.html#mlflow-ui>`_.
Qlib Recorder
===================
@@ -91,7 +92,7 @@ Record Template
The ``RecordTemp`` class is a class that enables generate experiment results such as IC and backtest in a certain format. We have provided three different `Record Template` class:
- ``SignalRecord``: This class generates the `preidction` results of the model.
- ``SignalRecord``: This class generates the `prediction` results of the model.
- ``SigAnaRecord``: This class generates the `IC`, `ICIR`, `Rank IC` and `Rank ICIR` of the model.
- ``PortAnaRecord``: This class generates the results of `backtest`. The detailed information about `backtest` as well as the available `strategy`, users can refer to `Strategy <../component/strategy.html>`_ and `Backtest <../component/backtest.html>`_.

View File

@@ -103,6 +103,12 @@ After saving the config into `configuration.yaml`, users could start the workflo
qrun configuration.yaml
If users want to use ``qrun`` under debug mode, please use the following command:
.. code-block:: bash
python -m pdb qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
.. note::
`qrun` will be placed in your $PATH directory when installing ``Qlib``.

View File

@@ -226,3 +226,8 @@ epub_exclude_files = ["search.html"]
autodoc_member_order = "bysource"
autodoc_default_flags = ["members"]
autodoc_default_options = {
"members": True,
"member-order": "bysource",
"special-members": "__init__",
}

View File

@@ -1,4 +1,5 @@
Cython
cmake
numpy
scipy
scikit-learn
scikit-learn

View File

@@ -63,6 +63,7 @@ Besides `provider_uri` and `region`, `qlib.init` has other parameters. The follo
If Qlib fails to connect redis via `redis_host` and `redis_port`, cache mechanism will not be used! Please refer to `Cache <../component/data.html#cache>`_ for details.
- `exp_manager`
Type: dict, optional parameter, the setting of `experiment manager` to be used in qlib. Users can specify an experiment manager class, as well as the tracking URI for all the experiments. However, please be aware that we only support input of a dictionary in the following style for `exp_manager`. For more information about `exp_manager`, users can refer to `Recorder: Experiment Management <../component/recorder.html>`_.
.. code-block:: Python
# For example, if you want to set your tracking_uri to a <specific folder>, you can initialize qlib below

View File

@@ -1,6 +1,6 @@
# Requirements
Here is the minimal hardware requirements to run the example.
Here is the minimal hardware requirements to run the `workflow_by_code` example.
- Memory: 16G
- Free Disk: 5G

View File

@@ -64,7 +64,6 @@ task:
loss: mse
n_jobs: 20
GPU: 0
rnn_type: GRU
dataset:
class: TSDatasetH
module_path: qlib.data.dataset

View File

@@ -64,7 +64,6 @@ task:
loss: mse
n_jobs: 20
GPU: 0
rnn_type: GRU
dataset:
class: TSDatasetH
module_path: qlib.data.dataset

View File

@@ -1,32 +1,35 @@
# Benchmarks Performance
Here are the results of each benchmark model running on Qlib's `Alpha360` and `Alpha158` dataset with China's A shared-stock & CSI300 data respectively. The values of each metric are the mean and std calculated based on 10 runs.
Here are the results of each benchmark model running on Qlib's `Alpha360` and `Alpha158` dataset with China's A shared-stock & CSI300 data respectively. The values of each metric are the mean and std calculated based on 20 runs.
The numbers shown below demonstrate the performance of the entire `workflow` of each model. We will update the `workflow` as well as models in the near future for better results.
## Alpha360 dataset
| Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown |
|---|---|---|---|---|---|---|---|---|
| Linear | Alpha360 | 0.0150±0.00 | 0.1049±0.00| 0.0284±0.00 | 0.1970±0.00 | -0.0655±0.00 | -0.6985±0.00| -0.2961±0.00 |
| Linear | Alpha360 | 0.0150±0.00 | 0.1049±0.00| 0.0284±0.00 | 0.1970±0.00 | -0.0659±0.00 | -0.7072±0.00| -0.2955±0.00 |
| CatBoost (Liudmila Prokhorenkova, et al.) | Alpha360 | 0.0397±0.00 | 0.2878±0.00| 0.0470±0.00 | 0.3703±0.00 | 0.0342±0.00 | 0.4092±0.00| -0.1057±0.00 |
| XGBoost (Tianqi Chen, et al.) | Alpha360 | 0.0400±0.00 | 0.3031±0.00| 0.0461±0.00 | 0.3862±0.00 | 0.0528±0.00 | 0.6307±0.00| -0.1113±0.00 |
| LightGBM (Guolin Ke, et al.) | Alpha360 | 0.0399±0.00 | 0.3075±0.00| 0.0492±0.00 | 0.4019±0.00 | 0.0323±0.00 | 0.4370±0.00| -0.0917±0.00 |
| MLP | Alpha360 | 0.0253±0.01 | 0.1954±0.05| 0.0329±0.00 | 0.2687±0.04 | 0.0161±0.01 | 0.1989±0.19| -0.1275±0.03 |
| GRU (Kyunghyun Cho, et al.) | Alpha360 | 0.0503±0.01 | 0.3946±0.06| 0.0588±0.00 | 0.4737±0.05 | 0.0799±0.02 | 1.0940±0.26| -0.0810±0.03 |
| LSTM (Sepp Hochreiter, et al.) | Alpha360 | 0.0466±0.01 | 0.3644±0.06| 0.0555±0.00 | 0.4451±0.04 | 0.0783±0.05 | 1.0539±0.65| -0.0844±0.03 |
| ALSTM (Yao Qin, et al.) | Alpha360 | 0.0472±0.00 | 0.3558±0.04| 0.0577±0.00 | 0.4522±0.04 | 0.0522±0.02 | 0.7090±0.32| -0.1059±0.03 |
| GATs (Petar Velickovic, et al.) | Alpha360 | 0.0480±0.00 | 0.3555±0.02| 0.0598±0.00 | 0.4616±0.01 | 0.0857±0.03 | 1.1317±0.42| -0.0917±0.01 |
| MLP | Alpha360 | 0.0285±0.00 | 0.1981±0.02| 0.0402±0.00 | 0.2993±0.02 | 0.0073±0.02 | 0.0880±0.22| -0.1446±0.03 |
| GRU (Kyunghyun Cho, et al.) | Alpha360 | 0.0490±0.01 | 0.3787±0.05| 0.0581±0.00 | 0.4664±0.04 | 0.0726±0.02 | 0.9817±0.34| -0.0902±0.03 |
| LSTM (Sepp Hochreiter, et al.) | Alpha360 | 0.0443±0.01 | 0.3401±0.05| 0.0536±0.01 | 0.4248±0.05 | 0.0627±0.03 | 0.8441±0.48| -0.0882±0.03 |
| ALSTM (Yao Qin, et al.) | Alpha360 | 0.0493±0.01 | 0.3778±0.06| 0.0585±0.00 | 0.4606±0.04 | 0.0513±0.03 | 0.6727±0.38| -0.1085±0.02 |
| GATs (Petar Velickovic, et al.) | Alpha360 | 0.0475±0.00 | 0.3515±0.02| 0.0592±0.00 | 0.4585±0.01 | 0.0876±0.02 | 1.1513±0.27| -0.0795±0.02 |
## Alpha158 dataset
| Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown |
|---|---|---|---|---|---|---|---|---|
| Linear | Alpha158 | 0.0393±0.00 | 0.2980±0.00| 0.0475±0.00 | 0.3546±0.00 | 0.0795±0.00 | 1.0712±0.00| -0.1449±0.00 |
| CatBoost (Liudmila Prokhorenkova, et al.) | Alpha158 | 0.0503±0.00 | 0.3586±0.00| 0.0483±0.00 | 0.3667±0.00 | 0.1080±0.00 | 1.1567±0.00| -0.0787±0.00 |
| CatBoost (Liudmila Prokhorenkova, et al.) | Alpha158 | 0.0503±0.00 | 0.3586±0.00| 0.0483±0.00 | 0.3667±0.00 | 0.1080±0.00 | 1.1561±0.00| -0.0787±0.00 |
| XGBoost (Tianqi Chen, et al.) | Alpha158 | 0.0481±0.00 | 0.3659±0.00| 0.0495±0.00 | 0.4033±0.00 | 0.1111±0.00 | 1.2915±0.00| -0.0893±0.00 |
| LightGBM (Guolin Ke, et al.) | Alpha158 | 0.0475±0.00 | 0.3979±0.00| 0.0485±0.00 | 0.4123±0.00 | 0.1143±0.00 | 1.2744±0.00| -0.0800±0.00 |
| MLP | Alpha158 | 0.0363±0.00 | 0.2770±0.02| 0.0421±0.00 | 0.3167±0.01 | 0.0856±0.01 | 1.0397±0.12| -0.1134±0.01 |
| TFT (Bryan Lim, et al.) | Alpha158 (with selected 20 features) | 0.0344±0.00 | 0.2071±0.02| 0.0103±0.00 | 0.0632±0.01 | 0.0638±0.00 | 0.5845±0.08| -0.1754±0.02 |
| GRU (Kyunghyun Cho, et al.) | Alpha158 (with selected 20 features) | 0.0302±0.00 | 0.2353±0.03| 0.0411±0.00 | 0.3309±0.03 | 0.0302±0.02 | 0.4353±0.28| -0.1140±0.02 |
| LSTM (Sepp Hochreiter, et al.) | Alpha158 (with selected 20 features) | 0.0359±0.01 | 0.2774±0.06| 0.0448±0.01 | 0.3597±0.05 | 0.0402±0.03 | 0.5743±0.41| -0.1152±0.03 |
| ALSTM (Yao Qin, et al.) | Alpha158 (with selected 20 features) | 0.0329±0.01 | 0.2465±0.07| 0.0450±0.01 | 0.3485±0.06 | 0.0288±0.04 | 0.4163±0.50| -0.1269±0.04 |
| GATs (Petar Velickovic, et al.) | Alpha158 (with selected 20 features) | 0.0349±0.00 | 0.2526±0.01| 0.0454±0.00 | 0.3531±0.01 | 0.0561±0.01 | 0.7992±0.19| -0.0751±0.02 |
| MLP | Alpha158 | 0.0358±0.00 | 0.2738±0.03| 0.0425±0.00 | 0.3221±0.01 | 0.0836±0.02 | 1.0323±0.25| -0.1127±0.02 |
| TabNet with pretrain (Sercan O. Arikm et al) | Alpha158 | 0.0344±0.00|0.205±0.11|0.0398±0.00 |0.3479±0.01|0.0827±0.02|1.1141±0.32 |-0.0925±0.02 |
| TFT (Bryan Lim, et al.) | Alpha158 (with selected 20 features) | 0.0343±0.00 | 0.2071±0.02| 0.0107±0.00 | 0.0660±0.02 | 0.0623±0.02 | 0.5818±0.20| -0.1762±0.01 |
| GRU (Kyunghyun Cho, et al.) | Alpha158 (with selected 20 features) | 0.0311±0.00 | 0.2418±0.04| 0.0425±0.00 | 0.3434±0.02 | 0.0330±0.02 | 0.4805±0.30| -0.1021±0.02 |
| LSTM (Sepp Hochreiter, et al.) | Alpha158 (with selected 20 features) | 0.0312±0.00 | 0.2394±0.04| 0.0418±0.00 | 0.3324±0.03 | 0.0298±0.02 | 0.4198±0.33| -0.1348±0.03 |
| ALSTM (Yao Qin, et al.) | Alpha158 (with selected 20 features) | 0.0385±0.01 | 0.3022±0.06| 0.0478±0.00 | 0.3874±0.04 | 0.0486±0.03 | 0.7141±0.45| -0.1088±0.03 |
| GATs (Petar Velickovic, et al.) | Alpha158 (with selected 20 features) | 0.0349±0.00 | 0.2511±0.01| 0.0457±0.00 | 0.3537±0.01 | 0.0578±0.02 | 0.8221±0.25| -0.0824±0.02 |
- The selected 20 features are based on the feature importance of a lightgbm-based model.

View File

@@ -25,7 +25,7 @@ import os
import data_formatters.qlib_Alpha158
class ExperimentConfig(object):
class ExperimentConfig:
"""Defines experiment configs and paths to outputs.
Attributes:

View File

@@ -320,7 +320,7 @@ class InterpretableMultiHeadAttention:
return outputs, attn
class TFTDataCache(object):
class TFTDataCache:
"""Caches data for the TFT."""
_data_cache = {}
@@ -348,7 +348,7 @@ class TFTDataCache(object):
# TFT model definitions.
class TemporalFusionTransformer(object):
class TemporalFusionTransformer:
"""Defines Temporal Fusion Transformer.
Attributes:
@@ -972,7 +972,7 @@ class TemporalFusionTransformer(object):
valid_quantiles = self.quantiles
output_size = self.output_size
class QuantileLossCalculator(object):
class QuantileLossCalculator:
"""Computes the combined quantile loss for prespecified quantiles.
Attributes:

Binary file not shown.

View File

@@ -0,0 +1,4 @@
pandas==1.1.2
numpy==1.17.4
scikit_learn==0.23.2
torch==1.7.0

View File

@@ -0,0 +1,74 @@
qlib_init:
provider_uri: "~/.qlib/qlib_data/cn_data"
region: cn
market: &market csi300
benchmark: &benchmark SH000300
data_handler_config: &data_handler_config
start_time: 2008-01-01
end_time: 2020-08-01
fit_start_time: 2008-01-01
fit_end_time: 2014-12-31
instruments: *market
infer_processors:
- class: RobustZScoreNorm
kwargs:
fields_group: feature
clip_outlier: true
- class: Fillna
kwargs:
fields_group: feature
learn_processors:
- class: DropnaLabel
- class: CSRankNorm
kwargs:
fields_group: label
label: ["Ref($close, -2) / Ref($close, -1) - 1"]
port_analysis_config: &port_analysis_config
strategy:
class: TopkDropoutStrategy
module_path: qlib.contrib.strategy.strategy
kwargs:
topk: 50
n_drop: 5
backtest:
verbose: False
limit_threshold: 0.095
account: 100000000
benchmark: *benchmark
deal_price: close
open_cost: 0.0005
close_cost: 0.0015
min_cost: 5
task:
model:
class: TabnetModel
module_path: qlib.contrib.model.pytorch_tabnet
kwargs:
pretrain: True
dataset:
class: DatasetH
module_path: qlib.data.dataset
kwargs:
handler:
class: Alpha158
module_path: qlib.contrib.data.handler
kwargs: *data_handler_config
segments:
pretrain: [2008-01-01, 2014-12-31]
pretrain_validation: [2015-01-01, 2020-08-01]
train: [2008-01-01, 2014-12-31]
valid: [2015-01-01, 2016-12-31]
test: [2017-01-01, 2020-08-01]
record:
- class: SignalRecord
module_path: qlib.workflow.record_temp
kwargs: {}
- class: SigAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
ana_long_short: False
ann_scaler: 252
- class: PortAnaRecord
module_path: qlib.workflow.record_temp
kwargs:
config: *port_analysis_config

View File

View File

@@ -0,0 +1,172 @@
from qlib.data.dataset.handler import DataHandler, DataHandlerLP
from qlib.data.dataset.processor import Processor
from qlib.utils import get_cls_kwargs
from qlib.log import TimeInspector
class HighFreqHandler(DataHandlerLP):
def __init__(
self,
instruments="csi300",
start_time=None,
end_time=None,
infer_processors=[],
learn_processors=[],
fit_start_time=None,
fit_end_time=None,
drop_raw=True,
):
def check_transform_proc(proc_l):
new_l = []
for p in proc_l:
p["kwargs"].update(
{
"fit_start_time": fit_start_time,
"fit_end_time": fit_end_time,
}
)
new_l.append(p)
return new_l
infer_processors = check_transform_proc(infer_processors)
learn_processors = check_transform_proc(learn_processors)
data_loader = {
"class": "QlibDataLoader",
"kwargs": {
"config": self.get_feature_config(),
"swap_level": False,
"freq": "1min",
},
}
super().__init__(
instruments=instruments,
start_time=start_time,
end_time=end_time,
data_loader=data_loader,
infer_processors=infer_processors,
learn_processors=learn_processors,
drop_raw=drop_raw,
)
def get_feature_config(self):
fields = []
names = []
template_if = "If(IsNull({1}), {0}, {1})"
template_paused = "Select(Or(IsNull($paused), Eq($paused, 0.0)), {0})"
template_fillnan = "BFillNan(FFillNan({0}))"
# Because there is no vwap field in the yahoo data, a method similar to Simpson integration is used to approximate vwap
simpson_vwap = "($open + 2*$high + 2*$low + $close)/6"
def get_normalized_price_feature(price_field, shift=0):
"""Get normalized price feature ops"""
if shift == 0:
template_norm = "{0}/Ref(DayLast({1}), 240)"
else:
template_norm = "Ref({0}, " + str(shift) + ")/Ref(DayLast({1}), 240)"
feature_ops = template_norm.format(
template_if.format(
template_fillnan.format(template_paused.format("$close")),
template_paused.format(price_field),
),
template_fillnan.format(template_paused.format("$close")),
)
return feature_ops
fields += [get_normalized_price_feature("$open", 0)]
fields += [get_normalized_price_feature("$high", 0)]
fields += [get_normalized_price_feature("$low", 0)]
fields += [get_normalized_price_feature("$close", 0)]
fields += [get_normalized_price_feature(simpson_vwap, 0)]
names += ["$open", "$high", "$low", "$close", "$vwap"]
fields += [get_normalized_price_feature("$open", 240)]
fields += [get_normalized_price_feature("$high", 240)]
fields += [get_normalized_price_feature("$low", 240)]
fields += [get_normalized_price_feature("$close", 240)]
fields += [get_normalized_price_feature(simpson_vwap, 240)]
names += ["$open_1", "$high_1", "$low_1", "$close_1", "$vwap_1"]
fields += [
"{0}/Ref(DayLast(Mean({0}, 7200)), 240)".format(
"If(IsNull({0}), 0, If(Or(Gt({1}, Mul(1.001, {3})), Lt({1}, Mul(0.999, {2}))), 0, {0}))".format(
template_paused.format("$volume"),
template_paused.format(simpson_vwap),
template_paused.format("$low"),
template_paused.format("$high"),
)
)
]
names += ["$volume"]
fields += [
"Ref({0}, 240)/Ref(DayLast(Mean({0}, 7200)), 240)".format(
"If(IsNull({0}), 0, If(Or(Gt({1}, Mul(1.001, {3})), Lt({1}, Mul(0.999, {2}))), 0, {0}))".format(
template_paused.format("$volume"),
template_paused.format(simpson_vwap),
template_paused.format("$low"),
template_paused.format("$high"),
)
)
]
names += ["$volume_1"]
fields += [template_paused.format("Date($close)")]
names += ["date"]
return fields, names
class HighFreqBacktestHandler(DataHandler):
def __init__(
self,
instruments="csi300",
start_time=None,
end_time=None,
):
data_loader = {
"class": "QlibDataLoader",
"kwargs": {
"config": self.get_feature_config(),
"swap_level": False,
"freq": "1min",
},
}
super().__init__(
instruments=instruments,
start_time=start_time,
end_time=end_time,
data_loader=data_loader,
)
def get_feature_config(self):
fields = []
names = []
template_if = "If(IsNull({1}), {0}, {1})"
template_paused = "Select(Or(IsNull($paused), Eq($paused, 0.0)), {0})"
template_fillnan = "BFillNan(FFillNan({0}))"
# Because there is no vwap field in the yahoo data, a method similar to Simpson integration is used to approximate vwap
simpson_vwap = "($open + 2*$high + 2*$low + $close)/6"
fields += [
template_fillnan.format(template_paused.format("$close")),
]
names += ["$close0"]
fields += [
template_if.format(
template_fillnan.format(template_paused.format("$close")),
template_paused.format(simpson_vwap),
)
]
names += ["$vwap0"]
fields += [
"If(IsNull({0}), 0, If(Or(Gt({1}, Mul(1.001, {3})), Lt({1}, Mul(0.999, {2}))), 0, {0}))".format(
template_paused.format("$volume"),
template_paused.format(simpson_vwap),
template_paused.format("$low"),
template_paused.format("$high"),
)
]
names += ["$volume0"]
return fields, names

View File

@@ -0,0 +1,56 @@
import numpy as np
import pandas as pd
import importlib
from qlib.data.ops import ElemOperator, PairOperator
from qlib.config import C
from qlib.data.cache import H
from qlib.data.data import Cal
def get_calendar_day(freq="day", future=False):
flag = f"{freq}_future_{future}_day"
if flag in H["c"]:
_calendar = H["c"][flag]
else:
_calendar = np.array(list(map(lambda x: x.date(), Cal.load_calendar(freq, future))))
H["c"][flag] = _calendar
return _calendar
class DayLast(ElemOperator):
def _load_internal(self, instrument, start_index, end_index, freq):
_calendar = get_calendar_day(freq=freq)
series = self.feature.load(instrument, start_index, end_index, freq)
return series.groupby(_calendar[series.index]).transform("last")
class FFillNan(ElemOperator):
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
return series.fillna(method="ffill")
class BFillNan(ElemOperator):
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
return series.fillna(method="bfill")
class Date(ElemOperator):
def _load_internal(self, instrument, start_index, end_index, freq):
_calendar = get_calendar_day(freq=freq)
series = self.feature.load(instrument, start_index, end_index, freq)
return pd.Series(_calendar[series.index], index=series.index)
class Select(PairOperator):
def _load_internal(self, instrument, start_index, end_index, freq):
series_condition = self.feature_left.load(instrument, start_index, end_index, freq)
series_feature = self.feature_right.load(instrument, start_index, end_index, freq)
return series_feature.loc[series_condition]
class IsNull(ElemOperator):
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
return series.isnull()

View File

@@ -0,0 +1,72 @@
import numpy as np
import pandas as pd
from qlib.data.dataset.processor import Processor
from qlib.data.dataset.utils import fetch_df_by_index
class HighFreqNorm(Processor):
def __init__(self, fit_start_time, fit_end_time):
self.fit_start_time = fit_start_time
self.fit_end_time = fit_end_time
def fit(self, df_features):
fetch_df = fetch_df_by_index(df_features, slice(self.fit_start_time, self.fit_end_time), level="datetime")
del df_features
df_values = fetch_df.values
names = {
"price": slice(0, 10),
"volume": slice(10, 12),
}
self.feature_med = {}
self.feature_std = {}
self.feature_vmax = {}
self.feature_vmin = {}
for name, name_val in names.items():
part_values = df_values[:, name_val].astype(np.float32)
if name == "volume":
part_values = np.log1p(part_values)
self.feature_med[name] = np.nanmedian(part_values)
part_values = part_values - self.feature_med[name]
self.feature_std[name] = np.nanmedian(np.absolute(part_values)) * 1.4826 + 1e-12
part_values = part_values / self.feature_std[name]
self.feature_vmax[name] = np.nanmax(part_values)
self.feature_vmin[name] = np.nanmin(part_values)
def __call__(self, df_features):
df_features.set_index("date", append=True, drop=True, inplace=True)
df_values = df_features.values
names = {
"price": slice(0, 10),
"volume": slice(10, 12),
}
for name, name_val in names.items():
if name == "volume":
df_values[:, name_val] = np.log1p(df_values[:, name_val])
df_values[:, name_val] -= self.feature_med[name]
df_values[:, name_val] /= self.feature_std[name]
slice0 = df_values[:, name_val] > 3.0
slice1 = df_values[:, name_val] > 3.5
slice2 = df_values[:, name_val] < -3.0
slice3 = df_values[:, name_val] < -3.5
df_values[:, name_val][slice0] = (
3.0 + (df_values[:, name_val][slice0] - 3.0) / (self.feature_vmax[name] - 3) * 0.5
)
df_values[:, name_val][slice1] = 3.5
df_values[:, name_val][slice2] = (
-3.0 - (df_values[:, name_val][slice2] + 3.0) / (self.feature_vmin[name] + 3) * 0.5
)
df_values[:, name_val][slice3] = -3.5
idx = df_features.index.droplevel("datetime").drop_duplicates()
idx.set_names(["instrument", "datetime"], inplace=True)
# Reshape is specifically for adapting to RL high-freq executor
feat = df_values[:, [0, 1, 2, 3, 4, 10]].reshape(-1, 6 * 240)
feat_1 = df_values[:, [5, 6, 7, 8, 9, 11]].reshape(-1, 6 * 240)
df_new_features = pd.DataFrame(
data=np.concatenate((feat, feat_1), axis=1),
index=idx,
columns=["FEATURE_%d" % i for i in range(12 * 240)],
).sort_index()
return df_new_features

View File

@@ -0,0 +1,166 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import sys
import fire
from pathlib import Path
import qlib
import pickle
import numpy as np
import pandas as pd
from qlib.config import HIGH_FREQ_CONFIG
from qlib.contrib.model.gbdt import LGBModel
from qlib.contrib.data.handler import Alpha158
from qlib.contrib.strategy.strategy import TopkDropoutStrategy
from qlib.contrib.evaluate import (
backtest as normal_backtest,
risk_analysis,
)
from qlib.utils import init_instance_by_config, exists_qlib_data
from qlib.data.dataset.handler import DataHandlerLP
from qlib.data.ops import Operators
from qlib.data.data import Cal
from qlib.tests.data import GetData
from highfreq_ops import get_calendar_day, DayLast, FFillNan, BFillNan, Date, Select, IsNull
class HighfreqWorkflow(object):
SPEC_CONF = {"custom_ops": [DayLast, FFillNan, BFillNan, Date, Select, IsNull], "expression_cache": None}
MARKET = "all"
BENCHMARK = "SH000300"
start_time = "2020-09-14 00:00:00"
end_time = "2021-01-18 16:00:00"
train_end_time = "2020-11-30 16:00:00"
test_start_time = "2020-12-01 00:00:00"
DATA_HANDLER_CONFIG0 = {
"start_time": start_time,
"end_time": end_time,
"freq": "1min",
"fit_start_time": start_time,
"fit_end_time": train_end_time,
"instruments": MARKET,
"infer_processors": [{"class": "HighFreqNorm", "module_path": "highfreq_processor", "kwargs": {}}],
}
DATA_HANDLER_CONFIG1 = {
"start_time": start_time,
"end_time": end_time,
"freq": "1min",
"instruments": MARKET,
}
task = {
"dataset": {
"class": "DatasetH",
"module_path": "qlib.data.dataset",
"kwargs": {
"handler": {
"class": "HighFreqHandler",
"module_path": "highfreq_handler",
"kwargs": DATA_HANDLER_CONFIG0,
},
"segments": {
"train": (start_time, train_end_time),
"test": (
test_start_time,
end_time,
),
},
},
},
"dataset_backtest": {
"class": "DatasetH",
"module_path": "qlib.data.dataset",
"kwargs": {
"handler": {
"class": "HighFreqBacktestHandler",
"module_path": "highfreq_handler",
"kwargs": DATA_HANDLER_CONFIG1,
},
"segments": {
"train": (start_time, train_end_time),
"test": (
test_start_time,
end_time,
),
},
},
},
}
def _init_qlib(self):
"""initialize qlib"""
# use yahoo_cn_1min data
QLIB_INIT_CONFIG = {**HIGH_FREQ_CONFIG, **self.SPEC_CONF}
provider_uri = QLIB_INIT_CONFIG.get("provider_uri")
if not exists_qlib_data(provider_uri):
print(f"Qlib data is not found in {provider_uri}")
GetData().qlib_data(target_dir=provider_uri, interval="1min", region=REG_CN)
qlib.init(**QLIB_INIT_CONFIG)
def _prepare_calender_cache(self):
"""preload the calendar for cache"""
# This code used the copy-on-write feature of Linux to avoid calculating the calendar multiple times in the subprocess
# This code may accelerate, but may be not useful on Windows and Mac Os
Cal.calendar(freq="1min")
get_calendar_day(freq="1min")
def get_data(self):
"""use dataset to get highreq data"""
self._init_qlib()
self._prepare_calender_cache()
dataset = init_instance_by_config(self.task["dataset"])
xtrain, xtest = dataset.prepare(["train", "test"])
print(xtrain, xtest)
dataset_backtest = init_instance_by_config(self.task["dataset_backtest"])
backtest_train, backtest_test = dataset_backtest.prepare(["train", "test"])
print(backtest_train, backtest_test)
del xtrain, xtest
del backtest_train, backtest_test
def dump_and_load_dataset(self):
"""dump and load dataset state on disk"""
self._init_qlib()
self._prepare_calender_cache()
dataset = init_instance_by_config(self.task["dataset"])
dataset_backtest = init_instance_by_config(self.task["dataset_backtest"])
##=============dump dataset=============
dataset.to_pickle(path="dataset.pkl")
dataset_backtest.to_pickle(path="dataset_backtest.pkl")
del dataset, dataset_backtest
##=============reload dataset=============
with open("dataset.pkl", "rb") as file_dataset:
dataset = pickle.load(file_dataset)
with open("dataset_backtest.pkl", "rb") as file_dataset_backtest:
dataset_backtest = pickle.load(file_dataset_backtest)
self._prepare_calender_cache()
##=============reload_dataset=============
dataset.init(init_type=DataHandlerLP.IT_LS)
dataset_backtest.init()
##=============get data=============
xtrain, xtest = dataset.prepare(["train", "test"])
backtest_train, backtest_test = dataset_backtest.prepare(["train", "test"])
print(xtrain, xtest)
print(backtest_train, backtest_test)
del xtrain, xtest
del backtest_train, backtest_test
if __name__ == "__main__":
fire.Fire(HighfreqWorkflow)

View File

@@ -69,9 +69,9 @@ def handler(signum, frame):
os.system("kill -9 %d" % os.getpid())
signal.signal(signal.SIGTSTP, handler)
signal.signal(signal.SIGINT, handler)
# function to calculate the mean and std of a list in the results dictionary
def cal_mean_std(results) -> dict:
mean_std = dict()

View File

@@ -17,7 +17,7 @@ from qlib.contrib.evaluate import (
from qlib.utils import exists_qlib_data, init_instance_by_config, flatten_dict
from qlib.workflow import R
from qlib.workflow.record_temp import SignalRecord, PortAnaRecord
from qlib.tests.data import GetData
if __name__ == "__main__":
@@ -25,9 +25,6 @@ if __name__ == "__main__":
provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir
if not exists_qlib_data(provider_uri):
print(f"Qlib data is not found in {provider_uri}")
sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts")))
from get_data import GetData
GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
qlib.init(provider_uri=provider_uri, region=REG_CN)
@@ -98,6 +95,7 @@ if __name__ == "__main__":
"open_cost": 0.0005,
"close_cost": 0.0015,
"min_cost": 5,
"return_order": True,
},
}
@@ -105,6 +103,11 @@ if __name__ == "__main__":
model = init_instance_by_config(task["model"])
dataset = init_instance_by_config(task["dataset"])
# NOTE: This line is optional
# It demonstrates that the dataset can be used standalone.
example_df = dataset.prepare("train")
print(example_df.head())
# start exp
with R.start(experiment_name="workflow"):
R.log_params(**flatten_dict(task))

View File

@@ -2,92 +2,49 @@
# Licensed under the MIT License.
__version__ = "0.6.1"
__version__ = "0.6.2"
import os
import re
import sys
import copy
import yaml
import logging
import platform
import subprocess
from pathlib import Path
from .utils import can_use_cache, init_instance_by_config, get_module_by_module_path
from .workflow.utils import experiment_exit_handler
# init qlib
def init(default_conf="client", **kwargs):
from .config import C, REG_CN, REG_US, QlibConfig
from .data.data import register_all_wrappers
from .log import get_module_logger, set_log_with_config
from .config import C
from .log import get_module_logger
from .data.cache import H
from .workflow import R, QlibRecorder
C.reset()
H.clear()
_logging_config = C.logging_config
if "logging_config" in kwargs:
_logging_config = kwargs["logging_config"]
# set global config
if _logging_config:
set_log_with_config(_logging_config)
# FIXME: this logger ignored the level in config
LOG = get_module_logger("Initialization", level=logging.INFO)
LOG.info(f"default_conf: {default_conf}.")
logger = get_module_logger("Initialization", level=logging.INFO)
C.set_mode(default_conf)
C.set_region(kwargs.get("region", C["region"] if "region" in C else REG_CN))
for k, v in kwargs.items():
C[k] = v
if k not in C:
LOG.warning("Unrecognized config %s" % k)
C.resolve_path()
if not (C["expression_cache"] is None and C["dataset_cache"] is None):
# check redis
if not can_use_cache():
LOG.warning(
f"redis connection failed(host={C['redis_host']} port={C['redis_port']}), cache will not be used!"
)
C["expression_cache"] = None
C["dataset_cache"] = None
C.set(default_conf, **kwargs)
# check path if server/local
if C.get_uri_type() == QlibConfig.LOCAL_URI:
if C.get_uri_type() == C.LOCAL_URI:
if not os.path.exists(C["provider_uri"]):
if C["auto_mount"]:
LOG.error(
logger.error(
f"Invalid provider uri: {C['provider_uri']}, please check if a valid provider uri has been set. This path does not exist."
)
else:
LOG.warning(f"auto_path is False, please make sure {C['mount_path']} is mounted")
elif C.get_uri_type() == QlibConfig.NFS_URI:
logger.warning(f"auto_path is False, please make sure {C['mount_path']} is mounted")
elif C.get_uri_type() == C.NFS_URI:
_mount_nfs_uri(C)
else:
raise NotImplementedError(f"This type of URI is not supported")
LOG.info("qlib successfully initialized based on %s settings." % default_conf)
register_all_wrappers()
LOG.info(f"data_path={C.get_data_path()}")
C.register()
if "flask_server" in C:
LOG.info(f"flask_server={C['flask_server']}, flask_port={C['flask_port']}")
# set up QlibRecorder
exp_manager = init_instance_by_config(C["exp_manager"])
qr = QlibRecorder(exp_manager)
R.register(qr)
# clean up experiment when python program ends
experiment_exit_handler()
logger.info(f"flask_server={C['flask_server']}, flask_port={C['flask_port']}")
logger.info("qlib successfully initialized based on %s settings." % default_conf)
logger.info(f"data_path={C.get_data_path()}")
def _mount_nfs_uri(C):

View File

@@ -11,26 +11,27 @@ Two modes are supported
"""
import copy
from pathlib import Path
import re
import os
import re
import copy
import logging
import multiprocessing
from pathlib import Path
class Config:
def __init__(self, default_conf):
self.__dict__["_default_config"] = default_conf # avoiding conflictions with __getattr__
self.__dict__["_default_config"] = copy.deepcopy(default_conf) # avoiding conflictions with __getattr__
self.reset()
def __getitem__(self, key):
return self.__dict__["_config"][key]
def __getattr__(self, attr):
try:
if attr in self.__dict__["_config"]:
return self.__dict__["_config"][attr]
except KeyError:
return AttributeError(f"No such {attr} in self._config")
raise AttributeError(f"No such {attr} in self._config")
def __setitem__(self, key, value):
self.__dict__["_config"][key] = value
@@ -59,6 +60,9 @@ class Config:
def update(self, *args, **kwargs):
self.__dict__["_config"].update(*args, **kwargs)
def set_conf_from_C(self, config_c):
self.update(**config_c.__dict__["_config"])
# REGION CONST
REG_CN = "cn"
@@ -86,7 +90,6 @@ _default_config = {
# How many tasks belong to one process. Recommend 1 for high-frequency data and None for daily data.
"maxtasksperchild": None,
"default_disk_cache": 1, # 0:skip/1:use
"disable_disk_cache": False, # disable disk cache; if High-frequency data generally disable_disk_cache=True
"mem_cache_size_limit": 500,
# memory cache expire second, only in used 'DatasetURICache' and 'client D.calendar'
# default 1 hour
@@ -184,9 +187,17 @@ MODE_CONF = {
"timeout": 100,
"logging_level": "INFO",
"region": REG_CN,
## Custom Operator
"custom_ops": [],
},
}
HIGH_FREQ_CONFIG = {
"provider_uri": "~/.qlib/qlib_data/yahoo_cn_1min",
"dataset_cache": None,
"expression_cache": "DiskExpressionCache",
"region": REG_CN,
}
_default_region_config = {
REG_CN: {
@@ -207,6 +218,10 @@ class QlibConfig(Config):
LOCAL_URI = "local"
NFS_URI = "nfs"
def __init__(self, default_conf):
super().__init__(default_conf)
self._registered = False
def set_mode(self, mode):
# raise KeyError
self.update(MODE_CONF[mode])
@@ -243,6 +258,64 @@ class QlibConfig(Config):
else:
raise NotImplementedError(f"This type of uri is not supported")
def set(self, default_conf="client", **kwargs):
from .utils import set_log_with_config, get_module_logger, can_use_cache
self.reset()
_logging_config = self.logging_config
if "logging_config" in kwargs:
_logging_config = kwargs["logging_config"]
# set global config
if _logging_config:
set_log_with_config(_logging_config)
# FIXME: this logger ignored the level in config
logger = get_module_logger("Initialization", level=logging.INFO)
logger.info(f"default_conf: {default_conf}.")
self.set_mode(default_conf)
self.set_region(kwargs.get("region", self["region"] if "region" in self else REG_CN))
for k, v in kwargs.items():
if k not in self:
logger.warning("Unrecognized config %s" % k)
self[k] = v
self.resolve_path()
if not (self["expression_cache"] is None and self["dataset_cache"] is None):
# check redis
if not can_use_cache():
logger.warning(
f"redis connection failed(host={self['redis_host']} port={self['redis_port']}), cache will not be used!"
)
self["expression_cache"] = None
self["dataset_cache"] = None
def register(self):
from .utils import init_instance_by_config
from .data.ops import register_all_ops
from .data.data import register_all_wrappers
from .workflow import R, QlibRecorder
from .workflow.utils import experiment_exit_handler
register_all_ops(self)
register_all_wrappers(self)
# set up QlibRecorder
exp_manager = init_instance_by_config(self["exp_manager"])
qr = QlibRecorder(exp_manager)
R.register(qr)
# clean up experiment when python program ends
experiment_exit_handler()
self._registered = True
@property
def registered(self):
return self._registered
# global config
C = QlibConfig(_default_config)

View File

@@ -1,9 +1,324 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# -*- coding: utf-8 -*-
from .order import Order
from .account import Account
from .position import Position
from .exchange import Exchange
from .report import Report
from .backtest import backtest as backtest_func, get_date_range
import numpy as np
import inspect
from ...utils import init_instance_by_config
from ...log import get_module_logger
from ...config import C
logger = get_module_logger("backtest caller")
def get_strategy(
strategy=None,
topk=50,
margin=0.5,
n_drop=5,
risk_degree=0.95,
str_type="dropout",
adjust_dates=None,
):
"""get_strategy
There will be 3 ways to return a stratgy. Please follow the code.
Parameters
----------
strategy : Strategy()
strategy used in backtest.
topk : int (Default value: 50)
top-N stocks to buy.
margin : int or float(Default value: 0.5)
- if isinstance(margin, int):
sell_limit = margin
- else:
sell_limit = pred_in_a_day.count() * margin
buffer margin, in single score_mode, continue holding stock if it is in nlargest(sell_limit).
sell_limit should be no less than topk.
n_drop : int
number of stocks to be replaced in each trading date.
risk_degree: float
0-1, 0.95 for example, use 95% money to trade.
str_type: 'amount', 'weight' or 'dropout'
strategy type: TopkAmountStrategy ,TopkWeightStrategy or TopkDropoutStrategy.
Returns
-------
:class: Strategy
an initialized strategy object
"""
# There will be 3 ways to return a strategy.
if strategy is None:
# 1) create strategy with param `strategy`
str_cls_dict = {
"amount": "TopkAmountStrategy",
"weight": "TopkWeightStrategy",
"dropout": "TopkDropoutStrategy",
}
logger.info("Create new strategy ")
from .. import strategy as strategy_pool
str_cls = getattr(strategy_pool, str_cls_dict.get(str_type))
strategy = str_cls(
topk=topk,
buffer_margin=margin,
n_drop=n_drop,
risk_degree=risk_degree,
adjust_dates=adjust_dates,
)
elif isinstance(strategy, (dict, str)):
# 2) create strategy with init_instance_by_config
logger.info("Create new strategy ")
strategy = init_instance_by_config(strategy)
from ..strategy.strategy import BaseStrategy
# else: nothing happens. 3) Use the strategy directly
if not isinstance(strategy, BaseStrategy):
raise TypeError("Strategy not supported")
return strategy
def get_exchange(
pred,
exchange=None,
subscribe_fields=[],
open_cost=0.0015,
close_cost=0.0025,
min_cost=5.0,
trade_unit=None,
limit_threshold=None,
deal_price=None,
extract_codes=False,
shift=1,
):
"""get_exchange
Parameters
----------
# exchange related arguments
exchange: Exchange().
subscribe_fields: list
subscribe fields.
open_cost : float
open transaction cost.
close_cost : float
close transaction cost.
min_cost : float
min transaction cost.
trade_unit : int
100 for China A.
deal_price: str
dealing price type: 'close', 'open', 'vwap'.
limit_threshold : float
limit move 0.1 (10%) for example, long and short with same limit.
extract_codes: bool
will we pass the codes extracted from the pred to the exchange.
NOTE: This will be faster with offline qlib.
Returns
-------
:class: Exchange
an initialized Exchange object
"""
if trade_unit is None:
trade_unit = C.trade_unit
if limit_threshold is None:
limit_threshold = C.limit_threshold
if deal_price is None:
deal_price = C.deal_price
if exchange is None:
logger.info("Create new exchange")
# handle exception for deal_price
if deal_price[0] != "$":
deal_price = "$" + deal_price
if extract_codes:
codes = sorted(pred.index.get_level_values("instrument").unique())
else:
codes = "all" # TODO: We must ensure that 'all.txt' includes all the stocks
dates = sorted(pred.index.get_level_values("datetime").unique())
dates = np.append(dates, get_date_range(dates[-1], left_shift=1, right_shift=shift))
exchange = Exchange(
trade_dates=dates,
codes=codes,
deal_price=deal_price,
subscribe_fields=subscribe_fields,
limit_threshold=limit_threshold,
open_cost=open_cost,
close_cost=close_cost,
min_cost=min_cost,
trade_unit=trade_unit,
)
return exchange
def get_executor(
executor=None,
trade_exchange=None,
verbose=True,
):
"""get_executor
There will be 3 ways to return a executor. Please follow the code.
Parameters
----------
executor : BaseExecutor
executor used in backtest.
trade_exchange : Exchange
exchange used in executor
verbose : bool
whether to print log.
Returns
-------
:class: BaseExecutor
an initialized BaseExecutor object
"""
# There will be 3 ways to return a executor.
if executor is None:
# 1) create executor with param `executor`
logger.info("Create new executor ")
from ..online.executor import SimulatorExecutor
executor = SimulatorExecutor(trade_exchange=trade_exchange, verbose=verbose)
elif isinstance(executor, (dict, str)):
# 2) create executor with config
logger.info("Create new executor ")
executor = init_instance_by_config(executor)
from ..online.executor import BaseExecutor
# 3) Use the executor directly
if not isinstance(executor, BaseExecutor):
raise TypeError("Executor not supported")
return executor
# This is the API for compatibility for legacy code
def backtest(pred, account=1e9, shift=1, benchmark="SH000905", verbose=True, return_order=False, **kwargs):
"""This function will help you set a reasonable Exchange and provide default value for strategy
Parameters
----------
- **backtest workflow related or commmon arguments**
pred : pandas.DataFrame
predict should has <datetime, instrument> index and one `score` column.
account : float
init account value.
shift : int
whether to shift prediction by one day.
benchmark : str
benchmark code, default is SH000905 CSI 500.
verbose : bool
whether to print log.
return_order : bool
whether to return order list
- **strategy related arguments**
strategy : Strategy()
strategy used in backtest.
topk : int (Default value: 50)
top-N stocks to buy.
margin : int or float(Default value: 0.5)
- if isinstance(margin, int):
sell_limit = margin
- else:
sell_limit = pred_in_a_day.count() * margin
buffer margin, in single score_mode, continue holding stock if it is in nlargest(sell_limit).
sell_limit should be no less than topk.
n_drop : int
number of stocks to be replaced in each trading date.
risk_degree: float
0-1, 0.95 for example, use 95% money to trade.
str_type: 'amount', 'weight' or 'dropout'
strategy type: TopkAmountStrategy ,TopkWeightStrategy or TopkDropoutStrategy.
- **exchange related arguments**
exchange: Exchange()
pass the exchange for speeding up.
subscribe_fields: list
subscribe fields.
open_cost : float
open transaction cost. The default value is 0.002(0.2%).
close_cost : float
close transaction cost. The default value is 0.002(0.2%).
min_cost : float
min transaction cost.
trade_unit : int
100 for China A.
deal_price: str
dealing price type: 'close', 'open', 'vwap'.
limit_threshold : float
limit move 0.1 (10%) for example, long and short with same limit.
extract_codes: bool
will we pass the codes extracted from the pred to the exchange.
.. note:: This will be faster with offline qlib.
- **executor related arguments**
executor : BaseExecutor()
executor used in backtest.
verbose : bool
whether to print log.
"""
# check strategy:
spec = inspect.getfullargspec(get_strategy)
str_args = {k: v for k, v in kwargs.items() if k in spec.args}
strategy = get_strategy(**str_args)
# init exchange:
spec = inspect.getfullargspec(get_exchange)
ex_args = {k: v for k, v in kwargs.items() if k in spec.args}
trade_exchange = get_exchange(pred, **ex_args)
# init executor:
executor = get_executor(executor=kwargs.get("executor"), trade_exchange=trade_exchange, verbose=verbose)
# run backtest
report_dict = backtest_func(
pred=pred,
strategy=strategy,
executor=executor,
trade_exchange=trade_exchange,
shift=shift,
verbose=verbose,
account=account,
benchmark=benchmark,
return_order=return_order,
)
# for compatibility of the old API. return the dict positions
positions = report_dict.get("positions")
report_dict.update({"positions": {k: p.position for k, p in positions.items()}})
return report_dict

View File

@@ -5,7 +5,6 @@
import numpy as np
import pandas as pd
from ...utils import get_date_by_shift, get_date_range
from ..online.executor import SimulatorExecutor
from ...data import D
from .account import Account
from ...config import C
@@ -15,7 +14,7 @@ from ...data.dataset.utils import get_level_index
LOG = get_module_logger("backtest")
def backtest(pred, strategy, trade_exchange, shift, verbose, account, benchmark):
def backtest(pred, strategy, executor, trade_exchange, shift, verbose, account, benchmark, return_order):
"""Parameters
----------
pred : pandas.DataFrame
@@ -69,9 +68,9 @@ def backtest(pred, strategy, trade_exchange, shift, verbose, account, benchmark)
raise ValueError(f"The benchmark {_codes} does not exist. Please provide the right benchmark")
bench = _temp_result.groupby(level="datetime")[_temp_result.columns.tolist()[0]].mean()
trade_dates = np.append(predict_dates[shift:], get_date_range(predict_dates[-1], shift=shift))
executor = SimulatorExecutor(trade_exchange, verbose=verbose)
trade_dates = np.append(predict_dates[shift:], get_date_range(predict_dates[-1], left_shift=1, right_shift=shift))
if return_order:
multi_order_list = []
# trading apart
for pred_date, trade_date in zip(predict_dates, trade_dates):
# for loop predict date and trading date
@@ -103,6 +102,8 @@ def backtest(pred, strategy, trade_exchange, shift, verbose, account, benchmark)
)
else:
order_list = []
if return_order:
multi_order_list.append((trade_account, order_list, trade_date))
# 4. Get result after executing order list
# NOTE: The following operation will modify order.amount.
# NOTE: If it is buy and the cash is insufficient, the tradable amount will be recalculated
@@ -115,7 +116,11 @@ def backtest(pred, strategy, trade_exchange, shift, verbose, account, benchmark)
report_df = trade_account.report.generate_report_dataframe()
report_df["bench"] = bench
positions = trade_account.get_positions()
return report_df, positions
report_dict = {"report_df": report_df, "positions": positions}
if return_order:
report_dict.update({"order_list": multi_order_list})
return report_dict
def update_account(trade_account, trade_info, trade_exchange, trade_date):

View File

@@ -49,10 +49,12 @@ class Alpha360(DataHandlerLP):
instruments="csi500",
start_time=None,
end_time=None,
freq="day",
infer_processors=_DEFAULT_INFER_PROCESSORS,
learn_processors=_DEFAULT_LEARN_PROCESSORS,
fit_start_time=None,
fit_end_time=None,
filter_pipe=None,
**kwargs,
):
infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time)
@@ -65,13 +67,15 @@ class Alpha360(DataHandlerLP):
"feature": self.get_feature_config(),
"label": kwargs.get("label", self.get_label_config()),
},
"filter_pipe": filter_pipe,
"freq": freq,
},
}
super().__init__(
instruments,
start_time,
end_time,
instruments=instruments,
start_time=start_time,
end_time=end_time,
data_loader=data_loader,
learn_processors=learn_processors,
infer_processors=infer_processors,
@@ -130,11 +134,13 @@ class Alpha158(DataHandlerLP):
instruments="csi500",
start_time=None,
end_time=None,
freq="day",
infer_processors=[],
learn_processors=_DEFAULT_LEARN_PROCESSORS,
fit_start_time=None,
fit_end_time=None,
process_type=DataHandlerLP.PTYPE_A,
filter_pipe=None,
**kwargs,
):
infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time)
@@ -143,13 +149,18 @@ class Alpha158(DataHandlerLP):
data_loader = {
"class": "QlibDataLoader",
"kwargs": {
"config": {"feature": self.get_feature_config(), "label": kwargs.get("label", self.get_label_config())},
"config": {
"feature": self.get_feature_config(),
"label": kwargs.get("label", self.get_label_config()),
},
"filter_pipe": filter_pipe,
"freq": freq,
},
}
super().__init__(
instruments,
start_time,
end_time,
instruments=instruments,
start_time=start_time,
end_time=end_time,
data_loader=data_loader,
infer_processors=infer_processors,
learn_processors=learn_processors,

View File

@@ -6,17 +6,16 @@ from __future__ import print_function
import numpy as np
import pandas as pd
import inspect
import warnings
from ..log import get_module_logger
from . import strategy as strategy_pool
from .strategy.strategy import BaseStrategy
from .backtest.exchange import Exchange
from .backtest.backtest import backtest as backtest_func, get_date_range
from .backtest import get_exchange, backtest as backtest_func
from .backtest.backtest import get_date_range
from ..data import D
from ..config import C
from ..data.dataset.utils import get_level_index
logger = get_module_logger("Evaluate")
@@ -46,144 +45,6 @@ def risk_analysis(r, N=252):
return res
def get_strategy(
strategy=None,
topk=50,
margin=0.5,
n_drop=5,
risk_degree=0.95,
str_type="amount",
adjust_dates=None,
):
"""get_strategy
Parameters
----------
strategy : Strategy()
strategy used in backtest.
topk : int (Default value: 50)
top-N stocks to buy.
margin : int or float(Default value: 0.5)
- if isinstance(margin, int):
sell_limit = margin
- else:
sell_limit = pred_in_a_day.count() * margin
buffer margin, in single score_mode, continue holding stock if it is in nlargest(sell_limit).
sell_limit should be no less than topk.
n_drop : int
number of stocks to be replaced in each trading date.
risk_degree: float
0-1, 0.95 for example, use 95% money to trade.
str_type: 'amount', 'weight' or 'dropout'
strategy type: TopkAmountStrategy ,TopkWeightStrategy or TopkDropoutStrategy.
Returns
-------
:class: Strategy
an initialized strategy object
"""
if strategy is None:
str_cls_dict = {
"amount": "TopkAmountStrategy",
"weight": "TopkWeightStrategy",
"dropout": "TopkDropoutStrategy",
}
logger.info("Create new streategy ")
str_cls = getattr(strategy_pool, str_cls_dict.get(str_type))
strategy = str_cls(
topk=topk,
buffer_margin=margin,
n_drop=n_drop,
risk_degree=risk_degree,
adjust_dates=adjust_dates,
)
if not isinstance(strategy, BaseStrategy):
raise TypeError("Strategy not supported")
return strategy
def get_exchange(
pred,
exchange=None,
subscribe_fields=[],
open_cost=0.0015,
close_cost=0.0025,
min_cost=5.0,
trade_unit=None,
limit_threshold=None,
deal_price=None,
extract_codes=False,
shift=1,
):
"""get_exchange
Parameters
----------
# exchange related arguments
exchange: Exchange().
subscribe_fields: list
subscribe fields.
open_cost : float
open transaction cost.
close_cost : float
close transaction cost.
min_cost : float
min transaction cost.
trade_unit : int
100 for China A.
deal_price: str
dealing price type: 'close', 'open', 'vwap'.
limit_threshold : float
limit move 0.1 (10%) for example, long and short with same limit.
extract_codes: bool
will we pass the codes extracted from the pred to the exchange.
NOTE: This will be faster with offline qlib.
Returns
-------
:class: Exchange
an initialized Exchange object
"""
if trade_unit is None:
trade_unit = C.trade_unit
if limit_threshold is None:
limit_threshold = C.limit_threshold
if deal_price is None:
deal_price = C.deal_price
if exchange is None:
logger.info("Create new exchange")
# handle exception for deal_price
if deal_price[0] != "$":
deal_price = "$" + deal_price
if extract_codes:
codes = sorted(pred.index.get_level_values("instrument").unique())
else:
codes = "all" # TODO: We must ensure that 'all.txt' includes all the stocks
dates = sorted(pred.index.get_level_values("datetime").unique())
dates = np.append(dates, get_date_range(dates[-1], shift=shift))
exchange = Exchange(
trade_dates=dates,
codes=codes,
deal_price=deal_price,
subscribe_fields=subscribe_fields,
limit_threshold=limit_threshold,
open_cost=open_cost,
close_cost=close_cost,
min_cost=min_cost,
trade_unit=trade_unit,
)
return exchange
# This is the API for compatibility for legacy code
def backtest(pred, account=1e9, shift=1, benchmark="SH000905", verbose=True, **kwargs):
"""This function will help you set a reasonable Exchange and provide default value for strategy
@@ -249,30 +110,22 @@ def backtest(pred, account=1e9, shift=1, benchmark="SH000905", verbose=True, **k
will we pass the codes extracted from the pred to the exchange.
.. note:: This will be faster with offline qlib.
- **executor related arguments**
executor : BaseExecutor()
executor used in backtest.
verbose : bool
whether to print log.
"""
# check strategy:
spec = inspect.getfullargspec(get_strategy)
str_args = {k: v for k, v in kwargs.items() if k in spec.args}
strategy = get_strategy(**str_args)
# init exchange:
spec = inspect.getfullargspec(get_exchange)
ex_args = {k: v for k, v in kwargs.items() if k in spec.args}
trade_exchange = get_exchange(pred, **ex_args)
# run backtest
report_df, positions = backtest_func(
pred=pred,
strategy=strategy,
trade_exchange=trade_exchange,
shift=shift,
verbose=verbose,
account=account,
benchmark=benchmark,
warnings.warn(
"this function is deprecated, please use backtest function in qlib.contrib.backtest", DeprecationWarning
)
# for compatibility of the old API. return the dict positions
positions = {k: p.position for k, p in positions.items()}
return report_df, positions
report_dict = backtest_func(
pred=pred, account=account, shift=shift, benchmark=benchmark, verbose=verbose, return_order=False, **kwargs
)
return report_dict.get("report_df"), report_dict.get("positions")
def long_short_backtest(
@@ -340,7 +193,7 @@ def long_short_backtest(
_pred_dates = pred.index.get_level_values(level="datetime")
predict_dates = D.calendar(start_time=_pred_dates.min(), end_time=_pred_dates.max())
trade_dates = np.append(predict_dates[shift:], get_date_range(predict_dates[-1], shift=shift))
trade_dates = np.append(predict_dates[shift:], get_date_range(predict_dates[-1], left_shift=1, right_shift=shift))
long_returns = {}
short_returns = {}

View File

@@ -56,7 +56,7 @@ class ALSTM(Model):
early_stop=20,
loss="mse",
optimizer="adam",
GPU="0",
GPU=0,
seed=None,
**kwargs
):

View File

@@ -58,7 +58,7 @@ class ALSTM(Model):
loss="mse",
optimizer="adam",
n_jobs=10,
GPU="0",
GPU=0,
seed=None,
**kwargs
):
@@ -204,8 +204,8 @@ class ALSTM(Model):
verbose=True,
save_path=None,
):
dl_train = dataset.prepare("train", data_key=DataHandlerLP.DK_L)
dl_valid = dataset.prepare("valid", data_key=DataHandlerLP.DK_L)
dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
dl_train.config(fillna_type="ffill+bfill") # process nan brought by dataloader
dl_valid.config(fillna_type="ffill+bfill") # process nan brought by dataloader
@@ -260,7 +260,7 @@ class ALSTM(Model):
if not self._fitted:
raise ValueError("model is not fitted yet!")
dl_test = dataset.prepare("test", data_key=DataHandlerLP.DK_I)
dl_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I)
dl_test.config(fillna_type="ffill+bfill")
test_loader = DataLoader(dl_test, batch_size=self.batch_size, num_workers=self.n_jobs)
self.ALSTM_model.eval()

View File

@@ -61,7 +61,7 @@ class GATs(Model):
with_pretrain=True,
model_path=None,
optimizer="adam",
GPU="0",
GPU=0,
seed=None,
**kwargs
):

View File

@@ -249,8 +249,8 @@ class GATs(Model):
save_path=None,
):
dl_train = dataset.prepare("train", data_key=DataHandlerLP.DK_L)
dl_valid = dataset.prepare("valid", data_key=DataHandlerLP.DK_L)
dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
dl_train.config(fillna_type="ffill+bfill") # process nan brought by dataloader
dl_valid.config(fillna_type="ffill+bfill") # process nan brought by dataloader
@@ -332,7 +332,7 @@ class GATs(Model):
if not self._fitted:
raise ValueError("model is not fitted yet!")
dl_test = dataset.prepare("test", data_key=DataHandlerLP.DK_I)
dl_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I)
dl_test.config(fillna_type="ffill+bfill")
sampler_test = DailyBatchSampler(dl_test)
test_loader = DataLoader(dl_test, sampler=sampler_test, num_workers=self.n_jobs)

View File

@@ -56,7 +56,7 @@ class GRU(Model):
early_stop=20,
loss="mse",
optimizer="adam",
GPU="0",
GPU=0,
seed=None,
**kwargs
):

View File

@@ -58,7 +58,7 @@ class GRU(Model):
loss="mse",
optimizer="adam",
n_jobs=10,
GPU="0",
GPU=0,
seed=None,
**kwargs
):
@@ -204,8 +204,8 @@ class GRU(Model):
verbose=True,
save_path=None,
):
dl_train = dataset.prepare("train", data_key=DataHandlerLP.DK_L)
dl_valid = dataset.prepare("valid", data_key=DataHandlerLP.DK_L)
dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
dl_train.config(fillna_type="ffill+bfill") # process nan brought by dataloader
dl_valid.config(fillna_type="ffill+bfill") # process nan brought by dataloader
@@ -260,7 +260,7 @@ class GRU(Model):
if not self._fitted:
raise ValueError("model is not fitted yet!")
dl_test = dataset.prepare("test", data_key=DataHandlerLP.DK_I)
dl_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I)
dl_test.config(fillna_type="ffill+bfill")
test_loader = DataLoader(dl_test, batch_size=self.batch_size, num_workers=self.n_jobs)
self.GRU_model.eval()

View File

@@ -56,7 +56,7 @@ class LSTM(Model):
early_stop=20,
loss="mse",
optimizer="adam",
GPU="0",
GPU=0,
seed=None,
**kwargs
):

View File

@@ -58,7 +58,7 @@ class LSTM(Model):
loss="mse",
optimizer="adam",
n_jobs=10,
GPU="0",
GPU=0,
seed=None,
**kwargs
):
@@ -204,8 +204,8 @@ class LSTM(Model):
verbose=True,
save_path=None,
):
dl_train = dataset.prepare("train", data_key=DataHandlerLP.DK_L)
dl_valid = dataset.prepare("valid", data_key=DataHandlerLP.DK_L)
dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
dl_train.config(fillna_type="ffill+bfill") # process nan brought by dataloader
dl_valid.config(fillna_type="ffill+bfill") # process nan brought by dataloader
@@ -260,7 +260,7 @@ class LSTM(Model):
if not self._fitted:
raise ValueError("model is not fitted yet!")
dl_test = dataset.prepare("test", data_key=DataHandlerLP.DK_I)
dl_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I)
dl_test.config(fillna_type="ffill+bfill")
test_loader = DataLoader(dl_test, batch_size=self.batch_size, num_workers=self.n_jobs)
self.LSTM_model.eval()

View File

@@ -60,7 +60,7 @@ class DNNModelPytorch(Model):
lr_decay_steps=100,
optimizer="gd",
loss="mse",
GPU="0",
GPU=0,
seed=None,
weight_decay=0.0,
**kwargs
@@ -259,7 +259,7 @@ class DNNModelPytorch(Model):
loss = torch.mul(sqr_loss, w).mean()
return loss
elif loss_type == "binary":
loss = nn.BCELoss()
loss = nn.BCELoss(weight=w)
return loss(pred, target)
else:
raise NotImplementedError("loss {} is not supported!".format(loss_type))
@@ -296,7 +296,7 @@ class DNNModelPytorch(Model):
self._fitted = True
class AverageMeter(object):
class AverageMeter:
"""Computes and stores the average and current value"""
def __init__(self):

View File

@@ -464,7 +464,7 @@ class SFM(Model):
return pd.Series(np.concatenate(preds), index=index)
class AverageMeter(object):
class AverageMeter:
"""Computes and stores the average and current value"""
def __init__(self):

View File

@@ -0,0 +1,642 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from __future__ import division
from __future__ import print_function
import os
import numpy as np
import pandas as pd
import copy
from sklearn.metrics import roc_auc_score, mean_squared_error
import logging
from ...utils import (
unpack_archive_with_buffer,
save_multiple_parts_file,
create_save_path,
drop_nan_by_y_index,
)
from ...log import get_module_logger, TimeInspector
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Function
from ...model.base import Model
from ...data.dataset import DatasetH
from ...data.dataset.handler import DataHandlerLP
class TabnetModel(Model):
def __init__(
self,
d_feat=158,
out_dim=64,
final_out_dim=1,
batch_size=4096,
n_d=64,
n_a=64,
n_shared=2,
n_ind=2,
n_steps=5,
n_epochs=100,
pretrain_n_epochs=50,
relax=1.3,
vbs=2048,
seed=993,
optimizer="adam",
loss="mse",
metric="",
early_stop=20,
GPU="1",
pretrain_loss="custom",
ps=0.3,
lr=0.01,
pretrain=True,
pretrain_file="./pretrain/best.model",
):
"""
TabNet model for Qlib
Args
ps: probability to generate the bernoulli mask
"""
# set hyper-parameters.
self.d_feat = d_feat
self.out_dim = out_dim
self.final_out_dim = final_out_dim
self.lr = lr
self.batch_size = batch_size
self.optimizer = optimizer.lower()
self.pretrain_loss = pretrain_loss
self.seed = seed
self.ps = ps
self.n_epochs = n_epochs
self.logger = get_module_logger("TabNet")
self.pretrain_n_epochs = pretrain_n_epochs
self.device = "cuda:%s" % (GPU) if torch.cuda.is_available() else "cpu"
self.loss = loss
self.metric = metric
self.early_stop = early_stop
self.pretrain = pretrain
self.pretrain_file = pretrain_file
self.logger.info(
"TabNet:"
"\nbatch_size : {}"
"\nvirtual bs : {}"
"\nGPU : {}"
"\npretrain: {}".format(self.batch_size, vbs, GPU, pretrain)
)
np.random.seed(self.seed)
torch.manual_seed(self.seed)
self.tabnet_model = TabNet(
inp_dim=self.d_feat, out_dim=self.out_dim, vbs=vbs, relax=relax, device=self.device
).to(self.device)
self.tabnet_decoder = TabNet_Decoder(self.out_dim, self.d_feat, n_shared, n_ind, vbs, n_steps, self.device).to(
self.device
)
if optimizer.lower() == "adam":
self.pretrain_optimizer = optim.Adam(
list(self.tabnet_model.parameters()) + list(self.tabnet_decoder.parameters()), lr=self.lr
)
self.train_optimizer = optim.Adam(self.tabnet_model.parameters(), lr=self.lr)
elif optimizer.lower() == "gd":
self.pretrain_optimizer = optim.SGD(
list(self.tabnet_model.parameters()) + list(self.tabnet_decoder.parameters()), lr=self.lr
)
self.train_optimizer = optim.SGD(self.tabnet_model.parameters(), lr=self.lr)
else:
raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
def pretrain_fn(self, dataset=DatasetH, pretrain_file="./pretrain/best.model"):
# make a directory if pretrian director does not exist
if pretrain_file.startswith("./pretrain") and not os.path.exists("pretrain"):
self.logger.info("make folder to store model...")
os.makedirs("pretrain")
[df_train, df_valid] = dataset.prepare(
["pretrain", "pretrain_validation"],
col_set=["feature", "label"],
data_key=DataHandlerLP.DK_L,
)
df_train.fillna(df_train.mean(), inplace=True)
df_valid.fillna(df_valid.mean(), inplace=True)
x_train = df_train["feature"]
x_valid = df_valid["feature"]
# Early stop setup
stop_steps = 0
train_loss = 0
best_loss = np.inf
for epoch_idx in range(self.pretrain_n_epochs):
self.logger.info("epoch: %s" % (epoch_idx))
self.logger.info("pre-training...")
self.pretrain_epoch(x_train)
self.logger.info("evaluating...")
train_loss = self.pretrain_test_epoch(x_train)
valid_loss = self.pretrain_test_epoch(x_valid)
self.logger.info("train %.6f, valid %.6f" % (train_loss, valid_loss))
if valid_loss < best_loss:
self.logger.info("Save Model...")
torch.save(self.tabnet_model.state_dict(), pretrain_file)
best_loss = valid_loss
else:
stop_steps += 1
if stop_steps >= self.early_stop:
self.logger.info("early stop")
break
def fit(
self,
dataset: DatasetH,
evals_result=dict(),
verbose=True,
save_path=None,
):
if self.pretrain:
# there is a pretrained model, load the model
self.logger.info("Pretrain...")
self.pretrain_fn(dataset, self.pretrain_file)
self.logger.info("Load Pretrain model")
self.tabnet_model.load_state_dict(torch.load(self.pretrain_file))
# adding one more linear layer to fit the final output dimension
self.tabnet_model = FinetuneModel(self.out_dim, self.final_out_dim, self.tabnet_model).to(self.device)
df_train, df_valid = dataset.prepare(
["train", "valid"],
col_set=["feature", "label"],
data_key=DataHandlerLP.DK_L,
)
df_train.fillna(df_train.mean(), inplace=True)
x_train, y_train = df_train["feature"], df_train["label"]
x_valid, y_valid = df_valid["feature"], df_valid["label"]
stop_steps = 0
train_loss = 0
best_score = np.inf
best_epoch = 0
evals_result["train"] = []
evals_result["valid"] = []
self.logger.info("training...")
self._fitted = True
for epoch_idx in range(self.n_epochs):
self.logger.info("epoch: %s" % (epoch_idx))
self.logger.info("training...")
self.train_epoch(x_train, y_train)
self.logger.info("evaluating...")
train_loss, train_score = self.test_epoch(x_train, y_train)
valid_loss, val_score = self.test_epoch(x_valid, y_valid)
self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
evals_result["train"].append(train_score)
evals_result["valid"].append(val_score)
if val_score < best_score:
best_score = val_score
stop_steps = 0
best_epoch = epoch_idx
else:
stop_steps += 1
if stop_steps >= self.early_stop:
self.logger.info("early stop")
break
self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
def predict(self, dataset):
if not self._fitted:
raise ValueError("model is not fitted yet!")
x_test = dataset.prepare("test", col_set="feature", data_key=DataHandlerLP.DK_I)
index = x_test.index
self.tabnet_model.eval()
x_values = torch.from_numpy(x_test.values)
x_values[torch.isnan(x_values)] = 0
sample_num = x_values.shape[0]
preds = []
for begin in range(sample_num)[:: self.batch_size]:
if sample_num - begin < self.batch_size:
end = sample_num
else:
end = begin + self.batch_size
x_batch = x_values[begin:end].float().to(self.device)
priors = torch.ones(end - begin, self.d_feat).to(self.device)
with torch.no_grad():
pred = self.tabnet_model(x_batch, priors).detach().cpu().numpy()
preds.append(pred)
return pd.Series(np.concatenate(preds), index=index)
def test_epoch(self, data_x, data_y):
# prepare training data
x_values = torch.from_numpy(data_x.values)
y_values = torch.from_numpy(np.squeeze(data_y.values))
x_values[torch.isnan(x_values)] = 0
y_values[torch.isnan(y_values)] = 0
self.tabnet_model.eval()
scores = []
losses = []
indices = np.arange(len(x_values))
for i in range(len(indices))[:: self.batch_size]:
if len(indices) - i < self.batch_size:
break
feature = x_values[indices[i : i + self.batch_size]].float().to(self.device)
label = y_values[indices[i : i + self.batch_size]].float().to(self.device)
priors = torch.ones(self.batch_size, self.d_feat).to(self.device)
pred = self.tabnet_model(feature, priors)
loss = self.loss_fn(pred, label)
losses.append(loss.item())
score = self.metric_fn(pred, label)
scores.append(score.item())
return np.mean(losses), np.mean(scores)
def train_epoch(self, x_train, y_train):
x_train_values = torch.from_numpy(x_train.values)
y_train_values = torch.from_numpy(np.squeeze(y_train.values))
x_train_values[torch.isnan(x_train_values)] = 0
y_train_values[torch.isnan(y_train_values)] = 0
self.tabnet_model.train()
indices = np.arange(len(x_train_values))
np.random.shuffle(indices)
for i in range(len(indices))[:: self.batch_size]:
if len(indices) - i < self.batch_size:
break
feature = x_train_values[indices[i : i + self.batch_size]].float().to(self.device)
label = y_train_values[indices[i : i + self.batch_size]].float().to(self.device)
priors = torch.ones(self.batch_size, self.d_feat).to(self.device)
pred = self.tabnet_model(feature, priors)
loss = self.loss_fn(pred, label)
self.train_optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_value_(self.tabnet_model.parameters(), 3.0)
self.train_optimizer.step()
def pretrain_epoch(self, x_train):
train_set = torch.from_numpy(x_train.values)
train_set[torch.isnan(train_set)] = 0
indices = np.arange(len(train_set))
np.random.shuffle(indices)
self.tabnet_model.train()
self.tabnet_decoder.train()
for i in range(len(indices))[:: self.batch_size]:
if len(indices) - i < self.batch_size:
break
S_mask = torch.bernoulli(torch.empty(self.batch_size, self.d_feat).fill_(self.ps))
x_train_values = train_set[indices[i : i + self.batch_size]] * (1 - S_mask)
y_train_values = train_set[indices[i : i + self.batch_size]] * (S_mask)
S_mask = S_mask.to(self.device)
feature = x_train_values.float().to(self.device)
label = y_train_values.float().to(self.device)
priors = 1 - S_mask
(vec, sparse_loss) = self.tabnet_model(feature, priors)
f = self.tabnet_decoder(vec)
loss = self.pretrain_loss_fn(label, f, S_mask)
self.pretrain_optimizer.zero_grad()
loss.backward()
self.pretrain_optimizer.step()
def pretrain_test_epoch(self, x_train):
train_set = torch.from_numpy(x_train.values)
train_set[torch.isnan(train_set)] = 0
indices = np.arange(len(train_set))
self.tabnet_model.eval()
self.tabnet_decoder.eval()
losses = []
for i in range(len(indices))[:: self.batch_size]:
if len(indices) - i < self.batch_size:
break
S_mask = torch.bernoulli(torch.empty(self.batch_size, self.d_feat).fill_(self.ps))
x_train_values = train_set[indices[i : i + self.batch_size]] * (1 - S_mask)
y_train_values = train_set[indices[i : i + self.batch_size]] * (S_mask)
feature = x_train_values.float().to(self.device)
label = y_train_values.float().to(self.device)
S_mask = S_mask.to(self.device)
priors = 1 - S_mask
(vec, sparse_loss) = self.tabnet_model(feature, priors)
f = self.tabnet_decoder(vec)
loss = self.pretrain_loss_fn(label, f, S_mask)
losses.append(loss.item())
return np.mean(losses)
def pretrain_loss_fn(self, f_hat, f, S):
"""
Pretrain loss function defined in the original paper, read "Tabular self-supervised learning" in https://arxiv.org/pdf/1908.07442.pdf
"""
down_mean = torch.mean(f, dim=0)
down = torch.sqrt(torch.sum(torch.square(f - down_mean), dim=0))
up = (f_hat - f) * S
return torch.sum(torch.square(up / down))
def loss_fn(self, pred, label):
mask = ~torch.isnan(label)
if self.loss == "mse":
return self.mse(pred[mask], label[mask])
raise ValueError("unknown loss `%s`" % self.loss)
def metric_fn(self, pred, label):
mask = torch.isfinite(label)
if self.metric == "" or self.metric == "loss":
return -self.loss_fn(pred[mask], label[mask])
raise ValueError("unknown metric `%s`" % self.metric)
def mse(self, pred, label):
loss = (pred - label) ** 2
return torch.mean(loss)
class FinetuneModel(nn.Module):
"""
FinuetuneModel for adding a layer by the end
"""
def __init__(self, input_dim, output_dim, trained_model):
super().__init__()
self.model = trained_model
self.fc = nn.Linear(input_dim, output_dim)
def forward(self, x, priors):
return self.fc(self.model(x, priors)[0]).squeeze() # take the vec out
class DecoderStep(nn.Module):
def __init__(self, inp_dim, out_dim, shared, n_ind, vbs, device):
super().__init__()
self.fea_tran = FeatureTransformer(inp_dim, out_dim, shared, n_ind, vbs, device)
self.fc = nn.Linear(out_dim, out_dim)
def forward(self, x):
x = self.fea_tran(x)
return self.fc(x)
class TabNet_Decoder(nn.Module):
def __init__(self, inp_dim, out_dim, n_shared, n_ind, vbs, n_steps, device):
"""
TabNet decoder that is used in pre-training
"""
self.out_dim = out_dim
super().__init__()
if n_shared > 0:
self.shared = nn.ModuleList()
self.shared.append(nn.Linear(inp_dim, 2 * out_dim))
for x in range(n_shared - 1):
self.shared.append(nn.Linear(out_dim, 2 * out_dim)) # preset the linear function we will use
else:
self.shared = None
self.n_steps = n_steps
self.steps = nn.ModuleList()
for x in range(n_steps):
self.steps.append(DecoderStep(inp_dim, out_dim, self.shared, n_ind, vbs, device))
def forward(self, x):
out = torch.zeros(x.size(0), self.out_dim).to(x.device)
for step in self.steps:
out += step(x)
return out
class TabNet(nn.Module):
def __init__(
self, inp_dim=6, out_dim=6, n_d=64, n_a=64, n_shared=2, n_ind=2, n_steps=5, relax=1.2, vbs=1024, device="cpu"
):
"""
TabNet AKA the original encoder
Args:
n_d: dimension of the features used to calculate the final results
n_a: dimension of the features input to the attention transformer of the next step
n_shared: numbr of shared steps in feature transfomer(optional)
n_ind: number of independent steps in feature transformer
n_steps: number of steps of pass through tabbet
relax coefficient:
virtual batch size:
"""
super().__init__()
# set the number of shared step in feature transformer
if n_shared > 0:
self.shared = nn.ModuleList()
self.shared.append(nn.Linear(inp_dim, 2 * (n_d + n_a)))
for x in range(n_shared - 1):
self.shared.append(nn.Linear(n_d + n_a, 2 * (n_d + n_a))) # preset the linear function we will use
else:
self.shared = None
self.first_step = FeatureTransformer(inp_dim, n_d + n_a, self.shared, n_ind, vbs, device)
self.steps = nn.ModuleList()
for x in range(n_steps - 1):
self.steps.append(DecisionStep(inp_dim, n_d, n_a, self.shared, n_ind, relax, vbs, device))
self.fc = nn.Linear(n_d, out_dim)
self.bn = nn.BatchNorm1d(inp_dim, momentum=0.01)
self.n_d = n_d
def forward(self, x, priors):
assert not torch.isnan(x).any()
x = self.bn(x)
x_a = self.first_step(x)[:, self.n_d :]
sparse_loss = torch.zeros(1).to(x.device)
out = torch.zeros(x.size(0), self.n_d).to(x.device)
for step in self.steps:
x_te, l = step(x, x_a, priors)
out += F.relu(x_te[:, : self.n_d]) # split the feautre from feat_transformer
x_a = x_te[:, self.n_d :]
sparse_loss += l
return self.fc(out), sparse_loss
class GBN(nn.Module):
"""
Ghost Batch Normalization
an efficient way of doing batch normalization
Args:
vbs: virtual batch size
"""
def __init__(self, inp, vbs=1024, momentum=0.01):
super().__init__()
self.bn = nn.BatchNorm1d(inp, momentum=momentum)
self.vbs = vbs
def forward(self, x):
chunk = torch.chunk(x, x.size(0) // self.vbs, 0)
res = [self.bn(y) for y in chunk]
return torch.cat(res, 0)
class GLU(nn.Module):
"""
GLU block that extracts only the most essential information
Args:
vbs: virtual batch size
"""
def __init__(self, inp_dim, out_dim, fc=None, vbs=1024):
super().__init__()
if fc:
self.fc = fc
else:
self.fc = nn.Linear(inp_dim, out_dim * 2)
self.bn = GBN(out_dim * 2, vbs=vbs)
self.od = out_dim
def forward(self, x):
x = self.bn(self.fc(x))
return torch.mul(x[:, : self.od], torch.sigmoid(x[:, self.od :]))
class AttentionTransformer(nn.Module):
"""
Args:
relax: relax coefficient. The greater it is, we can
use the same features more. When it is set to 1
we can use every feature only once
"""
def __init__(self, d_a, inp_dim, relax, vbs=1024):
super().__init__()
self.fc = nn.Linear(d_a, inp_dim)
self.bn = GBN(inp_dim, vbs=vbs)
self.r = relax
# a:feature from previous decision step
def forward(self, a, priors):
a = self.bn(self.fc(a))
mask = SparsemaxFunction.apply(a * priors)
priors = priors * (self.r - mask) # updating the prior
return mask
class FeatureTransformer(nn.Module):
def __init__(self, inp_dim, out_dim, shared, n_ind, vbs, device):
super().__init__()
first = True
self.shared = nn.ModuleList()
if shared:
self.shared.append(GLU(inp_dim, out_dim, shared[0], vbs=vbs))
first = False
for fc in shared[1:]:
self.shared.append(GLU(out_dim, out_dim, fc, vbs=vbs))
else:
self.shared = None
self.independ = nn.ModuleList()
if first:
self.independ.append(GLU(inp, out_dim, vbs=vbs))
for x in range(first, n_ind):
self.independ.append(GLU(out_dim, out_dim, vbs=vbs))
self.scale = torch.sqrt(torch.tensor([0.5], device=device))
def forward(self, x):
if self.shared:
x = self.shared[0](x)
for glu in self.shared[1:]:
x = torch.add(x, glu(x))
x = x * self.scale
for glu in self.independ:
x = torch.add(x, glu(x))
x = x * self.scale
return x
class DecisionStep(nn.Module):
"""
One step for the TabNet
"""
def __init__(self, inp_dim, n_d, n_a, shared, n_ind, relax, vbs, device):
super().__init__()
self.atten_tran = AttentionTransformer(n_a, inp_dim, relax, vbs)
self.fea_tran = FeatureTransformer(inp_dim, n_d + n_a, shared, n_ind, vbs, device)
def forward(self, x, a, priors):
mask = self.atten_tran(a, priors)
sparse_loss = ((-1) * mask * torch.log(mask + 1e-10)).mean()
x = self.fea_tran(x * mask)
return x, sparse_loss
def make_ix_like(input, dim=0):
d = input.size(dim)
rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype)
view = [1] * input.dim()
view[0] = -1
return rho.view(view).transpose(0, dim)
class SparsemaxFunction(Function):
"""
SparseMax function for replacing reLU
"""
@staticmethod
def forward(ctx, input, dim=-1):
ctx.dim = dim
max_val, _ = input.max(dim=dim, keepdim=True)
input -= max_val # same numerical stability trick as for softmax
tau, supp_size = SparsemaxFunction.threshold_and_support(input, dim=dim)
output = torch.clamp(input - tau, min=0)
ctx.save_for_backward(supp_size, output)
return output
@staticmethod
def backward(ctx, grad_output):
supp_size, output = ctx.saved_tensors
dim = ctx.dim
grad_input = grad_output.clone()
grad_input[output == 0] = 0
v_hat = grad_input.sum(dim=dim) / supp_size.to(output.dtype).squeeze()
v_hat = v_hat.unsqueeze(dim)
grad_input = torch.where(output != 0, grad_input - v_hat, grad_input)
return grad_input, None
@staticmethod
def threshold_and_support(input, dim=-1):
input_srt, _ = torch.sort(input, descending=True, dim=dim)
input_cumsum = input_srt.cumsum(dim) - 1
rhos = make_ix_like(input, dim)
support = rhos * input_srt > input_cumsum
support_size = support.sum(dim=dim).unsqueeze(dim)
tau = input_cumsum.gather(dim, support_size - 1)
tau /= support_size.to(input.dtype)
return tau, support_size

View File

@@ -21,7 +21,7 @@ from .executor import SimulatorExecutor
from .executor import save_score_series, load_score_series
class Operator(object):
class Operator:
def __init__(self, client: str):
"""
Parameters

View File

@@ -38,7 +38,7 @@ def _calculate_report_data(df: pd.DataFrame) -> pd.DataFrame:
:param df:
:return:
"""
index_names = df.index.names
df.index = df.index.strftime("%Y-%m-%d")
report_df = pd.DataFrame()
@@ -58,6 +58,8 @@ def _calculate_report_data(df: pd.DataFrame) -> pd.DataFrame:
report_df["turnover"] = df["turnover"]
report_df.sort_index(ascending=True, inplace=True)
report_df.index.names = index_names
return report_df

View File

@@ -17,7 +17,7 @@ from plotly.figure_factory import create_distplot
from ...utils import get_module_by_module_path
class BaseGraph(object):
class BaseGraph:
""""""
_name = None
@@ -204,7 +204,7 @@ class HistogramGraph(BaseGraph):
return _data
class SubplotsGraph(object):
class SubplotsGraph:
"""Create subplots same as df.plot(subplots=True)
Simple package for `plotly.tools.subplots`

View File

@@ -30,7 +30,7 @@ class BaseStrategy:
Parameters
-----------
score_series : pd.Seires
score_series : pd.Series
stock_id , score.
current : Position()
current state of position.

View File

@@ -6,7 +6,7 @@ import copy
import os
class TunerConfigManager(object):
class TunerConfigManager:
def __init__(self, config_path):
if not config_path:
@@ -27,7 +27,7 @@ class TunerConfigManager(object):
self.qlib_client_config = config.get("qlib_client", dict())
class PipelineExperimentConfig(object):
class PipelineExperimentConfig:
def __init__(self, config, TUNER_CONFIG_MANAGER):
"""
:param config: The config dict for tuner experiment
@@ -53,7 +53,7 @@ class PipelineExperimentConfig(object):
yaml.dump(TUNER_CONFIG_MANAGER.config, fp)
class OptimizationConfig(object):
class OptimizationConfig:
def __init__(self, config, TUNER_CONFIG_MANAGER):
self.report_type = config.get("report_type", "pred_long")

View File

@@ -11,7 +11,7 @@ from ...log import get_module_logger, TimeInspector
from ...utils import get_module_by_module_path
class Pipeline(object):
class Pipeline:
GLOBAL_BEST_PARAMS_NAME = "global_best_params.json"

View File

@@ -19,7 +19,7 @@ from hyperopt import fmin, tpe
from hyperopt import STATUS_OK, STATUS_FAIL
class Tuner(object):
class Tuner:
def __init__(self, tuner_config, optim_config):
self.logger = get_module_logger("Tuner", sh_level=logging.INFO)

View File

@@ -8,7 +8,7 @@ from libc.math cimport sqrt, isnan, NAN
from libcpp.vector cimport vector
cdef class Expanding(object):
cdef class Expanding:
"""1-D array expanding"""
cdef vector[double] barv
cdef int na_count

View File

@@ -8,7 +8,7 @@ from libc.math cimport sqrt, isnan, NAN
from libcpp.deque cimport deque
cdef class Rolling(object):
cdef class Rolling:
"""1-D array rolling"""
cdef int window
cdef deque[double] barv

View File

@@ -157,7 +157,7 @@ class Expression(abc.ABC):
@abc.abstractmethod
def _load_internal(self, instrument, start_index, end_index, freq):
pass
raise NotImplementedError("This function must be implemented in your newly defined feature")
@abc.abstractmethod
def get_longest_back_rolling(self):

View File

@@ -13,6 +13,7 @@ import pickle
import traceback
import redis_lock
import contextlib
import abc
from pathlib import Path
import numpy as np
import pandas as pd
@@ -32,43 +33,107 @@ from ..utils import (
from ..log import get_module_logger
from .base import Feature
from .ops import *
from .ops import Operators
class QlibCacheException(RuntimeError):
pass
class MemCacheUnit(OrderedDict):
class MemCacheUnit(abc.ABC):
"""Memory Cache Unit."""
# TODO: use min_heap to replace ordereddict for better performance
def __init__(self, *args, **kwargs):
self.size_limit = kwargs.pop("size_limit", None)
# limit_type: check size_limit type, length(call fun: len) or size(call fun: sys.getsizeof)
self.limit_type = kwargs.pop("limit_type", "length")
super(MemCacheUnit, self).__init__(*args, **kwargs)
self._check_size_limit()
self.size_limit = kwargs.pop("size_limit", 0)
self._size = 0
self.od = OrderedDict()
def __setitem__(self, key, value):
super(MemCacheUnit, self).__setitem__(key, value)
self._check_size_limit()
# TODO: thread safe?__setitem__ failure might cause inconsistent size?
def __getitem__(self, key):
value = super(MemCacheUnit, self).__getitem__(key)
super(MemCacheUnit, self).__delitem__(key)
super(MemCacheUnit, self).__setitem__(key, value)
return value
# precalculate the size after od.__setitem__
self._adjust_size(key, value)
def _check_size_limit(self):
if self.size_limit is not None:
get_cur_size = lambda x: len(x) if self.limit_type == "length" else sum(map(sys.getsizeof, x.values()))
while get_cur_size(self) > self.size_limit:
self.od.__setitem__(key, value)
# move the key to end,make it latest
self.od.move_to_end(key)
if self.limited:
# pop the oldest items beyond size limit
while self._size > self.size_limit:
self.popitem(last=False)
def __getitem__(self, key):
v = self.od.__getitem__(key)
self.od.move_to_end(key)
return v
class MemCache(object):
def __contains__(self, key):
return key in self.od
def __len__(self):
return self.od.__len__()
def __repr__(self):
return f"{self.__class__.__name__}<size_limit:{self.size_limit if self.limited else 'no limit'} total_size:{self._size}>\n{self.od.__repr__()}"
def set_limit_size(self, limit):
self.size_limit = limit
@property
def limited(self):
"""whether memory cache is limited"""
return self.size_limit > 0
@property
def total_size(self):
return self._size
def clear(self):
self._size = 0
self.od.clear()
def popitem(self, last=True):
k, v = self.od.popitem(last=last)
self._size -= self._get_value_size(v)
return k, v
def pop(self, key):
v = self.od.pop(key)
self._size -= self._get_value_size(v)
return v
def _adjust_size(self, key, value):
if key in self.od:
self._size -= self._get_value_size(self.od[key])
self._size += self._get_value_size(value)
@abc.abstractmethod
def _get_value_size(self, value):
raise NotImplementedError
class MemCacheLengthUnit(MemCacheUnit):
def __init__(self, size_limit=0):
super().__init__(size_limit=size_limit)
def _get_value_size(self, value):
return 1
class MemCacheSizeofUnit(MemCacheUnit):
def __init__(self, size_limit=0):
super().__init__(size_limit=size_limit)
def _get_value_size(self, value):
return sys.getsizeof(value)
class MemCache:
"""Memory cache."""
def __init__(self, mem_cache_size_limit=None, limit_type="length"):
@@ -79,21 +144,19 @@ class MemCache(object):
mem_cache_size_limit: cache max size.
limit_type: length or sizeof; length(call fun: len), size(call fun: sys.getsizeof).
"""
if limit_type not in ["length", "sizeof"]:
size_limit = C.mem_cache_size_limit if mem_cache_size_limit is None else mem_cache_size_limit
if limit_type == "length":
klass = MemCacheLengthUnit
elif limit_type == "sizeof":
klass = MemCacheSizeofUnit
else:
raise ValueError(f"limit_type must be length or sizeof, your limit_type is {limit_type}")
self.__calendar_mem_cache = MemCacheUnit(
size_limit=C.mem_cache_size_limit if mem_cache_size_limit is None else mem_cache_size_limit,
limit_type=limit_type,
)
self.__instrument_mem_cache = MemCacheUnit(
size_limit=C.mem_cache_size_limit if mem_cache_size_limit is None else mem_cache_size_limit,
limit_type=limit_type,
)
self.__feature_mem_cache = MemCacheUnit(
size_limit=C.mem_cache_size_limit if mem_cache_size_limit is None else mem_cache_size_limit,
limit_type=limit_type,
)
self.__calendar_mem_cache = klass(size_limit)
self.__instrument_mem_cache = klass(size_limit)
self.__feature_mem_cache = klass(size_limit)
def __getitem__(self, key):
if key == "c":
@@ -140,7 +203,7 @@ class MemCacheExpire:
return value, expire
class CacheUtils(object):
class CacheUtils:
LOCK_ID = "QLIB"
@staticmethod
@@ -224,7 +287,7 @@ class CacheUtils(object):
current_cache_wlock.release()
class BaseProviderCache(object):
class BaseProviderCache:
"""Provider cache base class"""
def __init__(self, provider):
@@ -762,8 +825,8 @@ class DiskDatasetCache(DatasetCache):
.. note:: The start is closed. The end is open!!!!!
- Each line contains two element <timestamp, end_index>
- It indicates the `end_index` of the data for `timestamp`
- Each line contains two element <start_index, end_index> with a timestamp as its index.
- It indicates the `start_index`(included) and `end_index`(excluded) of the data for `timestamp`
- meta data: cache/d41366901e25de3ec47297f12e2ba11d.meta

View File

@@ -12,7 +12,7 @@ from ..log import get_module_logger
import pickle
class Client(object):
class Client:
"""A client class
Provide the connection tool functions for ClientProvider.

View File

@@ -15,14 +15,13 @@ import importlib
import traceback
import numpy as np
import pandas as pd
from pathlib import Path
from multiprocessing import Pool
from .cache import H
from ..config import C
from .ops import *
from .ops import Operators
from ..log import get_module_logger
from ..utils import parse_field, read_bin, hash_args, normalize_cache_fields
from ..utils import parse_field, read_bin, hash_args, normalize_cache_fields, code_to_fname
from .base import Feature
from .cache import DiskDatasetCache, DiskExpressionCache
from ..utils import Wrapper, init_instance_by_config, register_wrapper, get_module_by_module_path
@@ -118,7 +117,7 @@ class CalendarProvider(abc.ABC):
if flag in H["c"]:
_calendar, _calendar_index = H["c"][flag]
else:
_calendar = np.array(self._load_calendar(freq, future))
_calendar = np.array(self.load_calendar(freq, future))
_calendar_index = {x: i for i, x in enumerate(_calendar)} # for fast search
H["c"][flag] = _calendar, _calendar_index
return _calendar, _calendar_index
@@ -215,20 +214,6 @@ class InstrumentProvider(abc.ABC):
return cls.LIST
raise ValueError(f"Unknown instrument type {inst}")
def convert_instruments(self, instrument):
_instruments_map = getattr(self, "_instruments_map", None)
if _instruments_map is None:
_df_list = []
# FIXME: each process will read these files
for _path in Path(C.get_data_path()).joinpath("instruments").glob("*.txt"):
_df = pd.read_csv(_path, sep="\t", names=["inst", "start_datetime", "end_datetime", "save_inst"])
_df_list.append(_df.iloc[:, [0, -1]])
df = pd.concat(_df_list, sort=False).sort_values("save_inst")
df = df.drop_duplicates(subset=["save_inst"], keep="first").fillna(axis=1, method="ffill")
_instruments_map = df.set_index("inst").iloc[:, 0].to_dict()
setattr(self, "_instruments_map", _instruments_map)
return _instruments_map.get(instrument, instrument)
class FeatureProvider(abc.ABC):
"""Feature provider class
@@ -481,11 +466,10 @@ class DatasetProvider(abc.ABC):
"""
# FIXME: Windows OS or MacOS using spawn: https://docs.python.org/3.8/library/multiprocessing.html?highlight=spawn#contexts-and-start-methods
global C
C = g_config
# NOTE: This place is compatible with windows, windows multi-process is spawn
if getattr(ExpressionD, "_provider", None) is None:
register_all_wrappers()
if not C.registered:
C.set_conf_from_C(g_config)
C.register()
obj = dict()
for field in column_names:
@@ -520,7 +504,7 @@ class LocalCalendarProvider(CalendarProvider):
"""Calendar file uri."""
return os.path.join(C.get_data_path(), "calendars", "{}.txt")
def _load_calendar(self, freq, future):
def load_calendar(self, freq, future):
"""Load original calendar timestamp from file.
Parameters
@@ -587,10 +571,16 @@ class LocalInstrumentProvider(InstrumentProvider):
fname = self._uri_inst.format(market)
if not os.path.exists(fname):
raise ValueError("instruments not exists for market " + market)
_instruments = dict()
df = pd.read_csv(fname, sep="\t", names=["inst", "start_datetime", "end_datetime", "save_inst"])
df["start_datetime"] = pd.to_datetime(df["start_datetime"])
df["end_datetime"] = pd.to_datetime(df["end_datetime"])
df = pd.read_csv(
fname,
sep="\t",
usecols=[0, 1, 2],
names=["inst", "start_datetime", "end_datetime"],
dtype={"inst": str},
parse_dates=["start_datetime", "end_datetime"],
)
for row in df.itertuples(index=False):
_instruments.setdefault(row[0], []).append((row[1], row[2]))
return _instruments
@@ -647,7 +637,7 @@ class LocalFeatureProvider(FeatureProvider):
def feature(self, instrument, field, start_index, end_index, freq):
# validate
field = str(field).lower()[1:]
instrument = Inst.convert_instruments(instrument)
instrument = code_to_fname(instrument)
uri_data = self._uri_data.format(instrument.lower(), field, freq)
if not os.path.exists(uri_data):
get_module_logger("data").warning("WARN: data not found for %s.%s" % (instrument, field))
@@ -682,6 +672,8 @@ class LocalExpressionProvider(ExpressionProvider):
series = series.astype(np.float32)
except ValueError:
pass
except TypeError:
pass
if not series.empty:
series = series.loc[start_index:end_index]
return series
@@ -969,8 +961,7 @@ class BaseProvider:
is a provider class.
"""
disk_cache = C.default_disk_cache if disk_cache is None else disk_cache
if C.disable_disk_cache:
disk_cache = False
fields = list(fields) # In case of tuple.
try:
return DatasetD.dataset(instruments, fields, start_time, end_time, freq, disk_cache)
except TypeError:
@@ -1035,15 +1026,34 @@ class ClientProvider(BaseProvider):
DatasetD.set_conn(self.client)
Cal = Wrapper()
Inst = Wrapper()
FeatureD = Wrapper()
ExpressionD = Wrapper()
DatasetD = Wrapper()
D = Wrapper()
import sys
if sys.version_info >= (3, 9):
from typing import Annotated
CalendarProviderWrapper = Annotated[CalendarProvider, Wrapper]
InstrumentProviderWrapper = Annotated[InstrumentProvider, Wrapper]
FeatureProviderWrapper = Annotated[FeatureProvider, Wrapper]
ExpressionProviderWrapper = Annotated[ExpressionProvider, Wrapper]
DatasetProviderWrapper = Annotated[DatasetProvider, Wrapper]
BaseProviderWrapper = Annotated[BaseProvider, Wrapper]
else:
CalendarProviderWrapper = CalendarProvider
InstrumentProviderWrapper = InstrumentProvider
FeatureProviderWrapper = FeatureProvider
ExpressionProviderWrapper = ExpressionProvider
DatasetProviderWrapper = DatasetProvider
BaseProviderWrapper = BaseProvider
Cal: CalendarProviderWrapper = Wrapper()
Inst: InstrumentProviderWrapper = Wrapper()
FeatureD: FeatureProviderWrapper = Wrapper()
ExpressionD: ExpressionProviderWrapper = Wrapper()
DatasetD: DatasetProviderWrapper = Wrapper()
D: BaseProviderWrapper = Wrapper()
def register_all_wrappers():
def register_all_wrappers(C):
"""register_all_wrappers"""
logger = get_module_logger("data")
module = get_module_by_module_path("qlib.data")
@@ -1052,7 +1062,7 @@ def register_all_wrappers():
if getattr(C, "calendar_cache", None) is not None:
_calendar_provider = init_instance_by_config(C.calendar_cache, module, provide=_calendar_provider)
register_wrapper(Cal, _calendar_provider, "qlib.data")
logger.debug(f"registering Cal {C.calendar_provider}-{C.calenar_cache}")
logger.debug(f"registering Cal {C.calendar_provider}-{C.calendar_cache}")
register_wrapper(Inst, C.instrument_provider, "qlib.data")
logger.debug(f"registering Inst {C.instrument_provider}")

View File

@@ -76,18 +76,22 @@ class DatasetH(Dataset):
- The processing is related to data split.
"""
def __init__(self, handler: Union[dict, DataHandler], segments: list):
def __init__(self, handler: Union[dict, DataHandler], segments: dict):
"""
Parameters
----------
handler : Union[dict, DataHandler]
handler will be passed into setup_data.
segments : list
segments : dict
handler will be passed into setup_data.
"""
super().__init__(handler, segments)
def setup_data(self, handler: Union[dict, DataHandler], segments: list):
def init(self, **kwargs):
"""Initialize the DatasetH, Only parameters belonging to handler.init will be passed in"""
self.handler.init(**kwargs)
def setup_data(self, handler: Union[dict, DataHandler], segments: dict):
"""
Setup the underlying data.
@@ -100,7 +104,7 @@ class DatasetH(Dataset):
- config of `DataHandler`. Please refer to `DataHandler`
segments : list
segments : dict
Describe the options to segment the data.
Here are some examples:
@@ -116,8 +120,8 @@ class DatasetH(Dataset):
'outsample': ("2017-01-01", "2020-08-01",),
}
"""
self._handler = init_instance_by_config(handler, accept_types=DataHandler)
self._segments = segments.copy()
self.handler = init_instance_by_config(handler, accept_types=DataHandler)
self.segments = segments.copy()
def _prepare_seg(self, slc: slice, **kwargs):
"""
@@ -127,7 +131,7 @@ class DatasetH(Dataset):
----------
slc : slice
"""
return self._handler.fetch(slc, **kwargs)
return self.handler.fetch(slc, **kwargs)
def prepare(
self,
@@ -150,7 +154,7 @@ class DatasetH(Dataset):
- ['train', 'valid']
col_set : str
The col_set will be passed to self._handler when fetching data.
The col_set will be passed to self.handler when fetching data.
data_key : str
The data to fetch: DK_*
Default is DK_I, which indicate fetching data for **inference**.
@@ -166,16 +170,16 @@ class DatasetH(Dataset):
logger = get_module_logger("DatasetH")
fetch_kwargs = {"col_set": col_set}
fetch_kwargs.update(kwargs)
if "data_key" in getfullargspec(self._handler.fetch).args:
if "data_key" in getfullargspec(self.handler.fetch).args:
fetch_kwargs["data_key"] = data_key
else:
logger.info(f"data_key[{data_key}] is ignored.")
# Handle all kinds of segments format
if isinstance(segments, (list, tuple)):
return [self._prepare_seg(slice(*self._segments[seg]), **fetch_kwargs) for seg in segments]
return [self._prepare_seg(slice(*self.segments[seg]), **fetch_kwargs) for seg in segments]
elif isinstance(segments, str):
return self._prepare_seg(slice(*self._segments[segments]), **fetch_kwargs)
return self._prepare_seg(slice(*self.segments[segments]), **fetch_kwargs)
elif isinstance(segments, slice):
return self._prepare_seg(segments, **fetch_kwargs)
else:
@@ -409,7 +413,7 @@ class TSDatasetH(DatasetH):
def setup_data(self, *args, **kwargs):
super().setup_data(*args, **kwargs)
cal = self._handler.fetch(col_set=self._handler.CS_RAW).index.get_level_values("datetime").unique()
cal = self.handler.fetch(col_set=self.handler.CS_RAW).index.get_level_values("datetime").unique()
cal = sorted(cal)
# Get the datatime index for building timestamp
self.cal = cal

View File

@@ -83,22 +83,42 @@ class DataHandler(Serializable):
# Setup data loader
assert data_loader is not None # to make start_time end_time could have None default value
# what data source to load data
self.data_loader = init_instance_by_config(
data_loader,
None if (isinstance(data_loader, dict) and "module_path" in data_loader) else data_loader_module,
accept_types=DataLoader,
)
# what data to be loaded from data source
# For IDE auto-completion.
self.instruments = instruments
self.start_time = start_time
self.end_time = end_time
self.fetch_orig = fetch_orig
if init_data:
with TimeInspector.logt("Init data"):
self.init()
super().__init__()
def init(self, enable_cache: bool = True):
def conf_data(self, **kwargs):
"""
configuration of data.
# what data to be loaded from data source
This method will be used when loading pickled handler from dataset.
The data will be initialized with different time range.
"""
attr_list = {"instruments", "start_time", "end_time"}
for k, v in kwargs.items():
if k in attr_list:
setattr(self, k, v)
else:
raise KeyError("Such config is not supported.")
def init(self, enable_cache: bool = False):
"""
initialize the data.
In case of running intialization for multiple time, it will do nothing for the second time.
@@ -262,6 +282,7 @@ class DataHandlerLP(DataHandler):
infer_processors=[],
learn_processors=[],
process_type=PTYPE_A,
drop_raw=False,
**kwargs,
):
"""
@@ -303,6 +324,8 @@ class DataHandlerLP(DataHandler):
- self._learn will be processed by infer_processors + learn_processors
- (e.g. self._infer processed by learn_processors )
drop_raw: bool
Whether to drop the raw data
"""
# Setup preprocessor
@@ -319,6 +342,7 @@ class DataHandlerLP(DataHandler):
)
self.process_type = process_type
self.drop_raw = drop_raw
super().__init__(instruments, start_time, end_time, data_loader, **kwargs)
def get_all_processors(self):
@@ -348,7 +372,7 @@ class DataHandlerLP(DataHandler):
"""
# data for inference
_infer_df = self._data
if len(self.infer_processors) > 0: # avoid modifying the original data
if len(self.infer_processors) > 0 and not self.drop_raw: # avoid modifying the original data
_infer_df = _infer_df.copy()
for proc in self.infer_processors:
@@ -378,6 +402,9 @@ class DataHandlerLP(DataHandler):
_learn_df = proc(_learn_df)
self._learn = _learn_df
if self.drop_raw:
del self._data
# init type
IT_FIT_SEQ = "fit_seq" # the input of `fit` will be the output of the previous processor
IT_FIT_IND = "fit_ind" # the input of `fit` will be the original df
@@ -416,6 +443,10 @@ class DataHandlerLP(DataHandler):
# TODO: Be able to cache handler data. Save the memory for data processing
def _get_df_by_key(self, data_key: str = DK_I) -> pd.DataFrame:
if data_key == self.DK_R and self.drop_raw:
raise AttributeError(
"DataHandlerLP has not attribute _data, please set drop_raw = False if you want to use raw data"
)
df = getattr(self, {self.DK_R: "_data", self.DK_I: "_infer", self.DK_L: "_learn"}[data_key])
return df

View File

@@ -10,7 +10,9 @@ import pandas as pd
from typing import Tuple, Union
from qlib.data import D
from qlib.utils import load_dataset
from qlib.data import filter as filter_module
from qlib.data.filter import BaseDFilter
from qlib.utils import load_dataset, init_instance_by_config
class DataLoader(abc.ABC):
@@ -76,6 +78,7 @@ class DLWParser(DataLoader):
<config> := <fields_info>
<fields_info> := ["expr", ...] | (["expr", ...], ["col_name", ...])
# NOTE: list or tuple will be treated as the things when parsing
"""
self.is_group = isinstance(config, dict)
@@ -85,9 +88,15 @@ class DLWParser(DataLoader):
self.fields = self._parse_fields_info(config)
def _parse_fields_info(self, fields_info: Tuple[list, tuple]) -> Tuple[list, list]:
if isinstance(fields_info, list):
if len(fields_info) == 0:
raise ValueError("The size of fields must be greater than 0")
if not isinstance(fields_info, (list, tuple)):
raise TypeError("Unsupported type")
if isinstance(fields_info[0], str):
exprs = names = fields_info
elif isinstance(fields_info, tuple):
elif isinstance(fields_info[0], (list, tuple)):
exprs, names = fields_info
else:
raise NotImplementedError(f"This type of input is not supported")
@@ -132,7 +141,7 @@ class DLWParser(DataLoader):
class QlibDataLoader(DLWParser):
"""Same as QlibDataLoader. The fields can be define by config"""
def __init__(self, config: Tuple[list, tuple, dict], filter_pipe=None):
def __init__(self, config: Tuple[list, tuple, dict], filter_pipe=None, swap_level=True, freq="day"):
"""
Parameters
----------
@@ -140,8 +149,19 @@ class QlibDataLoader(DLWParser):
Please refer to the doc of DLWParser
filter_pipe :
Filter pipe for the instruments
swap_level :
Whether to swap level of MultiIndex
"""
if filter_pipe is not None:
assert isinstance(filter_pipe, list), "The type of `filter_pipe` must be list."
filter_pipe = [
init_instance_by_config(fp, None if "module_path" in fp else filter_module, accept_types=BaseDFilter)
for fp in filter_pipe
]
self.filter_pipe = filter_pipe
self.swap_level = swap_level
self.freq = freq
super().__init__(config)
def load_group_df(self, instruments, exprs: list, names: list, start_time=None, end_time=None) -> pd.DataFrame:
@@ -153,9 +173,10 @@ class QlibDataLoader(DLWParser):
elif self.filter_pipe is not None:
warnings.warn("`filter_pipe` is not None, but it will not be used with `instruments` as list")
df = D.features(instruments, exprs, start_time, end_time)
df = D.features(instruments, exprs, start_time, end_time, self.freq)
df.columns = names
df = df.swaplevel().sort_index() # NOTE: always return <datetime, instrument>
if self.swap_level:
df = df.swaplevel().sort_index() # NOTE: if swaplevel, return <datetime, instrument>
return df

View File

@@ -6,6 +6,7 @@ from __future__ import division
from __future__ import print_function
import sys
import abc
import numpy as np
import pandas as pd
@@ -17,57 +18,12 @@ from ..log import get_module_logger
try:
from ._libs.rolling import rolling_slope, rolling_rsquare, rolling_resi
from ._libs.expanding import expanding_slope, expanding_rsquare, expanding_resi
except ImportError as err:
print("Do not import qlib package in the repository directory!")
except ImportError:
print(
"#### Do not import qlib package in the repository directory in case of importing qlib from . without compiling #####"
)
raise
__all__ = (
"Ref",
"Max",
"Min",
"Sum",
"Mean",
"Std",
"Var",
"Skew",
"Kurt",
"Med",
"Mad",
"Slope",
"Rsquare",
"Resi",
"Rank",
"Quantile",
"Count",
"EMA",
"WMA",
"Corr",
"Cov",
"Delta",
"Abs",
"Sign",
"Log",
"Power",
"Add",
"Sub",
"Mul",
"Div",
"Greater",
"Less",
"And",
"Or",
"Not",
"Gt",
"Ge",
"Lt",
"Le",
"Eq",
"Ne",
"Mask",
"IdxMax",
"IdxMin",
"If",
)
np.seterr(invalid="ignore")
@@ -77,12 +33,39 @@ np.seterr(invalid="ignore")
class ElemOperator(ExpressionOps):
"""Element-wise Operator
Parameters
----------
feature : Expression
feature instance
Returns
----------
Expression
feature operation output
"""
def __init__(self, feature):
self.feature = feature
def __str__(self):
return "{}({})".format(type(self).__name__, self.feature)
def get_longest_back_rolling(self):
return self.feature.get_longest_back_rolling()
def get_extended_window_size(self):
return self.feature.get_extended_window_size()
class NpElemOperator(ElemOperator):
"""Numpy Element-wise Operator
Parameters
----------
feature : Expression
feature instance
func : str
feature operation method
numpy feature operation method
Returns
----------
@@ -93,22 +76,14 @@ class ElemOperator(ExpressionOps):
def __init__(self, feature, func):
self.feature = feature
self.func = func
def __str__(self):
return "{}({})".format(type(self).__name__, self.feature)
super(NpElemOperator, self).__init__(feature)
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
return getattr(np, self.func)(series)
def get_longest_back_rolling(self):
return self.feature.get_longest_back_rolling()
def get_extended_window_size(self):
return self.feature.get_extended_window_size()
class Abs(ElemOperator):
class Abs(NpElemOperator):
"""Feature Absolute Value
Parameters
@@ -126,7 +101,7 @@ class Abs(ElemOperator):
super(Abs, self).__init__(feature, "abs")
class Sign(ElemOperator):
class Sign(NpElemOperator):
"""Feature Sign
Parameters
@@ -143,8 +118,17 @@ class Sign(ElemOperator):
def __init__(self, feature):
super(Sign, self).__init__(feature, "sign")
def _load_internal(self, instrument, start_index, end_index, freq):
"""
To avoid error raised by bool type input, we transform the data into float32.
"""
series = self.feature.load(instrument, start_index, end_index, freq)
# TODO: More precision types should be configurable
series = series.astype(np.float32)
return getattr(np, self.func)(series)
class Log(ElemOperator):
class Log(NpElemOperator):
"""Feature Log
Parameters
@@ -162,7 +146,7 @@ class Log(ElemOperator):
super(Log, self).__init__(feature, "log")
class Power(ElemOperator):
class Power(NpElemOperator):
"""Feature Power
Parameters
@@ -188,7 +172,7 @@ class Power(ElemOperator):
return getattr(np, self.func)(series, self.exponent)
class Mask(ElemOperator):
class Mask(NpElemOperator):
"""Feature Mask
Parameters
@@ -215,7 +199,7 @@ class Mask(ElemOperator):
return self.feature.load(self.instrument, start_index, end_index, freq)
class Not(ElemOperator):
class Not(NpElemOperator):
"""Not Operator
Parameters
@@ -254,28 +238,13 @@ class PairOperator(ExpressionOps):
two features' operation output
"""
def __init__(self, feature_left, feature_right, func):
def __init__(self, feature_left, feature_right):
self.feature_left = feature_left
self.feature_right = feature_right
self.func = func
def __str__(self):
return "{}({},{})".format(type(self).__name__, self.feature_left, self.feature_right)
def _load_internal(self, instrument, start_index, end_index, freq):
assert any(
[isinstance(self.feature_left, Expression), self.feature_right, Expression]
), "at least one of two inputs is Expression instance"
if isinstance(self.feature_left, Expression):
series_left = self.feature_left.load(instrument, start_index, end_index, freq)
else:
series_left = self.feature_left # numeric value
if isinstance(self.feature_right, Expression):
series_right = self.feature_right.load(instrument, start_index, end_index, freq)
else:
series_right = self.feature_right
return getattr(np, self.func)(series_left, series_right)
def get_longest_back_rolling(self):
if isinstance(self.feature_left, Expression):
left_br = self.feature_left.get_longest_back_rolling()
@@ -301,7 +270,46 @@ class PairOperator(ExpressionOps):
return max(ll, rl), max(lr, rr)
class Add(PairOperator):
class NpPairOperator(PairOperator):
"""Numpy Pair-wise operator
Parameters
----------
feature_left : Expression
feature instance or numeric value
feature_right : Expression
feature instance or numeric value
func : str
operator function
Returns
----------
Feature:
two features' operation output
"""
def __init__(self, feature_left, feature_right, func):
self.feature_left = feature_left
self.feature_right = feature_right
self.func = func
super(NpPairOperator, self).__init__(feature_left, feature_right)
def _load_internal(self, instrument, start_index, end_index, freq):
assert any(
[isinstance(self.feature_left, Expression), self.feature_right, Expression]
), "at least one of two inputs is Expression instance"
if isinstance(self.feature_left, Expression):
series_left = self.feature_left.load(instrument, start_index, end_index, freq)
else:
series_left = self.feature_left # numeric value
if isinstance(self.feature_right, Expression):
series_right = self.feature_right.load(instrument, start_index, end_index, freq)
else:
series_right = self.feature_right
return getattr(np, self.func)(series_left, series_right)
class Add(NpPairOperator):
"""Add Operator
Parameters
@@ -321,7 +329,7 @@ class Add(PairOperator):
super(Add, self).__init__(feature_left, feature_right, "add")
class Sub(PairOperator):
class Sub(NpPairOperator):
"""Subtract Operator
Parameters
@@ -341,7 +349,7 @@ class Sub(PairOperator):
super(Sub, self).__init__(feature_left, feature_right, "subtract")
class Mul(PairOperator):
class Mul(NpPairOperator):
"""Multiply Operator
Parameters
@@ -361,7 +369,7 @@ class Mul(PairOperator):
super(Mul, self).__init__(feature_left, feature_right, "multiply")
class Div(PairOperator):
class Div(NpPairOperator):
"""Division Operator
Parameters
@@ -381,7 +389,7 @@ class Div(PairOperator):
super(Div, self).__init__(feature_left, feature_right, "divide")
class Greater(PairOperator):
class Greater(NpPairOperator):
"""Greater Operator
Parameters
@@ -401,7 +409,7 @@ class Greater(PairOperator):
super(Greater, self).__init__(feature_left, feature_right, "maximum")
class Less(PairOperator):
class Less(NpPairOperator):
"""Less Operator
Parameters
@@ -421,7 +429,7 @@ class Less(PairOperator):
super(Less, self).__init__(feature_left, feature_right, "minimum")
class Gt(PairOperator):
class Gt(NpPairOperator):
"""Greater Than Operator
Parameters
@@ -441,7 +449,7 @@ class Gt(PairOperator):
super(Gt, self).__init__(feature_left, feature_right, "greater")
class Ge(PairOperator):
class Ge(NpPairOperator):
"""Greater Equal Than Operator
Parameters
@@ -461,7 +469,7 @@ class Ge(PairOperator):
super(Ge, self).__init__(feature_left, feature_right, "greater_equal")
class Lt(PairOperator):
class Lt(NpPairOperator):
"""Less Than Operator
Parameters
@@ -481,7 +489,7 @@ class Lt(PairOperator):
super(Lt, self).__init__(feature_left, feature_right, "less")
class Le(PairOperator):
class Le(NpPairOperator):
"""Less Equal Than Operator
Parameters
@@ -501,7 +509,7 @@ class Le(PairOperator):
super(Le, self).__init__(feature_left, feature_right, "less_equal")
class Eq(PairOperator):
class Eq(NpPairOperator):
"""Equal Operator
Parameters
@@ -521,7 +529,7 @@ class Eq(PairOperator):
super(Eq, self).__init__(feature_left, feature_right, "equal")
class Ne(PairOperator):
class Ne(NpPairOperator):
"""Not Equal Operator
Parameters
@@ -541,7 +549,7 @@ class Ne(PairOperator):
super(Ne, self).__init__(feature_left, feature_right, "not_equal")
class And(PairOperator):
class And(NpPairOperator):
"""And Operator
Parameters
@@ -561,7 +569,7 @@ class And(PairOperator):
super(And, self).__init__(feature_left, feature_right, "bitwise_and")
class Or(PairOperator):
class Or(NpPairOperator):
"""Or Operator
Parameters
@@ -1430,3 +1438,93 @@ class Cov(PairRolling):
def __init__(self, feature_left, feature_right, N):
super(Cov, self).__init__(feature_left, feature_right, N, "cov")
OpsList = [
Ref,
Max,
Min,
Sum,
Mean,
Std,
Var,
Skew,
Kurt,
Med,
Mad,
Slope,
Rsquare,
Resi,
Rank,
Quantile,
Count,
EMA,
WMA,
Corr,
Cov,
Delta,
Abs,
Sign,
Log,
Power,
Add,
Sub,
Mul,
Div,
Greater,
Less,
And,
Or,
Not,
Gt,
Ge,
Lt,
Le,
Eq,
Ne,
Mask,
IdxMax,
IdxMin,
If,
]
class OpsWrapper(object):
"""Ops Wrapper"""
def __init__(self):
self._ops = {}
def reset(self):
self._ops = {}
def register(self, ops_list):
for operator in ops_list:
if not issubclass(operator, ExpressionOps):
raise TypeError("operator must be subclass of ExpressionOps, not {}".format(operator))
if operator.__name__ in self._ops:
get_module_logger(self.__class__.__name__).warning(
"The custom operator [{}] will override the qlib default definition".format(operator.__name__)
)
self._ops[operator.__name__] = operator
def __getattr__(self, key):
if key not in self._ops:
raise AttributeError("The operator [{0}] is not registered".format(key))
return self._ops[key]
Operators = OpsWrapper()
def register_all_ops(C):
"""register all operator"""
logger = get_module_logger("ops")
Operators.reset()
Operators.register(OpsList)
if getattr(C, "custom_ops", None) is not None:
Operators.register(C.custom_ops)
logger.debug("register custom operator {}".format(C.custom_ops))

View File

@@ -36,7 +36,7 @@ def get_module_logger(module_name, level=None):
return module_logger
class TimeInspector(object):
class TimeInspector:
timer_logger = get_module_logger("timer", level=logging.WARNING)

View File

@@ -30,11 +30,6 @@ class Model(BaseModel):
The attribute names of learned model should `not` start with '_'. So that the model could be
dumped to disk.
Parameters
----------
dataset : Dataset
dataset will generate the processed data from model training.
The following code example shows how to retrieve `x_train`, `y_train` and `w_train` from the `dataset`:
.. code-block:: Python
@@ -53,6 +48,12 @@ class Model(BaseModel):
except KeyError as e:
w_train = pd.DataFrame(np.ones_like(y_train.values), index=y_train.index)
w_valid = pd.DataFrame(np.ones_like(y_valid.values), index=y_valid.index)
Parameters
----------
dataset : Dataset
dataset will generate the processed data from model training.
"""
raise NotImplementedError()

View File

@@ -9,7 +9,7 @@ import scipy.optimize as so
from typing import Optional, Union, Callable, List
class PortfolioOptimizer(object):
class PortfolioOptimizer:
"""Portfolio Optimizer
The following optimization algorithms are supported:

View File

@@ -7,6 +7,9 @@ from ..config import REG_CN
class TestAutoData(unittest.TestCase):
_setup_kwargs = {}
@classmethod
def setUpClass(cls) -> None:
# use default data
@@ -15,6 +18,10 @@ class TestAutoData(unittest.TestCase):
print(f"Qlib data is not found in {provider_uri}")
GetData().qlib_data(
name="qlib_data_simple", region="cn", version="latest", interval="1d", target_dir=provider_uri
name="qlib_data_simple",
region="cn",
interval="1d",
target_dir=provider_uri,
delete_old=False,
)
init(provider_uri=provider_uri, region=REG_CN)
init(provider_uri=provider_uri, region=REG_CN, **cls._setup_kwargs)

View File

@@ -1,14 +1,21 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import re
import qlib
import shutil
import zipfile
import requests
import datetime
from tqdm import tqdm
from pathlib import Path
from loguru import logger
class GetData:
DATASET_VERSION = "v1"
REMOTE_URL = "http://fintech.msra.cn/stock_data/downloads"
QLIB_DATA_NAME = "{dataset_name}_{region}_{interval}_{qlib_version}.zip"
def __init__(self, delete_zip_file=False):
"""
@@ -20,41 +27,92 @@ class GetData:
"""
self.delete_zip_file = delete_zip_file
def _download_data(self, file_name: str, target_dir: [Path, str]):
def normalize_dataset_version(self, dataset_version: str = None):
if dataset_version is None:
dataset_version = self.DATASET_VERSION
return dataset_version
def merge_remote_url(self, file_name: str, dataset_version: str = None):
return f"{self.REMOTE_URL}/{self.normalize_dataset_version(dataset_version)}/{file_name}"
def _download_data(
self, file_name: str, target_dir: [Path, str], delete_old: bool = True, dataset_version: str = None
):
target_dir = Path(target_dir).expanduser()
target_dir.mkdir(exist_ok=True, parents=True)
# saved file name
_target_file_name = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "_" + file_name
target_path = target_dir.joinpath(_target_file_name)
url = f"{self.REMOTE_URL}/{file_name}"
target_path = target_dir.joinpath(file_name)
url = self.merge_remote_url(file_name, dataset_version)
resp = requests.get(url, stream=True)
if resp.status_code != 200:
raise requests.exceptions.HTTPError()
chuck_size = 1024
chunk_size = 1024
logger.warning(
f"The data for the example is collected from Yahoo Finance. Please be aware that the quality of the data might not be perfect. (You can refer to the original data source: https://finance.yahoo.com/lookup.)"
)
logger.info(f"{file_name} downloading......")
with tqdm(total=int(resp.headers.get("Content-Length", 0))) as p_bar:
with target_path.open("wb") as fp:
for chuck in resp.iter_content(chunk_size=chuck_size):
fp.write(chuck)
p_bar.update(chuck_size)
for chunk in resp.iter_content(chunk_size=chunk_size):
fp.write(chunk)
p_bar.update(chunk_size)
self._unzip(target_path, target_dir)
self._unzip(target_path, target_dir, delete_old)
if self.delete_zip_file:
target_path.unlike()
target_path.unlink()
def check_dataset(self, file_name: str, dataset_version: str = None):
url = self.merge_remote_url(file_name, dataset_version)
resp = requests.get(url, stream=True)
status = True
if resp.status_code == 404:
status = False
return status
@staticmethod
def _unzip(file_path: Path, target_dir: Path):
def _unzip(file_path: Path, target_dir: Path, delete_old: bool = True):
if delete_old:
logger.warning(
f"will delete the old qlib data directory(features, instruments, calendars, features_cache, dataset_cache): {target_dir}"
)
GetData._delete_qlib_data(target_dir)
logger.info(f"{file_path} unzipping......")
with zipfile.ZipFile(str(file_path.resolve()), "r") as zp:
for _file in tqdm(zp.namelist()):
zp.extract(_file, str(target_dir.resolve()))
@staticmethod
def _delete_qlib_data(file_dir: Path):
logger.info(f"delete {file_dir}")
rm_dirs = []
for _name in ["features", "calendars", "instruments", "features_cache", "dataset_cache"]:
_p = file_dir.joinpath(_name)
if _p.exists():
rm_dirs.append(str(_p.resolve()))
if rm_dirs:
flag = input(
f"Will be deleted: "
f"\n\t{rm_dirs}"
f"\nIf you do not need to delete {file_dir}, please change the <--target_dir>"
f"\nAre you sure you want to delete, yes(Y/y), no (N/n):"
)
if str(flag) not in ["Y", "y"]:
exit()
for _p in rm_dirs:
logger.warning(f"delete: {_p}")
shutil.rmtree(_p)
def qlib_data(
self, name="qlib_data", target_dir="~/.qlib/qlib_data/cn_data", version="latest", interval="1d", region="cn"
self,
name="qlib_data",
target_dir="~/.qlib/qlib_data/cn_data",
version=None,
interval="1d",
region="cn",
delete_old=True,
):
"""download cn qlib data from remote
@@ -65,20 +123,31 @@ class GetData:
name: str
dataset name, value from [qlib_data, qlib_data_simple], by default qlib_data
version: str
data version, value from [v0, v1, ..., latest], by default latest
data version, value from [v1, ...], by default None(use script to specify version)
interval: str
data freq, value from [1d], by default 1d
region: str
data region, value from [cn, us], by default cn
delete_old: bool
delete an existing directory, by default True
Examples
---------
python get_data.py qlib_data --name qlib_data --target_dir ~/.qlib/qlib_data/cn_data --version latest --interval 1d --region cn
python get_data.py qlib_data --name qlib_data --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
-------
"""
file_name = f"{name}_{region.lower()}_{interval.lower()}_{version}.zip"
self._download_data(file_name.lower(), target_dir)
qlib_version = ".".join(re.findall(r"(\d+)\.+", qlib.__version__))
def _get_file_name(v):
return self.QLIB_DATA_NAME.format(
dataset_name=name, region=region.lower(), interval=interval.lower(), qlib_version=v
)
file_name = _get_file_name(qlib_version)
if not self.check_dataset(file_name, version):
file_name = _get_file_name("latest")
self._download_data(file_name.lower(), target_dir, delete_old, dataset_version=version)
def csv_data_cn(self, target_dir="~/.qlib/csv_data/cn_data"):
"""download cn csv data from remote

View File

@@ -27,7 +27,7 @@ from pathlib import Path
from typing import Union, Tuple
from ..config import C
from ..log import get_module_logger
from ..log import get_module_logger, set_log_with_config
log = get_module_logger("utils")
@@ -162,7 +162,7 @@ def parse_field(field):
# - $open+$close -> Feature("open")+Feature("close")
if not isinstance(field, str):
field = str(field)
return re.sub(r"\$(\w+)", r'Feature("\1")', field)
return re.sub(r"\$(\w+)", r'Feature("\1")', re.sub(r"(\w+\s*)\(", r"Operators.\1(", field))
def get_module_by_module_path(module_path):
@@ -279,8 +279,10 @@ def compare_dict_value(src_data: dict, dst_data: dict):
def create_save_path(save_path=None):
"""Create save path
:param save_path:
:return:
Parameters
----------
save_path: str
"""
if save_path:
if not os.path.exists(save_path):
@@ -471,30 +473,28 @@ def is_tradable_date(cur_date):
return str(cur_date.date()) == str(D.calendar(start_time=cur_date, future=True)[0].date())
def get_date_range(trading_date, shift, future=False):
def get_date_range(trading_date, left_shift=0, right_shift=0, future=False):
"""get trading date range by shift
:param trading_date:
:param shift: int
:param future: bool
:return:
Parameters
----------
trading_date: pd.Timestamp
left_shift: int
right_shift: int
future: bool
"""
from ..data import D
calendar = D.calendar(future=future)
if pd.to_datetime(trading_date) not in list(calendar):
raise ValueError("{} is not trading day!".format(str(trading_date)))
day_index = bisect.bisect_left(calendar, trading_date)
if 0 <= (day_index + shift) < len(calendar):
if shift > 0:
return calendar[day_index + 1 : day_index + 1 + shift]
else:
return calendar[day_index + shift : day_index]
else:
return calendar
start = get_date_by_shift(trading_date, left_shift, future=future)
end = get_date_by_shift(trading_date, right_shift, future=future)
calendar = D.calendar(start, end, future=future)
return calendar
def get_date_by_shift(trading_date, shift, future=False):
def get_date_by_shift(trading_date, shift, future=False, clip_shift=True):
"""get trading date with shift bias wil cur_date
e.g. : shift == 1, return next trading date
shift == -1, return previous trading date
@@ -502,8 +502,22 @@ def get_date_by_shift(trading_date, shift, future=False):
trading_date : pandas.Timestamp
current date
shift : int
clip_shift: bool
"""
return get_date_range(trading_date, shift, future)[0 if shift < 0 else -1] if shift != 0 else trading_date
from qlib.data import D
cal = D.calendar(future=future)
if pd.to_datetime(trading_date) not in list(cal):
raise ValueError("{} is not trading day!".format(str(trading_date)))
_index = bisect.bisect_left(cal, trading_date)
shift_index = _index + shift
if shift_index < 0 or shift_index >= len(cal):
if clip_shift:
shift_index = np.clip(shift_index, 0, len(cal) - 1)
else:
raise IndexError(f"The shift_index({shift_index}) of the trading day ({trading_date}) is out of range")
return cal[shift_index]
def get_next_trading_date(trading_date, future=False):
@@ -629,15 +643,28 @@ def exists_qlib_data(qlib_dir):
# check instruments
code_names = set(map(lambda x: x.name.lower(), features_dir.iterdir()))
_instrument = instruments_dir.joinpath("all.txt")
df = pd.read_csv(_instrument, sep="\t", names=["inst", "start_datetime", "end_datetime", "save_inst"])
df = df.iloc[:, [0, -1]].fillna(axis=1, method="ffill")
miss_code = set(df.iloc[:, -1].apply(str.lower)) - set(code_names)
miss_code = set(pd.read_csv(_instrument, sep="\t", header=None).loc[:, 0].apply(str.lower)) - set(code_names)
if miss_code and any(map(lambda x: "sht" not in x, miss_code)):
return False
return True
def check_qlib_data(qlib_config):
inst_dir = Path(qlib_config["provider_uri"]).joinpath("instruments")
for _p in inst_dir.glob("*.txt"):
try:
assert len(pd.read_csv(_p, sep="\t", nrows=0, header=None).columns) == 3, (
f"\nThe {str(_p.resolve())} of qlib data is not equal to 3 columns:"
f"\n\tIf you are using the data provided by qlib: "
f"https://qlib.readthedocs.io/en/latest/component/data.html#qlib-format-dataset"
f"\n\tIf you are using your own data, please dump the data again: "
f"https://qlib.readthedocs.io/en/latest/component/data.html#converting-csv-format-into-qlib-format"
)
except AssertionError:
raise
def lazy_sort_index(df: pd.DataFrame, axis=0) -> pd.DataFrame:
"""
make the df index sorted
@@ -686,7 +713,7 @@ def flatten_dict(d, parent_key="", sep="."):
#################### Wrapper #####################
class Wrapper(object):
class Wrapper:
"""Wrapper class for anything that needs to set up during qlib.init"""
def __init__(self):
@@ -728,3 +755,36 @@ def load_dataset(path_or_obj):
elif extension == ".csv":
return pd.read_csv(path_or_obj, parse_dates=True, index_col=[0, 1])
raise ValueError(f"unsupported file type `{extension}`")
def code_to_fname(code: str):
"""stock code to file name
Parameters
----------
code: str
"""
# NOTE: In windows, the following name is I/O device, and the file with the corresponding name cannot be created
# reference: https://superuser.com/questions/86999/why-cant-i-name-a-folder-or-file-con-in-windows
replace_names = ["CON", "PRN", "AUX", "NUL"]
replace_names += [f"COM{i}" for i in range(10)]
replace_names += [f"LPT{i}" for i in range(10)]
prefix = "_qlib_"
if str(code).upper() in replace_names:
code = prefix + str(code)
return code
def fname_to_code(fname: str):
"""file name to stock code
Parameters
----------
fname: str
"""
prefix = "_qlib_"
if fname.startswith(prefix):
fname = fname.lstrip(prefix)
return fname

View File

@@ -27,11 +27,6 @@ class Serializable:
def dump_all(self):
"""
will the object dump all object
Parameters
----------
self : [TODO:type]
[TODO:description]
"""
return getattr(self, "_dump_all", False)
@@ -39,11 +34,6 @@ class Serializable:
def exclude(self):
"""
What attribute will be dumped
Parameters
----------
self : [TODO:type]
[TODO:description]
"""
return getattr(self, "_exclude", [])

View File

@@ -3,6 +3,7 @@
from contextlib import contextmanager
from .expm import MLflowExpManager
from .exp import Experiment
from .recorder import Recorder
from ..utils import Wrapper
@@ -165,7 +166,7 @@ class QlibRecorder:
"""
return self.get_exp(experiment_id, experiment_name).list_recorders()
def get_exp(self, experiment_id=None, experiment_name=None, create: bool = True):
def get_exp(self, experiment_id=None, experiment_name=None, create: bool = True) -> Experiment:
"""
Method for retrieving an experiment with given id or name. Once the `create` argument is set to
True, if no valid experiment is found, this method will create one for you. Otherwise, it will
@@ -461,5 +462,14 @@ class QlibRecorder:
self.get_exp().get_recorder().set_tags(**kwargs)
import sys
if sys.version_info >= (3, 9):
from typing import Annotated
QlibRecorderWrapper = Annotated[QlibRecorder, Wrapper]
else:
QlibRecorderWrapper = QlibRecorder
# global record
R = Wrapper()
R: QlibRecorderWrapper = Wrapper()

View File

@@ -44,7 +44,7 @@ def sys_config(config, config_path):
# worflow handler function
def workflow(config_path, experiment_name="workflow", uri_folder="mlruns"):
with open(config_path) as fp:
config = yaml.load(fp, Loader=yaml.Loader)
config = yaml.load(fp, Loader=yaml.SafeLoader)
# config the `sys` section
sys_config(config, config_path)

View File

@@ -65,13 +65,13 @@ class Experiment:
"""
raise NotImplementedError(f"Please implement the `end` method.")
def create_recorder(self, name=None):
def create_recorder(self, recorder_name=None):
"""
Create a recorder for each experiment.
Parameters
----------
name : str
recorder_name : str
the name of the recorder to be created.
Returns

View File

@@ -5,10 +5,9 @@ import re
import pandas as pd
from pathlib import Path
from pprint import pprint
from ..contrib.evaluate import (
backtest as normal_backtest,
risk_analysis,
)
from ..contrib.evaluate import risk_analysis
from ..contrib.backtest import backtest as normal_backtest
from ..data.dataset import DatasetH
from ..data.dataset.handler import DataHandlerLP
from ..utils import init_instance_by_config, get_module_by_module_path
@@ -213,6 +212,11 @@ class SigAnaRecord(SignalRecord):
class PortAnaRecord(SignalRecord):
"""
This is the Portfolio Analysis Record class that generates the analysis results such as those of backtest. This class inherits the ``RecordTemp`` class.
The following files will be stored in recorder
- report_normal.pkl & positions_normal.pkl:
- The return report and detailed positions of the backtest, returned by `qlib/contrib/evaluate.py:backtest`
- port_analysis.pkl : The risk analysis of your portfolio, returned by `qlib/contrib/evaluate.py:risk_analysis`
"""
artifact_path = "portfolio_analysis"
@@ -236,9 +240,14 @@ class PortAnaRecord(SignalRecord):
# custom strategy and get backtest
pred_score = super().load()
report_normal, positions_normal = normal_backtest(pred_score, strategy=self.strategy, **self.backtest_config)
report_dict = normal_backtest(pred_score, strategy=self.strategy, **self.backtest_config)
report_normal = report_dict.get("report_df")
positions_normal = report_dict.get("positions")
self.recorder.save_objects(**{"report_normal.pkl": report_normal}, artifact_path=PortAnaRecord.get_path())
self.recorder.save_objects(**{"positions_normal.pkl": positions_normal}, artifact_path=PortAnaRecord.get_path())
order_normal = report_dict.get("order_list")
if order_normal:
self.recorder.save_objects(**{"order_normal.pkl": order_normal}, artifact_path=PortAnaRecord.get_path())
# analysis
analysis = dict()

View File

@@ -2,7 +2,7 @@
# Licensed under the MIT License.
import mlflow
import shutil, os, pickle, tempfile, codecs
import shutil, os, pickle, tempfile, codecs, pickle
from pathlib import Path
from datetime import datetime
from ..utils.objm import FileManager
@@ -202,9 +202,6 @@ class MLflowRecorder(Recorder):
super(MLflowRecorder, self).__init__(experiment_id, name)
self._uri = uri
self.artifact_uri = None
# set up file manager for saving objects
self.temp_dir = tempfile.mkdtemp()
self.fm = FileManager(Path(self.temp_dir).absolute())
self.client = mlflow.tracking.MlflowClient(tracking_uri=self._uri)
# construct from mlflow run
if mlflow_run is not None:
@@ -248,16 +245,18 @@ class MLflowRecorder(Recorder):
self.end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
if self.status != Recorder.STATUS_S:
self.status = status
shutil.rmtree(self.temp_dir)
def save_objects(self, local_path=None, artifact_path=None, **kwargs):
assert self._uri is not None, "Please start the experiment and recorder first before using recorder directly."
if local_path is not None:
self.client.log_artifacts(self.id, local_path, artifact_path)
else:
temp_dir = Path(tempfile.mkdtemp()).resolve()
for name, data in kwargs.items():
self.fm.save_obj(data, name)
self.client.log_artifact(self.id, self.fm.path / name, artifact_path)
with (temp_dir / name).open("wb") as f:
pickle.dump(data, f)
self.client.log_artifact(self.id, temp_dir / name, artifact_path)
shutil.rmtree(temp_dir)
def load_object(self, name):
assert self._uri is not None, "Please start the experiment and recorder first before using recorder directly."

View File

@@ -43,7 +43,7 @@ python get_data.py qlib_data --help
### US data
> Need to download data first: [Downlaod US Data](#Downlaod-US-Data)
> Need to download data first: [Download US Data](#Download-US-Data)
```python
import qlib

View File

@@ -1,28 +1,71 @@
import sys, platform
import sys
import platform
import qlib
import fire
import pkg_resources
from pathlib import Path
QLIB_PATH = Path(__file__).absolute().resolve().parent.parent
def linux_distribution():
try:
return platform.linux_distribution()
except:
return "N/A"
class InfoCollector:
"""
User could collect system info by following commands
`cd scripts && python collect_info.py all`
- NOTE: please avoid running this script in the project folder which contains `qlib`
"""
def sys(self):
"""collect system related info"""
for method in ["system", "machine", "platform", "version"]:
print(getattr(platform, method)())
def py(self):
"""collect Python related info"""
print("Python version: {}".format(sys.version.replace("\n", " ")))
def qlib(self):
"""collect qlib related info"""
print("Qlib version: {}".format(qlib.__version__))
REQUIRED = [
"numpy",
"pandas",
"scipy",
"requests",
"sacred",
"python-socketio",
"redis",
"python-redis-lock",
"schedule",
"cvxpy",
"hyperopt",
"fire",
"statsmodels",
"xlrd",
"plotly",
"matplotlib",
"tables",
"pyyaml",
"mlflow",
"tqdm",
"loguru",
"lightgbm",
"tornado",
"joblib",
"fire",
"ruamel.yaml",
]
for package in REQUIRED:
version = pkg_resources.get_distribution(package).version
print(f"{package}=={version}")
def all(self):
"""collect all info"""
for method in ["sys", "py", "qlib"]:
getattr(self, method)()
print()
print("Qlib version: {} \n".format(qlib.__version__))
print(
"""Python version: {} \n
linux_distribution: {}
system: {}
machine: {}
platform: {}
version: {}
""".format(
sys.version.split("\n"),
linux_distribution(),
platform.system(),
platform.machine(),
platform.platform(),
platform.version(),
)
)
if __name__ == "__main__":
fire.Fire(InfoCollector)

View File

@@ -5,6 +5,7 @@ import re
import time
import bisect
import pickle
import random
import requests
import functools
from pathlib import Path
@@ -17,6 +18,7 @@ from yahooquery import Ticker
HS_SYMBOLS_URL = "http://app.finance.ifeng.com/hq/list.php?type=stock_a&class={s_type}"
CALENDAR_URL_BASE = "http://push2his.eastmoney.com/api/qt/stock/kline/get?secid={market}.{bench_code}&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58&klt=101&fqt=0&beg=19900101&end=20991231"
SZSE_CALENDAR_URL = "http://www.szse.cn/api/report/exchange/onepersistenthour/monthList?month={month}&random={random}"
CALENDAR_BENCH_URL_MAP = {
"CSI300": CALENDAR_URL_BASE.format(market=1, bench_code="000300"),
@@ -63,7 +65,29 @@ def get_calendar_list(bench_code="CSI300") -> list:
df = Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max")
calendar = df.index.get_level_values(level="date").map(pd.Timestamp).unique().tolist()
else:
calendar = _get_calendar(CALENDAR_BENCH_URL_MAP[bench_code])
if bench_code.upper() == "ALL":
@deco_retry
def _get_calendar(month):
_cal = []
try:
resp = requests.get(SZSE_CALENDAR_URL.format(month=month, random=random.random)).json()
for _r in resp["data"]:
if int(_r["jybz"]):
_cal.append(pd.Timestamp(_r["jyrq"]))
except Exception as e:
raise ValueError(f"{month}-->{e}")
return _cal
month_range = pd.date_range(start="2000-01", end=pd.Timestamp.now() + pd.Timedelta(days=31), freq="M")
calendar = []
for _m in month_range:
cal = _get_calendar(_m.strftime("%Y-%m"))
if cal:
calendar += cal
calendar = list(filter(lambda x: x <= pd.Timestamp.now(), calendar))
else:
calendar = _get_calendar(CALENDAR_BENCH_URL_MAP[bench_code])
_CALENDAR_MAP[bench_code] = calendar
logger.info(f"end of get calendar list: {bench_code}.")
return calendar

View File

@@ -18,23 +18,81 @@ pip install -r requirements.txt
## Collector Data
### Download data and Normalize data
```bash
python collector.py collector_data --source_dir ~/.qlib/stock_data/source --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d
```
### Download Data
### CN Data
#### 1d
```bash
python collector.py download_data --source_dir ~/.qlib/stock_data/source --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d
# download from yahoo finance
python collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_1d --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d
# normalize
python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/cn_1d --normalize_dir ~/.qlib/stock_data/source/cn_1d_nor --region CN --interval 1d
# dump data
cd qlib/scripts
python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1d_nor --qlib_dir ~/.qlib/stock_data/source/qlib_cn_1d --freq day --exclude_fields date,adjclose,dividends,splits,symbol
# using
import qlib
from qlib.data import D
qlib.init(provider_uri="~/.qlib/stock_data/source/qlib_cn_1d", region="CN")
df = D.features(D.instruments("all"), ["$close"], freq="day")
```
### Normalize Data
#### 1min
```bash
python collector.py normalize_data --source_dir ~/.qlib/stock_data/source --normalize_dir ~/.qlib/stock_data/normalize --region CN
# download from yahoo finance
python collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_1min --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1min
# normalize
python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/cn_1min --normalize_dir ~/.qlib/stock_data/source/cn_1min_nor --region CN --interval 1min
# dump data
cd qlib/scripts
python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1min_nor --qlib_dir ~/.qlib/stock_data/source/qlib_cn_1min --freq 1min --exclude_fields date,adjclose,dividends,splits,symbol
# using
import qlib
from qlib.data import D
qlib.init(provider_uri="~/.qlib/stock_data/source/qlib_cn_1min", region="CN")
df = D.features(D.instruments("all"), ["$close"], freq="1min")
```
### US Data
#### 1d
```bash
# download from yahoo finance
python collector.py download_data --source_dir ~/.qlib/stock_data/source/us_1d --region US --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d
# normalize
python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/us_1d --normalize_dir ~/.qlib/stock_data/source/us_1d_nor --region US --interval 1d
# dump data
cd qlib/scripts
python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1d_nor --qlib_dir ~/.qlib/stock_data/source/qlib_us_1d --freq day --exclude_fields date,adjclose,dividends,splits,symbol
# using
import qlib
from qlib.data import D
qlib.init(provider_uri="~/.qlib/stock_data/source/qlib_us_1d", region="US")
df = D.features(D.instruments("all"), ["$close"], freq="day")
```
### Help
```bash
pythono collector.py collector_data --help
@@ -42,5 +100,5 @@ pythono collector.py collector_data --help
## Parameters
- interval: 1m or 1d
- interval: 1min or 1d
- region: CN or US

File diff suppressed because it is too large Load Diff

View File

@@ -14,6 +14,7 @@ import numpy as np
import pandas as pd
from tqdm import tqdm
from loguru import logger
from qlib.utils import fname_to_code, code_to_fname
class DumpDataBase:
@@ -27,7 +28,6 @@ class DumpDataBase:
HIGH_FREQ_FORMAT = "%Y-%m-%d %H:%M:%S"
INSTRUMENTS_SEP = "\t"
INSTRUMENTS_FILE_NAME = "all.txt"
SAVE_INST_FIELD = "save_inst"
UPDATE_MODE = "update"
ALL_MODE = "all"
@@ -45,7 +45,6 @@ class DumpDataBase:
exclude_fields: str = "",
include_fields: str = "",
limit_nums: int = None,
inst_prefix: str = "",
):
"""
@@ -73,9 +72,6 @@ class DumpDataBase:
fields not dumped
limit_nums: int
Use when debugging, default None
inst_prefix: str
add a column to the instruments file and record the saved instrument name,
the US stock code contains "PRN", and the directory cannot be created on Windows system, use the "_" prefix.
"""
csv_path = Path(csv_path).expanduser()
if isinstance(exclude_fields, str):
@@ -84,7 +80,6 @@ class DumpDataBase:
include_fields = include_fields.split(",")
self._exclude_fields = tuple(filter(lambda x: len(x) > 0, map(str.strip, exclude_fields)))
self._include_fields = tuple(filter(lambda x: len(x) > 0, map(str.strip, include_fields)))
self._inst_prefix = inst_prefix.strip()
self.file_suffix = file_suffix
self.symbol_field_name = symbol_field_name
self.csv_files = sorted(csv_path.glob(f"*{self.file_suffix}") if csv_path.is_dir() else [csv_path])
@@ -145,7 +140,7 @@ class DumpDataBase:
return df
def get_symbol_from_file(self, file_path: Path) -> str:
return file_path.name[: -len(self.file_suffix)].strip().lower()
return fname_to_code(file_path.name[: -len(self.file_suffix)].strip().lower())
def get_dump_fields(self, df_columns: Iterable[str]) -> Iterable[str]:
return (
@@ -173,7 +168,6 @@ class DumpDataBase:
self.symbol_field_name,
self.INSTRUMENTS_START_FIELD,
self.INSTRUMENTS_END_FIELD,
self.SAVE_INST_FIELD,
],
)
@@ -190,13 +184,11 @@ class DumpDataBase:
instruments_path = str(self._instruments_dir.joinpath(self.INSTRUMENTS_FILE_NAME).resolve())
if isinstance(instruments_data, pd.DataFrame):
_df_fields = [self.symbol_field_name, self.INSTRUMENTS_START_FIELD, self.INSTRUMENTS_END_FIELD]
if self._inst_prefix:
_df_fields.append(self.SAVE_INST_FIELD)
instruments_data[self.SAVE_INST_FIELD] = instruments_data[self.symbol_field_name].apply(
lambda x: f"{self._inst_prefix}{x}"
)
instruments_data = instruments_data.loc[:, _df_fields]
instruments_data.to_csv(instruments_path, header=False, sep=self.INSTRUMENTS_SEP)
instruments_data[self.symbol_field_name] = instruments_data[self.symbol_field_name].apply(
lambda x: fname_to_code(x.lower()).upper()
)
instruments_data.to_csv(instruments_path, header=False, sep=self.INSTRUMENTS_SEP, index=False)
else:
np.savetxt(instruments_path, instruments_data, fmt="%s", encoding="utf-8")
@@ -223,26 +215,26 @@ class DumpDataBase:
logger.warning(f"{features_dir.name} data is None or empty")
return
# align index
_df = self.data_merge_calendar(df, self._calendars_list)
_df = self.data_merge_calendar(df, calendar_list)
# used when creating a bin file
date_index = self.get_datetime_index(_df, calendar_list)
for field in self.get_dump_fields(_df.columns):
bin_path = features_dir.joinpath(f"{field}.{self.freq}{self.DUMP_FILE_SUFFIX}")
if field not in _df.columns:
continue
if self._mode == self.UPDATE_MODE:
if bin_path.exists() and self._mode == self.UPDATE_MODE:
# update
with bin_path.open("ab") as fp:
np.array(_df[field]).astype("<f").tofile(fp)
elif self._mode == self.ALL_MODE:
np.hstack([date_index, _df[field]]).astype("<f").tofile(str(bin_path.resolve()))
else:
raise ValueError(f"{self._mode} cannot support!")
# append; self._mode == self.ALL_MODE or not bin_path.exists()
np.hstack([date_index, _df[field]]).astype("<f").tofile(str(bin_path.resolve()))
def _dump_bin(self, file_or_data: [Path, pd.DataFrame], calendar_list: List[pd.Timestamp]):
if isinstance(file_or_data, pd.DataFrame):
if file_or_data.empty:
return
code = file_or_data.iloc[0][self.symbol_field_name].lower()
code = fname_to_code(file_or_data.iloc[0][self.symbol_field_name].lower())
df = file_or_data
elif isinstance(file_or_data, Path):
code = self.get_symbol_from_file(file_or_data)
@@ -253,8 +245,7 @@ class DumpDataBase:
logger.warning(f"{code} data is None or empty")
return
# features save dir
code = self._inst_prefix + code if self._inst_prefix else code
features_dir = self._features_dir.joinpath(code)
features_dir = self._features_dir.joinpath(code_to_fname(code).lower())
features_dir.mkdir(parents=True, exist_ok=True)
self._data_to_bin(df, calendar_list, features_dir)
@@ -283,8 +274,6 @@ class DumpDataAll(DumpDataBase):
_end_time = self._format_datetime(_end_time)
symbol = self.get_symbol_from_file(file_path)
_inst_fields = [symbol.upper(), _begin_time, _end_time]
if self._inst_prefix:
_inst_fields.append(self._inst_prefix + symbol.upper())
date_range_list.append(f"{self.INSTRUMENTS_SEP.join(_inst_fields)}")
p_bar.update()
self._kwargs["all_datetime_set"] = all_datetime
@@ -323,12 +312,18 @@ class DumpDataFix(DumpDataAll):
def _dump_instruments(self):
logger.info("start dump instruments......")
_fun = partial(self._get_date, is_begin_end=True)
new_stock_files = sorted(filter(lambda x: x.name not in self._old_instruments, self.csv_files))
new_stock_files = sorted(
filter(
lambda x: fname_to_code(x.name[: -len(self.file_suffix)].strip().lower()).upper()
not in self._old_instruments,
self.csv_files,
)
)
with tqdm(total=len(new_stock_files)) as p_bar:
with ProcessPoolExecutor(max_workers=self.works) as execute:
for file_path, (_begin_time, _end_time) in zip(new_stock_files, execute.map(_fun, new_stock_files)):
if isinstance(_begin_time, pd.Timestamp) and isinstance(_end_time, pd.Timestamp):
symbol = self.get_symbol_from_file(file_path).upper()
symbol = fname_to_code(self.get_symbol_from_file(file_path).lower()).upper()
_dt_map = self._old_instruments.setdefault(symbol, dict())
_dt_map[self.INSTRUMENTS_START_FIELD] = self._format_datetime(_begin_time)
_dt_map[self.INSTRUMENTS_END_FIELD] = self._format_datetime(_end_time)
@@ -406,10 +401,10 @@ class DumpDataUpdate(DumpDataBase):
)
self._mode = self.UPDATE_MODE
self._old_calendar_list = self._read_calendars(self._calendars_dir.joinpath(f"{self.freq}.txt"))
self._update_instruments = self._read_instruments(
self._instruments_dir.joinpath(self.INSTRUMENTS_FILE_NAME)
).to_dict(
orient="index"
self._update_instruments = (
self._read_instruments(self._instruments_dir.joinpath(self.INSTRUMENTS_FILE_NAME))
.set_index([self.symbol_field_name])
.to_dict(orient="index")
) # type: dict
# load all csv files
@@ -425,10 +420,7 @@ class DumpDataUpdate(DumpDataBase):
all_df = []
def _read_csv(file_path: Path):
if self._include_fields:
_df = pd.read_csv(file_path, usecols=self._include_fields)
else:
_df = pd.read_csv(file_path)
_df = pd.read_csv(file_path, parse_dates=[self.date_field_name])
if self.symbol_field_name not in _df.columns:
_df[self.symbol_field_name] = self.get_symbol_from_file(file_path)
return _df
@@ -436,7 +428,7 @@ class DumpDataUpdate(DumpDataBase):
with tqdm(total=len(self.csv_files)) as p_bar:
with ThreadPoolExecutor(max_workers=self.works) as executor:
for df in executor.map(_read_csv, self.csv_files):
if df:
if not df.empty:
all_df.append(df)
p_bar.update()
@@ -455,25 +447,27 @@ class DumpDataUpdate(DumpDataBase):
with ProcessPoolExecutor(max_workers=self.works) as executor:
futures = {}
for _code, _df in self._all_data.groupby(self.symbol_field_name):
_code = str(_code).upper()
_code = fname_to_code(str(_code).lower()).upper()
_start, _end = self._get_date(_df, is_begin_end=True)
if not (isinstance(_start, pd.Timestamp) and isinstance(_end, pd.Timestamp)):
continue
if _code in self._update_instruments:
self._update_instruments[_code]["end_time"] = _end
self._update_instruments[_code][self.INSTRUMENTS_END_FIELD] = self._format_datetime(_end)
futures[executor.submit(self._dump_bin, _df, self._update_calendars)] = _code
else:
# new stock
_dt_range = self._update_instruments.setdefault(_code, dict())
_dt_range["start_time"] = _start
_dt_range["end_time"] = _end
_dt_range[self.INSTRUMENTS_START_FIELD] = self._format_datetime(_start)
_dt_range[self.INSTRUMENTS_END_FIELD] = self._format_datetime(_end)
futures[executor.submit(self._dump_bin, _df, self._new_calendar_list)] = _code
for _future in tqdm(as_completed(futures)):
try:
_future.result()
except Exception:
error_code[futures[_future]] = traceback.format_exc()
with tqdm(total=len(futures)) as p_bar:
for _future in as_completed(futures):
try:
_future.result()
except Exception:
error_code[futures[_future]] = traceback.format_exc()
p_bar.update()
logger.info(f"dump bin errors {error_code}")
logger.info("end of features dump.\n")
@@ -481,7 +475,9 @@ class DumpDataUpdate(DumpDataBase):
def dump(self):
self.save_calendars(self._new_calendar_list)
self._dump_features()
self.save_instruments(pd.DataFrame.from_dict(self._update_instruments, orient="index"))
df = pd.DataFrame.from_dict(self._update_instruments, orient="index")
df.index.names = [self.symbol_field_name]
self.save_instruments(df.reset_index())
if __name__ == "__main__":

View File

@@ -1,5 +1,6 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import fire
from qlib.tests.data import GetData

View File

@@ -11,7 +11,7 @@ NAME = "pyqlib"
DESCRIPTION = "A Quantitative-research Platform"
REQUIRES_PYTHON = ">=3.5.0"
VERSION = "0.6.1"
VERSION = "0.6.2"
# Detect Cython
try:
@@ -35,7 +35,6 @@ REQUIRED = [
"scipy>=1.0.0",
"requests>=2.18.0",
"sacred>=0.7.4",
"pymongo==3.7.2",
"python-socketio==3.1.2",
"redis>=3.0.1",
"python-redis-lock>=3.3.1",

View File

@@ -66,7 +66,7 @@ class TestDataset(TestAutoData):
# Check the data
# Get data from DataFrame Directly
data_from_df = (
tsdh._handler.fetch(data_key=DataHandlerLP.DK_L)
tsdh.handler.fetch(data_key=DataHandlerLP.DK_L)
.loc(axis=0)["2015-01-01":"2016-12-31", "SZ300315"]
.iloc[-30:]
.values

View File

@@ -37,7 +37,7 @@ class TestGetData(unittest.TestCase):
def test_0_qlib_data(self):
GetData().qlib_data(name="qlib_data_simple", target_dir=QLIB_DIR, region="cn", interval="1d", version="latest")
GetData().qlib_data(name="qlib_data_simple", target_dir=QLIB_DIR, region="cn", interval="1d", delete_old=False)
df = D.features(D.instruments("csi300"), self.FIELDS)
self.assertListEqual(list(df.columns), self.FIELDS, "get qlib data failed")
self.assertFalse(df.dropna().empty, "get qlib data failed")

View File

@@ -0,0 +1,69 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import sys
import unittest
import numpy as np
import qlib
from qlib.data import D
from qlib.data.ops import ElemOperator, PairOperator
from qlib.config import REG_CN
from qlib.utils import exists_qlib_data
from qlib.tests import TestAutoData
from qlib.tests.data import GetData
class Diff(ElemOperator):
"""Feature First Difference
Parameters
----------
feature : Expression
feature instance
Returns
----------
Expression
a feature instance with first difference
"""
def _load_internal(self, instrument, start_index, end_index, freq):
series = self.feature.load(instrument, start_index, end_index, freq)
return series.diff()
def get_extended_window_size(self):
lft_etd, rght_etd = self.feature.get_extended_window_size()
return lft_etd + 1, rght_etd
class Distance(PairOperator):
"""Feature Distance
Parameters
----------
feature : Expression
feature instance
Returns
----------
Expression
a feature instance with distance
"""
def _load_internal(self, instrument, start_index, end_index, freq):
series_left = self.feature_left.load(instrument, start_index, end_index, freq)
series_right = self.feature_right.load(instrument, start_index, end_index, freq)
return np.abs(series_left - series_right)
class TestRegiterCustomOps(TestAutoData):
@classmethod
def setUpClass(cls) -> None:
cls._setup_kwargs.update({"custom_ops": [Diff, Distance]})
super().setUpClass()
def test_regiter_custom_ops(self):
instruments = ["SH600000"]
fields = ["Diff($close)", "Distance($close, Ref($close, 1))"]
print(D.features(instruments, fields, start_time="2010-01-01", end_time="2017-12-31", freq="day"))
if __name__ == "__main__":
unittest.main()