mirror of
https://github.com/microsoft/qlib.git
synced 2026-06-29 09:01:18 +08:00
Compare commits
148 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
97d354fa73 | ||
|
|
a87fb5a68c | ||
|
|
835b47a7e7 | ||
|
|
802dac81c9 | ||
|
|
bdc70c192a | ||
|
|
213f809148 | ||
|
|
f3fd5e0773 | ||
|
|
decf74cbdf | ||
|
|
b4a92d55f8 | ||
|
|
ebc31b9bdb | ||
|
|
56ebe9bf36 | ||
|
|
ddd68fc761 | ||
|
|
f50463aca9 | ||
|
|
c0e7cbc983 | ||
|
|
828993b397 | ||
|
|
8ef89b4fa8 | ||
|
|
76cf9dad99 | ||
|
|
f3eb02a0bd | ||
|
|
ffa68fd010 | ||
|
|
f6dd006c35 | ||
|
|
8c29105bca | ||
|
|
948b829ff4 | ||
|
|
304a0c3d7a | ||
|
|
02dea2aeb6 | ||
|
|
6fc4f2b249 | ||
|
|
2a5f06ee9e | ||
|
|
7f9216dc90 | ||
|
|
263ccdfe6f | ||
|
|
1a8f1bfc57 | ||
|
|
9dc11a9e3c | ||
|
|
3bdd54308b | ||
|
|
1b569d371d | ||
|
|
36e5c601de | ||
|
|
ae45711e2b | ||
|
|
bcc47aa4cb | ||
|
|
ee94634b23 | ||
|
|
2016ebbbb2 | ||
|
|
1eaf09cce1 | ||
|
|
7579f4b4c0 | ||
|
|
1a1c45981c | ||
|
|
e4ecea55e4 | ||
|
|
58616fced9 | ||
|
|
8e9ca22b07 | ||
|
|
6a145df87c | ||
|
|
06dbd02b99 | ||
|
|
ffedb6382f | ||
|
|
3f9f295a87 | ||
|
|
84d77f4585 | ||
|
|
afdf58b4fa | ||
|
|
2b6d16feb1 | ||
|
|
0a86a6f392 | ||
|
|
5da5ad4b9f | ||
|
|
dd07810b66 | ||
|
|
a762248d98 | ||
|
|
80c9a47e51 | ||
|
|
784e73bceb | ||
|
|
5ad1b4cc33 | ||
|
|
e85646762c | ||
|
|
fc81a39317 | ||
|
|
d44c5bb2b2 | ||
|
|
c622d3f6f8 | ||
|
|
6daaa79519 | ||
|
|
3dda2cb379 | ||
|
|
4fcfde7cfb | ||
|
|
3403c00b6b | ||
|
|
ecdfe49fd1 | ||
|
|
cc214a3462 | ||
|
|
65d8af41e7 | ||
|
|
0e0970f06e | ||
|
|
917261dbf6 | ||
|
|
6a9105e065 | ||
|
|
570bb272eb | ||
|
|
0524a47cf4 | ||
|
|
9abc0b0d4f | ||
|
|
fe60e40927 | ||
|
|
740c297618 | ||
|
|
b4a088efe8 | ||
|
|
b34890772f | ||
|
|
054ffa29f6 | ||
|
|
74e08c9e37 | ||
|
|
ea96c9e22d | ||
|
|
86e7c44c6b | ||
|
|
64cf2e2df8 | ||
|
|
4361a4049a | ||
|
|
231f37376b | ||
|
|
328cdeda4a | ||
|
|
4dbc8e52ec | ||
|
|
ba447d3448 | ||
|
|
df556532d0 | ||
|
|
18e040f506 | ||
|
|
aefc98b1d7 | ||
|
|
46c8d791ac | ||
|
|
afcd91a2d0 | ||
|
|
4a30d9d1ec | ||
|
|
2da2e9bd9e | ||
|
|
3e6877ff0f | ||
|
|
a0f32036a6 | ||
|
|
d8f36df7f4 | ||
|
|
cb3b6c5bde | ||
|
|
b11712fa54 | ||
|
|
660edeb94f | ||
|
|
95de4088df | ||
|
|
e8d7a22651 | ||
|
|
4a62b929ad | ||
|
|
5efe82fb56 | ||
|
|
40bbafcaab | ||
|
|
4c4f0f3c5e | ||
|
|
ae0e0eca3d | ||
|
|
7e37fa710a | ||
|
|
e0c460c33c | ||
|
|
53f501ac19 | ||
|
|
132df027a5 | ||
|
|
7d97fd39ce | ||
|
|
995fa98fc6 | ||
|
|
824de921d1 | ||
|
|
66d9bd1a68 | ||
|
|
1c0bb2f827 | ||
|
|
ea018ed4dc | ||
|
|
f3f1867b14 | ||
|
|
8bbfd8810c | ||
|
|
3f84c3768a | ||
|
|
7372a3a598 | ||
|
|
4b4cd38ca6 | ||
|
|
7d40ba753a | ||
|
|
9b60214e0c | ||
|
|
f7e775f941 | ||
|
|
aefbf3b5f1 | ||
|
|
3f85af05e5 | ||
|
|
192c2dc5ef | ||
|
|
911edd7839 | ||
|
|
3d47dd78c8 | ||
|
|
8f6ab0af54 | ||
|
|
cb0b6fcdaa | ||
|
|
6b8824dd29 | ||
|
|
c217e7c479 | ||
|
|
ea4fe1577b | ||
|
|
1bab07e419 | ||
|
|
422d1d8c93 | ||
|
|
c8f9b1162d | ||
|
|
e2bdef7ffe | ||
|
|
c10955d026 | ||
|
|
d642c7b6ea | ||
|
|
0cdc5e125a | ||
|
|
2de812f262 | ||
|
|
16450c2876 | ||
|
|
729b57e4a7 | ||
|
|
87cc52cd05 | ||
|
|
0be57d51be |
5
.github/ISSUE_TEMPLATE/bug-report.md
vendored
5
.github/ISSUE_TEMPLATE/bug-report.md
vendored
@@ -28,7 +28,8 @@ Steps to reproduce the behavior:
|
||||
|
||||
## Environment
|
||||
|
||||
**Note**: One could run `python scripts/collect_info.py` under the `qlib` directory to get the following information.
|
||||
**Note**: User could run `cd scripts && python collect_info.py all` under project directory to get system information
|
||||
and paste them here directly.
|
||||
|
||||
- Qlib version:
|
||||
- Python version:
|
||||
@@ -37,4 +38,4 @@ Steps to reproduce the behavior:
|
||||
|
||||
## Additional Notes
|
||||
|
||||
<!-- Add any other information about the problem here. -->
|
||||
<!-- Add any other information about the problem here. -->
|
||||
|
||||
62
.github/stale.yml
vendored
Normal file
62
.github/stale.yml
vendored
Normal file
@@ -0,0 +1,62 @@
|
||||
# Configuration for probot-stale - https://github.com/probot/stale
|
||||
|
||||
# Number of days of inactivity before an Issue or Pull Request becomes stale
|
||||
daysUntilStale: 60
|
||||
|
||||
# Number of days of inactivity before an Issue or Pull Request with the stale label is closed.
|
||||
# Set to false to disable. If disabled, issues still need to be closed manually, but will remain marked as stale.
|
||||
daysUntilClose: 7
|
||||
|
||||
# Only issues or pull requests with all of these labels are check if stale. Defaults to `[]` (disabled)
|
||||
onlyLabels: []
|
||||
|
||||
# Issues or Pull Requests with these labels will never be considered stale. Set to `[]` to disable
|
||||
exemptLabels:
|
||||
- bug
|
||||
- pinned
|
||||
- security
|
||||
- "[Status] Maybe Later"
|
||||
|
||||
# Set to true to ignore issues in a project (defaults to false)
|
||||
exemptProjects: false
|
||||
|
||||
# Set to true to ignore issues in a milestone (defaults to false)
|
||||
exemptMilestones: false
|
||||
|
||||
# Set to true to ignore issues with an assignee (defaults to false)
|
||||
exemptAssignees: false
|
||||
|
||||
# Label to use when marking as stale
|
||||
staleLabel: wontfix
|
||||
|
||||
# Comment to post when marking as stale. Set to `false` to disable
|
||||
markComment: >
|
||||
This issue has been automatically marked as stale because it has not had
|
||||
recent activity. It will be closed if no further activity occurs. Thank you
|
||||
for your contributions.
|
||||
|
||||
# Comment to post when removing the stale label.
|
||||
# unmarkComment: >
|
||||
# Your comment here.
|
||||
|
||||
# Comment to post when closing a stale Issue or Pull Request.
|
||||
# closeComment: >
|
||||
# Your comment here.
|
||||
|
||||
# Limit the number of actions per hour, from 1-30. Default is 30
|
||||
limitPerRun: 30
|
||||
|
||||
# Limit to only `issues` or `pulls`
|
||||
# only: issues
|
||||
|
||||
# Optionally, specify configuration settings that are specific to just 'issues' or 'pulls':
|
||||
# pulls:
|
||||
# daysUntilStale: 30
|
||||
# markComment: >
|
||||
# This pull request has been automatically marked as stale because it has not had
|
||||
# recent activity. It will be closed if no further activity occurs. Thank you
|
||||
# for your contributions.
|
||||
|
||||
# issues:
|
||||
# exemptLabels:
|
||||
# - confirmed
|
||||
88
.github/workflows/test.yml
vendored
88
.github/workflows/test.yml
vendored
@@ -13,7 +13,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [windows-latest, ubuntu-16.04, ubuntu-18.04, ubuntu-20.04, macos-latest]
|
||||
python-version: [3.6, 3.7, 3.8]
|
||||
python-version: [3.6, 3.7, 3.8, 3.9]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
@@ -22,9 +22,58 @@ jobs:
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Lint with Black
|
||||
run: |
|
||||
cd ..
|
||||
if [ "$RUNNER_OS" == "Windows" ]; then
|
||||
$CONDA\\python.exe -m pip install black
|
||||
$CONDA\\python.exe -m black qlib -l 120 --check --diff
|
||||
else
|
||||
sudo $CONDA/bin/python -m pip install black
|
||||
$CONDA/bin/python -m black qlib -l 120 --check --diff
|
||||
fi
|
||||
shell: bash
|
||||
|
||||
# Test Qlib installed with pip
|
||||
- name: Install Qlib with pip
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Windows" ]; then
|
||||
$CONDA\\python.exe -m pip install pyqlib --ignore-installed ruamel.yaml --user
|
||||
else
|
||||
sudo $CONDA/bin/python -m pip install pyqlib --ignore-installed ruamel.yaml
|
||||
fi
|
||||
shell: bash
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
- name: Install Lightgbm for MacOS
|
||||
if: runner.os == 'macOS'
|
||||
run: |
|
||||
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Microsoft/qlib/main/.github/brew_install.sh)"
|
||||
HOMEBREW_NO_AUTO_UPDATE=1 brew install lightgbm
|
||||
|
||||
- name: Test data downloads
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Windows" ]; then
|
||||
$CONDA\\python.exe scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
|
||||
else
|
||||
$CONDA/bin/python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
|
||||
fi
|
||||
shell: bash
|
||||
|
||||
- name: Test workflow by config (install from pip)
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Windows" ]; then
|
||||
$CONDA\\python.exe qlib\\workflow\\cli.py examples\\benchmarks\\LightGBM\\workflow_config_lightgbm_Alpha158.yaml
|
||||
$CONDA\\python.exe -m pip uninstall -y pyqlib
|
||||
else
|
||||
$CONDA/bin/python qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
|
||||
sudo $CONDA/bin/python -m pip uninstall -y pyqlib
|
||||
fi
|
||||
shell: bash
|
||||
|
||||
# Test Qlib installed from source
|
||||
- name: Install Qlib from source
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Windows" ]; then
|
||||
$CONDA\\python.exe -m pip install --upgrade cython
|
||||
$CONDA\\python.exe -m pip install numpy jupyter jupyter_contrib_nbextensions
|
||||
@@ -36,13 +85,7 @@ jobs:
|
||||
sudo $CONDA/bin/python -m pip install -U scipy scikit-learn # installing without this line will cause errors on GitHub Actions, while instsalling locally won't
|
||||
sudo $CONDA/bin/python setup.py install
|
||||
fi
|
||||
shell: bash
|
||||
|
||||
- name: Install Lightgbm for MacOS
|
||||
if: runner.os == 'macOS'
|
||||
run: |
|
||||
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Microsoft/qlib/main/.github/brew_install.sh)"
|
||||
HOMEBREW_NO_AUTO_UPDATE=1 brew install lightgbm
|
||||
shell: bash
|
||||
|
||||
- name: Install test dependencies
|
||||
run: |
|
||||
@@ -54,16 +97,6 @@ jobs:
|
||||
sudo $CONDA/bin/python -m pip install black pytest
|
||||
fi
|
||||
shell: bash
|
||||
|
||||
- name: Lint with Black
|
||||
run: |
|
||||
cd ..
|
||||
if [ "$RUNNER_OS" == "Windows" ]; then
|
||||
$CONDA\\python.exe -m black qlib -l 120 --check --diff
|
||||
else
|
||||
$CONDA/bin/python -m black qlib -l 120 --check --diff
|
||||
fi
|
||||
shell: bash
|
||||
|
||||
- name: Unit tests with Pytest
|
||||
run: |
|
||||
@@ -73,22 +106,13 @@ jobs:
|
||||
else
|
||||
$CONDA/bin/python -m pytest . --durations=0
|
||||
fi
|
||||
shell: bash
|
||||
shell: bash
|
||||
|
||||
- name: Test data downloads
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Windows" ]; then
|
||||
$CONDA\\python.exe scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
|
||||
else
|
||||
$CONDA/bin/python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
|
||||
fi
|
||||
shell: bash
|
||||
|
||||
- name: Test workflow by config
|
||||
- name: Test workflow by config (install from source)
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Windows" ]; then
|
||||
$CONDA\\python.exe qlib\\workflow\\cli.py examples\\benchmarks\\LightGBM\\workflow_config_lightgbm_Alpha158.yaml
|
||||
else
|
||||
$CONDA/bin/python qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
|
||||
fi
|
||||
shell: bash
|
||||
shell: bash
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -2,6 +2,7 @@
|
||||
__pycache__/
|
||||
|
||||
*.pyc
|
||||
*.pyd
|
||||
*.so
|
||||
*.ipynb
|
||||
.ipynb_checkpoints
|
||||
|
||||
62
README.md
62
README.md
@@ -34,6 +34,7 @@ For more details, please refer to our paper ["Qlib: An AI-oriented Quantitative
|
||||
- [More About Qlib](#more-about-qlib)
|
||||
- [Offline Mode and Online Mode](#offline-mode-and-online-mode)
|
||||
- [Performance of Qlib Data Server](#performance-of-qlib-data-server)
|
||||
- [Related Reports](#related-reports)
|
||||
- [Contributing](#contributing)
|
||||
|
||||
|
||||
@@ -61,11 +62,27 @@ At the module level, Qlib is a platform that consists of the above components. T
|
||||
|
||||
This quick start guide tries to demonstrate
|
||||
1. It's very easy to build a complete Quant research workflow and try your ideas with _Qlib_.
|
||||
1. Though with *public data* and *simple models*, machine learning technologies **work very well** in practical Quant investment.
|
||||
2. Though with *public data* and *simple models*, machine learning technologies **work very well** in practical Quant investment.
|
||||
|
||||
Here is a quick **[demo](https://terminalizer.com/view/3f24561a4470)** shows how to install ``Qlib``, and run LightGBM with ``qrun``. **But**, please make sure you have already prepared the data following the [instruction](#data-preparation).
|
||||
|
||||
|
||||
## Installation
|
||||
|
||||
Users can easily install ``Qlib`` by pip according to the following command
|
||||
This table demonstrates the supported Python version of `Qlib`:
|
||||
| | install with pip | install from source | plot |
|
||||
| ------------- |:---------------------:|:--------------------:|:----:|
|
||||
| Python 3.6 | :heavy_check_mark: | :heavy_check_mark: (only with `Anaconda`) | :heavy_check_mark: |
|
||||
| Python 3.7 | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
|
||||
| Python 3.8 | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
|
||||
| Python 3.9 | :x: | :heavy_check_mark: | :x: |
|
||||
|
||||
**Note**:
|
||||
1. Please pay attention that installing cython in Python 3.6 will raise some error when installing ``Qlib`` from source. If users use Python 3.6 on their machines, it is recommended to *upgrade* Python to version 3.7 or use `conda`'s Python to install ``Qlib`` from source.
|
||||
2. For Python 3.9, `Qlib` supports running workflows such as training models, doing backtest and plot most of the related figures (those included in [notebook](examples/workflow_by_code.ipynb)). However, plotting for the *model performance* is not supported for now and we will fix this when the dependent packages are upgraded in the future.
|
||||
|
||||
### Install with pip
|
||||
Users can easily install ``Qlib`` by pip according to the following command.
|
||||
|
||||
```bash
|
||||
pip install pyqlib
|
||||
@@ -73,6 +90,7 @@ Users can easily install ``Qlib`` by pip according to the following command
|
||||
|
||||
**Note**: pip will install the latest stable qlib. However, the main branch of qlib is in active development. If you want to test the latest scripts or functions in the main branch. Please install qlib with the methods below.
|
||||
|
||||
### Install from source
|
||||
Also, users can install the latest dev version ``Qlib`` by the source code according to the following steps:
|
||||
|
||||
* Before installing ``Qlib`` from source, users need to install some dependencies:
|
||||
@@ -81,7 +99,6 @@ Also, users can install the latest dev version ``Qlib`` by the source code accor
|
||||
pip install numpy
|
||||
pip install --upgrade cython
|
||||
```
|
||||
**Note**: Please pay attention that installing cython in Python 3.6 will raise some error when installing ``Qlib`` from source. If users use Python 3.6 on their machines, it is recommended to *upgrade* Python to version 3.7 or use `conda`'s Python to install ``Qlib`` from source.
|
||||
|
||||
* Clone the repository and install ``Qlib`` as follows.
|
||||
* If you haven't installed qlib by the command ``pip install pyqlib`` before:
|
||||
@@ -94,7 +111,9 @@ Also, users can install the latest dev version ``Qlib`` by the source code accor
|
||||
git clone https://github.com/microsoft/qlib.git && cd qlib
|
||||
pip install .
|
||||
```
|
||||
**Note**: **Only** the command ``pip install .`` **can** overwrite the stable version installed by ``pip install pyqlib``, which the command ``python setup.py install`` **can't**.
|
||||
**Note**: **Only** the command ``pip install .`` **can** overwrite the stable version installed by ``pip install pyqlib``, while the command ``python setup.py install`` **can't**.
|
||||
|
||||
**Tips**: If you fail to install `Qlib` or run the examples in your environment, comparing your steps and the [CI workflow](.github/workflows/test.yml) may help you find the problem.
|
||||
|
||||
## Data Preparation
|
||||
Load and prepare data by running the following code:
|
||||
@@ -138,12 +157,16 @@ Users could create the same dataset with it.
|
||||
## Auto Quant Research Workflow
|
||||
Qlib provides a tool named `qrun` to run the whole workflow automatically (including building dataset, training models, backtest and evaluation). You can start an auto quant research workflow and have a graphical reports analysis according to the following steps:
|
||||
|
||||
1. Quant Research Workflow: Run `qrun` with lightgbm workflow config ([workflow_config_lightgbm.yaml](examples/benchmarks/LightGBM/workflow_config_lightgbm.yaml)) as following.
|
||||
1. Quant Research Workflow: Run `qrun` with lightgbm workflow config ([workflow_config_lightgbm_Alpha158.yaml](examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml) as following.
|
||||
```bash
|
||||
cd examples # Avoid running program under the directory contains `qlib`
|
||||
qrun benchmarks/LightGBM/workflow_config_lightgbm.yaml
|
||||
qrun benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
|
||||
```
|
||||
The result of `qrun` is as follows, please refer to please refer to [Intraday Trading](https://qlib.readthedocs.io/en/latest/component/backtest.html) for more details about the result.
|
||||
If users want to use `qrun` under debug mode, please use the following command:
|
||||
```bash
|
||||
python -m pdb qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
|
||||
```
|
||||
The result of `qrun` is as follows, please refer to [Intraday Trading](https://qlib.readthedocs.io/en/latest/component/backtest.html) for more details about the result.
|
||||
|
||||
```bash
|
||||
|
||||
@@ -198,16 +221,17 @@ The automatic workflow may not suite the research workflow of all Quant research
|
||||
# [Quant Model Zoo](examples/benchmarks)
|
||||
|
||||
Here is a list of models built on `Qlib`.
|
||||
- [GBDT based on LightGBM (Guolin Ke, et al.)](qlib/contrib/model/gbdt.py)
|
||||
- [GBDT based on Catboost (Liudmila Prokhorenkova, et al.)](qlib/contrib/model/catboost_model.py)
|
||||
- [GBDT based on XGBoost (Tianqi Chen, et al.)](qlib/contrib/model/xgboost.py)
|
||||
- [GBDT based on XGBoost (Tianqi Chen, et al. 2016)](qlib/contrib/model/xgboost.py)
|
||||
- [GBDT based on LightGBM (Guolin Ke, et al. 2017)](qlib/contrib/model/gbdt.py)
|
||||
- [GBDT based on Catboost (Liudmila Prokhorenkova, et al. 2017)](qlib/contrib/model/catboost_model.py)
|
||||
- [MLP based on pytorch](qlib/contrib/model/pytorch_nn.py)
|
||||
- [GRU based on pytorch (Kyunghyun Cho, et al.)](qlib/contrib/model/pytorch_gru.py)
|
||||
- [LSTM based on pytorcn (Sepp Hochreiter, et al.)](qlib/contrib/model/pytorch_lstm.py)
|
||||
- [ALSTM based on pytorcn (Yao Qin, et al.)](qlib/contrib/model/pytorch_alstm.py)
|
||||
- [GATs based on pytorch (Petar Velickovic, et al.)](qlib/contrib/model/pytorch_gats.py)
|
||||
- [SFM based on pytorch (Liheng Zhang, et al.)](qlib/contrib/model/pytorch_sfm.py)
|
||||
- [TFT based on tensorflow (Bryan Lim, et al.)](examples/benchmarks/TFT/tft.py)
|
||||
- [LSTM based on pytorch (Sepp Hochreiter, et al. 1997)](qlib/contrib/model/pytorch_lstm.py)
|
||||
- [GRU based on pytorch (Kyunghyun Cho, et al. 2014)](qlib/contrib/model/pytorch_gru.py)
|
||||
- [ALSTM based on pytorch (Yao Qin, et al. 2017)](qlib/contrib/model/pytorch_alstm.py)
|
||||
- [GATs based on pytorch (Petar Velickovic, et al. 2017)](qlib/contrib/model/pytorch_gats.py)
|
||||
- [SFM based on pytorch (Liheng Zhang, et al. 2017)](qlib/contrib/model/pytorch_sfm.py)
|
||||
- [TFT based on tensorflow (Bryan Lim, et al. 2019)](examples/benchmarks/TFT/tft.py)
|
||||
- [TabNet based on pytorch (Sercan O. Arik, et al. 2019)](qlib/contrib/model/pytorch_tabnet.py)
|
||||
|
||||
Your PR of new Quant models is highly welcomed.
|
||||
|
||||
@@ -288,7 +312,11 @@ Such overheads greatly slow down the data loading process.
|
||||
Qlib data are stored in a compact format, which is efficient to be combined into arrays for scientific computation.
|
||||
|
||||
|
||||
|
||||
# Related Reports
|
||||
- [Guide To Qlib: Microsoft’s AI Investment Platform](https://analyticsindiamag.com/qlib/)
|
||||
- [【华泰金工林晓明团队】微软AI量化投资平台Qlib体验——华泰人工智能系列之四十](https://mp.weixin.qq.com/s/Brcd7im4NibJOJzZfMn6tQ)
|
||||
- [微软也搞AI量化平台?还是开源的!](https://mp.weixin.qq.com/s/47bP5YwxfTp2uTHjUBzJQQ)
|
||||
- [微矿Qlib:业内首个AI量化投资开源平台](https://mp.weixin.qq.com/s/vsJv7lsgjEi-ALYUz4CvtQ)
|
||||
|
||||
|
||||
# Contributing
|
||||
|
||||
12
docs/_static/demo.sh
vendored
Normal file
12
docs/_static/demo.sh
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
#!/bin/sh
|
||||
git clone https://github.com/microsoft/qlib.git
|
||||
cd qlib
|
||||
ls
|
||||
pip install pyqlib
|
||||
# or
|
||||
# pip install numpy
|
||||
# pip install --upgrade cython
|
||||
# python setup.py install
|
||||
cd examples
|
||||
ls
|
||||
qrun benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
|
||||
@@ -50,57 +50,37 @@ Users can use ``Data Handler`` to build formulaic alphas `MACD` in qlib:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
>> from qlib.data.dataset.handler import QLibDataHandler
|
||||
>> from qlib.data.dataset.loader import QlibDataLoader
|
||||
>> MACD_EXP = '(EMA($close, 12) - EMA($close, 26))/$close - EMA((EMA($close, 12) - EMA($close, 26))/$close, 9)/$close'
|
||||
>> fields = [MACD_EXP] # MACD
|
||||
>> names = ['MACD']
|
||||
>> labels = ['$close'] # label
|
||||
>> labels = ['Ref($close, -2)/Ref($close, -1) - 1'] # label
|
||||
>> label_names = ['LABEL']
|
||||
>> data_handler = QLibDataHandler(start_date='2010-01-01', end_date='2017-12-31', fields=fields, names=names, labels=labels, label_names=label_names)
|
||||
>> TRAINER_CONFIG = {
|
||||
.. "train_start_date": "2007-01-01",
|
||||
.. "train_end_date": "2014-12-31",
|
||||
.. "validate_start_date": "2015-01-01",
|
||||
.. "validate_end_date": "2016-12-31",
|
||||
.. "test_start_date": "2017-01-01",
|
||||
.. "test_end_date": "2020-08-01",
|
||||
>> data_loader_config = {
|
||||
.. "feature": (fields, names),
|
||||
.. "label": (labels, label_names)
|
||||
.. }
|
||||
>> feature_train, label_train, feature_validate, label_validate, feature_test, label_test = data_handler.get_split_data(**TRAINER_CONFIG)
|
||||
>> print(feature_train, label_train)
|
||||
MACD
|
||||
instrument datetime
|
||||
SH600000 2010-01-04 -0.008625
|
||||
2010-01-05 -0.007234
|
||||
2010-01-06 -0.007693
|
||||
2010-01-07 -0.009633
|
||||
2010-01-08 -0.009891
|
||||
... ...
|
||||
SZ300251 2014-12-25 0.043072
|
||||
2014-12-26 0.041345
|
||||
2014-12-29 0.042733
|
||||
2014-12-30 0.042066
|
||||
2014-12-31 0.036299
|
||||
|
||||
[322025 rows x 1 columns]
|
||||
LABEL
|
||||
instrument datetime
|
||||
SH600000 2010-01-04 4.260015
|
||||
2010-01-05 4.292182
|
||||
2010-01-06 4.207747
|
||||
2010-01-07 4.113258
|
||||
2010-01-08 4.159496
|
||||
... ...
|
||||
SZ300251 2014-12-25 4.343212
|
||||
2014-12-26 4.470587
|
||||
2014-12-29 4.762474
|
||||
2014-12-30 4.369748
|
||||
2014-12-31 4.182222
|
||||
|
||||
[322025 rows x 1 columns]
|
||||
>> data_loader = QlibDataLoader(config=data_loader_config)
|
||||
>> df = data_loader.load(instruments='csi300', start_time='2010-01-01', end_time='2017-12-31')
|
||||
>> print(df)
|
||||
feature label
|
||||
MACD LABEL
|
||||
datetime instrument
|
||||
2010-01-04 SH600000 -0.011547 -0.019672
|
||||
SH600004 0.002745 -0.014721
|
||||
SH600006 0.010133 0.002911
|
||||
SH600008 -0.001113 0.009818
|
||||
SH600009 0.025878 -0.017758
|
||||
... ... ...
|
||||
2017-12-29 SZ300124 0.007306 -0.005074
|
||||
SZ300136 -0.013492 0.056352
|
||||
SZ300144 -0.000966 0.011853
|
||||
SZ300251 0.004383 0.021739
|
||||
SZ300315 -0.030557 0.012455
|
||||
|
||||
Reference
|
||||
===========
|
||||
|
||||
To learn more about ``Data Handler``, please refer to `Data Handler <../component/data.html>`_
|
||||
To learn more about ``Data Loader``, please refer to `Data Loader <../component/data.html#data-loader>`_
|
||||
|
||||
To learn more about ``Data API``, please refer to `Data API <../component/data.html>`_
|
||||
|
||||
@@ -126,17 +126,17 @@ After conversion, users can find their Qlib format data in the directory `~/.qli
|
||||
The arguments of `--include_fields` should correspond with the column names of CSV files. The columns names of dataset provided by ``Qlib`` should include open, close, high, low, volume and factor at least.
|
||||
|
||||
- `open`
|
||||
The opening price
|
||||
The adjusted opening price
|
||||
- `close`
|
||||
The closing price
|
||||
The adjusted closing price
|
||||
- `high`
|
||||
The highest price
|
||||
The adjusted highest price
|
||||
- `low`
|
||||
The lowest price
|
||||
The adjusted lowest price
|
||||
- `volume`
|
||||
The trading volume
|
||||
The adjusted trading volume
|
||||
- `factor`
|
||||
The Restoration factor
|
||||
The Restoration factor. Normally, ``factor = adjusted_price / original_price``, `adjusted price` reference: `split adjusted <https://www.investopedia.com/terms/s/splitadjusted.asp>`_
|
||||
|
||||
In the convention of `Qlib` data processing, `open, close, high, low, volume, money and factor` will be set to NaN if the stock is suspended.
|
||||
|
||||
@@ -195,6 +195,7 @@ Feature
|
||||
- `ExpressionOps`
|
||||
`ExpressionOps` will use operator for feature construction.
|
||||
To know more about ``Operator``, please refer to `Operator API <../reference/api.html#module-qlib.data.ops>`_.
|
||||
Also, ``Qlib`` supports users to define their own custom ``Operator``, an example has been given in ``tests/test_register_ops.py``.
|
||||
|
||||
To know more about ``Feature``, please refer to `Feature API <../reference/api.html#module-qlib.data.base>`_.
|
||||
|
||||
@@ -295,6 +296,7 @@ The ``Processor`` module in ``Qlib`` is designed to be learnable and it is respo
|
||||
- ``RobustZScoreNorm``: `processor` that applies robust z-score normalization.
|
||||
- ``CSZScoreNorm``: `processor` that applies cross sectional z-score normalization.
|
||||
- ``CSRankNorm``: `processor` that applies cross sectional rank normalization.
|
||||
- ``CSZFillna``: `processor` that fills N/A values in a cross sectional way by the mean of the column.
|
||||
|
||||
Users can also create their own `processor` by inheriting the base class of ``Processor``. Please refer to the implementation of all the processors for more information (`Processor Link <https://github.com/microsoft/qlib/blob/main/qlib/data/dataset/processor.py>`_).
|
||||
|
||||
|
||||
@@ -34,8 +34,9 @@ Here is a general view of the structure of the system:
|
||||
- Recorder 2
|
||||
- ...
|
||||
- ...
|
||||
This experiment management system defines a set of interface and provided a concrete implementation based on the machine learning platform: ``MLFlow`` (`link <https://mlflow.org/>`_).
|
||||
This experiment management system defines a set of interface and provided a concrete implementation ``MLflowExpManager``, which is based on the machine learning platform: ``MLFlow`` (`link <https://mlflow.org/>`_).
|
||||
|
||||
If users set the implementation of ``ExpManager`` to be ``MLflowExpManager``, they can use the command `mlflow ui` to visualize and check the experiment results. For more information, pleaes refer to the related documents `here <https://www.mlflow.org/docs/latest/cli.html#mlflow-ui>`_.
|
||||
|
||||
Qlib Recorder
|
||||
===================
|
||||
@@ -91,7 +92,7 @@ Record Template
|
||||
|
||||
The ``RecordTemp`` class is a class that enables generate experiment results such as IC and backtest in a certain format. We have provided three different `Record Template` class:
|
||||
|
||||
- ``SignalRecord``: This class generates the `preidction` results of the model.
|
||||
- ``SignalRecord``: This class generates the `prediction` results of the model.
|
||||
- ``SigAnaRecord``: This class generates the `IC`, `ICIR`, `Rank IC` and `Rank ICIR` of the model.
|
||||
- ``PortAnaRecord``: This class generates the results of `backtest`. The detailed information about `backtest` as well as the available `strategy`, users can refer to `Strategy <../component/strategy.html>`_ and `Backtest <../component/backtest.html>`_.
|
||||
|
||||
|
||||
@@ -103,6 +103,12 @@ After saving the config into `configuration.yaml`, users could start the workflo
|
||||
|
||||
qrun configuration.yaml
|
||||
|
||||
If users want to use ``qrun`` under debug mode, please use the following command:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python -m pdb qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
|
||||
|
||||
.. note::
|
||||
|
||||
`qrun` will be placed in your $PATH directory when installing ``Qlib``.
|
||||
|
||||
@@ -226,3 +226,8 @@ epub_exclude_files = ["search.html"]
|
||||
|
||||
autodoc_member_order = "bysource"
|
||||
autodoc_default_flags = ["members"]
|
||||
autodoc_default_options = {
|
||||
"members": True,
|
||||
"member-order": "bysource",
|
||||
"special-members": "__init__",
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
Cython
|
||||
cmake
|
||||
numpy
|
||||
scipy
|
||||
scikit-learn
|
||||
scikit-learn
|
||||
|
||||
@@ -63,6 +63,7 @@ Besides `provider_uri` and `region`, `qlib.init` has other parameters. The follo
|
||||
If Qlib fails to connect redis via `redis_host` and `redis_port`, cache mechanism will not be used! Please refer to `Cache <../component/data.html#cache>`_ for details.
|
||||
- `exp_manager`
|
||||
Type: dict, optional parameter, the setting of `experiment manager` to be used in qlib. Users can specify an experiment manager class, as well as the tracking URI for all the experiments. However, please be aware that we only support input of a dictionary in the following style for `exp_manager`. For more information about `exp_manager`, users can refer to `Recorder: Experiment Management <../component/recorder.html>`_.
|
||||
|
||||
.. code-block:: Python
|
||||
|
||||
# For example, if you want to set your tracking_uri to a <specific folder>, you can initialize qlib below
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Requirements
|
||||
|
||||
Here is the minimal hardware requirements to run the example.
|
||||
Here is the minimal hardware requirements to run the `workflow_by_code` example.
|
||||
- Memory: 16G
|
||||
- Free Disk: 5G
|
||||
|
||||
|
||||
@@ -64,7 +64,6 @@ task:
|
||||
loss: mse
|
||||
n_jobs: 20
|
||||
GPU: 0
|
||||
rnn_type: GRU
|
||||
dataset:
|
||||
class: TSDatasetH
|
||||
module_path: qlib.data.dataset
|
||||
|
||||
@@ -64,7 +64,6 @@ task:
|
||||
loss: mse
|
||||
n_jobs: 20
|
||||
GPU: 0
|
||||
rnn_type: GRU
|
||||
dataset:
|
||||
class: TSDatasetH
|
||||
module_path: qlib.data.dataset
|
||||
|
||||
@@ -1,32 +1,35 @@
|
||||
# Benchmarks Performance
|
||||
|
||||
Here are the results of each benchmark model running on Qlib's `Alpha360` and `Alpha158` dataset with China's A shared-stock & CSI300 data respectively. The values of each metric are the mean and std calculated based on 10 runs.
|
||||
Here are the results of each benchmark model running on Qlib's `Alpha360` and `Alpha158` dataset with China's A shared-stock & CSI300 data respectively. The values of each metric are the mean and std calculated based on 20 runs.
|
||||
|
||||
The numbers shown below demonstrate the performance of the entire `workflow` of each model. We will update the `workflow` as well as models in the near future for better results.
|
||||
|
||||
## Alpha360 dataset
|
||||
| Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown |
|
||||
|---|---|---|---|---|---|---|---|---|
|
||||
| Linear | Alpha360 | 0.0150±0.00 | 0.1049±0.00| 0.0284±0.00 | 0.1970±0.00 | -0.0655±0.00 | -0.6985±0.00| -0.2961±0.00 |
|
||||
| Linear | Alpha360 | 0.0150±0.00 | 0.1049±0.00| 0.0284±0.00 | 0.1970±0.00 | -0.0659±0.00 | -0.7072±0.00| -0.2955±0.00 |
|
||||
| CatBoost (Liudmila Prokhorenkova, et al.) | Alpha360 | 0.0397±0.00 | 0.2878±0.00| 0.0470±0.00 | 0.3703±0.00 | 0.0342±0.00 | 0.4092±0.00| -0.1057±0.00 |
|
||||
| XGBoost (Tianqi Chen, et al.) | Alpha360 | 0.0400±0.00 | 0.3031±0.00| 0.0461±0.00 | 0.3862±0.00 | 0.0528±0.00 | 0.6307±0.00| -0.1113±0.00 |
|
||||
| LightGBM (Guolin Ke, et al.) | Alpha360 | 0.0399±0.00 | 0.3075±0.00| 0.0492±0.00 | 0.4019±0.00 | 0.0323±0.00 | 0.4370±0.00| -0.0917±0.00 |
|
||||
| MLP | Alpha360 | 0.0253±0.01 | 0.1954±0.05| 0.0329±0.00 | 0.2687±0.04 | 0.0161±0.01 | 0.1989±0.19| -0.1275±0.03 |
|
||||
| GRU (Kyunghyun Cho, et al.) | Alpha360 | 0.0503±0.01 | 0.3946±0.06| 0.0588±0.00 | 0.4737±0.05 | 0.0799±0.02 | 1.0940±0.26| -0.0810±0.03 |
|
||||
| LSTM (Sepp Hochreiter, et al.) | Alpha360 | 0.0466±0.01 | 0.3644±0.06| 0.0555±0.00 | 0.4451±0.04 | 0.0783±0.05 | 1.0539±0.65| -0.0844±0.03 |
|
||||
| ALSTM (Yao Qin, et al.) | Alpha360 | 0.0472±0.00 | 0.3558±0.04| 0.0577±0.00 | 0.4522±0.04 | 0.0522±0.02 | 0.7090±0.32| -0.1059±0.03 |
|
||||
| GATs (Petar Velickovic, et al.) | Alpha360 | 0.0480±0.00 | 0.3555±0.02| 0.0598±0.00 | 0.4616±0.01 | 0.0857±0.03 | 1.1317±0.42| -0.0917±0.01 |
|
||||
| MLP | Alpha360 | 0.0285±0.00 | 0.1981±0.02| 0.0402±0.00 | 0.2993±0.02 | 0.0073±0.02 | 0.0880±0.22| -0.1446±0.03 |
|
||||
| GRU (Kyunghyun Cho, et al.) | Alpha360 | 0.0490±0.01 | 0.3787±0.05| 0.0581±0.00 | 0.4664±0.04 | 0.0726±0.02 | 0.9817±0.34| -0.0902±0.03 |
|
||||
| LSTM (Sepp Hochreiter, et al.) | Alpha360 | 0.0443±0.01 | 0.3401±0.05| 0.0536±0.01 | 0.4248±0.05 | 0.0627±0.03 | 0.8441±0.48| -0.0882±0.03 |
|
||||
| ALSTM (Yao Qin, et al.) | Alpha360 | 0.0493±0.01 | 0.3778±0.06| 0.0585±0.00 | 0.4606±0.04 | 0.0513±0.03 | 0.6727±0.38| -0.1085±0.02 |
|
||||
| GATs (Petar Velickovic, et al.) | Alpha360 | 0.0475±0.00 | 0.3515±0.02| 0.0592±0.00 | 0.4585±0.01 | 0.0876±0.02 | 1.1513±0.27| -0.0795±0.02 |
|
||||
|
||||
## Alpha158 dataset
|
||||
| Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown |
|
||||
|---|---|---|---|---|---|---|---|---|
|
||||
| Linear | Alpha158 | 0.0393±0.00 | 0.2980±0.00| 0.0475±0.00 | 0.3546±0.00 | 0.0795±0.00 | 1.0712±0.00| -0.1449±0.00 |
|
||||
| CatBoost (Liudmila Prokhorenkova, et al.) | Alpha158 | 0.0503±0.00 | 0.3586±0.00| 0.0483±0.00 | 0.3667±0.00 | 0.1080±0.00 | 1.1567±0.00| -0.0787±0.00 |
|
||||
| CatBoost (Liudmila Prokhorenkova, et al.) | Alpha158 | 0.0503±0.00 | 0.3586±0.00| 0.0483±0.00 | 0.3667±0.00 | 0.1080±0.00 | 1.1561±0.00| -0.0787±0.00 |
|
||||
| XGBoost (Tianqi Chen, et al.) | Alpha158 | 0.0481±0.00 | 0.3659±0.00| 0.0495±0.00 | 0.4033±0.00 | 0.1111±0.00 | 1.2915±0.00| -0.0893±0.00 |
|
||||
| LightGBM (Guolin Ke, et al.) | Alpha158 | 0.0475±0.00 | 0.3979±0.00| 0.0485±0.00 | 0.4123±0.00 | 0.1143±0.00 | 1.2744±0.00| -0.0800±0.00 |
|
||||
| MLP | Alpha158 | 0.0363±0.00 | 0.2770±0.02| 0.0421±0.00 | 0.3167±0.01 | 0.0856±0.01 | 1.0397±0.12| -0.1134±0.01 |
|
||||
| TFT (Bryan Lim, et al.) | Alpha158 (with selected 20 features) | 0.0344±0.00 | 0.2071±0.02| 0.0103±0.00 | 0.0632±0.01 | 0.0638±0.00 | 0.5845±0.08| -0.1754±0.02 |
|
||||
| GRU (Kyunghyun Cho, et al.) | Alpha158 (with selected 20 features) | 0.0302±0.00 | 0.2353±0.03| 0.0411±0.00 | 0.3309±0.03 | 0.0302±0.02 | 0.4353±0.28| -0.1140±0.02 |
|
||||
| LSTM (Sepp Hochreiter, et al.) | Alpha158 (with selected 20 features) | 0.0359±0.01 | 0.2774±0.06| 0.0448±0.01 | 0.3597±0.05 | 0.0402±0.03 | 0.5743±0.41| -0.1152±0.03 |
|
||||
| ALSTM (Yao Qin, et al.) | Alpha158 (with selected 20 features) | 0.0329±0.01 | 0.2465±0.07| 0.0450±0.01 | 0.3485±0.06 | 0.0288±0.04 | 0.4163±0.50| -0.1269±0.04 |
|
||||
| GATs (Petar Velickovic, et al.) | Alpha158 (with selected 20 features) | 0.0349±0.00 | 0.2526±0.01| 0.0454±0.00 | 0.3531±0.01 | 0.0561±0.01 | 0.7992±0.19| -0.0751±0.02 |
|
||||
| MLP | Alpha158 | 0.0358±0.00 | 0.2738±0.03| 0.0425±0.00 | 0.3221±0.01 | 0.0836±0.02 | 1.0323±0.25| -0.1127±0.02 |
|
||||
| TabNet with pretrain (Sercan O. Arikm et al) | Alpha158 | 0.0344±0.00|0.205±0.11|0.0398±0.00 |0.3479±0.01|0.0827±0.02|1.1141±0.32 |-0.0925±0.02 |
|
||||
| TFT (Bryan Lim, et al.) | Alpha158 (with selected 20 features) | 0.0343±0.00 | 0.2071±0.02| 0.0107±0.00 | 0.0660±0.02 | 0.0623±0.02 | 0.5818±0.20| -0.1762±0.01 |
|
||||
| GRU (Kyunghyun Cho, et al.) | Alpha158 (with selected 20 features) | 0.0311±0.00 | 0.2418±0.04| 0.0425±0.00 | 0.3434±0.02 | 0.0330±0.02 | 0.4805±0.30| -0.1021±0.02 |
|
||||
| LSTM (Sepp Hochreiter, et al.) | Alpha158 (with selected 20 features) | 0.0312±0.00 | 0.2394±0.04| 0.0418±0.00 | 0.3324±0.03 | 0.0298±0.02 | 0.4198±0.33| -0.1348±0.03 |
|
||||
| ALSTM (Yao Qin, et al.) | Alpha158 (with selected 20 features) | 0.0385±0.01 | 0.3022±0.06| 0.0478±0.00 | 0.3874±0.04 | 0.0486±0.03 | 0.7141±0.45| -0.1088±0.03 |
|
||||
| GATs (Petar Velickovic, et al.) | Alpha158 (with selected 20 features) | 0.0349±0.00 | 0.2511±0.01| 0.0457±0.00 | 0.3537±0.01 | 0.0578±0.02 | 0.8221±0.25| -0.0824±0.02 |
|
||||
|
||||
- The selected 20 features are based on the feature importance of a lightgbm-based model.
|
||||
|
||||
@@ -25,7 +25,7 @@ import os
|
||||
import data_formatters.qlib_Alpha158
|
||||
|
||||
|
||||
class ExperimentConfig(object):
|
||||
class ExperimentConfig:
|
||||
"""Defines experiment configs and paths to outputs.
|
||||
|
||||
Attributes:
|
||||
|
||||
@@ -320,7 +320,7 @@ class InterpretableMultiHeadAttention:
|
||||
return outputs, attn
|
||||
|
||||
|
||||
class TFTDataCache(object):
|
||||
class TFTDataCache:
|
||||
"""Caches data for the TFT."""
|
||||
|
||||
_data_cache = {}
|
||||
@@ -348,7 +348,7 @@ class TFTDataCache(object):
|
||||
|
||||
|
||||
# TFT model definitions.
|
||||
class TemporalFusionTransformer(object):
|
||||
class TemporalFusionTransformer:
|
||||
"""Defines Temporal Fusion Transformer.
|
||||
|
||||
Attributes:
|
||||
@@ -972,7 +972,7 @@ class TemporalFusionTransformer(object):
|
||||
valid_quantiles = self.quantiles
|
||||
output_size = self.output_size
|
||||
|
||||
class QuantileLossCalculator(object):
|
||||
class QuantileLossCalculator:
|
||||
"""Computes the combined quantile loss for prespecified quantiles.
|
||||
|
||||
Attributes:
|
||||
|
||||
BIN
examples/benchmarks/TabNet/pretrain/best.model
Normal file
BIN
examples/benchmarks/TabNet/pretrain/best.model
Normal file
Binary file not shown.
4
examples/benchmarks/TabNet/requirements.txt
Normal file
4
examples/benchmarks/TabNet/requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
pandas==1.1.2
|
||||
numpy==1.17.4
|
||||
scikit_learn==0.23.2
|
||||
torch==1.7.0
|
||||
@@ -0,0 +1,74 @@
|
||||
qlib_init:
|
||||
provider_uri: "~/.qlib/qlib_data/cn_data"
|
||||
region: cn
|
||||
market: &market csi300
|
||||
benchmark: &benchmark SH000300
|
||||
data_handler_config: &data_handler_config
|
||||
start_time: 2008-01-01
|
||||
end_time: 2020-08-01
|
||||
fit_start_time: 2008-01-01
|
||||
fit_end_time: 2014-12-31
|
||||
instruments: *market
|
||||
infer_processors:
|
||||
- class: RobustZScoreNorm
|
||||
kwargs:
|
||||
fields_group: feature
|
||||
clip_outlier: true
|
||||
- class: Fillna
|
||||
kwargs:
|
||||
fields_group: feature
|
||||
learn_processors:
|
||||
- class: DropnaLabel
|
||||
- class: CSRankNorm
|
||||
kwargs:
|
||||
fields_group: label
|
||||
label: ["Ref($close, -2) / Ref($close, -1) - 1"]
|
||||
port_analysis_config: &port_analysis_config
|
||||
strategy:
|
||||
class: TopkDropoutStrategy
|
||||
module_path: qlib.contrib.strategy.strategy
|
||||
kwargs:
|
||||
topk: 50
|
||||
n_drop: 5
|
||||
backtest:
|
||||
verbose: False
|
||||
limit_threshold: 0.095
|
||||
account: 100000000
|
||||
benchmark: *benchmark
|
||||
deal_price: close
|
||||
open_cost: 0.0005
|
||||
close_cost: 0.0015
|
||||
min_cost: 5
|
||||
task:
|
||||
model:
|
||||
class: TabnetModel
|
||||
module_path: qlib.contrib.model.pytorch_tabnet
|
||||
kwargs:
|
||||
pretrain: True
|
||||
dataset:
|
||||
class: DatasetH
|
||||
module_path: qlib.data.dataset
|
||||
kwargs:
|
||||
handler:
|
||||
class: Alpha158
|
||||
module_path: qlib.contrib.data.handler
|
||||
kwargs: *data_handler_config
|
||||
segments:
|
||||
pretrain: [2008-01-01, 2014-12-31]
|
||||
pretrain_validation: [2015-01-01, 2020-08-01]
|
||||
train: [2008-01-01, 2014-12-31]
|
||||
valid: [2015-01-01, 2016-12-31]
|
||||
test: [2017-01-01, 2020-08-01]
|
||||
record:
|
||||
- class: SignalRecord
|
||||
module_path: qlib.workflow.record_temp
|
||||
kwargs: {}
|
||||
- class: SigAnaRecord
|
||||
module_path: qlib.workflow.record_temp
|
||||
kwargs:
|
||||
ana_long_short: False
|
||||
ann_scaler: 252
|
||||
- class: PortAnaRecord
|
||||
module_path: qlib.workflow.record_temp
|
||||
kwargs:
|
||||
config: *port_analysis_config
|
||||
0
examples/highfreq/__init__.py
Normal file
0
examples/highfreq/__init__.py
Normal file
172
examples/highfreq/highfreq_handler.py
Normal file
172
examples/highfreq/highfreq_handler.py
Normal file
@@ -0,0 +1,172 @@
|
||||
from qlib.data.dataset.handler import DataHandler, DataHandlerLP
|
||||
from qlib.data.dataset.processor import Processor
|
||||
from qlib.utils import get_cls_kwargs
|
||||
from qlib.log import TimeInspector
|
||||
|
||||
|
||||
class HighFreqHandler(DataHandlerLP):
|
||||
def __init__(
|
||||
self,
|
||||
instruments="csi300",
|
||||
start_time=None,
|
||||
end_time=None,
|
||||
infer_processors=[],
|
||||
learn_processors=[],
|
||||
fit_start_time=None,
|
||||
fit_end_time=None,
|
||||
drop_raw=True,
|
||||
):
|
||||
def check_transform_proc(proc_l):
|
||||
new_l = []
|
||||
for p in proc_l:
|
||||
p["kwargs"].update(
|
||||
{
|
||||
"fit_start_time": fit_start_time,
|
||||
"fit_end_time": fit_end_time,
|
||||
}
|
||||
)
|
||||
new_l.append(p)
|
||||
return new_l
|
||||
|
||||
infer_processors = check_transform_proc(infer_processors)
|
||||
learn_processors = check_transform_proc(learn_processors)
|
||||
|
||||
data_loader = {
|
||||
"class": "QlibDataLoader",
|
||||
"kwargs": {
|
||||
"config": self.get_feature_config(),
|
||||
"swap_level": False,
|
||||
"freq": "1min",
|
||||
},
|
||||
}
|
||||
super().__init__(
|
||||
instruments=instruments,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
data_loader=data_loader,
|
||||
infer_processors=infer_processors,
|
||||
learn_processors=learn_processors,
|
||||
drop_raw=drop_raw,
|
||||
)
|
||||
|
||||
def get_feature_config(self):
|
||||
fields = []
|
||||
names = []
|
||||
|
||||
template_if = "If(IsNull({1}), {0}, {1})"
|
||||
template_paused = "Select(Or(IsNull($paused), Eq($paused, 0.0)), {0})"
|
||||
template_fillnan = "BFillNan(FFillNan({0}))"
|
||||
# Because there is no vwap field in the yahoo data, a method similar to Simpson integration is used to approximate vwap
|
||||
simpson_vwap = "($open + 2*$high + 2*$low + $close)/6"
|
||||
|
||||
def get_normalized_price_feature(price_field, shift=0):
|
||||
"""Get normalized price feature ops"""
|
||||
if shift == 0:
|
||||
template_norm = "{0}/Ref(DayLast({1}), 240)"
|
||||
else:
|
||||
template_norm = "Ref({0}, " + str(shift) + ")/Ref(DayLast({1}), 240)"
|
||||
|
||||
feature_ops = template_norm.format(
|
||||
template_if.format(
|
||||
template_fillnan.format(template_paused.format("$close")),
|
||||
template_paused.format(price_field),
|
||||
),
|
||||
template_fillnan.format(template_paused.format("$close")),
|
||||
)
|
||||
return feature_ops
|
||||
|
||||
fields += [get_normalized_price_feature("$open", 0)]
|
||||
fields += [get_normalized_price_feature("$high", 0)]
|
||||
fields += [get_normalized_price_feature("$low", 0)]
|
||||
fields += [get_normalized_price_feature("$close", 0)]
|
||||
fields += [get_normalized_price_feature(simpson_vwap, 0)]
|
||||
names += ["$open", "$high", "$low", "$close", "$vwap"]
|
||||
|
||||
fields += [get_normalized_price_feature("$open", 240)]
|
||||
fields += [get_normalized_price_feature("$high", 240)]
|
||||
fields += [get_normalized_price_feature("$low", 240)]
|
||||
fields += [get_normalized_price_feature("$close", 240)]
|
||||
fields += [get_normalized_price_feature(simpson_vwap, 240)]
|
||||
names += ["$open_1", "$high_1", "$low_1", "$close_1", "$vwap_1"]
|
||||
|
||||
fields += [
|
||||
"{0}/Ref(DayLast(Mean({0}, 7200)), 240)".format(
|
||||
"If(IsNull({0}), 0, If(Or(Gt({1}, Mul(1.001, {3})), Lt({1}, Mul(0.999, {2}))), 0, {0}))".format(
|
||||
template_paused.format("$volume"),
|
||||
template_paused.format(simpson_vwap),
|
||||
template_paused.format("$low"),
|
||||
template_paused.format("$high"),
|
||||
)
|
||||
)
|
||||
]
|
||||
names += ["$volume"]
|
||||
fields += [
|
||||
"Ref({0}, 240)/Ref(DayLast(Mean({0}, 7200)), 240)".format(
|
||||
"If(IsNull({0}), 0, If(Or(Gt({1}, Mul(1.001, {3})), Lt({1}, Mul(0.999, {2}))), 0, {0}))".format(
|
||||
template_paused.format("$volume"),
|
||||
template_paused.format(simpson_vwap),
|
||||
template_paused.format("$low"),
|
||||
template_paused.format("$high"),
|
||||
)
|
||||
)
|
||||
]
|
||||
names += ["$volume_1"]
|
||||
|
||||
fields += [template_paused.format("Date($close)")]
|
||||
names += ["date"]
|
||||
return fields, names
|
||||
|
||||
|
||||
class HighFreqBacktestHandler(DataHandler):
|
||||
def __init__(
|
||||
self,
|
||||
instruments="csi300",
|
||||
start_time=None,
|
||||
end_time=None,
|
||||
):
|
||||
data_loader = {
|
||||
"class": "QlibDataLoader",
|
||||
"kwargs": {
|
||||
"config": self.get_feature_config(),
|
||||
"swap_level": False,
|
||||
"freq": "1min",
|
||||
},
|
||||
}
|
||||
super().__init__(
|
||||
instruments=instruments,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
data_loader=data_loader,
|
||||
)
|
||||
|
||||
def get_feature_config(self):
|
||||
fields = []
|
||||
names = []
|
||||
|
||||
template_if = "If(IsNull({1}), {0}, {1})"
|
||||
template_paused = "Select(Or(IsNull($paused), Eq($paused, 0.0)), {0})"
|
||||
template_fillnan = "BFillNan(FFillNan({0}))"
|
||||
# Because there is no vwap field in the yahoo data, a method similar to Simpson integration is used to approximate vwap
|
||||
simpson_vwap = "($open + 2*$high + 2*$low + $close)/6"
|
||||
fields += [
|
||||
template_fillnan.format(template_paused.format("$close")),
|
||||
]
|
||||
names += ["$close0"]
|
||||
fields += [
|
||||
template_if.format(
|
||||
template_fillnan.format(template_paused.format("$close")),
|
||||
template_paused.format(simpson_vwap),
|
||||
)
|
||||
]
|
||||
names += ["$vwap0"]
|
||||
fields += [
|
||||
"If(IsNull({0}), 0, If(Or(Gt({1}, Mul(1.001, {3})), Lt({1}, Mul(0.999, {2}))), 0, {0}))".format(
|
||||
template_paused.format("$volume"),
|
||||
template_paused.format(simpson_vwap),
|
||||
template_paused.format("$low"),
|
||||
template_paused.format("$high"),
|
||||
)
|
||||
]
|
||||
names += ["$volume0"]
|
||||
|
||||
return fields, names
|
||||
56
examples/highfreq/highfreq_ops.py
Normal file
56
examples/highfreq/highfreq_ops.py
Normal file
@@ -0,0 +1,56 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import importlib
|
||||
from qlib.data.ops import ElemOperator, PairOperator
|
||||
from qlib.config import C
|
||||
from qlib.data.cache import H
|
||||
from qlib.data.data import Cal
|
||||
|
||||
|
||||
def get_calendar_day(freq="day", future=False):
|
||||
flag = f"{freq}_future_{future}_day"
|
||||
if flag in H["c"]:
|
||||
_calendar = H["c"][flag]
|
||||
else:
|
||||
_calendar = np.array(list(map(lambda x: x.date(), Cal.load_calendar(freq, future))))
|
||||
H["c"][flag] = _calendar
|
||||
return _calendar
|
||||
|
||||
|
||||
class DayLast(ElemOperator):
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
_calendar = get_calendar_day(freq=freq)
|
||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
return series.groupby(_calendar[series.index]).transform("last")
|
||||
|
||||
|
||||
class FFillNan(ElemOperator):
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
return series.fillna(method="ffill")
|
||||
|
||||
|
||||
class BFillNan(ElemOperator):
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
return series.fillna(method="bfill")
|
||||
|
||||
|
||||
class Date(ElemOperator):
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
_calendar = get_calendar_day(freq=freq)
|
||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
return pd.Series(_calendar[series.index], index=series.index)
|
||||
|
||||
|
||||
class Select(PairOperator):
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
series_condition = self.feature_left.load(instrument, start_index, end_index, freq)
|
||||
series_feature = self.feature_right.load(instrument, start_index, end_index, freq)
|
||||
return series_feature.loc[series_condition]
|
||||
|
||||
|
||||
class IsNull(ElemOperator):
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
return series.isnull()
|
||||
72
examples/highfreq/highfreq_processor.py
Normal file
72
examples/highfreq/highfreq_processor.py
Normal file
@@ -0,0 +1,72 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from qlib.data.dataset.processor import Processor
|
||||
from qlib.data.dataset.utils import fetch_df_by_index
|
||||
|
||||
|
||||
class HighFreqNorm(Processor):
|
||||
def __init__(self, fit_start_time, fit_end_time):
|
||||
self.fit_start_time = fit_start_time
|
||||
self.fit_end_time = fit_end_time
|
||||
|
||||
def fit(self, df_features):
|
||||
fetch_df = fetch_df_by_index(df_features, slice(self.fit_start_time, self.fit_end_time), level="datetime")
|
||||
del df_features
|
||||
df_values = fetch_df.values
|
||||
names = {
|
||||
"price": slice(0, 10),
|
||||
"volume": slice(10, 12),
|
||||
}
|
||||
self.feature_med = {}
|
||||
self.feature_std = {}
|
||||
self.feature_vmax = {}
|
||||
self.feature_vmin = {}
|
||||
for name, name_val in names.items():
|
||||
part_values = df_values[:, name_val].astype(np.float32)
|
||||
if name == "volume":
|
||||
part_values = np.log1p(part_values)
|
||||
self.feature_med[name] = np.nanmedian(part_values)
|
||||
part_values = part_values - self.feature_med[name]
|
||||
self.feature_std[name] = np.nanmedian(np.absolute(part_values)) * 1.4826 + 1e-12
|
||||
part_values = part_values / self.feature_std[name]
|
||||
self.feature_vmax[name] = np.nanmax(part_values)
|
||||
self.feature_vmin[name] = np.nanmin(part_values)
|
||||
|
||||
def __call__(self, df_features):
|
||||
df_features.set_index("date", append=True, drop=True, inplace=True)
|
||||
df_values = df_features.values
|
||||
names = {
|
||||
"price": slice(0, 10),
|
||||
"volume": slice(10, 12),
|
||||
}
|
||||
|
||||
for name, name_val in names.items():
|
||||
if name == "volume":
|
||||
df_values[:, name_val] = np.log1p(df_values[:, name_val])
|
||||
df_values[:, name_val] -= self.feature_med[name]
|
||||
df_values[:, name_val] /= self.feature_std[name]
|
||||
slice0 = df_values[:, name_val] > 3.0
|
||||
slice1 = df_values[:, name_val] > 3.5
|
||||
slice2 = df_values[:, name_val] < -3.0
|
||||
slice3 = df_values[:, name_val] < -3.5
|
||||
|
||||
df_values[:, name_val][slice0] = (
|
||||
3.0 + (df_values[:, name_val][slice0] - 3.0) / (self.feature_vmax[name] - 3) * 0.5
|
||||
)
|
||||
df_values[:, name_val][slice1] = 3.5
|
||||
df_values[:, name_val][slice2] = (
|
||||
-3.0 - (df_values[:, name_val][slice2] + 3.0) / (self.feature_vmin[name] + 3) * 0.5
|
||||
)
|
||||
df_values[:, name_val][slice3] = -3.5
|
||||
idx = df_features.index.droplevel("datetime").drop_duplicates()
|
||||
idx.set_names(["instrument", "datetime"], inplace=True)
|
||||
|
||||
# Reshape is specifically for adapting to RL high-freq executor
|
||||
feat = df_values[:, [0, 1, 2, 3, 4, 10]].reshape(-1, 6 * 240)
|
||||
feat_1 = df_values[:, [5, 6, 7, 8, 9, 11]].reshape(-1, 6 * 240)
|
||||
df_new_features = pd.DataFrame(
|
||||
data=np.concatenate((feat, feat_1), axis=1),
|
||||
index=idx,
|
||||
columns=["FEATURE_%d" % i for i in range(12 * 240)],
|
||||
).sort_index()
|
||||
return df_new_features
|
||||
166
examples/highfreq/workflow.py
Normal file
166
examples/highfreq/workflow.py
Normal file
@@ -0,0 +1,166 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import sys
|
||||
import fire
|
||||
from pathlib import Path
|
||||
|
||||
import qlib
|
||||
import pickle
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from qlib.config import HIGH_FREQ_CONFIG
|
||||
from qlib.contrib.model.gbdt import LGBModel
|
||||
from qlib.contrib.data.handler import Alpha158
|
||||
from qlib.contrib.strategy.strategy import TopkDropoutStrategy
|
||||
from qlib.contrib.evaluate import (
|
||||
backtest as normal_backtest,
|
||||
risk_analysis,
|
||||
)
|
||||
|
||||
from qlib.utils import init_instance_by_config, exists_qlib_data
|
||||
from qlib.data.dataset.handler import DataHandlerLP
|
||||
from qlib.data.ops import Operators
|
||||
from qlib.data.data import Cal
|
||||
from qlib.tests.data import GetData
|
||||
|
||||
from highfreq_ops import get_calendar_day, DayLast, FFillNan, BFillNan, Date, Select, IsNull
|
||||
|
||||
|
||||
class HighfreqWorkflow(object):
|
||||
|
||||
SPEC_CONF = {"custom_ops": [DayLast, FFillNan, BFillNan, Date, Select, IsNull], "expression_cache": None}
|
||||
|
||||
MARKET = "all"
|
||||
BENCHMARK = "SH000300"
|
||||
|
||||
start_time = "2020-09-14 00:00:00"
|
||||
end_time = "2021-01-18 16:00:00"
|
||||
train_end_time = "2020-11-30 16:00:00"
|
||||
test_start_time = "2020-12-01 00:00:00"
|
||||
|
||||
DATA_HANDLER_CONFIG0 = {
|
||||
"start_time": start_time,
|
||||
"end_time": end_time,
|
||||
"freq": "1min",
|
||||
"fit_start_time": start_time,
|
||||
"fit_end_time": train_end_time,
|
||||
"instruments": MARKET,
|
||||
"infer_processors": [{"class": "HighFreqNorm", "module_path": "highfreq_processor", "kwargs": {}}],
|
||||
}
|
||||
DATA_HANDLER_CONFIG1 = {
|
||||
"start_time": start_time,
|
||||
"end_time": end_time,
|
||||
"freq": "1min",
|
||||
"instruments": MARKET,
|
||||
}
|
||||
|
||||
task = {
|
||||
"dataset": {
|
||||
"class": "DatasetH",
|
||||
"module_path": "qlib.data.dataset",
|
||||
"kwargs": {
|
||||
"handler": {
|
||||
"class": "HighFreqHandler",
|
||||
"module_path": "highfreq_handler",
|
||||
"kwargs": DATA_HANDLER_CONFIG0,
|
||||
},
|
||||
"segments": {
|
||||
"train": (start_time, train_end_time),
|
||||
"test": (
|
||||
test_start_time,
|
||||
end_time,
|
||||
),
|
||||
},
|
||||
},
|
||||
},
|
||||
"dataset_backtest": {
|
||||
"class": "DatasetH",
|
||||
"module_path": "qlib.data.dataset",
|
||||
"kwargs": {
|
||||
"handler": {
|
||||
"class": "HighFreqBacktestHandler",
|
||||
"module_path": "highfreq_handler",
|
||||
"kwargs": DATA_HANDLER_CONFIG1,
|
||||
},
|
||||
"segments": {
|
||||
"train": (start_time, train_end_time),
|
||||
"test": (
|
||||
test_start_time,
|
||||
end_time,
|
||||
),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
def _init_qlib(self):
|
||||
"""initialize qlib"""
|
||||
# use yahoo_cn_1min data
|
||||
QLIB_INIT_CONFIG = {**HIGH_FREQ_CONFIG, **self.SPEC_CONF}
|
||||
provider_uri = QLIB_INIT_CONFIG.get("provider_uri")
|
||||
if not exists_qlib_data(provider_uri):
|
||||
print(f"Qlib data is not found in {provider_uri}")
|
||||
GetData().qlib_data(target_dir=provider_uri, interval="1min", region=REG_CN)
|
||||
qlib.init(**QLIB_INIT_CONFIG)
|
||||
|
||||
def _prepare_calender_cache(self):
|
||||
"""preload the calendar for cache"""
|
||||
|
||||
# This code used the copy-on-write feature of Linux to avoid calculating the calendar multiple times in the subprocess
|
||||
# This code may accelerate, but may be not useful on Windows and Mac Os
|
||||
Cal.calendar(freq="1min")
|
||||
get_calendar_day(freq="1min")
|
||||
|
||||
def get_data(self):
|
||||
"""use dataset to get highreq data"""
|
||||
self._init_qlib()
|
||||
self._prepare_calender_cache()
|
||||
|
||||
dataset = init_instance_by_config(self.task["dataset"])
|
||||
xtrain, xtest = dataset.prepare(["train", "test"])
|
||||
print(xtrain, xtest)
|
||||
|
||||
dataset_backtest = init_instance_by_config(self.task["dataset_backtest"])
|
||||
backtest_train, backtest_test = dataset_backtest.prepare(["train", "test"])
|
||||
print(backtest_train, backtest_test)
|
||||
|
||||
del xtrain, xtest
|
||||
del backtest_train, backtest_test
|
||||
|
||||
def dump_and_load_dataset(self):
|
||||
"""dump and load dataset state on disk"""
|
||||
self._init_qlib()
|
||||
self._prepare_calender_cache()
|
||||
dataset = init_instance_by_config(self.task["dataset"])
|
||||
dataset_backtest = init_instance_by_config(self.task["dataset_backtest"])
|
||||
|
||||
##=============dump dataset=============
|
||||
dataset.to_pickle(path="dataset.pkl")
|
||||
dataset_backtest.to_pickle(path="dataset_backtest.pkl")
|
||||
|
||||
del dataset, dataset_backtest
|
||||
##=============reload dataset=============
|
||||
with open("dataset.pkl", "rb") as file_dataset:
|
||||
dataset = pickle.load(file_dataset)
|
||||
|
||||
with open("dataset_backtest.pkl", "rb") as file_dataset_backtest:
|
||||
dataset_backtest = pickle.load(file_dataset_backtest)
|
||||
|
||||
self._prepare_calender_cache()
|
||||
##=============reload_dataset=============
|
||||
dataset.init(init_type=DataHandlerLP.IT_LS)
|
||||
dataset_backtest.init()
|
||||
|
||||
##=============get data=============
|
||||
xtrain, xtest = dataset.prepare(["train", "test"])
|
||||
backtest_train, backtest_test = dataset_backtest.prepare(["train", "test"])
|
||||
|
||||
print(xtrain, xtest)
|
||||
print(backtest_train, backtest_test)
|
||||
del xtrain, xtest
|
||||
del backtest_train, backtest_test
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
fire.Fire(HighfreqWorkflow)
|
||||
@@ -69,9 +69,9 @@ def handler(signum, frame):
|
||||
os.system("kill -9 %d" % os.getpid())
|
||||
|
||||
|
||||
signal.signal(signal.SIGTSTP, handler)
|
||||
signal.signal(signal.SIGINT, handler)
|
||||
|
||||
|
||||
# function to calculate the mean and std of a list in the results dictionary
|
||||
def cal_mean_std(results) -> dict:
|
||||
mean_std = dict()
|
||||
|
||||
@@ -17,7 +17,7 @@ from qlib.contrib.evaluate import (
|
||||
from qlib.utils import exists_qlib_data, init_instance_by_config, flatten_dict
|
||||
from qlib.workflow import R
|
||||
from qlib.workflow.record_temp import SignalRecord, PortAnaRecord
|
||||
|
||||
from qlib.tests.data import GetData
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -25,9 +25,6 @@ if __name__ == "__main__":
|
||||
provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir
|
||||
if not exists_qlib_data(provider_uri):
|
||||
print(f"Qlib data is not found in {provider_uri}")
|
||||
sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts")))
|
||||
from get_data import GetData
|
||||
|
||||
GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
|
||||
|
||||
qlib.init(provider_uri=provider_uri, region=REG_CN)
|
||||
@@ -98,6 +95,7 @@ if __name__ == "__main__":
|
||||
"open_cost": 0.0005,
|
||||
"close_cost": 0.0015,
|
||||
"min_cost": 5,
|
||||
"return_order": True,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -105,6 +103,11 @@ if __name__ == "__main__":
|
||||
model = init_instance_by_config(task["model"])
|
||||
dataset = init_instance_by_config(task["dataset"])
|
||||
|
||||
# NOTE: This line is optional
|
||||
# It demonstrates that the dataset can be used standalone.
|
||||
example_df = dataset.prepare("train")
|
||||
print(example_df.head())
|
||||
|
||||
# start exp
|
||||
with R.start(experiment_name="workflow"):
|
||||
R.log_params(**flatten_dict(task))
|
||||
|
||||
@@ -2,92 +2,49 @@
|
||||
# Licensed under the MIT License.
|
||||
|
||||
|
||||
__version__ = "0.6.1"
|
||||
__version__ = "0.6.2"
|
||||
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import copy
|
||||
import yaml
|
||||
import logging
|
||||
import platform
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from .utils import can_use_cache, init_instance_by_config, get_module_by_module_path
|
||||
from .workflow.utils import experiment_exit_handler
|
||||
|
||||
# init qlib
|
||||
def init(default_conf="client", **kwargs):
|
||||
from .config import C, REG_CN, REG_US, QlibConfig
|
||||
from .data.data import register_all_wrappers
|
||||
from .log import get_module_logger, set_log_with_config
|
||||
from .config import C
|
||||
from .log import get_module_logger
|
||||
from .data.cache import H
|
||||
from .workflow import R, QlibRecorder
|
||||
|
||||
C.reset()
|
||||
H.clear()
|
||||
|
||||
_logging_config = C.logging_config
|
||||
if "logging_config" in kwargs:
|
||||
_logging_config = kwargs["logging_config"]
|
||||
|
||||
# set global config
|
||||
if _logging_config:
|
||||
set_log_with_config(_logging_config)
|
||||
|
||||
# FIXME: this logger ignored the level in config
|
||||
LOG = get_module_logger("Initialization", level=logging.INFO)
|
||||
LOG.info(f"default_conf: {default_conf}.")
|
||||
logger = get_module_logger("Initialization", level=logging.INFO)
|
||||
|
||||
C.set_mode(default_conf)
|
||||
C.set_region(kwargs.get("region", C["region"] if "region" in C else REG_CN))
|
||||
|
||||
for k, v in kwargs.items():
|
||||
C[k] = v
|
||||
if k not in C:
|
||||
LOG.warning("Unrecognized config %s" % k)
|
||||
|
||||
C.resolve_path()
|
||||
|
||||
if not (C["expression_cache"] is None and C["dataset_cache"] is None):
|
||||
# check redis
|
||||
if not can_use_cache():
|
||||
LOG.warning(
|
||||
f"redis connection failed(host={C['redis_host']} port={C['redis_port']}), cache will not be used!"
|
||||
)
|
||||
C["expression_cache"] = None
|
||||
C["dataset_cache"] = None
|
||||
C.set(default_conf, **kwargs)
|
||||
|
||||
# check path if server/local
|
||||
if C.get_uri_type() == QlibConfig.LOCAL_URI:
|
||||
if C.get_uri_type() == C.LOCAL_URI:
|
||||
if not os.path.exists(C["provider_uri"]):
|
||||
if C["auto_mount"]:
|
||||
LOG.error(
|
||||
logger.error(
|
||||
f"Invalid provider uri: {C['provider_uri']}, please check if a valid provider uri has been set. This path does not exist."
|
||||
)
|
||||
else:
|
||||
LOG.warning(f"auto_path is False, please make sure {C['mount_path']} is mounted")
|
||||
elif C.get_uri_type() == QlibConfig.NFS_URI:
|
||||
logger.warning(f"auto_path is False, please make sure {C['mount_path']} is mounted")
|
||||
elif C.get_uri_type() == C.NFS_URI:
|
||||
_mount_nfs_uri(C)
|
||||
else:
|
||||
raise NotImplementedError(f"This type of URI is not supported")
|
||||
|
||||
LOG.info("qlib successfully initialized based on %s settings." % default_conf)
|
||||
register_all_wrappers()
|
||||
|
||||
LOG.info(f"data_path={C.get_data_path()}")
|
||||
C.register()
|
||||
|
||||
if "flask_server" in C:
|
||||
LOG.info(f"flask_server={C['flask_server']}, flask_port={C['flask_port']}")
|
||||
|
||||
# set up QlibRecorder
|
||||
exp_manager = init_instance_by_config(C["exp_manager"])
|
||||
qr = QlibRecorder(exp_manager)
|
||||
R.register(qr)
|
||||
# clean up experiment when python program ends
|
||||
experiment_exit_handler()
|
||||
logger.info(f"flask_server={C['flask_server']}, flask_port={C['flask_port']}")
|
||||
logger.info("qlib successfully initialized based on %s settings." % default_conf)
|
||||
logger.info(f"data_path={C.get_data_path()}")
|
||||
|
||||
|
||||
def _mount_nfs_uri(C):
|
||||
|
||||
@@ -11,26 +11,27 @@ Two modes are supported
|
||||
|
||||
"""
|
||||
|
||||
import copy
|
||||
from pathlib import Path
|
||||
import re
|
||||
import os
|
||||
import re
|
||||
import copy
|
||||
import logging
|
||||
import multiprocessing
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class Config:
|
||||
def __init__(self, default_conf):
|
||||
self.__dict__["_default_config"] = default_conf # avoiding conflictions with __getattr__
|
||||
self.__dict__["_default_config"] = copy.deepcopy(default_conf) # avoiding conflictions with __getattr__
|
||||
self.reset()
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.__dict__["_config"][key]
|
||||
|
||||
def __getattr__(self, attr):
|
||||
try:
|
||||
if attr in self.__dict__["_config"]:
|
||||
return self.__dict__["_config"][attr]
|
||||
except KeyError:
|
||||
return AttributeError(f"No such {attr} in self._config")
|
||||
|
||||
raise AttributeError(f"No such {attr} in self._config")
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
self.__dict__["_config"][key] = value
|
||||
@@ -59,6 +60,9 @@ class Config:
|
||||
def update(self, *args, **kwargs):
|
||||
self.__dict__["_config"].update(*args, **kwargs)
|
||||
|
||||
def set_conf_from_C(self, config_c):
|
||||
self.update(**config_c.__dict__["_config"])
|
||||
|
||||
|
||||
# REGION CONST
|
||||
REG_CN = "cn"
|
||||
@@ -86,7 +90,6 @@ _default_config = {
|
||||
# How many tasks belong to one process. Recommend 1 for high-frequency data and None for daily data.
|
||||
"maxtasksperchild": None,
|
||||
"default_disk_cache": 1, # 0:skip/1:use
|
||||
"disable_disk_cache": False, # disable disk cache; if High-frequency data generally disable_disk_cache=True
|
||||
"mem_cache_size_limit": 500,
|
||||
# memory cache expire second, only in used 'DatasetURICache' and 'client D.calendar'
|
||||
# default 1 hour
|
||||
@@ -184,9 +187,17 @@ MODE_CONF = {
|
||||
"timeout": 100,
|
||||
"logging_level": "INFO",
|
||||
"region": REG_CN,
|
||||
## Custom Operator
|
||||
"custom_ops": [],
|
||||
},
|
||||
}
|
||||
|
||||
HIGH_FREQ_CONFIG = {
|
||||
"provider_uri": "~/.qlib/qlib_data/yahoo_cn_1min",
|
||||
"dataset_cache": None,
|
||||
"expression_cache": "DiskExpressionCache",
|
||||
"region": REG_CN,
|
||||
}
|
||||
|
||||
_default_region_config = {
|
||||
REG_CN: {
|
||||
@@ -207,6 +218,10 @@ class QlibConfig(Config):
|
||||
LOCAL_URI = "local"
|
||||
NFS_URI = "nfs"
|
||||
|
||||
def __init__(self, default_conf):
|
||||
super().__init__(default_conf)
|
||||
self._registered = False
|
||||
|
||||
def set_mode(self, mode):
|
||||
# raise KeyError
|
||||
self.update(MODE_CONF[mode])
|
||||
@@ -243,6 +258,64 @@ class QlibConfig(Config):
|
||||
else:
|
||||
raise NotImplementedError(f"This type of uri is not supported")
|
||||
|
||||
def set(self, default_conf="client", **kwargs):
|
||||
from .utils import set_log_with_config, get_module_logger, can_use_cache
|
||||
|
||||
self.reset()
|
||||
|
||||
_logging_config = self.logging_config
|
||||
if "logging_config" in kwargs:
|
||||
_logging_config = kwargs["logging_config"]
|
||||
|
||||
# set global config
|
||||
if _logging_config:
|
||||
set_log_with_config(_logging_config)
|
||||
|
||||
# FIXME: this logger ignored the level in config
|
||||
logger = get_module_logger("Initialization", level=logging.INFO)
|
||||
logger.info(f"default_conf: {default_conf}.")
|
||||
|
||||
self.set_mode(default_conf)
|
||||
self.set_region(kwargs.get("region", self["region"] if "region" in self else REG_CN))
|
||||
|
||||
for k, v in kwargs.items():
|
||||
if k not in self:
|
||||
logger.warning("Unrecognized config %s" % k)
|
||||
self[k] = v
|
||||
|
||||
self.resolve_path()
|
||||
|
||||
if not (self["expression_cache"] is None and self["dataset_cache"] is None):
|
||||
# check redis
|
||||
if not can_use_cache():
|
||||
logger.warning(
|
||||
f"redis connection failed(host={self['redis_host']} port={self['redis_port']}), cache will not be used!"
|
||||
)
|
||||
self["expression_cache"] = None
|
||||
self["dataset_cache"] = None
|
||||
|
||||
def register(self):
|
||||
from .utils import init_instance_by_config
|
||||
from .data.ops import register_all_ops
|
||||
from .data.data import register_all_wrappers
|
||||
from .workflow import R, QlibRecorder
|
||||
from .workflow.utils import experiment_exit_handler
|
||||
|
||||
register_all_ops(self)
|
||||
register_all_wrappers(self)
|
||||
# set up QlibRecorder
|
||||
exp_manager = init_instance_by_config(self["exp_manager"])
|
||||
qr = QlibRecorder(exp_manager)
|
||||
R.register(qr)
|
||||
# clean up experiment when python program ends
|
||||
experiment_exit_handler()
|
||||
|
||||
self._registered = True
|
||||
|
||||
@property
|
||||
def registered(self):
|
||||
return self._registered
|
||||
|
||||
|
||||
# global config
|
||||
C = QlibConfig(_default_config)
|
||||
|
||||
@@ -1,9 +1,324 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
from .order import Order
|
||||
from .account import Account
|
||||
from .position import Position
|
||||
from .exchange import Exchange
|
||||
from .report import Report
|
||||
from .backtest import backtest as backtest_func, get_date_range
|
||||
|
||||
import numpy as np
|
||||
import inspect
|
||||
from ...utils import init_instance_by_config
|
||||
from ...log import get_module_logger
|
||||
from ...config import C
|
||||
|
||||
logger = get_module_logger("backtest caller")
|
||||
|
||||
|
||||
def get_strategy(
|
||||
strategy=None,
|
||||
topk=50,
|
||||
margin=0.5,
|
||||
n_drop=5,
|
||||
risk_degree=0.95,
|
||||
str_type="dropout",
|
||||
adjust_dates=None,
|
||||
):
|
||||
"""get_strategy
|
||||
|
||||
There will be 3 ways to return a stratgy. Please follow the code.
|
||||
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
strategy : Strategy()
|
||||
strategy used in backtest.
|
||||
topk : int (Default value: 50)
|
||||
top-N stocks to buy.
|
||||
margin : int or float(Default value: 0.5)
|
||||
- if isinstance(margin, int):
|
||||
|
||||
sell_limit = margin
|
||||
|
||||
- else:
|
||||
|
||||
sell_limit = pred_in_a_day.count() * margin
|
||||
|
||||
buffer margin, in single score_mode, continue holding stock if it is in nlargest(sell_limit).
|
||||
sell_limit should be no less than topk.
|
||||
n_drop : int
|
||||
number of stocks to be replaced in each trading date.
|
||||
risk_degree: float
|
||||
0-1, 0.95 for example, use 95% money to trade.
|
||||
str_type: 'amount', 'weight' or 'dropout'
|
||||
strategy type: TopkAmountStrategy ,TopkWeightStrategy or TopkDropoutStrategy.
|
||||
|
||||
Returns
|
||||
-------
|
||||
:class: Strategy
|
||||
an initialized strategy object
|
||||
"""
|
||||
|
||||
# There will be 3 ways to return a strategy.
|
||||
if strategy is None:
|
||||
# 1) create strategy with param `strategy`
|
||||
str_cls_dict = {
|
||||
"amount": "TopkAmountStrategy",
|
||||
"weight": "TopkWeightStrategy",
|
||||
"dropout": "TopkDropoutStrategy",
|
||||
}
|
||||
logger.info("Create new strategy ")
|
||||
from .. import strategy as strategy_pool
|
||||
|
||||
str_cls = getattr(strategy_pool, str_cls_dict.get(str_type))
|
||||
strategy = str_cls(
|
||||
topk=topk,
|
||||
buffer_margin=margin,
|
||||
n_drop=n_drop,
|
||||
risk_degree=risk_degree,
|
||||
adjust_dates=adjust_dates,
|
||||
)
|
||||
elif isinstance(strategy, (dict, str)):
|
||||
# 2) create strategy with init_instance_by_config
|
||||
logger.info("Create new strategy ")
|
||||
strategy = init_instance_by_config(strategy)
|
||||
|
||||
from ..strategy.strategy import BaseStrategy
|
||||
|
||||
# else: nothing happens. 3) Use the strategy directly
|
||||
if not isinstance(strategy, BaseStrategy):
|
||||
raise TypeError("Strategy not supported")
|
||||
return strategy
|
||||
|
||||
|
||||
def get_exchange(
|
||||
pred,
|
||||
exchange=None,
|
||||
subscribe_fields=[],
|
||||
open_cost=0.0015,
|
||||
close_cost=0.0025,
|
||||
min_cost=5.0,
|
||||
trade_unit=None,
|
||||
limit_threshold=None,
|
||||
deal_price=None,
|
||||
extract_codes=False,
|
||||
shift=1,
|
||||
):
|
||||
"""get_exchange
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
# exchange related arguments
|
||||
exchange: Exchange().
|
||||
subscribe_fields: list
|
||||
subscribe fields.
|
||||
open_cost : float
|
||||
open transaction cost.
|
||||
close_cost : float
|
||||
close transaction cost.
|
||||
min_cost : float
|
||||
min transaction cost.
|
||||
trade_unit : int
|
||||
100 for China A.
|
||||
deal_price: str
|
||||
dealing price type: 'close', 'open', 'vwap'.
|
||||
limit_threshold : float
|
||||
limit move 0.1 (10%) for example, long and short with same limit.
|
||||
extract_codes: bool
|
||||
will we pass the codes extracted from the pred to the exchange.
|
||||
NOTE: This will be faster with offline qlib.
|
||||
|
||||
Returns
|
||||
-------
|
||||
:class: Exchange
|
||||
an initialized Exchange object
|
||||
"""
|
||||
|
||||
if trade_unit is None:
|
||||
trade_unit = C.trade_unit
|
||||
if limit_threshold is None:
|
||||
limit_threshold = C.limit_threshold
|
||||
if deal_price is None:
|
||||
deal_price = C.deal_price
|
||||
if exchange is None:
|
||||
logger.info("Create new exchange")
|
||||
# handle exception for deal_price
|
||||
if deal_price[0] != "$":
|
||||
deal_price = "$" + deal_price
|
||||
if extract_codes:
|
||||
codes = sorted(pred.index.get_level_values("instrument").unique())
|
||||
else:
|
||||
codes = "all" # TODO: We must ensure that 'all.txt' includes all the stocks
|
||||
|
||||
dates = sorted(pred.index.get_level_values("datetime").unique())
|
||||
dates = np.append(dates, get_date_range(dates[-1], left_shift=1, right_shift=shift))
|
||||
|
||||
exchange = Exchange(
|
||||
trade_dates=dates,
|
||||
codes=codes,
|
||||
deal_price=deal_price,
|
||||
subscribe_fields=subscribe_fields,
|
||||
limit_threshold=limit_threshold,
|
||||
open_cost=open_cost,
|
||||
close_cost=close_cost,
|
||||
min_cost=min_cost,
|
||||
trade_unit=trade_unit,
|
||||
)
|
||||
return exchange
|
||||
|
||||
|
||||
def get_executor(
|
||||
executor=None,
|
||||
trade_exchange=None,
|
||||
verbose=True,
|
||||
):
|
||||
"""get_executor
|
||||
|
||||
There will be 3 ways to return a executor. Please follow the code.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
executor : BaseExecutor
|
||||
executor used in backtest.
|
||||
trade_exchange : Exchange
|
||||
exchange used in executor
|
||||
verbose : bool
|
||||
whether to print log.
|
||||
|
||||
Returns
|
||||
-------
|
||||
:class: BaseExecutor
|
||||
an initialized BaseExecutor object
|
||||
"""
|
||||
|
||||
# There will be 3 ways to return a executor.
|
||||
if executor is None:
|
||||
# 1) create executor with param `executor`
|
||||
logger.info("Create new executor ")
|
||||
from ..online.executor import SimulatorExecutor
|
||||
|
||||
executor = SimulatorExecutor(trade_exchange=trade_exchange, verbose=verbose)
|
||||
elif isinstance(executor, (dict, str)):
|
||||
# 2) create executor with config
|
||||
logger.info("Create new executor ")
|
||||
executor = init_instance_by_config(executor)
|
||||
|
||||
from ..online.executor import BaseExecutor
|
||||
|
||||
# 3) Use the executor directly
|
||||
if not isinstance(executor, BaseExecutor):
|
||||
raise TypeError("Executor not supported")
|
||||
return executor
|
||||
|
||||
|
||||
# This is the API for compatibility for legacy code
|
||||
def backtest(pred, account=1e9, shift=1, benchmark="SH000905", verbose=True, return_order=False, **kwargs):
|
||||
"""This function will help you set a reasonable Exchange and provide default value for strategy
|
||||
Parameters
|
||||
----------
|
||||
|
||||
- **backtest workflow related or commmon arguments**
|
||||
|
||||
pred : pandas.DataFrame
|
||||
predict should has <datetime, instrument> index and one `score` column.
|
||||
account : float
|
||||
init account value.
|
||||
shift : int
|
||||
whether to shift prediction by one day.
|
||||
benchmark : str
|
||||
benchmark code, default is SH000905 CSI 500.
|
||||
verbose : bool
|
||||
whether to print log.
|
||||
return_order : bool
|
||||
whether to return order list
|
||||
|
||||
- **strategy related arguments**
|
||||
|
||||
strategy : Strategy()
|
||||
strategy used in backtest.
|
||||
topk : int (Default value: 50)
|
||||
top-N stocks to buy.
|
||||
margin : int or float(Default value: 0.5)
|
||||
- if isinstance(margin, int):
|
||||
|
||||
sell_limit = margin
|
||||
|
||||
- else:
|
||||
|
||||
sell_limit = pred_in_a_day.count() * margin
|
||||
|
||||
buffer margin, in single score_mode, continue holding stock if it is in nlargest(sell_limit).
|
||||
sell_limit should be no less than topk.
|
||||
n_drop : int
|
||||
number of stocks to be replaced in each trading date.
|
||||
risk_degree: float
|
||||
0-1, 0.95 for example, use 95% money to trade.
|
||||
str_type: 'amount', 'weight' or 'dropout'
|
||||
strategy type: TopkAmountStrategy ,TopkWeightStrategy or TopkDropoutStrategy.
|
||||
|
||||
- **exchange related arguments**
|
||||
|
||||
exchange: Exchange()
|
||||
pass the exchange for speeding up.
|
||||
subscribe_fields: list
|
||||
subscribe fields.
|
||||
open_cost : float
|
||||
open transaction cost. The default value is 0.002(0.2%).
|
||||
close_cost : float
|
||||
close transaction cost. The default value is 0.002(0.2%).
|
||||
min_cost : float
|
||||
min transaction cost.
|
||||
trade_unit : int
|
||||
100 for China A.
|
||||
deal_price: str
|
||||
dealing price type: 'close', 'open', 'vwap'.
|
||||
limit_threshold : float
|
||||
limit move 0.1 (10%) for example, long and short with same limit.
|
||||
extract_codes: bool
|
||||
will we pass the codes extracted from the pred to the exchange.
|
||||
|
||||
.. note:: This will be faster with offline qlib.
|
||||
|
||||
- **executor related arguments**
|
||||
|
||||
executor : BaseExecutor()
|
||||
executor used in backtest.
|
||||
verbose : bool
|
||||
whether to print log.
|
||||
|
||||
"""
|
||||
# check strategy:
|
||||
spec = inspect.getfullargspec(get_strategy)
|
||||
str_args = {k: v for k, v in kwargs.items() if k in spec.args}
|
||||
strategy = get_strategy(**str_args)
|
||||
|
||||
# init exchange:
|
||||
spec = inspect.getfullargspec(get_exchange)
|
||||
ex_args = {k: v for k, v in kwargs.items() if k in spec.args}
|
||||
trade_exchange = get_exchange(pred, **ex_args)
|
||||
|
||||
# init executor:
|
||||
executor = get_executor(executor=kwargs.get("executor"), trade_exchange=trade_exchange, verbose=verbose)
|
||||
|
||||
# run backtest
|
||||
report_dict = backtest_func(
|
||||
pred=pred,
|
||||
strategy=strategy,
|
||||
executor=executor,
|
||||
trade_exchange=trade_exchange,
|
||||
shift=shift,
|
||||
verbose=verbose,
|
||||
account=account,
|
||||
benchmark=benchmark,
|
||||
return_order=return_order,
|
||||
)
|
||||
# for compatibility of the old API. return the dict positions
|
||||
|
||||
positions = report_dict.get("positions")
|
||||
report_dict.update({"positions": {k: p.position for k, p in positions.items()}})
|
||||
return report_dict
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from ...utils import get_date_by_shift, get_date_range
|
||||
from ..online.executor import SimulatorExecutor
|
||||
from ...data import D
|
||||
from .account import Account
|
||||
from ...config import C
|
||||
@@ -15,7 +14,7 @@ from ...data.dataset.utils import get_level_index
|
||||
LOG = get_module_logger("backtest")
|
||||
|
||||
|
||||
def backtest(pred, strategy, trade_exchange, shift, verbose, account, benchmark):
|
||||
def backtest(pred, strategy, executor, trade_exchange, shift, verbose, account, benchmark, return_order):
|
||||
"""Parameters
|
||||
----------
|
||||
pred : pandas.DataFrame
|
||||
@@ -69,9 +68,9 @@ def backtest(pred, strategy, trade_exchange, shift, verbose, account, benchmark)
|
||||
raise ValueError(f"The benchmark {_codes} does not exist. Please provide the right benchmark")
|
||||
bench = _temp_result.groupby(level="datetime")[_temp_result.columns.tolist()[0]].mean()
|
||||
|
||||
trade_dates = np.append(predict_dates[shift:], get_date_range(predict_dates[-1], shift=shift))
|
||||
executor = SimulatorExecutor(trade_exchange, verbose=verbose)
|
||||
|
||||
trade_dates = np.append(predict_dates[shift:], get_date_range(predict_dates[-1], left_shift=1, right_shift=shift))
|
||||
if return_order:
|
||||
multi_order_list = []
|
||||
# trading apart
|
||||
for pred_date, trade_date in zip(predict_dates, trade_dates):
|
||||
# for loop predict date and trading date
|
||||
@@ -103,6 +102,8 @@ def backtest(pred, strategy, trade_exchange, shift, verbose, account, benchmark)
|
||||
)
|
||||
else:
|
||||
order_list = []
|
||||
if return_order:
|
||||
multi_order_list.append((trade_account, order_list, trade_date))
|
||||
# 4. Get result after executing order list
|
||||
# NOTE: The following operation will modify order.amount.
|
||||
# NOTE: If it is buy and the cash is insufficient, the tradable amount will be recalculated
|
||||
@@ -115,7 +116,11 @@ def backtest(pred, strategy, trade_exchange, shift, verbose, account, benchmark)
|
||||
report_df = trade_account.report.generate_report_dataframe()
|
||||
report_df["bench"] = bench
|
||||
positions = trade_account.get_positions()
|
||||
return report_df, positions
|
||||
|
||||
report_dict = {"report_df": report_df, "positions": positions}
|
||||
if return_order:
|
||||
report_dict.update({"order_list": multi_order_list})
|
||||
return report_dict
|
||||
|
||||
|
||||
def update_account(trade_account, trade_info, trade_exchange, trade_date):
|
||||
|
||||
@@ -49,10 +49,12 @@ class Alpha360(DataHandlerLP):
|
||||
instruments="csi500",
|
||||
start_time=None,
|
||||
end_time=None,
|
||||
freq="day",
|
||||
infer_processors=_DEFAULT_INFER_PROCESSORS,
|
||||
learn_processors=_DEFAULT_LEARN_PROCESSORS,
|
||||
fit_start_time=None,
|
||||
fit_end_time=None,
|
||||
filter_pipe=None,
|
||||
**kwargs,
|
||||
):
|
||||
infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time)
|
||||
@@ -65,13 +67,15 @@ class Alpha360(DataHandlerLP):
|
||||
"feature": self.get_feature_config(),
|
||||
"label": kwargs.get("label", self.get_label_config()),
|
||||
},
|
||||
"filter_pipe": filter_pipe,
|
||||
"freq": freq,
|
||||
},
|
||||
}
|
||||
|
||||
super().__init__(
|
||||
instruments,
|
||||
start_time,
|
||||
end_time,
|
||||
instruments=instruments,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
data_loader=data_loader,
|
||||
learn_processors=learn_processors,
|
||||
infer_processors=infer_processors,
|
||||
@@ -130,11 +134,13 @@ class Alpha158(DataHandlerLP):
|
||||
instruments="csi500",
|
||||
start_time=None,
|
||||
end_time=None,
|
||||
freq="day",
|
||||
infer_processors=[],
|
||||
learn_processors=_DEFAULT_LEARN_PROCESSORS,
|
||||
fit_start_time=None,
|
||||
fit_end_time=None,
|
||||
process_type=DataHandlerLP.PTYPE_A,
|
||||
filter_pipe=None,
|
||||
**kwargs,
|
||||
):
|
||||
infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time)
|
||||
@@ -143,13 +149,18 @@ class Alpha158(DataHandlerLP):
|
||||
data_loader = {
|
||||
"class": "QlibDataLoader",
|
||||
"kwargs": {
|
||||
"config": {"feature": self.get_feature_config(), "label": kwargs.get("label", self.get_label_config())},
|
||||
"config": {
|
||||
"feature": self.get_feature_config(),
|
||||
"label": kwargs.get("label", self.get_label_config()),
|
||||
},
|
||||
"filter_pipe": filter_pipe,
|
||||
"freq": freq,
|
||||
},
|
||||
}
|
||||
super().__init__(
|
||||
instruments,
|
||||
start_time,
|
||||
end_time,
|
||||
instruments=instruments,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
data_loader=data_loader,
|
||||
infer_processors=infer_processors,
|
||||
learn_processors=learn_processors,
|
||||
|
||||
@@ -6,17 +6,16 @@ from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import inspect
|
||||
import warnings
|
||||
from ..log import get_module_logger
|
||||
from . import strategy as strategy_pool
|
||||
from .strategy.strategy import BaseStrategy
|
||||
from .backtest.exchange import Exchange
|
||||
from .backtest.backtest import backtest as backtest_func, get_date_range
|
||||
from .backtest import get_exchange, backtest as backtest_func
|
||||
from .backtest.backtest import get_date_range
|
||||
|
||||
from ..data import D
|
||||
from ..config import C
|
||||
from ..data.dataset.utils import get_level_index
|
||||
|
||||
|
||||
logger = get_module_logger("Evaluate")
|
||||
|
||||
|
||||
@@ -46,144 +45,6 @@ def risk_analysis(r, N=252):
|
||||
return res
|
||||
|
||||
|
||||
def get_strategy(
|
||||
strategy=None,
|
||||
topk=50,
|
||||
margin=0.5,
|
||||
n_drop=5,
|
||||
risk_degree=0.95,
|
||||
str_type="amount",
|
||||
adjust_dates=None,
|
||||
):
|
||||
"""get_strategy
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
strategy : Strategy()
|
||||
strategy used in backtest.
|
||||
topk : int (Default value: 50)
|
||||
top-N stocks to buy.
|
||||
margin : int or float(Default value: 0.5)
|
||||
- if isinstance(margin, int):
|
||||
|
||||
sell_limit = margin
|
||||
|
||||
- else:
|
||||
|
||||
sell_limit = pred_in_a_day.count() * margin
|
||||
|
||||
buffer margin, in single score_mode, continue holding stock if it is in nlargest(sell_limit).
|
||||
sell_limit should be no less than topk.
|
||||
n_drop : int
|
||||
number of stocks to be replaced in each trading date.
|
||||
risk_degree: float
|
||||
0-1, 0.95 for example, use 95% money to trade.
|
||||
str_type: 'amount', 'weight' or 'dropout'
|
||||
strategy type: TopkAmountStrategy ,TopkWeightStrategy or TopkDropoutStrategy.
|
||||
|
||||
Returns
|
||||
-------
|
||||
:class: Strategy
|
||||
an initialized strategy object
|
||||
"""
|
||||
if strategy is None:
|
||||
str_cls_dict = {
|
||||
"amount": "TopkAmountStrategy",
|
||||
"weight": "TopkWeightStrategy",
|
||||
"dropout": "TopkDropoutStrategy",
|
||||
}
|
||||
logger.info("Create new streategy ")
|
||||
str_cls = getattr(strategy_pool, str_cls_dict.get(str_type))
|
||||
strategy = str_cls(
|
||||
topk=topk,
|
||||
buffer_margin=margin,
|
||||
n_drop=n_drop,
|
||||
risk_degree=risk_degree,
|
||||
adjust_dates=adjust_dates,
|
||||
)
|
||||
if not isinstance(strategy, BaseStrategy):
|
||||
raise TypeError("Strategy not supported")
|
||||
return strategy
|
||||
|
||||
|
||||
def get_exchange(
|
||||
pred,
|
||||
exchange=None,
|
||||
subscribe_fields=[],
|
||||
open_cost=0.0015,
|
||||
close_cost=0.0025,
|
||||
min_cost=5.0,
|
||||
trade_unit=None,
|
||||
limit_threshold=None,
|
||||
deal_price=None,
|
||||
extract_codes=False,
|
||||
shift=1,
|
||||
):
|
||||
"""get_exchange
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
# exchange related arguments
|
||||
exchange: Exchange().
|
||||
subscribe_fields: list
|
||||
subscribe fields.
|
||||
open_cost : float
|
||||
open transaction cost.
|
||||
close_cost : float
|
||||
close transaction cost.
|
||||
min_cost : float
|
||||
min transaction cost.
|
||||
trade_unit : int
|
||||
100 for China A.
|
||||
deal_price: str
|
||||
dealing price type: 'close', 'open', 'vwap'.
|
||||
limit_threshold : float
|
||||
limit move 0.1 (10%) for example, long and short with same limit.
|
||||
extract_codes: bool
|
||||
will we pass the codes extracted from the pred to the exchange.
|
||||
NOTE: This will be faster with offline qlib.
|
||||
|
||||
Returns
|
||||
-------
|
||||
:class: Exchange
|
||||
an initialized Exchange object
|
||||
"""
|
||||
|
||||
if trade_unit is None:
|
||||
trade_unit = C.trade_unit
|
||||
if limit_threshold is None:
|
||||
limit_threshold = C.limit_threshold
|
||||
if deal_price is None:
|
||||
deal_price = C.deal_price
|
||||
if exchange is None:
|
||||
logger.info("Create new exchange")
|
||||
# handle exception for deal_price
|
||||
if deal_price[0] != "$":
|
||||
deal_price = "$" + deal_price
|
||||
if extract_codes:
|
||||
codes = sorted(pred.index.get_level_values("instrument").unique())
|
||||
else:
|
||||
codes = "all" # TODO: We must ensure that 'all.txt' includes all the stocks
|
||||
|
||||
dates = sorted(pred.index.get_level_values("datetime").unique())
|
||||
dates = np.append(dates, get_date_range(dates[-1], shift=shift))
|
||||
|
||||
exchange = Exchange(
|
||||
trade_dates=dates,
|
||||
codes=codes,
|
||||
deal_price=deal_price,
|
||||
subscribe_fields=subscribe_fields,
|
||||
limit_threshold=limit_threshold,
|
||||
open_cost=open_cost,
|
||||
close_cost=close_cost,
|
||||
min_cost=min_cost,
|
||||
trade_unit=trade_unit,
|
||||
)
|
||||
return exchange
|
||||
|
||||
|
||||
# This is the API for compatibility for legacy code
|
||||
def backtest(pred, account=1e9, shift=1, benchmark="SH000905", verbose=True, **kwargs):
|
||||
"""This function will help you set a reasonable Exchange and provide default value for strategy
|
||||
@@ -249,30 +110,22 @@ def backtest(pred, account=1e9, shift=1, benchmark="SH000905", verbose=True, **k
|
||||
will we pass the codes extracted from the pred to the exchange.
|
||||
|
||||
.. note:: This will be faster with offline qlib.
|
||||
|
||||
- **executor related arguments**
|
||||
|
||||
executor : BaseExecutor()
|
||||
executor used in backtest.
|
||||
verbose : bool
|
||||
whether to print log.
|
||||
|
||||
"""
|
||||
# check strategy:
|
||||
spec = inspect.getfullargspec(get_strategy)
|
||||
str_args = {k: v for k, v in kwargs.items() if k in spec.args}
|
||||
strategy = get_strategy(**str_args)
|
||||
|
||||
# init exchange:
|
||||
spec = inspect.getfullargspec(get_exchange)
|
||||
ex_args = {k: v for k, v in kwargs.items() if k in spec.args}
|
||||
trade_exchange = get_exchange(pred, **ex_args)
|
||||
|
||||
# run backtest
|
||||
report_df, positions = backtest_func(
|
||||
pred=pred,
|
||||
strategy=strategy,
|
||||
trade_exchange=trade_exchange,
|
||||
shift=shift,
|
||||
verbose=verbose,
|
||||
account=account,
|
||||
benchmark=benchmark,
|
||||
warnings.warn(
|
||||
"this function is deprecated, please use backtest function in qlib.contrib.backtest", DeprecationWarning
|
||||
)
|
||||
# for compatibility of the old API. return the dict positions
|
||||
positions = {k: p.position for k, p in positions.items()}
|
||||
return report_df, positions
|
||||
report_dict = backtest_func(
|
||||
pred=pred, account=account, shift=shift, benchmark=benchmark, verbose=verbose, return_order=False, **kwargs
|
||||
)
|
||||
return report_dict.get("report_df"), report_dict.get("positions")
|
||||
|
||||
|
||||
def long_short_backtest(
|
||||
@@ -340,7 +193,7 @@ def long_short_backtest(
|
||||
|
||||
_pred_dates = pred.index.get_level_values(level="datetime")
|
||||
predict_dates = D.calendar(start_time=_pred_dates.min(), end_time=_pred_dates.max())
|
||||
trade_dates = np.append(predict_dates[shift:], get_date_range(predict_dates[-1], shift=shift))
|
||||
trade_dates = np.append(predict_dates[shift:], get_date_range(predict_dates[-1], left_shift=1, right_shift=shift))
|
||||
|
||||
long_returns = {}
|
||||
short_returns = {}
|
||||
|
||||
@@ -56,7 +56,7 @@ class ALSTM(Model):
|
||||
early_stop=20,
|
||||
loss="mse",
|
||||
optimizer="adam",
|
||||
GPU="0",
|
||||
GPU=0,
|
||||
seed=None,
|
||||
**kwargs
|
||||
):
|
||||
|
||||
@@ -58,7 +58,7 @@ class ALSTM(Model):
|
||||
loss="mse",
|
||||
optimizer="adam",
|
||||
n_jobs=10,
|
||||
GPU="0",
|
||||
GPU=0,
|
||||
seed=None,
|
||||
**kwargs
|
||||
):
|
||||
@@ -204,8 +204,8 @@ class ALSTM(Model):
|
||||
verbose=True,
|
||||
save_path=None,
|
||||
):
|
||||
dl_train = dataset.prepare("train", data_key=DataHandlerLP.DK_L)
|
||||
dl_valid = dataset.prepare("valid", data_key=DataHandlerLP.DK_L)
|
||||
dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
|
||||
dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
|
||||
|
||||
dl_train.config(fillna_type="ffill+bfill") # process nan brought by dataloader
|
||||
dl_valid.config(fillna_type="ffill+bfill") # process nan brought by dataloader
|
||||
@@ -260,7 +260,7 @@ class ALSTM(Model):
|
||||
if not self._fitted:
|
||||
raise ValueError("model is not fitted yet!")
|
||||
|
||||
dl_test = dataset.prepare("test", data_key=DataHandlerLP.DK_I)
|
||||
dl_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I)
|
||||
dl_test.config(fillna_type="ffill+bfill")
|
||||
test_loader = DataLoader(dl_test, batch_size=self.batch_size, num_workers=self.n_jobs)
|
||||
self.ALSTM_model.eval()
|
||||
|
||||
@@ -61,7 +61,7 @@ class GATs(Model):
|
||||
with_pretrain=True,
|
||||
model_path=None,
|
||||
optimizer="adam",
|
||||
GPU="0",
|
||||
GPU=0,
|
||||
seed=None,
|
||||
**kwargs
|
||||
):
|
||||
|
||||
@@ -249,8 +249,8 @@ class GATs(Model):
|
||||
save_path=None,
|
||||
):
|
||||
|
||||
dl_train = dataset.prepare("train", data_key=DataHandlerLP.DK_L)
|
||||
dl_valid = dataset.prepare("valid", data_key=DataHandlerLP.DK_L)
|
||||
dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
|
||||
dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
|
||||
|
||||
dl_train.config(fillna_type="ffill+bfill") # process nan brought by dataloader
|
||||
dl_valid.config(fillna_type="ffill+bfill") # process nan brought by dataloader
|
||||
@@ -332,7 +332,7 @@ class GATs(Model):
|
||||
if not self._fitted:
|
||||
raise ValueError("model is not fitted yet!")
|
||||
|
||||
dl_test = dataset.prepare("test", data_key=DataHandlerLP.DK_I)
|
||||
dl_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I)
|
||||
dl_test.config(fillna_type="ffill+bfill")
|
||||
sampler_test = DailyBatchSampler(dl_test)
|
||||
test_loader = DataLoader(dl_test, sampler=sampler_test, num_workers=self.n_jobs)
|
||||
|
||||
@@ -56,7 +56,7 @@ class GRU(Model):
|
||||
early_stop=20,
|
||||
loss="mse",
|
||||
optimizer="adam",
|
||||
GPU="0",
|
||||
GPU=0,
|
||||
seed=None,
|
||||
**kwargs
|
||||
):
|
||||
|
||||
@@ -58,7 +58,7 @@ class GRU(Model):
|
||||
loss="mse",
|
||||
optimizer="adam",
|
||||
n_jobs=10,
|
||||
GPU="0",
|
||||
GPU=0,
|
||||
seed=None,
|
||||
**kwargs
|
||||
):
|
||||
@@ -204,8 +204,8 @@ class GRU(Model):
|
||||
verbose=True,
|
||||
save_path=None,
|
||||
):
|
||||
dl_train = dataset.prepare("train", data_key=DataHandlerLP.DK_L)
|
||||
dl_valid = dataset.prepare("valid", data_key=DataHandlerLP.DK_L)
|
||||
dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
|
||||
dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
|
||||
|
||||
dl_train.config(fillna_type="ffill+bfill") # process nan brought by dataloader
|
||||
dl_valid.config(fillna_type="ffill+bfill") # process nan brought by dataloader
|
||||
@@ -260,7 +260,7 @@ class GRU(Model):
|
||||
if not self._fitted:
|
||||
raise ValueError("model is not fitted yet!")
|
||||
|
||||
dl_test = dataset.prepare("test", data_key=DataHandlerLP.DK_I)
|
||||
dl_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I)
|
||||
dl_test.config(fillna_type="ffill+bfill")
|
||||
test_loader = DataLoader(dl_test, batch_size=self.batch_size, num_workers=self.n_jobs)
|
||||
self.GRU_model.eval()
|
||||
|
||||
@@ -56,7 +56,7 @@ class LSTM(Model):
|
||||
early_stop=20,
|
||||
loss="mse",
|
||||
optimizer="adam",
|
||||
GPU="0",
|
||||
GPU=0,
|
||||
seed=None,
|
||||
**kwargs
|
||||
):
|
||||
|
||||
@@ -58,7 +58,7 @@ class LSTM(Model):
|
||||
loss="mse",
|
||||
optimizer="adam",
|
||||
n_jobs=10,
|
||||
GPU="0",
|
||||
GPU=0,
|
||||
seed=None,
|
||||
**kwargs
|
||||
):
|
||||
@@ -204,8 +204,8 @@ class LSTM(Model):
|
||||
verbose=True,
|
||||
save_path=None,
|
||||
):
|
||||
dl_train = dataset.prepare("train", data_key=DataHandlerLP.DK_L)
|
||||
dl_valid = dataset.prepare("valid", data_key=DataHandlerLP.DK_L)
|
||||
dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
|
||||
dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
|
||||
|
||||
dl_train.config(fillna_type="ffill+bfill") # process nan brought by dataloader
|
||||
dl_valid.config(fillna_type="ffill+bfill") # process nan brought by dataloader
|
||||
@@ -260,7 +260,7 @@ class LSTM(Model):
|
||||
if not self._fitted:
|
||||
raise ValueError("model is not fitted yet!")
|
||||
|
||||
dl_test = dataset.prepare("test", data_key=DataHandlerLP.DK_I)
|
||||
dl_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I)
|
||||
dl_test.config(fillna_type="ffill+bfill")
|
||||
test_loader = DataLoader(dl_test, batch_size=self.batch_size, num_workers=self.n_jobs)
|
||||
self.LSTM_model.eval()
|
||||
|
||||
@@ -60,7 +60,7 @@ class DNNModelPytorch(Model):
|
||||
lr_decay_steps=100,
|
||||
optimizer="gd",
|
||||
loss="mse",
|
||||
GPU="0",
|
||||
GPU=0,
|
||||
seed=None,
|
||||
weight_decay=0.0,
|
||||
**kwargs
|
||||
@@ -259,7 +259,7 @@ class DNNModelPytorch(Model):
|
||||
loss = torch.mul(sqr_loss, w).mean()
|
||||
return loss
|
||||
elif loss_type == "binary":
|
||||
loss = nn.BCELoss()
|
||||
loss = nn.BCELoss(weight=w)
|
||||
return loss(pred, target)
|
||||
else:
|
||||
raise NotImplementedError("loss {} is not supported!".format(loss_type))
|
||||
@@ -296,7 +296,7 @@ class DNNModelPytorch(Model):
|
||||
self._fitted = True
|
||||
|
||||
|
||||
class AverageMeter(object):
|
||||
class AverageMeter:
|
||||
"""Computes and stores the average and current value"""
|
||||
|
||||
def __init__(self):
|
||||
|
||||
@@ -464,7 +464,7 @@ class SFM(Model):
|
||||
return pd.Series(np.concatenate(preds), index=index)
|
||||
|
||||
|
||||
class AverageMeter(object):
|
||||
class AverageMeter:
|
||||
"""Computes and stores the average and current value"""
|
||||
|
||||
def __init__(self):
|
||||
|
||||
642
qlib/contrib/model/pytorch_tabnet.py
Normal file
642
qlib/contrib/model/pytorch_tabnet.py
Normal file
@@ -0,0 +1,642 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import copy
|
||||
from sklearn.metrics import roc_auc_score, mean_squared_error
|
||||
import logging
|
||||
from ...utils import (
|
||||
unpack_archive_with_buffer,
|
||||
save_multiple_parts_file,
|
||||
create_save_path,
|
||||
drop_nan_by_y_index,
|
||||
)
|
||||
from ...log import get_module_logger, TimeInspector
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
import torch.nn.functional as F
|
||||
from torch.autograd import Function
|
||||
|
||||
from ...model.base import Model
|
||||
from ...data.dataset import DatasetH
|
||||
from ...data.dataset.handler import DataHandlerLP
|
||||
|
||||
|
||||
class TabnetModel(Model):
|
||||
def __init__(
|
||||
self,
|
||||
d_feat=158,
|
||||
out_dim=64,
|
||||
final_out_dim=1,
|
||||
batch_size=4096,
|
||||
n_d=64,
|
||||
n_a=64,
|
||||
n_shared=2,
|
||||
n_ind=2,
|
||||
n_steps=5,
|
||||
n_epochs=100,
|
||||
pretrain_n_epochs=50,
|
||||
relax=1.3,
|
||||
vbs=2048,
|
||||
seed=993,
|
||||
optimizer="adam",
|
||||
loss="mse",
|
||||
metric="",
|
||||
early_stop=20,
|
||||
GPU="1",
|
||||
pretrain_loss="custom",
|
||||
ps=0.3,
|
||||
lr=0.01,
|
||||
pretrain=True,
|
||||
pretrain_file="./pretrain/best.model",
|
||||
):
|
||||
"""
|
||||
TabNet model for Qlib
|
||||
|
||||
Args:
|
||||
ps: probability to generate the bernoulli mask
|
||||
"""
|
||||
# set hyper-parameters.
|
||||
self.d_feat = d_feat
|
||||
self.out_dim = out_dim
|
||||
self.final_out_dim = final_out_dim
|
||||
self.lr = lr
|
||||
self.batch_size = batch_size
|
||||
self.optimizer = optimizer.lower()
|
||||
self.pretrain_loss = pretrain_loss
|
||||
self.seed = seed
|
||||
self.ps = ps
|
||||
self.n_epochs = n_epochs
|
||||
self.logger = get_module_logger("TabNet")
|
||||
self.pretrain_n_epochs = pretrain_n_epochs
|
||||
self.device = "cuda:%s" % (GPU) if torch.cuda.is_available() else "cpu"
|
||||
self.loss = loss
|
||||
self.metric = metric
|
||||
self.early_stop = early_stop
|
||||
self.pretrain = pretrain
|
||||
self.pretrain_file = pretrain_file
|
||||
self.logger.info(
|
||||
"TabNet:"
|
||||
"\nbatch_size : {}"
|
||||
"\nvirtual bs : {}"
|
||||
"\nGPU : {}"
|
||||
"\npretrain: {}".format(self.batch_size, vbs, GPU, pretrain)
|
||||
)
|
||||
np.random.seed(self.seed)
|
||||
torch.manual_seed(self.seed)
|
||||
|
||||
self.tabnet_model = TabNet(
|
||||
inp_dim=self.d_feat, out_dim=self.out_dim, vbs=vbs, relax=relax, device=self.device
|
||||
).to(self.device)
|
||||
self.tabnet_decoder = TabNet_Decoder(self.out_dim, self.d_feat, n_shared, n_ind, vbs, n_steps, self.device).to(
|
||||
self.device
|
||||
)
|
||||
|
||||
if optimizer.lower() == "adam":
|
||||
self.pretrain_optimizer = optim.Adam(
|
||||
list(self.tabnet_model.parameters()) + list(self.tabnet_decoder.parameters()), lr=self.lr
|
||||
)
|
||||
self.train_optimizer = optim.Adam(self.tabnet_model.parameters(), lr=self.lr)
|
||||
|
||||
elif optimizer.lower() == "gd":
|
||||
self.pretrain_optimizer = optim.SGD(
|
||||
list(self.tabnet_model.parameters()) + list(self.tabnet_decoder.parameters()), lr=self.lr
|
||||
)
|
||||
self.train_optimizer = optim.SGD(self.tabnet_model.parameters(), lr=self.lr)
|
||||
else:
|
||||
raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
|
||||
|
||||
def pretrain_fn(self, dataset=DatasetH, pretrain_file="./pretrain/best.model"):
|
||||
# make a directory if pretrian director does not exist
|
||||
if pretrain_file.startswith("./pretrain") and not os.path.exists("pretrain"):
|
||||
self.logger.info("make folder to store model...")
|
||||
os.makedirs("pretrain")
|
||||
|
||||
[df_train, df_valid] = dataset.prepare(
|
||||
["pretrain", "pretrain_validation"],
|
||||
col_set=["feature", "label"],
|
||||
data_key=DataHandlerLP.DK_L,
|
||||
)
|
||||
|
||||
df_train.fillna(df_train.mean(), inplace=True)
|
||||
df_valid.fillna(df_valid.mean(), inplace=True)
|
||||
|
||||
x_train = df_train["feature"]
|
||||
x_valid = df_valid["feature"]
|
||||
|
||||
# Early stop setup
|
||||
stop_steps = 0
|
||||
train_loss = 0
|
||||
best_loss = np.inf
|
||||
|
||||
for epoch_idx in range(self.pretrain_n_epochs):
|
||||
self.logger.info("epoch: %s" % (epoch_idx))
|
||||
self.logger.info("pre-training...")
|
||||
self.pretrain_epoch(x_train)
|
||||
self.logger.info("evaluating...")
|
||||
train_loss = self.pretrain_test_epoch(x_train)
|
||||
valid_loss = self.pretrain_test_epoch(x_valid)
|
||||
self.logger.info("train %.6f, valid %.6f" % (train_loss, valid_loss))
|
||||
|
||||
if valid_loss < best_loss:
|
||||
self.logger.info("Save Model...")
|
||||
torch.save(self.tabnet_model.state_dict(), pretrain_file)
|
||||
best_loss = valid_loss
|
||||
else:
|
||||
stop_steps += 1
|
||||
if stop_steps >= self.early_stop:
|
||||
self.logger.info("early stop")
|
||||
break
|
||||
|
||||
def fit(
|
||||
self,
|
||||
dataset: DatasetH,
|
||||
evals_result=dict(),
|
||||
verbose=True,
|
||||
save_path=None,
|
||||
):
|
||||
if self.pretrain:
|
||||
# there is a pretrained model, load the model
|
||||
self.logger.info("Pretrain...")
|
||||
self.pretrain_fn(dataset, self.pretrain_file)
|
||||
self.logger.info("Load Pretrain model")
|
||||
self.tabnet_model.load_state_dict(torch.load(self.pretrain_file))
|
||||
|
||||
# adding one more linear layer to fit the final output dimension
|
||||
self.tabnet_model = FinetuneModel(self.out_dim, self.final_out_dim, self.tabnet_model).to(self.device)
|
||||
df_train, df_valid = dataset.prepare(
|
||||
["train", "valid"],
|
||||
col_set=["feature", "label"],
|
||||
data_key=DataHandlerLP.DK_L,
|
||||
)
|
||||
df_train.fillna(df_train.mean(), inplace=True)
|
||||
x_train, y_train = df_train["feature"], df_train["label"]
|
||||
x_valid, y_valid = df_valid["feature"], df_valid["label"]
|
||||
|
||||
stop_steps = 0
|
||||
train_loss = 0
|
||||
best_score = np.inf
|
||||
best_epoch = 0
|
||||
evals_result["train"] = []
|
||||
evals_result["valid"] = []
|
||||
|
||||
self.logger.info("training...")
|
||||
self._fitted = True
|
||||
|
||||
for epoch_idx in range(self.n_epochs):
|
||||
self.logger.info("epoch: %s" % (epoch_idx))
|
||||
self.logger.info("training...")
|
||||
self.train_epoch(x_train, y_train)
|
||||
self.logger.info("evaluating...")
|
||||
train_loss, train_score = self.test_epoch(x_train, y_train)
|
||||
valid_loss, val_score = self.test_epoch(x_valid, y_valid)
|
||||
self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
|
||||
evals_result["train"].append(train_score)
|
||||
evals_result["valid"].append(val_score)
|
||||
|
||||
if val_score < best_score:
|
||||
best_score = val_score
|
||||
stop_steps = 0
|
||||
best_epoch = epoch_idx
|
||||
else:
|
||||
stop_steps += 1
|
||||
if stop_steps >= self.early_stop:
|
||||
self.logger.info("early stop")
|
||||
break
|
||||
self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
|
||||
|
||||
def predict(self, dataset):
|
||||
if not self._fitted:
|
||||
raise ValueError("model is not fitted yet!")
|
||||
|
||||
x_test = dataset.prepare("test", col_set="feature", data_key=DataHandlerLP.DK_I)
|
||||
index = x_test.index
|
||||
self.tabnet_model.eval()
|
||||
x_values = torch.from_numpy(x_test.values)
|
||||
x_values[torch.isnan(x_values)] = 0
|
||||
sample_num = x_values.shape[0]
|
||||
preds = []
|
||||
|
||||
for begin in range(sample_num)[:: self.batch_size]:
|
||||
if sample_num - begin < self.batch_size:
|
||||
end = sample_num
|
||||
else:
|
||||
end = begin + self.batch_size
|
||||
|
||||
x_batch = x_values[begin:end].float().to(self.device)
|
||||
priors = torch.ones(end - begin, self.d_feat).to(self.device)
|
||||
|
||||
with torch.no_grad():
|
||||
pred = self.tabnet_model(x_batch, priors).detach().cpu().numpy()
|
||||
|
||||
preds.append(pred)
|
||||
|
||||
return pd.Series(np.concatenate(preds), index=index)
|
||||
|
||||
def test_epoch(self, data_x, data_y):
|
||||
# prepare training data
|
||||
x_values = torch.from_numpy(data_x.values)
|
||||
y_values = torch.from_numpy(np.squeeze(data_y.values))
|
||||
x_values[torch.isnan(x_values)] = 0
|
||||
y_values[torch.isnan(y_values)] = 0
|
||||
self.tabnet_model.eval()
|
||||
|
||||
scores = []
|
||||
losses = []
|
||||
|
||||
indices = np.arange(len(x_values))
|
||||
|
||||
for i in range(len(indices))[:: self.batch_size]:
|
||||
|
||||
if len(indices) - i < self.batch_size:
|
||||
break
|
||||
feature = x_values[indices[i : i + self.batch_size]].float().to(self.device)
|
||||
label = y_values[indices[i : i + self.batch_size]].float().to(self.device)
|
||||
priors = torch.ones(self.batch_size, self.d_feat).to(self.device)
|
||||
pred = self.tabnet_model(feature, priors)
|
||||
loss = self.loss_fn(pred, label)
|
||||
losses.append(loss.item())
|
||||
|
||||
score = self.metric_fn(pred, label)
|
||||
scores.append(score.item())
|
||||
|
||||
return np.mean(losses), np.mean(scores)
|
||||
|
||||
def train_epoch(self, x_train, y_train):
|
||||
x_train_values = torch.from_numpy(x_train.values)
|
||||
y_train_values = torch.from_numpy(np.squeeze(y_train.values))
|
||||
x_train_values[torch.isnan(x_train_values)] = 0
|
||||
y_train_values[torch.isnan(y_train_values)] = 0
|
||||
self.tabnet_model.train()
|
||||
|
||||
indices = np.arange(len(x_train_values))
|
||||
np.random.shuffle(indices)
|
||||
|
||||
for i in range(len(indices))[:: self.batch_size]:
|
||||
|
||||
if len(indices) - i < self.batch_size:
|
||||
break
|
||||
|
||||
feature = x_train_values[indices[i : i + self.batch_size]].float().to(self.device)
|
||||
label = y_train_values[indices[i : i + self.batch_size]].float().to(self.device)
|
||||
priors = torch.ones(self.batch_size, self.d_feat).to(self.device)
|
||||
pred = self.tabnet_model(feature, priors)
|
||||
loss = self.loss_fn(pred, label)
|
||||
|
||||
self.train_optimizer.zero_grad()
|
||||
loss.backward()
|
||||
torch.nn.utils.clip_grad_value_(self.tabnet_model.parameters(), 3.0)
|
||||
self.train_optimizer.step()
|
||||
|
||||
def pretrain_epoch(self, x_train):
|
||||
train_set = torch.from_numpy(x_train.values)
|
||||
train_set[torch.isnan(train_set)] = 0
|
||||
indices = np.arange(len(train_set))
|
||||
np.random.shuffle(indices)
|
||||
|
||||
self.tabnet_model.train()
|
||||
self.tabnet_decoder.train()
|
||||
|
||||
for i in range(len(indices))[:: self.batch_size]:
|
||||
|
||||
if len(indices) - i < self.batch_size:
|
||||
break
|
||||
|
||||
S_mask = torch.bernoulli(torch.empty(self.batch_size, self.d_feat).fill_(self.ps))
|
||||
x_train_values = train_set[indices[i : i + self.batch_size]] * (1 - S_mask)
|
||||
y_train_values = train_set[indices[i : i + self.batch_size]] * (S_mask)
|
||||
|
||||
S_mask = S_mask.to(self.device)
|
||||
feature = x_train_values.float().to(self.device)
|
||||
label = y_train_values.float().to(self.device)
|
||||
priors = 1 - S_mask
|
||||
(vec, sparse_loss) = self.tabnet_model(feature, priors)
|
||||
f = self.tabnet_decoder(vec)
|
||||
loss = self.pretrain_loss_fn(label, f, S_mask)
|
||||
|
||||
self.pretrain_optimizer.zero_grad()
|
||||
loss.backward()
|
||||
self.pretrain_optimizer.step()
|
||||
|
||||
def pretrain_test_epoch(self, x_train):
|
||||
train_set = torch.from_numpy(x_train.values)
|
||||
train_set[torch.isnan(train_set)] = 0
|
||||
indices = np.arange(len(train_set))
|
||||
|
||||
self.tabnet_model.eval()
|
||||
self.tabnet_decoder.eval()
|
||||
|
||||
losses = []
|
||||
|
||||
for i in range(len(indices))[:: self.batch_size]:
|
||||
|
||||
if len(indices) - i < self.batch_size:
|
||||
break
|
||||
|
||||
S_mask = torch.bernoulli(torch.empty(self.batch_size, self.d_feat).fill_(self.ps))
|
||||
x_train_values = train_set[indices[i : i + self.batch_size]] * (1 - S_mask)
|
||||
y_train_values = train_set[indices[i : i + self.batch_size]] * (S_mask)
|
||||
|
||||
feature = x_train_values.float().to(self.device)
|
||||
label = y_train_values.float().to(self.device)
|
||||
S_mask = S_mask.to(self.device)
|
||||
priors = 1 - S_mask
|
||||
(vec, sparse_loss) = self.tabnet_model(feature, priors)
|
||||
f = self.tabnet_decoder(vec)
|
||||
|
||||
loss = self.pretrain_loss_fn(label, f, S_mask)
|
||||
losses.append(loss.item())
|
||||
|
||||
return np.mean(losses)
|
||||
|
||||
def pretrain_loss_fn(self, f_hat, f, S):
|
||||
"""
|
||||
Pretrain loss function defined in the original paper, read "Tabular self-supervised learning" in https://arxiv.org/pdf/1908.07442.pdf
|
||||
"""
|
||||
down_mean = torch.mean(f, dim=0)
|
||||
down = torch.sqrt(torch.sum(torch.square(f - down_mean), dim=0))
|
||||
up = (f_hat - f) * S
|
||||
return torch.sum(torch.square(up / down))
|
||||
|
||||
def loss_fn(self, pred, label):
|
||||
mask = ~torch.isnan(label)
|
||||
if self.loss == "mse":
|
||||
return self.mse(pred[mask], label[mask])
|
||||
raise ValueError("unknown loss `%s`" % self.loss)
|
||||
|
||||
def metric_fn(self, pred, label):
|
||||
mask = torch.isfinite(label)
|
||||
if self.metric == "" or self.metric == "loss":
|
||||
return -self.loss_fn(pred[mask], label[mask])
|
||||
raise ValueError("unknown metric `%s`" % self.metric)
|
||||
|
||||
def mse(self, pred, label):
|
||||
loss = (pred - label) ** 2
|
||||
return torch.mean(loss)
|
||||
|
||||
|
||||
class FinetuneModel(nn.Module):
|
||||
"""
|
||||
FinuetuneModel for adding a layer by the end
|
||||
"""
|
||||
|
||||
def __init__(self, input_dim, output_dim, trained_model):
|
||||
super().__init__()
|
||||
self.model = trained_model
|
||||
self.fc = nn.Linear(input_dim, output_dim)
|
||||
|
||||
def forward(self, x, priors):
|
||||
return self.fc(self.model(x, priors)[0]).squeeze() # take the vec out
|
||||
|
||||
|
||||
class DecoderStep(nn.Module):
|
||||
def __init__(self, inp_dim, out_dim, shared, n_ind, vbs, device):
|
||||
super().__init__()
|
||||
self.fea_tran = FeatureTransformer(inp_dim, out_dim, shared, n_ind, vbs, device)
|
||||
self.fc = nn.Linear(out_dim, out_dim)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.fea_tran(x)
|
||||
return self.fc(x)
|
||||
|
||||
|
||||
class TabNet_Decoder(nn.Module):
|
||||
def __init__(self, inp_dim, out_dim, n_shared, n_ind, vbs, n_steps, device):
|
||||
"""
|
||||
TabNet decoder that is used in pre-training
|
||||
"""
|
||||
self.out_dim = out_dim
|
||||
|
||||
super().__init__()
|
||||
if n_shared > 0:
|
||||
self.shared = nn.ModuleList()
|
||||
self.shared.append(nn.Linear(inp_dim, 2 * out_dim))
|
||||
for x in range(n_shared - 1):
|
||||
self.shared.append(nn.Linear(out_dim, 2 * out_dim)) # preset the linear function we will use
|
||||
else:
|
||||
self.shared = None
|
||||
self.n_steps = n_steps
|
||||
self.steps = nn.ModuleList()
|
||||
for x in range(n_steps):
|
||||
self.steps.append(DecoderStep(inp_dim, out_dim, self.shared, n_ind, vbs, device))
|
||||
|
||||
def forward(self, x):
|
||||
out = torch.zeros(x.size(0), self.out_dim).to(x.device)
|
||||
for step in self.steps:
|
||||
out += step(x)
|
||||
return out
|
||||
|
||||
|
||||
class TabNet(nn.Module):
|
||||
def __init__(
|
||||
self, inp_dim=6, out_dim=6, n_d=64, n_a=64, n_shared=2, n_ind=2, n_steps=5, relax=1.2, vbs=1024, device="cpu"
|
||||
):
|
||||
"""
|
||||
TabNet AKA the original encoder
|
||||
|
||||
Args:
|
||||
n_d: dimension of the features used to calculate the final results
|
||||
n_a: dimension of the features input to the attention transformer of the next step
|
||||
n_shared: numbr of shared steps in feature transfomer(optional)
|
||||
n_ind: number of independent steps in feature transformer
|
||||
n_steps: number of steps of pass through tabbet
|
||||
relax coefficient:
|
||||
virtual batch size:
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
# set the number of shared step in feature transformer
|
||||
if n_shared > 0:
|
||||
self.shared = nn.ModuleList()
|
||||
self.shared.append(nn.Linear(inp_dim, 2 * (n_d + n_a)))
|
||||
for x in range(n_shared - 1):
|
||||
self.shared.append(nn.Linear(n_d + n_a, 2 * (n_d + n_a))) # preset the linear function we will use
|
||||
else:
|
||||
self.shared = None
|
||||
|
||||
self.first_step = FeatureTransformer(inp_dim, n_d + n_a, self.shared, n_ind, vbs, device)
|
||||
self.steps = nn.ModuleList()
|
||||
for x in range(n_steps - 1):
|
||||
self.steps.append(DecisionStep(inp_dim, n_d, n_a, self.shared, n_ind, relax, vbs, device))
|
||||
self.fc = nn.Linear(n_d, out_dim)
|
||||
self.bn = nn.BatchNorm1d(inp_dim, momentum=0.01)
|
||||
self.n_d = n_d
|
||||
|
||||
def forward(self, x, priors):
|
||||
assert not torch.isnan(x).any()
|
||||
x = self.bn(x)
|
||||
x_a = self.first_step(x)[:, self.n_d :]
|
||||
sparse_loss = torch.zeros(1).to(x.device)
|
||||
out = torch.zeros(x.size(0), self.n_d).to(x.device)
|
||||
for step in self.steps:
|
||||
x_te, l = step(x, x_a, priors)
|
||||
out += F.relu(x_te[:, : self.n_d]) # split the feautre from feat_transformer
|
||||
x_a = x_te[:, self.n_d :]
|
||||
sparse_loss += l
|
||||
return self.fc(out), sparse_loss
|
||||
|
||||
|
||||
class GBN(nn.Module):
|
||||
"""
|
||||
Ghost Batch Normalization
|
||||
an efficient way of doing batch normalization
|
||||
|
||||
Args:
|
||||
vbs: virtual batch size
|
||||
"""
|
||||
|
||||
def __init__(self, inp, vbs=1024, momentum=0.01):
|
||||
super().__init__()
|
||||
self.bn = nn.BatchNorm1d(inp, momentum=momentum)
|
||||
self.vbs = vbs
|
||||
|
||||
def forward(self, x):
|
||||
chunk = torch.chunk(x, x.size(0) // self.vbs, 0)
|
||||
res = [self.bn(y) for y in chunk]
|
||||
return torch.cat(res, 0)
|
||||
|
||||
|
||||
class GLU(nn.Module):
|
||||
"""
|
||||
GLU block that extracts only the most essential information
|
||||
|
||||
Args:
|
||||
vbs: virtual batch size
|
||||
"""
|
||||
|
||||
def __init__(self, inp_dim, out_dim, fc=None, vbs=1024):
|
||||
super().__init__()
|
||||
if fc:
|
||||
self.fc = fc
|
||||
else:
|
||||
self.fc = nn.Linear(inp_dim, out_dim * 2)
|
||||
self.bn = GBN(out_dim * 2, vbs=vbs)
|
||||
self.od = out_dim
|
||||
|
||||
def forward(self, x):
|
||||
x = self.bn(self.fc(x))
|
||||
return torch.mul(x[:, : self.od], torch.sigmoid(x[:, self.od :]))
|
||||
|
||||
|
||||
class AttentionTransformer(nn.Module):
|
||||
"""
|
||||
Args:
|
||||
relax: relax coefficient. The greater it is, we can
|
||||
use the same features more. When it is set to 1
|
||||
we can use every feature only once
|
||||
"""
|
||||
|
||||
def __init__(self, d_a, inp_dim, relax, vbs=1024):
|
||||
super().__init__()
|
||||
self.fc = nn.Linear(d_a, inp_dim)
|
||||
self.bn = GBN(inp_dim, vbs=vbs)
|
||||
self.r = relax
|
||||
|
||||
# a:feature from previous decision step
|
||||
def forward(self, a, priors):
|
||||
a = self.bn(self.fc(a))
|
||||
mask = SparsemaxFunction.apply(a * priors)
|
||||
priors = priors * (self.r - mask) # updating the prior
|
||||
return mask
|
||||
|
||||
|
||||
class FeatureTransformer(nn.Module):
|
||||
def __init__(self, inp_dim, out_dim, shared, n_ind, vbs, device):
|
||||
super().__init__()
|
||||
first = True
|
||||
self.shared = nn.ModuleList()
|
||||
if shared:
|
||||
self.shared.append(GLU(inp_dim, out_dim, shared[0], vbs=vbs))
|
||||
first = False
|
||||
for fc in shared[1:]:
|
||||
self.shared.append(GLU(out_dim, out_dim, fc, vbs=vbs))
|
||||
else:
|
||||
self.shared = None
|
||||
self.independ = nn.ModuleList()
|
||||
if first:
|
||||
self.independ.append(GLU(inp, out_dim, vbs=vbs))
|
||||
for x in range(first, n_ind):
|
||||
self.independ.append(GLU(out_dim, out_dim, vbs=vbs))
|
||||
self.scale = torch.sqrt(torch.tensor([0.5], device=device))
|
||||
|
||||
def forward(self, x):
|
||||
if self.shared:
|
||||
x = self.shared[0](x)
|
||||
for glu in self.shared[1:]:
|
||||
x = torch.add(x, glu(x))
|
||||
x = x * self.scale
|
||||
for glu in self.independ:
|
||||
x = torch.add(x, glu(x))
|
||||
x = x * self.scale
|
||||
return x
|
||||
|
||||
|
||||
class DecisionStep(nn.Module):
|
||||
"""
|
||||
One step for the TabNet
|
||||
"""
|
||||
|
||||
def __init__(self, inp_dim, n_d, n_a, shared, n_ind, relax, vbs, device):
|
||||
super().__init__()
|
||||
self.atten_tran = AttentionTransformer(n_a, inp_dim, relax, vbs)
|
||||
self.fea_tran = FeatureTransformer(inp_dim, n_d + n_a, shared, n_ind, vbs, device)
|
||||
|
||||
def forward(self, x, a, priors):
|
||||
mask = self.atten_tran(a, priors)
|
||||
sparse_loss = ((-1) * mask * torch.log(mask + 1e-10)).mean()
|
||||
x = self.fea_tran(x * mask)
|
||||
return x, sparse_loss
|
||||
|
||||
|
||||
def make_ix_like(input, dim=0):
|
||||
d = input.size(dim)
|
||||
rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype)
|
||||
view = [1] * input.dim()
|
||||
view[0] = -1
|
||||
return rho.view(view).transpose(0, dim)
|
||||
|
||||
|
||||
class SparsemaxFunction(Function):
|
||||
"""
|
||||
SparseMax function for replacing reLU
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, input, dim=-1):
|
||||
ctx.dim = dim
|
||||
max_val, _ = input.max(dim=dim, keepdim=True)
|
||||
input -= max_val # same numerical stability trick as for softmax
|
||||
tau, supp_size = SparsemaxFunction.threshold_and_support(input, dim=dim)
|
||||
output = torch.clamp(input - tau, min=0)
|
||||
ctx.save_for_backward(supp_size, output)
|
||||
return output
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
supp_size, output = ctx.saved_tensors
|
||||
dim = ctx.dim
|
||||
grad_input = grad_output.clone()
|
||||
grad_input[output == 0] = 0
|
||||
|
||||
v_hat = grad_input.sum(dim=dim) / supp_size.to(output.dtype).squeeze()
|
||||
v_hat = v_hat.unsqueeze(dim)
|
||||
grad_input = torch.where(output != 0, grad_input - v_hat, grad_input)
|
||||
return grad_input, None
|
||||
|
||||
@staticmethod
|
||||
def threshold_and_support(input, dim=-1):
|
||||
input_srt, _ = torch.sort(input, descending=True, dim=dim)
|
||||
input_cumsum = input_srt.cumsum(dim) - 1
|
||||
rhos = make_ix_like(input, dim)
|
||||
support = rhos * input_srt > input_cumsum
|
||||
|
||||
support_size = support.sum(dim=dim).unsqueeze(dim)
|
||||
tau = input_cumsum.gather(dim, support_size - 1)
|
||||
tau /= support_size.to(input.dtype)
|
||||
return tau, support_size
|
||||
@@ -21,7 +21,7 @@ from .executor import SimulatorExecutor
|
||||
from .executor import save_score_series, load_score_series
|
||||
|
||||
|
||||
class Operator(object):
|
||||
class Operator:
|
||||
def __init__(self, client: str):
|
||||
"""
|
||||
Parameters
|
||||
|
||||
@@ -38,7 +38,7 @@ def _calculate_report_data(df: pd.DataFrame) -> pd.DataFrame:
|
||||
:param df:
|
||||
:return:
|
||||
"""
|
||||
|
||||
index_names = df.index.names
|
||||
df.index = df.index.strftime("%Y-%m-%d")
|
||||
|
||||
report_df = pd.DataFrame()
|
||||
@@ -58,6 +58,8 @@ def _calculate_report_data(df: pd.DataFrame) -> pd.DataFrame:
|
||||
|
||||
report_df["turnover"] = df["turnover"]
|
||||
report_df.sort_index(ascending=True, inplace=True)
|
||||
|
||||
report_df.index.names = index_names
|
||||
return report_df
|
||||
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ from plotly.figure_factory import create_distplot
|
||||
from ...utils import get_module_by_module_path
|
||||
|
||||
|
||||
class BaseGraph(object):
|
||||
class BaseGraph:
|
||||
""""""
|
||||
|
||||
_name = None
|
||||
@@ -204,7 +204,7 @@ class HistogramGraph(BaseGraph):
|
||||
return _data
|
||||
|
||||
|
||||
class SubplotsGraph(object):
|
||||
class SubplotsGraph:
|
||||
"""Create subplots same as df.plot(subplots=True)
|
||||
|
||||
Simple package for `plotly.tools.subplots`
|
||||
|
||||
@@ -30,7 +30,7 @@ class BaseStrategy:
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
score_series : pd.Seires
|
||||
score_series : pd.Series
|
||||
stock_id , score.
|
||||
current : Position()
|
||||
current state of position.
|
||||
|
||||
@@ -6,7 +6,7 @@ import copy
|
||||
import os
|
||||
|
||||
|
||||
class TunerConfigManager(object):
|
||||
class TunerConfigManager:
|
||||
def __init__(self, config_path):
|
||||
|
||||
if not config_path:
|
||||
@@ -27,7 +27,7 @@ class TunerConfigManager(object):
|
||||
self.qlib_client_config = config.get("qlib_client", dict())
|
||||
|
||||
|
||||
class PipelineExperimentConfig(object):
|
||||
class PipelineExperimentConfig:
|
||||
def __init__(self, config, TUNER_CONFIG_MANAGER):
|
||||
"""
|
||||
:param config: The config dict for tuner experiment
|
||||
@@ -53,7 +53,7 @@ class PipelineExperimentConfig(object):
|
||||
yaml.dump(TUNER_CONFIG_MANAGER.config, fp)
|
||||
|
||||
|
||||
class OptimizationConfig(object):
|
||||
class OptimizationConfig:
|
||||
def __init__(self, config, TUNER_CONFIG_MANAGER):
|
||||
|
||||
self.report_type = config.get("report_type", "pred_long")
|
||||
|
||||
@@ -11,7 +11,7 @@ from ...log import get_module_logger, TimeInspector
|
||||
from ...utils import get_module_by_module_path
|
||||
|
||||
|
||||
class Pipeline(object):
|
||||
class Pipeline:
|
||||
|
||||
GLOBAL_BEST_PARAMS_NAME = "global_best_params.json"
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ from hyperopt import fmin, tpe
|
||||
from hyperopt import STATUS_OK, STATUS_FAIL
|
||||
|
||||
|
||||
class Tuner(object):
|
||||
class Tuner:
|
||||
def __init__(self, tuner_config, optim_config):
|
||||
|
||||
self.logger = get_module_logger("Tuner", sh_level=logging.INFO)
|
||||
|
||||
@@ -8,7 +8,7 @@ from libc.math cimport sqrt, isnan, NAN
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
|
||||
cdef class Expanding(object):
|
||||
cdef class Expanding:
|
||||
"""1-D array expanding"""
|
||||
cdef vector[double] barv
|
||||
cdef int na_count
|
||||
|
||||
@@ -8,7 +8,7 @@ from libc.math cimport sqrt, isnan, NAN
|
||||
from libcpp.deque cimport deque
|
||||
|
||||
|
||||
cdef class Rolling(object):
|
||||
cdef class Rolling:
|
||||
"""1-D array rolling"""
|
||||
cdef int window
|
||||
cdef deque[double] barv
|
||||
|
||||
@@ -157,7 +157,7 @@ class Expression(abc.ABC):
|
||||
|
||||
@abc.abstractmethod
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
pass
|
||||
raise NotImplementedError("This function must be implemented in your newly defined feature")
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_longest_back_rolling(self):
|
||||
|
||||
@@ -13,6 +13,7 @@ import pickle
|
||||
import traceback
|
||||
import redis_lock
|
||||
import contextlib
|
||||
import abc
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
@@ -32,43 +33,107 @@ from ..utils import (
|
||||
from ..log import get_module_logger
|
||||
from .base import Feature
|
||||
|
||||
from .ops import *
|
||||
from .ops import Operators
|
||||
|
||||
|
||||
class QlibCacheException(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
class MemCacheUnit(OrderedDict):
|
||||
class MemCacheUnit(abc.ABC):
|
||||
"""Memory Cache Unit."""
|
||||
|
||||
# TODO: use min_heap to replace ordereddict for better performance
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.size_limit = kwargs.pop("size_limit", None)
|
||||
# limit_type: check size_limit type, length(call fun: len) or size(call fun: sys.getsizeof)
|
||||
self.limit_type = kwargs.pop("limit_type", "length")
|
||||
super(MemCacheUnit, self).__init__(*args, **kwargs)
|
||||
self._check_size_limit()
|
||||
self.size_limit = kwargs.pop("size_limit", 0)
|
||||
self._size = 0
|
||||
self.od = OrderedDict()
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
super(MemCacheUnit, self).__setitem__(key, value)
|
||||
self._check_size_limit()
|
||||
# TODO: thread safe?__setitem__ failure might cause inconsistent size?
|
||||
|
||||
def __getitem__(self, key):
|
||||
value = super(MemCacheUnit, self).__getitem__(key)
|
||||
super(MemCacheUnit, self).__delitem__(key)
|
||||
super(MemCacheUnit, self).__setitem__(key, value)
|
||||
return value
|
||||
# precalculate the size after od.__setitem__
|
||||
self._adjust_size(key, value)
|
||||
|
||||
def _check_size_limit(self):
|
||||
if self.size_limit is not None:
|
||||
get_cur_size = lambda x: len(x) if self.limit_type == "length" else sum(map(sys.getsizeof, x.values()))
|
||||
while get_cur_size(self) > self.size_limit:
|
||||
self.od.__setitem__(key, value)
|
||||
|
||||
# move the key to end,make it latest
|
||||
self.od.move_to_end(key)
|
||||
|
||||
if self.limited:
|
||||
# pop the oldest items beyond size limit
|
||||
while self._size > self.size_limit:
|
||||
self.popitem(last=False)
|
||||
|
||||
def __getitem__(self, key):
|
||||
v = self.od.__getitem__(key)
|
||||
self.od.move_to_end(key)
|
||||
return v
|
||||
|
||||
class MemCache(object):
|
||||
def __contains__(self, key):
|
||||
return key in self.od
|
||||
|
||||
def __len__(self):
|
||||
return self.od.__len__()
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.__class__.__name__}<size_limit:{self.size_limit if self.limited else 'no limit'} total_size:{self._size}>\n{self.od.__repr__()}"
|
||||
|
||||
def set_limit_size(self, limit):
|
||||
self.size_limit = limit
|
||||
|
||||
@property
|
||||
def limited(self):
|
||||
"""whether memory cache is limited"""
|
||||
return self.size_limit > 0
|
||||
|
||||
@property
|
||||
def total_size(self):
|
||||
return self._size
|
||||
|
||||
def clear(self):
|
||||
self._size = 0
|
||||
self.od.clear()
|
||||
|
||||
def popitem(self, last=True):
|
||||
k, v = self.od.popitem(last=last)
|
||||
self._size -= self._get_value_size(v)
|
||||
|
||||
return k, v
|
||||
|
||||
def pop(self, key):
|
||||
v = self.od.pop(key)
|
||||
self._size -= self._get_value_size(v)
|
||||
|
||||
return v
|
||||
|
||||
def _adjust_size(self, key, value):
|
||||
if key in self.od:
|
||||
self._size -= self._get_value_size(self.od[key])
|
||||
|
||||
self._size += self._get_value_size(value)
|
||||
|
||||
@abc.abstractmethod
|
||||
def _get_value_size(self, value):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class MemCacheLengthUnit(MemCacheUnit):
|
||||
def __init__(self, size_limit=0):
|
||||
super().__init__(size_limit=size_limit)
|
||||
|
||||
def _get_value_size(self, value):
|
||||
return 1
|
||||
|
||||
|
||||
class MemCacheSizeofUnit(MemCacheUnit):
|
||||
def __init__(self, size_limit=0):
|
||||
super().__init__(size_limit=size_limit)
|
||||
|
||||
def _get_value_size(self, value):
|
||||
return sys.getsizeof(value)
|
||||
|
||||
|
||||
class MemCache:
|
||||
"""Memory cache."""
|
||||
|
||||
def __init__(self, mem_cache_size_limit=None, limit_type="length"):
|
||||
@@ -79,21 +144,19 @@ class MemCache(object):
|
||||
mem_cache_size_limit: cache max size.
|
||||
limit_type: length or sizeof; length(call fun: len), size(call fun: sys.getsizeof).
|
||||
"""
|
||||
if limit_type not in ["length", "sizeof"]:
|
||||
|
||||
size_limit = C.mem_cache_size_limit if mem_cache_size_limit is None else mem_cache_size_limit
|
||||
|
||||
if limit_type == "length":
|
||||
klass = MemCacheLengthUnit
|
||||
elif limit_type == "sizeof":
|
||||
klass = MemCacheSizeofUnit
|
||||
else:
|
||||
raise ValueError(f"limit_type must be length or sizeof, your limit_type is {limit_type}")
|
||||
|
||||
self.__calendar_mem_cache = MemCacheUnit(
|
||||
size_limit=C.mem_cache_size_limit if mem_cache_size_limit is None else mem_cache_size_limit,
|
||||
limit_type=limit_type,
|
||||
)
|
||||
self.__instrument_mem_cache = MemCacheUnit(
|
||||
size_limit=C.mem_cache_size_limit if mem_cache_size_limit is None else mem_cache_size_limit,
|
||||
limit_type=limit_type,
|
||||
)
|
||||
self.__feature_mem_cache = MemCacheUnit(
|
||||
size_limit=C.mem_cache_size_limit if mem_cache_size_limit is None else mem_cache_size_limit,
|
||||
limit_type=limit_type,
|
||||
)
|
||||
self.__calendar_mem_cache = klass(size_limit)
|
||||
self.__instrument_mem_cache = klass(size_limit)
|
||||
self.__feature_mem_cache = klass(size_limit)
|
||||
|
||||
def __getitem__(self, key):
|
||||
if key == "c":
|
||||
@@ -140,7 +203,7 @@ class MemCacheExpire:
|
||||
return value, expire
|
||||
|
||||
|
||||
class CacheUtils(object):
|
||||
class CacheUtils:
|
||||
LOCK_ID = "QLIB"
|
||||
|
||||
@staticmethod
|
||||
@@ -224,7 +287,7 @@ class CacheUtils(object):
|
||||
current_cache_wlock.release()
|
||||
|
||||
|
||||
class BaseProviderCache(object):
|
||||
class BaseProviderCache:
|
||||
"""Provider cache base class"""
|
||||
|
||||
def __init__(self, provider):
|
||||
@@ -762,8 +825,8 @@ class DiskDatasetCache(DatasetCache):
|
||||
|
||||
.. note:: The start is closed. The end is open!!!!!
|
||||
|
||||
- Each line contains two element <timestamp, end_index>
|
||||
- It indicates the `end_index` of the data for `timestamp`
|
||||
- Each line contains two element <start_index, end_index> with a timestamp as its index.
|
||||
- It indicates the `start_index`(included) and `end_index`(excluded) of the data for `timestamp`
|
||||
|
||||
- meta data: cache/d41366901e25de3ec47297f12e2ba11d.meta
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ from ..log import get_module_logger
|
||||
import pickle
|
||||
|
||||
|
||||
class Client(object):
|
||||
class Client:
|
||||
"""A client class
|
||||
|
||||
Provide the connection tool functions for ClientProvider.
|
||||
|
||||
@@ -15,14 +15,13 @@ import importlib
|
||||
import traceback
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from multiprocessing import Pool
|
||||
|
||||
from .cache import H
|
||||
from ..config import C
|
||||
from .ops import *
|
||||
from .ops import Operators
|
||||
from ..log import get_module_logger
|
||||
from ..utils import parse_field, read_bin, hash_args, normalize_cache_fields
|
||||
from ..utils import parse_field, read_bin, hash_args, normalize_cache_fields, code_to_fname
|
||||
from .base import Feature
|
||||
from .cache import DiskDatasetCache, DiskExpressionCache
|
||||
from ..utils import Wrapper, init_instance_by_config, register_wrapper, get_module_by_module_path
|
||||
@@ -118,7 +117,7 @@ class CalendarProvider(abc.ABC):
|
||||
if flag in H["c"]:
|
||||
_calendar, _calendar_index = H["c"][flag]
|
||||
else:
|
||||
_calendar = np.array(self._load_calendar(freq, future))
|
||||
_calendar = np.array(self.load_calendar(freq, future))
|
||||
_calendar_index = {x: i for i, x in enumerate(_calendar)} # for fast search
|
||||
H["c"][flag] = _calendar, _calendar_index
|
||||
return _calendar, _calendar_index
|
||||
@@ -215,20 +214,6 @@ class InstrumentProvider(abc.ABC):
|
||||
return cls.LIST
|
||||
raise ValueError(f"Unknown instrument type {inst}")
|
||||
|
||||
def convert_instruments(self, instrument):
|
||||
_instruments_map = getattr(self, "_instruments_map", None)
|
||||
if _instruments_map is None:
|
||||
_df_list = []
|
||||
# FIXME: each process will read these files
|
||||
for _path in Path(C.get_data_path()).joinpath("instruments").glob("*.txt"):
|
||||
_df = pd.read_csv(_path, sep="\t", names=["inst", "start_datetime", "end_datetime", "save_inst"])
|
||||
_df_list.append(_df.iloc[:, [0, -1]])
|
||||
df = pd.concat(_df_list, sort=False).sort_values("save_inst")
|
||||
df = df.drop_duplicates(subset=["save_inst"], keep="first").fillna(axis=1, method="ffill")
|
||||
_instruments_map = df.set_index("inst").iloc[:, 0].to_dict()
|
||||
setattr(self, "_instruments_map", _instruments_map)
|
||||
return _instruments_map.get(instrument, instrument)
|
||||
|
||||
|
||||
class FeatureProvider(abc.ABC):
|
||||
"""Feature provider class
|
||||
@@ -481,11 +466,10 @@ class DatasetProvider(abc.ABC):
|
||||
|
||||
"""
|
||||
# FIXME: Windows OS or MacOS using spawn: https://docs.python.org/3.8/library/multiprocessing.html?highlight=spawn#contexts-and-start-methods
|
||||
global C
|
||||
C = g_config
|
||||
# NOTE: This place is compatible with windows, windows multi-process is spawn
|
||||
if getattr(ExpressionD, "_provider", None) is None:
|
||||
register_all_wrappers()
|
||||
if not C.registered:
|
||||
C.set_conf_from_C(g_config)
|
||||
C.register()
|
||||
|
||||
obj = dict()
|
||||
for field in column_names:
|
||||
@@ -520,7 +504,7 @@ class LocalCalendarProvider(CalendarProvider):
|
||||
"""Calendar file uri."""
|
||||
return os.path.join(C.get_data_path(), "calendars", "{}.txt")
|
||||
|
||||
def _load_calendar(self, freq, future):
|
||||
def load_calendar(self, freq, future):
|
||||
"""Load original calendar timestamp from file.
|
||||
|
||||
Parameters
|
||||
@@ -587,10 +571,16 @@ class LocalInstrumentProvider(InstrumentProvider):
|
||||
fname = self._uri_inst.format(market)
|
||||
if not os.path.exists(fname):
|
||||
raise ValueError("instruments not exists for market " + market)
|
||||
|
||||
_instruments = dict()
|
||||
df = pd.read_csv(fname, sep="\t", names=["inst", "start_datetime", "end_datetime", "save_inst"])
|
||||
df["start_datetime"] = pd.to_datetime(df["start_datetime"])
|
||||
df["end_datetime"] = pd.to_datetime(df["end_datetime"])
|
||||
df = pd.read_csv(
|
||||
fname,
|
||||
sep="\t",
|
||||
usecols=[0, 1, 2],
|
||||
names=["inst", "start_datetime", "end_datetime"],
|
||||
dtype={"inst": str},
|
||||
parse_dates=["start_datetime", "end_datetime"],
|
||||
)
|
||||
for row in df.itertuples(index=False):
|
||||
_instruments.setdefault(row[0], []).append((row[1], row[2]))
|
||||
return _instruments
|
||||
@@ -647,7 +637,7 @@ class LocalFeatureProvider(FeatureProvider):
|
||||
def feature(self, instrument, field, start_index, end_index, freq):
|
||||
# validate
|
||||
field = str(field).lower()[1:]
|
||||
instrument = Inst.convert_instruments(instrument)
|
||||
instrument = code_to_fname(instrument)
|
||||
uri_data = self._uri_data.format(instrument.lower(), field, freq)
|
||||
if not os.path.exists(uri_data):
|
||||
get_module_logger("data").warning("WARN: data not found for %s.%s" % (instrument, field))
|
||||
@@ -682,6 +672,8 @@ class LocalExpressionProvider(ExpressionProvider):
|
||||
series = series.astype(np.float32)
|
||||
except ValueError:
|
||||
pass
|
||||
except TypeError:
|
||||
pass
|
||||
if not series.empty:
|
||||
series = series.loc[start_index:end_index]
|
||||
return series
|
||||
@@ -969,8 +961,7 @@ class BaseProvider:
|
||||
is a provider class.
|
||||
"""
|
||||
disk_cache = C.default_disk_cache if disk_cache is None else disk_cache
|
||||
if C.disable_disk_cache:
|
||||
disk_cache = False
|
||||
fields = list(fields) # In case of tuple.
|
||||
try:
|
||||
return DatasetD.dataset(instruments, fields, start_time, end_time, freq, disk_cache)
|
||||
except TypeError:
|
||||
@@ -1035,15 +1026,34 @@ class ClientProvider(BaseProvider):
|
||||
DatasetD.set_conn(self.client)
|
||||
|
||||
|
||||
Cal = Wrapper()
|
||||
Inst = Wrapper()
|
||||
FeatureD = Wrapper()
|
||||
ExpressionD = Wrapper()
|
||||
DatasetD = Wrapper()
|
||||
D = Wrapper()
|
||||
import sys
|
||||
|
||||
if sys.version_info >= (3, 9):
|
||||
from typing import Annotated
|
||||
|
||||
CalendarProviderWrapper = Annotated[CalendarProvider, Wrapper]
|
||||
InstrumentProviderWrapper = Annotated[InstrumentProvider, Wrapper]
|
||||
FeatureProviderWrapper = Annotated[FeatureProvider, Wrapper]
|
||||
ExpressionProviderWrapper = Annotated[ExpressionProvider, Wrapper]
|
||||
DatasetProviderWrapper = Annotated[DatasetProvider, Wrapper]
|
||||
BaseProviderWrapper = Annotated[BaseProvider, Wrapper]
|
||||
else:
|
||||
CalendarProviderWrapper = CalendarProvider
|
||||
InstrumentProviderWrapper = InstrumentProvider
|
||||
FeatureProviderWrapper = FeatureProvider
|
||||
ExpressionProviderWrapper = ExpressionProvider
|
||||
DatasetProviderWrapper = DatasetProvider
|
||||
BaseProviderWrapper = BaseProvider
|
||||
|
||||
Cal: CalendarProviderWrapper = Wrapper()
|
||||
Inst: InstrumentProviderWrapper = Wrapper()
|
||||
FeatureD: FeatureProviderWrapper = Wrapper()
|
||||
ExpressionD: ExpressionProviderWrapper = Wrapper()
|
||||
DatasetD: DatasetProviderWrapper = Wrapper()
|
||||
D: BaseProviderWrapper = Wrapper()
|
||||
|
||||
|
||||
def register_all_wrappers():
|
||||
def register_all_wrappers(C):
|
||||
"""register_all_wrappers"""
|
||||
logger = get_module_logger("data")
|
||||
module = get_module_by_module_path("qlib.data")
|
||||
@@ -1052,7 +1062,7 @@ def register_all_wrappers():
|
||||
if getattr(C, "calendar_cache", None) is not None:
|
||||
_calendar_provider = init_instance_by_config(C.calendar_cache, module, provide=_calendar_provider)
|
||||
register_wrapper(Cal, _calendar_provider, "qlib.data")
|
||||
logger.debug(f"registering Cal {C.calendar_provider}-{C.calenar_cache}")
|
||||
logger.debug(f"registering Cal {C.calendar_provider}-{C.calendar_cache}")
|
||||
|
||||
register_wrapper(Inst, C.instrument_provider, "qlib.data")
|
||||
logger.debug(f"registering Inst {C.instrument_provider}")
|
||||
|
||||
@@ -76,18 +76,22 @@ class DatasetH(Dataset):
|
||||
- The processing is related to data split.
|
||||
"""
|
||||
|
||||
def __init__(self, handler: Union[dict, DataHandler], segments: list):
|
||||
def __init__(self, handler: Union[dict, DataHandler], segments: dict):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
handler : Union[dict, DataHandler]
|
||||
handler will be passed into setup_data.
|
||||
segments : list
|
||||
segments : dict
|
||||
handler will be passed into setup_data.
|
||||
"""
|
||||
super().__init__(handler, segments)
|
||||
|
||||
def setup_data(self, handler: Union[dict, DataHandler], segments: list):
|
||||
def init(self, **kwargs):
|
||||
"""Initialize the DatasetH, Only parameters belonging to handler.init will be passed in"""
|
||||
self.handler.init(**kwargs)
|
||||
|
||||
def setup_data(self, handler: Union[dict, DataHandler], segments: dict):
|
||||
"""
|
||||
Setup the underlying data.
|
||||
|
||||
@@ -100,7 +104,7 @@ class DatasetH(Dataset):
|
||||
|
||||
- config of `DataHandler`. Please refer to `DataHandler`
|
||||
|
||||
segments : list
|
||||
segments : dict
|
||||
Describe the options to segment the data.
|
||||
Here are some examples:
|
||||
|
||||
@@ -116,8 +120,8 @@ class DatasetH(Dataset):
|
||||
'outsample': ("2017-01-01", "2020-08-01",),
|
||||
}
|
||||
"""
|
||||
self._handler = init_instance_by_config(handler, accept_types=DataHandler)
|
||||
self._segments = segments.copy()
|
||||
self.handler = init_instance_by_config(handler, accept_types=DataHandler)
|
||||
self.segments = segments.copy()
|
||||
|
||||
def _prepare_seg(self, slc: slice, **kwargs):
|
||||
"""
|
||||
@@ -127,7 +131,7 @@ class DatasetH(Dataset):
|
||||
----------
|
||||
slc : slice
|
||||
"""
|
||||
return self._handler.fetch(slc, **kwargs)
|
||||
return self.handler.fetch(slc, **kwargs)
|
||||
|
||||
def prepare(
|
||||
self,
|
||||
@@ -150,7 +154,7 @@ class DatasetH(Dataset):
|
||||
- ['train', 'valid']
|
||||
|
||||
col_set : str
|
||||
The col_set will be passed to self._handler when fetching data.
|
||||
The col_set will be passed to self.handler when fetching data.
|
||||
data_key : str
|
||||
The data to fetch: DK_*
|
||||
Default is DK_I, which indicate fetching data for **inference**.
|
||||
@@ -166,16 +170,16 @@ class DatasetH(Dataset):
|
||||
logger = get_module_logger("DatasetH")
|
||||
fetch_kwargs = {"col_set": col_set}
|
||||
fetch_kwargs.update(kwargs)
|
||||
if "data_key" in getfullargspec(self._handler.fetch).args:
|
||||
if "data_key" in getfullargspec(self.handler.fetch).args:
|
||||
fetch_kwargs["data_key"] = data_key
|
||||
else:
|
||||
logger.info(f"data_key[{data_key}] is ignored.")
|
||||
|
||||
# Handle all kinds of segments format
|
||||
if isinstance(segments, (list, tuple)):
|
||||
return [self._prepare_seg(slice(*self._segments[seg]), **fetch_kwargs) for seg in segments]
|
||||
return [self._prepare_seg(slice(*self.segments[seg]), **fetch_kwargs) for seg in segments]
|
||||
elif isinstance(segments, str):
|
||||
return self._prepare_seg(slice(*self._segments[segments]), **fetch_kwargs)
|
||||
return self._prepare_seg(slice(*self.segments[segments]), **fetch_kwargs)
|
||||
elif isinstance(segments, slice):
|
||||
return self._prepare_seg(segments, **fetch_kwargs)
|
||||
else:
|
||||
@@ -409,7 +413,7 @@ class TSDatasetH(DatasetH):
|
||||
|
||||
def setup_data(self, *args, **kwargs):
|
||||
super().setup_data(*args, **kwargs)
|
||||
cal = self._handler.fetch(col_set=self._handler.CS_RAW).index.get_level_values("datetime").unique()
|
||||
cal = self.handler.fetch(col_set=self.handler.CS_RAW).index.get_level_values("datetime").unique()
|
||||
cal = sorted(cal)
|
||||
# Get the datatime index for building timestamp
|
||||
self.cal = cal
|
||||
|
||||
@@ -83,22 +83,42 @@ class DataHandler(Serializable):
|
||||
# Setup data loader
|
||||
assert data_loader is not None # to make start_time end_time could have None default value
|
||||
|
||||
# what data source to load data
|
||||
self.data_loader = init_instance_by_config(
|
||||
data_loader,
|
||||
None if (isinstance(data_loader, dict) and "module_path" in data_loader) else data_loader_module,
|
||||
accept_types=DataLoader,
|
||||
)
|
||||
|
||||
# what data to be loaded from data source
|
||||
# For IDE auto-completion.
|
||||
self.instruments = instruments
|
||||
self.start_time = start_time
|
||||
self.end_time = end_time
|
||||
|
||||
self.fetch_orig = fetch_orig
|
||||
if init_data:
|
||||
with TimeInspector.logt("Init data"):
|
||||
self.init()
|
||||
super().__init__()
|
||||
|
||||
def init(self, enable_cache: bool = True):
|
||||
def conf_data(self, **kwargs):
|
||||
"""
|
||||
configuration of data.
|
||||
# what data to be loaded from data source
|
||||
|
||||
This method will be used when loading pickled handler from dataset.
|
||||
The data will be initialized with different time range.
|
||||
|
||||
"""
|
||||
attr_list = {"instruments", "start_time", "end_time"}
|
||||
for k, v in kwargs.items():
|
||||
if k in attr_list:
|
||||
setattr(self, k, v)
|
||||
else:
|
||||
raise KeyError("Such config is not supported.")
|
||||
|
||||
def init(self, enable_cache: bool = False):
|
||||
"""
|
||||
initialize the data.
|
||||
In case of running intialization for multiple time, it will do nothing for the second time.
|
||||
@@ -262,6 +282,7 @@ class DataHandlerLP(DataHandler):
|
||||
infer_processors=[],
|
||||
learn_processors=[],
|
||||
process_type=PTYPE_A,
|
||||
drop_raw=False,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
@@ -303,6 +324,8 @@ class DataHandlerLP(DataHandler):
|
||||
- self._learn will be processed by infer_processors + learn_processors
|
||||
|
||||
- (e.g. self._infer processed by learn_processors )
|
||||
drop_raw: bool
|
||||
Whether to drop the raw data
|
||||
"""
|
||||
|
||||
# Setup preprocessor
|
||||
@@ -319,6 +342,7 @@ class DataHandlerLP(DataHandler):
|
||||
)
|
||||
|
||||
self.process_type = process_type
|
||||
self.drop_raw = drop_raw
|
||||
super().__init__(instruments, start_time, end_time, data_loader, **kwargs)
|
||||
|
||||
def get_all_processors(self):
|
||||
@@ -348,7 +372,7 @@ class DataHandlerLP(DataHandler):
|
||||
"""
|
||||
# data for inference
|
||||
_infer_df = self._data
|
||||
if len(self.infer_processors) > 0: # avoid modifying the original data
|
||||
if len(self.infer_processors) > 0 and not self.drop_raw: # avoid modifying the original data
|
||||
_infer_df = _infer_df.copy()
|
||||
|
||||
for proc in self.infer_processors:
|
||||
@@ -378,6 +402,9 @@ class DataHandlerLP(DataHandler):
|
||||
_learn_df = proc(_learn_df)
|
||||
self._learn = _learn_df
|
||||
|
||||
if self.drop_raw:
|
||||
del self._data
|
||||
|
||||
# init type
|
||||
IT_FIT_SEQ = "fit_seq" # the input of `fit` will be the output of the previous processor
|
||||
IT_FIT_IND = "fit_ind" # the input of `fit` will be the original df
|
||||
@@ -416,6 +443,10 @@ class DataHandlerLP(DataHandler):
|
||||
# TODO: Be able to cache handler data. Save the memory for data processing
|
||||
|
||||
def _get_df_by_key(self, data_key: str = DK_I) -> pd.DataFrame:
|
||||
if data_key == self.DK_R and self.drop_raw:
|
||||
raise AttributeError(
|
||||
"DataHandlerLP has not attribute _data, please set drop_raw = False if you want to use raw data"
|
||||
)
|
||||
df = getattr(self, {self.DK_R: "_data", self.DK_I: "_infer", self.DK_L: "_learn"}[data_key])
|
||||
return df
|
||||
|
||||
|
||||
@@ -10,7 +10,9 @@ import pandas as pd
|
||||
from typing import Tuple, Union
|
||||
|
||||
from qlib.data import D
|
||||
from qlib.utils import load_dataset
|
||||
from qlib.data import filter as filter_module
|
||||
from qlib.data.filter import BaseDFilter
|
||||
from qlib.utils import load_dataset, init_instance_by_config
|
||||
|
||||
|
||||
class DataLoader(abc.ABC):
|
||||
@@ -76,6 +78,7 @@ class DLWParser(DataLoader):
|
||||
<config> := <fields_info>
|
||||
|
||||
<fields_info> := ["expr", ...] | (["expr", ...], ["col_name", ...])
|
||||
# NOTE: list or tuple will be treated as the things when parsing
|
||||
"""
|
||||
self.is_group = isinstance(config, dict)
|
||||
|
||||
@@ -85,9 +88,15 @@ class DLWParser(DataLoader):
|
||||
self.fields = self._parse_fields_info(config)
|
||||
|
||||
def _parse_fields_info(self, fields_info: Tuple[list, tuple]) -> Tuple[list, list]:
|
||||
if isinstance(fields_info, list):
|
||||
if len(fields_info) == 0:
|
||||
raise ValueError("The size of fields must be greater than 0")
|
||||
|
||||
if not isinstance(fields_info, (list, tuple)):
|
||||
raise TypeError("Unsupported type")
|
||||
|
||||
if isinstance(fields_info[0], str):
|
||||
exprs = names = fields_info
|
||||
elif isinstance(fields_info, tuple):
|
||||
elif isinstance(fields_info[0], (list, tuple)):
|
||||
exprs, names = fields_info
|
||||
else:
|
||||
raise NotImplementedError(f"This type of input is not supported")
|
||||
@@ -132,7 +141,7 @@ class DLWParser(DataLoader):
|
||||
class QlibDataLoader(DLWParser):
|
||||
"""Same as QlibDataLoader. The fields can be define by config"""
|
||||
|
||||
def __init__(self, config: Tuple[list, tuple, dict], filter_pipe=None):
|
||||
def __init__(self, config: Tuple[list, tuple, dict], filter_pipe=None, swap_level=True, freq="day"):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
@@ -140,8 +149,19 @@ class QlibDataLoader(DLWParser):
|
||||
Please refer to the doc of DLWParser
|
||||
filter_pipe :
|
||||
Filter pipe for the instruments
|
||||
swap_level :
|
||||
Whether to swap level of MultiIndex
|
||||
"""
|
||||
if filter_pipe is not None:
|
||||
assert isinstance(filter_pipe, list), "The type of `filter_pipe` must be list."
|
||||
filter_pipe = [
|
||||
init_instance_by_config(fp, None if "module_path" in fp else filter_module, accept_types=BaseDFilter)
|
||||
for fp in filter_pipe
|
||||
]
|
||||
|
||||
self.filter_pipe = filter_pipe
|
||||
self.swap_level = swap_level
|
||||
self.freq = freq
|
||||
super().__init__(config)
|
||||
|
||||
def load_group_df(self, instruments, exprs: list, names: list, start_time=None, end_time=None) -> pd.DataFrame:
|
||||
@@ -153,9 +173,10 @@ class QlibDataLoader(DLWParser):
|
||||
elif self.filter_pipe is not None:
|
||||
warnings.warn("`filter_pipe` is not None, but it will not be used with `instruments` as list")
|
||||
|
||||
df = D.features(instruments, exprs, start_time, end_time)
|
||||
df = D.features(instruments, exprs, start_time, end_time, self.freq)
|
||||
df.columns = names
|
||||
df = df.swaplevel().sort_index() # NOTE: always return <datetime, instrument>
|
||||
if self.swap_level:
|
||||
df = df.swaplevel().sort_index() # NOTE: if swaplevel, return <datetime, instrument>
|
||||
return df
|
||||
|
||||
|
||||
|
||||
288
qlib/data/ops.py
288
qlib/data/ops.py
@@ -6,6 +6,7 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import abc
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
@@ -17,57 +18,12 @@ from ..log import get_module_logger
|
||||
try:
|
||||
from ._libs.rolling import rolling_slope, rolling_rsquare, rolling_resi
|
||||
from ._libs.expanding import expanding_slope, expanding_rsquare, expanding_resi
|
||||
except ImportError as err:
|
||||
print("Do not import qlib package in the repository directory!")
|
||||
except ImportError:
|
||||
print(
|
||||
"#### Do not import qlib package in the repository directory in case of importing qlib from . without compiling #####"
|
||||
)
|
||||
raise
|
||||
|
||||
__all__ = (
|
||||
"Ref",
|
||||
"Max",
|
||||
"Min",
|
||||
"Sum",
|
||||
"Mean",
|
||||
"Std",
|
||||
"Var",
|
||||
"Skew",
|
||||
"Kurt",
|
||||
"Med",
|
||||
"Mad",
|
||||
"Slope",
|
||||
"Rsquare",
|
||||
"Resi",
|
||||
"Rank",
|
||||
"Quantile",
|
||||
"Count",
|
||||
"EMA",
|
||||
"WMA",
|
||||
"Corr",
|
||||
"Cov",
|
||||
"Delta",
|
||||
"Abs",
|
||||
"Sign",
|
||||
"Log",
|
||||
"Power",
|
||||
"Add",
|
||||
"Sub",
|
||||
"Mul",
|
||||
"Div",
|
||||
"Greater",
|
||||
"Less",
|
||||
"And",
|
||||
"Or",
|
||||
"Not",
|
||||
"Gt",
|
||||
"Ge",
|
||||
"Lt",
|
||||
"Le",
|
||||
"Eq",
|
||||
"Ne",
|
||||
"Mask",
|
||||
"IdxMax",
|
||||
"IdxMin",
|
||||
"If",
|
||||
)
|
||||
|
||||
np.seterr(invalid="ignore")
|
||||
|
||||
@@ -77,12 +33,39 @@ np.seterr(invalid="ignore")
|
||||
class ElemOperator(ExpressionOps):
|
||||
"""Element-wise Operator
|
||||
|
||||
Parameters
|
||||
----------
|
||||
feature : Expression
|
||||
feature instance
|
||||
|
||||
Returns
|
||||
----------
|
||||
Expression
|
||||
feature operation output
|
||||
"""
|
||||
|
||||
def __init__(self, feature):
|
||||
self.feature = feature
|
||||
|
||||
def __str__(self):
|
||||
return "{}({})".format(type(self).__name__, self.feature)
|
||||
|
||||
def get_longest_back_rolling(self):
|
||||
return self.feature.get_longest_back_rolling()
|
||||
|
||||
def get_extended_window_size(self):
|
||||
return self.feature.get_extended_window_size()
|
||||
|
||||
|
||||
class NpElemOperator(ElemOperator):
|
||||
"""Numpy Element-wise Operator
|
||||
|
||||
Parameters
|
||||
----------
|
||||
feature : Expression
|
||||
feature instance
|
||||
func : str
|
||||
feature operation method
|
||||
numpy feature operation method
|
||||
|
||||
Returns
|
||||
----------
|
||||
@@ -93,22 +76,14 @@ class ElemOperator(ExpressionOps):
|
||||
def __init__(self, feature, func):
|
||||
self.feature = feature
|
||||
self.func = func
|
||||
|
||||
def __str__(self):
|
||||
return "{}({})".format(type(self).__name__, self.feature)
|
||||
super(NpElemOperator, self).__init__(feature)
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
return getattr(np, self.func)(series)
|
||||
|
||||
def get_longest_back_rolling(self):
|
||||
return self.feature.get_longest_back_rolling()
|
||||
|
||||
def get_extended_window_size(self):
|
||||
return self.feature.get_extended_window_size()
|
||||
|
||||
|
||||
class Abs(ElemOperator):
|
||||
class Abs(NpElemOperator):
|
||||
"""Feature Absolute Value
|
||||
|
||||
Parameters
|
||||
@@ -126,7 +101,7 @@ class Abs(ElemOperator):
|
||||
super(Abs, self).__init__(feature, "abs")
|
||||
|
||||
|
||||
class Sign(ElemOperator):
|
||||
class Sign(NpElemOperator):
|
||||
"""Feature Sign
|
||||
|
||||
Parameters
|
||||
@@ -143,8 +118,17 @@ class Sign(ElemOperator):
|
||||
def __init__(self, feature):
|
||||
super(Sign, self).__init__(feature, "sign")
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
"""
|
||||
To avoid error raised by bool type input, we transform the data into float32.
|
||||
"""
|
||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
# TODO: More precision types should be configurable
|
||||
series = series.astype(np.float32)
|
||||
return getattr(np, self.func)(series)
|
||||
|
||||
class Log(ElemOperator):
|
||||
|
||||
class Log(NpElemOperator):
|
||||
"""Feature Log
|
||||
|
||||
Parameters
|
||||
@@ -162,7 +146,7 @@ class Log(ElemOperator):
|
||||
super(Log, self).__init__(feature, "log")
|
||||
|
||||
|
||||
class Power(ElemOperator):
|
||||
class Power(NpElemOperator):
|
||||
"""Feature Power
|
||||
|
||||
Parameters
|
||||
@@ -188,7 +172,7 @@ class Power(ElemOperator):
|
||||
return getattr(np, self.func)(series, self.exponent)
|
||||
|
||||
|
||||
class Mask(ElemOperator):
|
||||
class Mask(NpElemOperator):
|
||||
"""Feature Mask
|
||||
|
||||
Parameters
|
||||
@@ -215,7 +199,7 @@ class Mask(ElemOperator):
|
||||
return self.feature.load(self.instrument, start_index, end_index, freq)
|
||||
|
||||
|
||||
class Not(ElemOperator):
|
||||
class Not(NpElemOperator):
|
||||
"""Not Operator
|
||||
|
||||
Parameters
|
||||
@@ -254,28 +238,13 @@ class PairOperator(ExpressionOps):
|
||||
two features' operation output
|
||||
"""
|
||||
|
||||
def __init__(self, feature_left, feature_right, func):
|
||||
def __init__(self, feature_left, feature_right):
|
||||
self.feature_left = feature_left
|
||||
self.feature_right = feature_right
|
||||
self.func = func
|
||||
|
||||
def __str__(self):
|
||||
return "{}({},{})".format(type(self).__name__, self.feature_left, self.feature_right)
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
assert any(
|
||||
[isinstance(self.feature_left, Expression), self.feature_right, Expression]
|
||||
), "at least one of two inputs is Expression instance"
|
||||
if isinstance(self.feature_left, Expression):
|
||||
series_left = self.feature_left.load(instrument, start_index, end_index, freq)
|
||||
else:
|
||||
series_left = self.feature_left # numeric value
|
||||
if isinstance(self.feature_right, Expression):
|
||||
series_right = self.feature_right.load(instrument, start_index, end_index, freq)
|
||||
else:
|
||||
series_right = self.feature_right
|
||||
return getattr(np, self.func)(series_left, series_right)
|
||||
|
||||
def get_longest_back_rolling(self):
|
||||
if isinstance(self.feature_left, Expression):
|
||||
left_br = self.feature_left.get_longest_back_rolling()
|
||||
@@ -301,7 +270,46 @@ class PairOperator(ExpressionOps):
|
||||
return max(ll, rl), max(lr, rr)
|
||||
|
||||
|
||||
class Add(PairOperator):
|
||||
class NpPairOperator(PairOperator):
|
||||
"""Numpy Pair-wise operator
|
||||
|
||||
Parameters
|
||||
----------
|
||||
feature_left : Expression
|
||||
feature instance or numeric value
|
||||
feature_right : Expression
|
||||
feature instance or numeric value
|
||||
func : str
|
||||
operator function
|
||||
|
||||
Returns
|
||||
----------
|
||||
Feature:
|
||||
two features' operation output
|
||||
"""
|
||||
|
||||
def __init__(self, feature_left, feature_right, func):
|
||||
self.feature_left = feature_left
|
||||
self.feature_right = feature_right
|
||||
self.func = func
|
||||
super(NpPairOperator, self).__init__(feature_left, feature_right)
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
assert any(
|
||||
[isinstance(self.feature_left, Expression), self.feature_right, Expression]
|
||||
), "at least one of two inputs is Expression instance"
|
||||
if isinstance(self.feature_left, Expression):
|
||||
series_left = self.feature_left.load(instrument, start_index, end_index, freq)
|
||||
else:
|
||||
series_left = self.feature_left # numeric value
|
||||
if isinstance(self.feature_right, Expression):
|
||||
series_right = self.feature_right.load(instrument, start_index, end_index, freq)
|
||||
else:
|
||||
series_right = self.feature_right
|
||||
return getattr(np, self.func)(series_left, series_right)
|
||||
|
||||
|
||||
class Add(NpPairOperator):
|
||||
"""Add Operator
|
||||
|
||||
Parameters
|
||||
@@ -321,7 +329,7 @@ class Add(PairOperator):
|
||||
super(Add, self).__init__(feature_left, feature_right, "add")
|
||||
|
||||
|
||||
class Sub(PairOperator):
|
||||
class Sub(NpPairOperator):
|
||||
"""Subtract Operator
|
||||
|
||||
Parameters
|
||||
@@ -341,7 +349,7 @@ class Sub(PairOperator):
|
||||
super(Sub, self).__init__(feature_left, feature_right, "subtract")
|
||||
|
||||
|
||||
class Mul(PairOperator):
|
||||
class Mul(NpPairOperator):
|
||||
"""Multiply Operator
|
||||
|
||||
Parameters
|
||||
@@ -361,7 +369,7 @@ class Mul(PairOperator):
|
||||
super(Mul, self).__init__(feature_left, feature_right, "multiply")
|
||||
|
||||
|
||||
class Div(PairOperator):
|
||||
class Div(NpPairOperator):
|
||||
"""Division Operator
|
||||
|
||||
Parameters
|
||||
@@ -381,7 +389,7 @@ class Div(PairOperator):
|
||||
super(Div, self).__init__(feature_left, feature_right, "divide")
|
||||
|
||||
|
||||
class Greater(PairOperator):
|
||||
class Greater(NpPairOperator):
|
||||
"""Greater Operator
|
||||
|
||||
Parameters
|
||||
@@ -401,7 +409,7 @@ class Greater(PairOperator):
|
||||
super(Greater, self).__init__(feature_left, feature_right, "maximum")
|
||||
|
||||
|
||||
class Less(PairOperator):
|
||||
class Less(NpPairOperator):
|
||||
"""Less Operator
|
||||
|
||||
Parameters
|
||||
@@ -421,7 +429,7 @@ class Less(PairOperator):
|
||||
super(Less, self).__init__(feature_left, feature_right, "minimum")
|
||||
|
||||
|
||||
class Gt(PairOperator):
|
||||
class Gt(NpPairOperator):
|
||||
"""Greater Than Operator
|
||||
|
||||
Parameters
|
||||
@@ -441,7 +449,7 @@ class Gt(PairOperator):
|
||||
super(Gt, self).__init__(feature_left, feature_right, "greater")
|
||||
|
||||
|
||||
class Ge(PairOperator):
|
||||
class Ge(NpPairOperator):
|
||||
"""Greater Equal Than Operator
|
||||
|
||||
Parameters
|
||||
@@ -461,7 +469,7 @@ class Ge(PairOperator):
|
||||
super(Ge, self).__init__(feature_left, feature_right, "greater_equal")
|
||||
|
||||
|
||||
class Lt(PairOperator):
|
||||
class Lt(NpPairOperator):
|
||||
"""Less Than Operator
|
||||
|
||||
Parameters
|
||||
@@ -481,7 +489,7 @@ class Lt(PairOperator):
|
||||
super(Lt, self).__init__(feature_left, feature_right, "less")
|
||||
|
||||
|
||||
class Le(PairOperator):
|
||||
class Le(NpPairOperator):
|
||||
"""Less Equal Than Operator
|
||||
|
||||
Parameters
|
||||
@@ -501,7 +509,7 @@ class Le(PairOperator):
|
||||
super(Le, self).__init__(feature_left, feature_right, "less_equal")
|
||||
|
||||
|
||||
class Eq(PairOperator):
|
||||
class Eq(NpPairOperator):
|
||||
"""Equal Operator
|
||||
|
||||
Parameters
|
||||
@@ -521,7 +529,7 @@ class Eq(PairOperator):
|
||||
super(Eq, self).__init__(feature_left, feature_right, "equal")
|
||||
|
||||
|
||||
class Ne(PairOperator):
|
||||
class Ne(NpPairOperator):
|
||||
"""Not Equal Operator
|
||||
|
||||
Parameters
|
||||
@@ -541,7 +549,7 @@ class Ne(PairOperator):
|
||||
super(Ne, self).__init__(feature_left, feature_right, "not_equal")
|
||||
|
||||
|
||||
class And(PairOperator):
|
||||
class And(NpPairOperator):
|
||||
"""And Operator
|
||||
|
||||
Parameters
|
||||
@@ -561,7 +569,7 @@ class And(PairOperator):
|
||||
super(And, self).__init__(feature_left, feature_right, "bitwise_and")
|
||||
|
||||
|
||||
class Or(PairOperator):
|
||||
class Or(NpPairOperator):
|
||||
"""Or Operator
|
||||
|
||||
Parameters
|
||||
@@ -1430,3 +1438,93 @@ class Cov(PairRolling):
|
||||
|
||||
def __init__(self, feature_left, feature_right, N):
|
||||
super(Cov, self).__init__(feature_left, feature_right, N, "cov")
|
||||
|
||||
|
||||
OpsList = [
|
||||
Ref,
|
||||
Max,
|
||||
Min,
|
||||
Sum,
|
||||
Mean,
|
||||
Std,
|
||||
Var,
|
||||
Skew,
|
||||
Kurt,
|
||||
Med,
|
||||
Mad,
|
||||
Slope,
|
||||
Rsquare,
|
||||
Resi,
|
||||
Rank,
|
||||
Quantile,
|
||||
Count,
|
||||
EMA,
|
||||
WMA,
|
||||
Corr,
|
||||
Cov,
|
||||
Delta,
|
||||
Abs,
|
||||
Sign,
|
||||
Log,
|
||||
Power,
|
||||
Add,
|
||||
Sub,
|
||||
Mul,
|
||||
Div,
|
||||
Greater,
|
||||
Less,
|
||||
And,
|
||||
Or,
|
||||
Not,
|
||||
Gt,
|
||||
Ge,
|
||||
Lt,
|
||||
Le,
|
||||
Eq,
|
||||
Ne,
|
||||
Mask,
|
||||
IdxMax,
|
||||
IdxMin,
|
||||
If,
|
||||
]
|
||||
|
||||
|
||||
class OpsWrapper(object):
|
||||
"""Ops Wrapper"""
|
||||
|
||||
def __init__(self):
|
||||
self._ops = {}
|
||||
|
||||
def reset(self):
|
||||
self._ops = {}
|
||||
|
||||
def register(self, ops_list):
|
||||
for operator in ops_list:
|
||||
if not issubclass(operator, ExpressionOps):
|
||||
raise TypeError("operator must be subclass of ExpressionOps, not {}".format(operator))
|
||||
|
||||
if operator.__name__ in self._ops:
|
||||
get_module_logger(self.__class__.__name__).warning(
|
||||
"The custom operator [{}] will override the qlib default definition".format(operator.__name__)
|
||||
)
|
||||
self._ops[operator.__name__] = operator
|
||||
|
||||
def __getattr__(self, key):
|
||||
if key not in self._ops:
|
||||
raise AttributeError("The operator [{0}] is not registered".format(key))
|
||||
return self._ops[key]
|
||||
|
||||
|
||||
Operators = OpsWrapper()
|
||||
|
||||
|
||||
def register_all_ops(C):
|
||||
"""register all operator"""
|
||||
logger = get_module_logger("ops")
|
||||
|
||||
Operators.reset()
|
||||
Operators.register(OpsList)
|
||||
|
||||
if getattr(C, "custom_ops", None) is not None:
|
||||
Operators.register(C.custom_ops)
|
||||
logger.debug("register custom operator {}".format(C.custom_ops))
|
||||
|
||||
@@ -36,7 +36,7 @@ def get_module_logger(module_name, level=None):
|
||||
return module_logger
|
||||
|
||||
|
||||
class TimeInspector(object):
|
||||
class TimeInspector:
|
||||
|
||||
timer_logger = get_module_logger("timer", level=logging.WARNING)
|
||||
|
||||
|
||||
@@ -30,11 +30,6 @@ class Model(BaseModel):
|
||||
The attribute names of learned model should `not` start with '_'. So that the model could be
|
||||
dumped to disk.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset : Dataset
|
||||
dataset will generate the processed data from model training.
|
||||
|
||||
The following code example shows how to retrieve `x_train`, `y_train` and `w_train` from the `dataset`:
|
||||
|
||||
.. code-block:: Python
|
||||
@@ -53,6 +48,12 @@ class Model(BaseModel):
|
||||
except KeyError as e:
|
||||
w_train = pd.DataFrame(np.ones_like(y_train.values), index=y_train.index)
|
||||
w_valid = pd.DataFrame(np.ones_like(y_valid.values), index=y_valid.index)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset : Dataset
|
||||
dataset will generate the processed data from model training.
|
||||
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ import scipy.optimize as so
|
||||
from typing import Optional, Union, Callable, List
|
||||
|
||||
|
||||
class PortfolioOptimizer(object):
|
||||
class PortfolioOptimizer:
|
||||
"""Portfolio Optimizer
|
||||
|
||||
The following optimization algorithms are supported:
|
||||
|
||||
@@ -7,6 +7,9 @@ from ..config import REG_CN
|
||||
|
||||
|
||||
class TestAutoData(unittest.TestCase):
|
||||
|
||||
_setup_kwargs = {}
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls) -> None:
|
||||
# use default data
|
||||
@@ -15,6 +18,10 @@ class TestAutoData(unittest.TestCase):
|
||||
print(f"Qlib data is not found in {provider_uri}")
|
||||
|
||||
GetData().qlib_data(
|
||||
name="qlib_data_simple", region="cn", version="latest", interval="1d", target_dir=provider_uri
|
||||
name="qlib_data_simple",
|
||||
region="cn",
|
||||
interval="1d",
|
||||
target_dir=provider_uri,
|
||||
delete_old=False,
|
||||
)
|
||||
init(provider_uri=provider_uri, region=REG_CN)
|
||||
init(provider_uri=provider_uri, region=REG_CN, **cls._setup_kwargs)
|
||||
|
||||
@@ -1,14 +1,21 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import re
|
||||
import qlib
|
||||
import shutil
|
||||
import zipfile
|
||||
import requests
|
||||
import datetime
|
||||
from tqdm import tqdm
|
||||
from pathlib import Path
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class GetData:
|
||||
DATASET_VERSION = "v1"
|
||||
REMOTE_URL = "http://fintech.msra.cn/stock_data/downloads"
|
||||
QLIB_DATA_NAME = "{dataset_name}_{region}_{interval}_{qlib_version}.zip"
|
||||
|
||||
def __init__(self, delete_zip_file=False):
|
||||
"""
|
||||
@@ -20,41 +27,92 @@ class GetData:
|
||||
"""
|
||||
self.delete_zip_file = delete_zip_file
|
||||
|
||||
def _download_data(self, file_name: str, target_dir: [Path, str]):
|
||||
def normalize_dataset_version(self, dataset_version: str = None):
|
||||
if dataset_version is None:
|
||||
dataset_version = self.DATASET_VERSION
|
||||
return dataset_version
|
||||
|
||||
def merge_remote_url(self, file_name: str, dataset_version: str = None):
|
||||
return f"{self.REMOTE_URL}/{self.normalize_dataset_version(dataset_version)}/{file_name}"
|
||||
|
||||
def _download_data(
|
||||
self, file_name: str, target_dir: [Path, str], delete_old: bool = True, dataset_version: str = None
|
||||
):
|
||||
target_dir = Path(target_dir).expanduser()
|
||||
target_dir.mkdir(exist_ok=True, parents=True)
|
||||
# saved file name
|
||||
_target_file_name = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "_" + file_name
|
||||
target_path = target_dir.joinpath(_target_file_name)
|
||||
|
||||
url = f"{self.REMOTE_URL}/{file_name}"
|
||||
target_path = target_dir.joinpath(file_name)
|
||||
|
||||
url = self.merge_remote_url(file_name, dataset_version)
|
||||
resp = requests.get(url, stream=True)
|
||||
if resp.status_code != 200:
|
||||
raise requests.exceptions.HTTPError()
|
||||
|
||||
chuck_size = 1024
|
||||
chunk_size = 1024
|
||||
logger.warning(
|
||||
f"The data for the example is collected from Yahoo Finance. Please be aware that the quality of the data might not be perfect. (You can refer to the original data source: https://finance.yahoo.com/lookup.)"
|
||||
)
|
||||
logger.info(f"{file_name} downloading......")
|
||||
with tqdm(total=int(resp.headers.get("Content-Length", 0))) as p_bar:
|
||||
with target_path.open("wb") as fp:
|
||||
for chuck in resp.iter_content(chunk_size=chuck_size):
|
||||
fp.write(chuck)
|
||||
p_bar.update(chuck_size)
|
||||
for chunk in resp.iter_content(chunk_size=chunk_size):
|
||||
fp.write(chunk)
|
||||
p_bar.update(chunk_size)
|
||||
|
||||
self._unzip(target_path, target_dir)
|
||||
self._unzip(target_path, target_dir, delete_old)
|
||||
if self.delete_zip_file:
|
||||
target_path.unlike()
|
||||
target_path.unlink()
|
||||
|
||||
def check_dataset(self, file_name: str, dataset_version: str = None):
|
||||
url = self.merge_remote_url(file_name, dataset_version)
|
||||
resp = requests.get(url, stream=True)
|
||||
status = True
|
||||
if resp.status_code == 404:
|
||||
status = False
|
||||
return status
|
||||
|
||||
@staticmethod
|
||||
def _unzip(file_path: Path, target_dir: Path):
|
||||
def _unzip(file_path: Path, target_dir: Path, delete_old: bool = True):
|
||||
if delete_old:
|
||||
logger.warning(
|
||||
f"will delete the old qlib data directory(features, instruments, calendars, features_cache, dataset_cache): {target_dir}"
|
||||
)
|
||||
GetData._delete_qlib_data(target_dir)
|
||||
logger.info(f"{file_path} unzipping......")
|
||||
with zipfile.ZipFile(str(file_path.resolve()), "r") as zp:
|
||||
for _file in tqdm(zp.namelist()):
|
||||
zp.extract(_file, str(target_dir.resolve()))
|
||||
|
||||
@staticmethod
|
||||
def _delete_qlib_data(file_dir: Path):
|
||||
logger.info(f"delete {file_dir}")
|
||||
rm_dirs = []
|
||||
for _name in ["features", "calendars", "instruments", "features_cache", "dataset_cache"]:
|
||||
_p = file_dir.joinpath(_name)
|
||||
if _p.exists():
|
||||
rm_dirs.append(str(_p.resolve()))
|
||||
if rm_dirs:
|
||||
flag = input(
|
||||
f"Will be deleted: "
|
||||
f"\n\t{rm_dirs}"
|
||||
f"\nIf you do not need to delete {file_dir}, please change the <--target_dir>"
|
||||
f"\nAre you sure you want to delete, yes(Y/y), no (N/n):"
|
||||
)
|
||||
if str(flag) not in ["Y", "y"]:
|
||||
exit()
|
||||
for _p in rm_dirs:
|
||||
logger.warning(f"delete: {_p}")
|
||||
shutil.rmtree(_p)
|
||||
|
||||
def qlib_data(
|
||||
self, name="qlib_data", target_dir="~/.qlib/qlib_data/cn_data", version="latest", interval="1d", region="cn"
|
||||
self,
|
||||
name="qlib_data",
|
||||
target_dir="~/.qlib/qlib_data/cn_data",
|
||||
version=None,
|
||||
interval="1d",
|
||||
region="cn",
|
||||
delete_old=True,
|
||||
):
|
||||
"""download cn qlib data from remote
|
||||
|
||||
@@ -65,20 +123,31 @@ class GetData:
|
||||
name: str
|
||||
dataset name, value from [qlib_data, qlib_data_simple], by default qlib_data
|
||||
version: str
|
||||
data version, value from [v0, v1, ..., latest], by default latest
|
||||
data version, value from [v1, ...], by default None(use script to specify version)
|
||||
interval: str
|
||||
data freq, value from [1d], by default 1d
|
||||
region: str
|
||||
data region, value from [cn, us], by default cn
|
||||
delete_old: bool
|
||||
delete an existing directory, by default True
|
||||
|
||||
Examples
|
||||
---------
|
||||
python get_data.py qlib_data --name qlib_data --target_dir ~/.qlib/qlib_data/cn_data --version latest --interval 1d --region cn
|
||||
python get_data.py qlib_data --name qlib_data --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
|
||||
-------
|
||||
|
||||
"""
|
||||
file_name = f"{name}_{region.lower()}_{interval.lower()}_{version}.zip"
|
||||
self._download_data(file_name.lower(), target_dir)
|
||||
qlib_version = ".".join(re.findall(r"(\d+)\.+", qlib.__version__))
|
||||
|
||||
def _get_file_name(v):
|
||||
return self.QLIB_DATA_NAME.format(
|
||||
dataset_name=name, region=region.lower(), interval=interval.lower(), qlib_version=v
|
||||
)
|
||||
|
||||
file_name = _get_file_name(qlib_version)
|
||||
if not self.check_dataset(file_name, version):
|
||||
file_name = _get_file_name("latest")
|
||||
self._download_data(file_name.lower(), target_dir, delete_old, dataset_version=version)
|
||||
|
||||
def csv_data_cn(self, target_dir="~/.qlib/csv_data/cn_data"):
|
||||
"""download cn csv data from remote
|
||||
|
||||
@@ -27,7 +27,7 @@ from pathlib import Path
|
||||
from typing import Union, Tuple
|
||||
|
||||
from ..config import C
|
||||
from ..log import get_module_logger
|
||||
from ..log import get_module_logger, set_log_with_config
|
||||
|
||||
log = get_module_logger("utils")
|
||||
|
||||
@@ -162,7 +162,7 @@ def parse_field(field):
|
||||
# - $open+$close -> Feature("open")+Feature("close")
|
||||
if not isinstance(field, str):
|
||||
field = str(field)
|
||||
return re.sub(r"\$(\w+)", r'Feature("\1")', field)
|
||||
return re.sub(r"\$(\w+)", r'Feature("\1")', re.sub(r"(\w+\s*)\(", r"Operators.\1(", field))
|
||||
|
||||
|
||||
def get_module_by_module_path(module_path):
|
||||
@@ -279,8 +279,10 @@ def compare_dict_value(src_data: dict, dst_data: dict):
|
||||
def create_save_path(save_path=None):
|
||||
"""Create save path
|
||||
|
||||
:param save_path:
|
||||
:return:
|
||||
Parameters
|
||||
----------
|
||||
save_path: str
|
||||
|
||||
"""
|
||||
if save_path:
|
||||
if not os.path.exists(save_path):
|
||||
@@ -471,30 +473,28 @@ def is_tradable_date(cur_date):
|
||||
return str(cur_date.date()) == str(D.calendar(start_time=cur_date, future=True)[0].date())
|
||||
|
||||
|
||||
def get_date_range(trading_date, shift, future=False):
|
||||
def get_date_range(trading_date, left_shift=0, right_shift=0, future=False):
|
||||
"""get trading date range by shift
|
||||
|
||||
:param trading_date:
|
||||
:param shift: int
|
||||
:param future: bool
|
||||
:return:
|
||||
Parameters
|
||||
----------
|
||||
trading_date: pd.Timestamp
|
||||
left_shift: int
|
||||
right_shift: int
|
||||
future: bool
|
||||
|
||||
"""
|
||||
|
||||
from ..data import D
|
||||
|
||||
calendar = D.calendar(future=future)
|
||||
if pd.to_datetime(trading_date) not in list(calendar):
|
||||
raise ValueError("{} is not trading day!".format(str(trading_date)))
|
||||
day_index = bisect.bisect_left(calendar, trading_date)
|
||||
if 0 <= (day_index + shift) < len(calendar):
|
||||
if shift > 0:
|
||||
return calendar[day_index + 1 : day_index + 1 + shift]
|
||||
else:
|
||||
return calendar[day_index + shift : day_index]
|
||||
else:
|
||||
return calendar
|
||||
start = get_date_by_shift(trading_date, left_shift, future=future)
|
||||
end = get_date_by_shift(trading_date, right_shift, future=future)
|
||||
|
||||
calendar = D.calendar(start, end, future=future)
|
||||
return calendar
|
||||
|
||||
|
||||
def get_date_by_shift(trading_date, shift, future=False):
|
||||
def get_date_by_shift(trading_date, shift, future=False, clip_shift=True):
|
||||
"""get trading date with shift bias wil cur_date
|
||||
e.g. : shift == 1, return next trading date
|
||||
shift == -1, return previous trading date
|
||||
@@ -502,8 +502,22 @@ def get_date_by_shift(trading_date, shift, future=False):
|
||||
trading_date : pandas.Timestamp
|
||||
current date
|
||||
shift : int
|
||||
clip_shift: bool
|
||||
|
||||
"""
|
||||
return get_date_range(trading_date, shift, future)[0 if shift < 0 else -1] if shift != 0 else trading_date
|
||||
from qlib.data import D
|
||||
|
||||
cal = D.calendar(future=future)
|
||||
if pd.to_datetime(trading_date) not in list(cal):
|
||||
raise ValueError("{} is not trading day!".format(str(trading_date)))
|
||||
_index = bisect.bisect_left(cal, trading_date)
|
||||
shift_index = _index + shift
|
||||
if shift_index < 0 or shift_index >= len(cal):
|
||||
if clip_shift:
|
||||
shift_index = np.clip(shift_index, 0, len(cal) - 1)
|
||||
else:
|
||||
raise IndexError(f"The shift_index({shift_index}) of the trading day ({trading_date}) is out of range")
|
||||
return cal[shift_index]
|
||||
|
||||
|
||||
def get_next_trading_date(trading_date, future=False):
|
||||
@@ -629,15 +643,28 @@ def exists_qlib_data(qlib_dir):
|
||||
# check instruments
|
||||
code_names = set(map(lambda x: x.name.lower(), features_dir.iterdir()))
|
||||
_instrument = instruments_dir.joinpath("all.txt")
|
||||
df = pd.read_csv(_instrument, sep="\t", names=["inst", "start_datetime", "end_datetime", "save_inst"])
|
||||
df = df.iloc[:, [0, -1]].fillna(axis=1, method="ffill")
|
||||
miss_code = set(df.iloc[:, -1].apply(str.lower)) - set(code_names)
|
||||
miss_code = set(pd.read_csv(_instrument, sep="\t", header=None).loc[:, 0].apply(str.lower)) - set(code_names)
|
||||
if miss_code and any(map(lambda x: "sht" not in x, miss_code)):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def check_qlib_data(qlib_config):
|
||||
inst_dir = Path(qlib_config["provider_uri"]).joinpath("instruments")
|
||||
for _p in inst_dir.glob("*.txt"):
|
||||
try:
|
||||
assert len(pd.read_csv(_p, sep="\t", nrows=0, header=None).columns) == 3, (
|
||||
f"\nThe {str(_p.resolve())} of qlib data is not equal to 3 columns:"
|
||||
f"\n\tIf you are using the data provided by qlib: "
|
||||
f"https://qlib.readthedocs.io/en/latest/component/data.html#qlib-format-dataset"
|
||||
f"\n\tIf you are using your own data, please dump the data again: "
|
||||
f"https://qlib.readthedocs.io/en/latest/component/data.html#converting-csv-format-into-qlib-format"
|
||||
)
|
||||
except AssertionError:
|
||||
raise
|
||||
|
||||
|
||||
def lazy_sort_index(df: pd.DataFrame, axis=0) -> pd.DataFrame:
|
||||
"""
|
||||
make the df index sorted
|
||||
@@ -686,7 +713,7 @@ def flatten_dict(d, parent_key="", sep="."):
|
||||
|
||||
|
||||
#################### Wrapper #####################
|
||||
class Wrapper(object):
|
||||
class Wrapper:
|
||||
"""Wrapper class for anything that needs to set up during qlib.init"""
|
||||
|
||||
def __init__(self):
|
||||
@@ -728,3 +755,36 @@ def load_dataset(path_or_obj):
|
||||
elif extension == ".csv":
|
||||
return pd.read_csv(path_or_obj, parse_dates=True, index_col=[0, 1])
|
||||
raise ValueError(f"unsupported file type `{extension}`")
|
||||
|
||||
|
||||
def code_to_fname(code: str):
|
||||
"""stock code to file name
|
||||
|
||||
Parameters
|
||||
----------
|
||||
code: str
|
||||
"""
|
||||
# NOTE: In windows, the following name is I/O device, and the file with the corresponding name cannot be created
|
||||
# reference: https://superuser.com/questions/86999/why-cant-i-name-a-folder-or-file-con-in-windows
|
||||
replace_names = ["CON", "PRN", "AUX", "NUL"]
|
||||
replace_names += [f"COM{i}" for i in range(10)]
|
||||
replace_names += [f"LPT{i}" for i in range(10)]
|
||||
|
||||
prefix = "_qlib_"
|
||||
if str(code).upper() in replace_names:
|
||||
code = prefix + str(code)
|
||||
|
||||
return code
|
||||
|
||||
|
||||
def fname_to_code(fname: str):
|
||||
"""file name to stock code
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fname: str
|
||||
"""
|
||||
prefix = "_qlib_"
|
||||
if fname.startswith(prefix):
|
||||
fname = fname.lstrip(prefix)
|
||||
return fname
|
||||
|
||||
@@ -27,11 +27,6 @@ class Serializable:
|
||||
def dump_all(self):
|
||||
"""
|
||||
will the object dump all object
|
||||
|
||||
Parameters
|
||||
----------
|
||||
self : [TODO:type]
|
||||
[TODO:description]
|
||||
"""
|
||||
return getattr(self, "_dump_all", False)
|
||||
|
||||
@@ -39,11 +34,6 @@ class Serializable:
|
||||
def exclude(self):
|
||||
"""
|
||||
What attribute will be dumped
|
||||
|
||||
Parameters
|
||||
----------
|
||||
self : [TODO:type]
|
||||
[TODO:description]
|
||||
"""
|
||||
return getattr(self, "_exclude", [])
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
from contextlib import contextmanager
|
||||
from .expm import MLflowExpManager
|
||||
from .exp import Experiment
|
||||
from .recorder import Recorder
|
||||
from ..utils import Wrapper
|
||||
|
||||
@@ -165,7 +166,7 @@ class QlibRecorder:
|
||||
"""
|
||||
return self.get_exp(experiment_id, experiment_name).list_recorders()
|
||||
|
||||
def get_exp(self, experiment_id=None, experiment_name=None, create: bool = True):
|
||||
def get_exp(self, experiment_id=None, experiment_name=None, create: bool = True) -> Experiment:
|
||||
"""
|
||||
Method for retrieving an experiment with given id or name. Once the `create` argument is set to
|
||||
True, if no valid experiment is found, this method will create one for you. Otherwise, it will
|
||||
@@ -461,5 +462,14 @@ class QlibRecorder:
|
||||
self.get_exp().get_recorder().set_tags(**kwargs)
|
||||
|
||||
|
||||
import sys
|
||||
|
||||
if sys.version_info >= (3, 9):
|
||||
from typing import Annotated
|
||||
|
||||
QlibRecorderWrapper = Annotated[QlibRecorder, Wrapper]
|
||||
else:
|
||||
QlibRecorderWrapper = QlibRecorder
|
||||
|
||||
# global record
|
||||
R = Wrapper()
|
||||
R: QlibRecorderWrapper = Wrapper()
|
||||
|
||||
@@ -44,7 +44,7 @@ def sys_config(config, config_path):
|
||||
# worflow handler function
|
||||
def workflow(config_path, experiment_name="workflow", uri_folder="mlruns"):
|
||||
with open(config_path) as fp:
|
||||
config = yaml.load(fp, Loader=yaml.Loader)
|
||||
config = yaml.load(fp, Loader=yaml.SafeLoader)
|
||||
|
||||
# config the `sys` section
|
||||
sys_config(config, config_path)
|
||||
|
||||
@@ -65,13 +65,13 @@ class Experiment:
|
||||
"""
|
||||
raise NotImplementedError(f"Please implement the `end` method.")
|
||||
|
||||
def create_recorder(self, name=None):
|
||||
def create_recorder(self, recorder_name=None):
|
||||
"""
|
||||
Create a recorder for each experiment.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name : str
|
||||
recorder_name : str
|
||||
the name of the recorder to be created.
|
||||
|
||||
Returns
|
||||
|
||||
@@ -5,10 +5,9 @@ import re
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from pprint import pprint
|
||||
from ..contrib.evaluate import (
|
||||
backtest as normal_backtest,
|
||||
risk_analysis,
|
||||
)
|
||||
from ..contrib.evaluate import risk_analysis
|
||||
from ..contrib.backtest import backtest as normal_backtest
|
||||
|
||||
from ..data.dataset import DatasetH
|
||||
from ..data.dataset.handler import DataHandlerLP
|
||||
from ..utils import init_instance_by_config, get_module_by_module_path
|
||||
@@ -213,6 +212,11 @@ class SigAnaRecord(SignalRecord):
|
||||
class PortAnaRecord(SignalRecord):
|
||||
"""
|
||||
This is the Portfolio Analysis Record class that generates the analysis results such as those of backtest. This class inherits the ``RecordTemp`` class.
|
||||
|
||||
The following files will be stored in recorder
|
||||
- report_normal.pkl & positions_normal.pkl:
|
||||
- The return report and detailed positions of the backtest, returned by `qlib/contrib/evaluate.py:backtest`
|
||||
- port_analysis.pkl : The risk analysis of your portfolio, returned by `qlib/contrib/evaluate.py:risk_analysis`
|
||||
"""
|
||||
|
||||
artifact_path = "portfolio_analysis"
|
||||
@@ -236,9 +240,14 @@ class PortAnaRecord(SignalRecord):
|
||||
|
||||
# custom strategy and get backtest
|
||||
pred_score = super().load()
|
||||
report_normal, positions_normal = normal_backtest(pred_score, strategy=self.strategy, **self.backtest_config)
|
||||
report_dict = normal_backtest(pred_score, strategy=self.strategy, **self.backtest_config)
|
||||
report_normal = report_dict.get("report_df")
|
||||
positions_normal = report_dict.get("positions")
|
||||
self.recorder.save_objects(**{"report_normal.pkl": report_normal}, artifact_path=PortAnaRecord.get_path())
|
||||
self.recorder.save_objects(**{"positions_normal.pkl": positions_normal}, artifact_path=PortAnaRecord.get_path())
|
||||
order_normal = report_dict.get("order_list")
|
||||
if order_normal:
|
||||
self.recorder.save_objects(**{"order_normal.pkl": order_normal}, artifact_path=PortAnaRecord.get_path())
|
||||
|
||||
# analysis
|
||||
analysis = dict()
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import mlflow
|
||||
import shutil, os, pickle, tempfile, codecs
|
||||
import shutil, os, pickle, tempfile, codecs, pickle
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from ..utils.objm import FileManager
|
||||
@@ -202,9 +202,6 @@ class MLflowRecorder(Recorder):
|
||||
super(MLflowRecorder, self).__init__(experiment_id, name)
|
||||
self._uri = uri
|
||||
self.artifact_uri = None
|
||||
# set up file manager for saving objects
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
self.fm = FileManager(Path(self.temp_dir).absolute())
|
||||
self.client = mlflow.tracking.MlflowClient(tracking_uri=self._uri)
|
||||
# construct from mlflow run
|
||||
if mlflow_run is not None:
|
||||
@@ -248,16 +245,18 @@ class MLflowRecorder(Recorder):
|
||||
self.end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
if self.status != Recorder.STATUS_S:
|
||||
self.status = status
|
||||
shutil.rmtree(self.temp_dir)
|
||||
|
||||
def save_objects(self, local_path=None, artifact_path=None, **kwargs):
|
||||
assert self._uri is not None, "Please start the experiment and recorder first before using recorder directly."
|
||||
if local_path is not None:
|
||||
self.client.log_artifacts(self.id, local_path, artifact_path)
|
||||
else:
|
||||
temp_dir = Path(tempfile.mkdtemp()).resolve()
|
||||
for name, data in kwargs.items():
|
||||
self.fm.save_obj(data, name)
|
||||
self.client.log_artifact(self.id, self.fm.path / name, artifact_path)
|
||||
with (temp_dir / name).open("wb") as f:
|
||||
pickle.dump(data, f)
|
||||
self.client.log_artifact(self.id, temp_dir / name, artifact_path)
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
def load_object(self, name):
|
||||
assert self._uri is not None, "Please start the experiment and recorder first before using recorder directly."
|
||||
|
||||
@@ -43,7 +43,7 @@ python get_data.py qlib_data --help
|
||||
|
||||
### US data
|
||||
|
||||
> Need to download data first: [Downlaod US Data](#Downlaod-US-Data)
|
||||
> Need to download data first: [Download US Data](#Download-US-Data)
|
||||
|
||||
```python
|
||||
import qlib
|
||||
|
||||
@@ -1,28 +1,71 @@
|
||||
import sys, platform
|
||||
import sys
|
||||
import platform
|
||||
import qlib
|
||||
import fire
|
||||
import pkg_resources
|
||||
from pathlib import Path
|
||||
|
||||
QLIB_PATH = Path(__file__).absolute().resolve().parent.parent
|
||||
|
||||
|
||||
def linux_distribution():
|
||||
try:
|
||||
return platform.linux_distribution()
|
||||
except:
|
||||
return "N/A"
|
||||
class InfoCollector:
|
||||
"""
|
||||
User could collect system info by following commands
|
||||
`cd scripts && python collect_info.py all`
|
||||
- NOTE: please avoid running this script in the project folder which contains `qlib`
|
||||
"""
|
||||
|
||||
def sys(self):
|
||||
"""collect system related info"""
|
||||
for method in ["system", "machine", "platform", "version"]:
|
||||
print(getattr(platform, method)())
|
||||
|
||||
def py(self):
|
||||
"""collect Python related info"""
|
||||
print("Python version: {}".format(sys.version.replace("\n", " ")))
|
||||
|
||||
def qlib(self):
|
||||
"""collect qlib related info"""
|
||||
print("Qlib version: {}".format(qlib.__version__))
|
||||
REQUIRED = [
|
||||
"numpy",
|
||||
"pandas",
|
||||
"scipy",
|
||||
"requests",
|
||||
"sacred",
|
||||
"python-socketio",
|
||||
"redis",
|
||||
"python-redis-lock",
|
||||
"schedule",
|
||||
"cvxpy",
|
||||
"hyperopt",
|
||||
"fire",
|
||||
"statsmodels",
|
||||
"xlrd",
|
||||
"plotly",
|
||||
"matplotlib",
|
||||
"tables",
|
||||
"pyyaml",
|
||||
"mlflow",
|
||||
"tqdm",
|
||||
"loguru",
|
||||
"lightgbm",
|
||||
"tornado",
|
||||
"joblib",
|
||||
"fire",
|
||||
"ruamel.yaml",
|
||||
]
|
||||
|
||||
for package in REQUIRED:
|
||||
version = pkg_resources.get_distribution(package).version
|
||||
print(f"{package}=={version}")
|
||||
|
||||
def all(self):
|
||||
"""collect all info"""
|
||||
for method in ["sys", "py", "qlib"]:
|
||||
getattr(self, method)()
|
||||
print()
|
||||
|
||||
|
||||
print("Qlib version: {} \n".format(qlib.__version__))
|
||||
print(
|
||||
"""Python version: {} \n
|
||||
linux_distribution: {}
|
||||
system: {}
|
||||
machine: {}
|
||||
platform: {}
|
||||
version: {}
|
||||
""".format(
|
||||
sys.version.split("\n"),
|
||||
linux_distribution(),
|
||||
platform.system(),
|
||||
platform.machine(),
|
||||
platform.platform(),
|
||||
platform.version(),
|
||||
)
|
||||
)
|
||||
if __name__ == "__main__":
|
||||
fire.Fire(InfoCollector)
|
||||
|
||||
@@ -5,6 +5,7 @@ import re
|
||||
import time
|
||||
import bisect
|
||||
import pickle
|
||||
import random
|
||||
import requests
|
||||
import functools
|
||||
from pathlib import Path
|
||||
@@ -17,6 +18,7 @@ from yahooquery import Ticker
|
||||
HS_SYMBOLS_URL = "http://app.finance.ifeng.com/hq/list.php?type=stock_a&class={s_type}"
|
||||
|
||||
CALENDAR_URL_BASE = "http://push2his.eastmoney.com/api/qt/stock/kline/get?secid={market}.{bench_code}&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58&klt=101&fqt=0&beg=19900101&end=20991231"
|
||||
SZSE_CALENDAR_URL = "http://www.szse.cn/api/report/exchange/onepersistenthour/monthList?month={month}&random={random}"
|
||||
|
||||
CALENDAR_BENCH_URL_MAP = {
|
||||
"CSI300": CALENDAR_URL_BASE.format(market=1, bench_code="000300"),
|
||||
@@ -63,7 +65,29 @@ def get_calendar_list(bench_code="CSI300") -> list:
|
||||
df = Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max")
|
||||
calendar = df.index.get_level_values(level="date").map(pd.Timestamp).unique().tolist()
|
||||
else:
|
||||
calendar = _get_calendar(CALENDAR_BENCH_URL_MAP[bench_code])
|
||||
if bench_code.upper() == "ALL":
|
||||
|
||||
@deco_retry
|
||||
def _get_calendar(month):
|
||||
_cal = []
|
||||
try:
|
||||
resp = requests.get(SZSE_CALENDAR_URL.format(month=month, random=random.random)).json()
|
||||
for _r in resp["data"]:
|
||||
if int(_r["jybz"]):
|
||||
_cal.append(pd.Timestamp(_r["jyrq"]))
|
||||
except Exception as e:
|
||||
raise ValueError(f"{month}-->{e}")
|
||||
return _cal
|
||||
|
||||
month_range = pd.date_range(start="2000-01", end=pd.Timestamp.now() + pd.Timedelta(days=31), freq="M")
|
||||
calendar = []
|
||||
for _m in month_range:
|
||||
cal = _get_calendar(_m.strftime("%Y-%m"))
|
||||
if cal:
|
||||
calendar += cal
|
||||
calendar = list(filter(lambda x: x <= pd.Timestamp.now(), calendar))
|
||||
else:
|
||||
calendar = _get_calendar(CALENDAR_BENCH_URL_MAP[bench_code])
|
||||
_CALENDAR_MAP[bench_code] = calendar
|
||||
logger.info(f"end of get calendar list: {bench_code}.")
|
||||
return calendar
|
||||
|
||||
@@ -18,23 +18,81 @@ pip install -r requirements.txt
|
||||
|
||||
## Collector Data
|
||||
|
||||
### Download data and Normalize data
|
||||
```bash
|
||||
python collector.py collector_data --source_dir ~/.qlib/stock_data/source --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d
|
||||
```
|
||||
|
||||
### Download Data
|
||||
### CN Data
|
||||
|
||||
#### 1d
|
||||
|
||||
```bash
|
||||
python collector.py download_data --source_dir ~/.qlib/stock_data/source --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d
|
||||
|
||||
# download from yahoo finance
|
||||
python collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_1d --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d
|
||||
|
||||
# normalize
|
||||
python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/cn_1d --normalize_dir ~/.qlib/stock_data/source/cn_1d_nor --region CN --interval 1d
|
||||
|
||||
# dump data
|
||||
cd qlib/scripts
|
||||
python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1d_nor --qlib_dir ~/.qlib/stock_data/source/qlib_cn_1d --freq day --exclude_fields date,adjclose,dividends,splits,symbol
|
||||
|
||||
# using
|
||||
import qlib
|
||||
from qlib.data import D
|
||||
|
||||
qlib.init(provider_uri="~/.qlib/stock_data/source/qlib_cn_1d", region="CN")
|
||||
df = D.features(D.instruments("all"), ["$close"], freq="day")
|
||||
|
||||
```
|
||||
|
||||
### Normalize Data
|
||||
#### 1min
|
||||
|
||||
```bash
|
||||
python collector.py normalize_data --source_dir ~/.qlib/stock_data/source --normalize_dir ~/.qlib/stock_data/normalize --region CN
|
||||
|
||||
# download from yahoo finance
|
||||
python collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_1min --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1min
|
||||
|
||||
# normalize
|
||||
python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/cn_1min --normalize_dir ~/.qlib/stock_data/source/cn_1min_nor --region CN --interval 1min
|
||||
|
||||
# dump data
|
||||
cd qlib/scripts
|
||||
python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1min_nor --qlib_dir ~/.qlib/stock_data/source/qlib_cn_1min --freq 1min --exclude_fields date,adjclose,dividends,splits,symbol
|
||||
|
||||
# using
|
||||
import qlib
|
||||
from qlib.data import D
|
||||
|
||||
qlib.init(provider_uri="~/.qlib/stock_data/source/qlib_cn_1min", region="CN")
|
||||
df = D.features(D.instruments("all"), ["$close"], freq="1min")
|
||||
|
||||
```
|
||||
|
||||
### US Data
|
||||
|
||||
#### 1d
|
||||
|
||||
```bash
|
||||
|
||||
# download from yahoo finance
|
||||
python collector.py download_data --source_dir ~/.qlib/stock_data/source/us_1d --region US --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d
|
||||
|
||||
# normalize
|
||||
python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/us_1d --normalize_dir ~/.qlib/stock_data/source/us_1d_nor --region US --interval 1d
|
||||
|
||||
# dump data
|
||||
cd qlib/scripts
|
||||
python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1d_nor --qlib_dir ~/.qlib/stock_data/source/qlib_us_1d --freq day --exclude_fields date,adjclose,dividends,splits,symbol
|
||||
|
||||
# using
|
||||
import qlib
|
||||
from qlib.data import D
|
||||
|
||||
qlib.init(provider_uri="~/.qlib/stock_data/source/qlib_us_1d", region="US")
|
||||
df = D.features(D.instruments("all"), ["$close"], freq="day")
|
||||
|
||||
```
|
||||
|
||||
|
||||
### Help
|
||||
```bash
|
||||
pythono collector.py collector_data --help
|
||||
@@ -42,5 +100,5 @@ pythono collector.py collector_data --help
|
||||
|
||||
## Parameters
|
||||
|
||||
- interval: 1m or 1d
|
||||
- interval: 1min or 1d
|
||||
- region: CN or US
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -14,6 +14,7 @@ import numpy as np
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
from loguru import logger
|
||||
from qlib.utils import fname_to_code, code_to_fname
|
||||
|
||||
|
||||
class DumpDataBase:
|
||||
@@ -27,7 +28,6 @@ class DumpDataBase:
|
||||
HIGH_FREQ_FORMAT = "%Y-%m-%d %H:%M:%S"
|
||||
INSTRUMENTS_SEP = "\t"
|
||||
INSTRUMENTS_FILE_NAME = "all.txt"
|
||||
SAVE_INST_FIELD = "save_inst"
|
||||
|
||||
UPDATE_MODE = "update"
|
||||
ALL_MODE = "all"
|
||||
@@ -45,7 +45,6 @@ class DumpDataBase:
|
||||
exclude_fields: str = "",
|
||||
include_fields: str = "",
|
||||
limit_nums: int = None,
|
||||
inst_prefix: str = "",
|
||||
):
|
||||
"""
|
||||
|
||||
@@ -73,9 +72,6 @@ class DumpDataBase:
|
||||
fields not dumped
|
||||
limit_nums: int
|
||||
Use when debugging, default None
|
||||
inst_prefix: str
|
||||
add a column to the instruments file and record the saved instrument name,
|
||||
the US stock code contains "PRN", and the directory cannot be created on Windows system, use the "_" prefix.
|
||||
"""
|
||||
csv_path = Path(csv_path).expanduser()
|
||||
if isinstance(exclude_fields, str):
|
||||
@@ -84,7 +80,6 @@ class DumpDataBase:
|
||||
include_fields = include_fields.split(",")
|
||||
self._exclude_fields = tuple(filter(lambda x: len(x) > 0, map(str.strip, exclude_fields)))
|
||||
self._include_fields = tuple(filter(lambda x: len(x) > 0, map(str.strip, include_fields)))
|
||||
self._inst_prefix = inst_prefix.strip()
|
||||
self.file_suffix = file_suffix
|
||||
self.symbol_field_name = symbol_field_name
|
||||
self.csv_files = sorted(csv_path.glob(f"*{self.file_suffix}") if csv_path.is_dir() else [csv_path])
|
||||
@@ -145,7 +140,7 @@ class DumpDataBase:
|
||||
return df
|
||||
|
||||
def get_symbol_from_file(self, file_path: Path) -> str:
|
||||
return file_path.name[: -len(self.file_suffix)].strip().lower()
|
||||
return fname_to_code(file_path.name[: -len(self.file_suffix)].strip().lower())
|
||||
|
||||
def get_dump_fields(self, df_columns: Iterable[str]) -> Iterable[str]:
|
||||
return (
|
||||
@@ -173,7 +168,6 @@ class DumpDataBase:
|
||||
self.symbol_field_name,
|
||||
self.INSTRUMENTS_START_FIELD,
|
||||
self.INSTRUMENTS_END_FIELD,
|
||||
self.SAVE_INST_FIELD,
|
||||
],
|
||||
)
|
||||
|
||||
@@ -190,13 +184,11 @@ class DumpDataBase:
|
||||
instruments_path = str(self._instruments_dir.joinpath(self.INSTRUMENTS_FILE_NAME).resolve())
|
||||
if isinstance(instruments_data, pd.DataFrame):
|
||||
_df_fields = [self.symbol_field_name, self.INSTRUMENTS_START_FIELD, self.INSTRUMENTS_END_FIELD]
|
||||
if self._inst_prefix:
|
||||
_df_fields.append(self.SAVE_INST_FIELD)
|
||||
instruments_data[self.SAVE_INST_FIELD] = instruments_data[self.symbol_field_name].apply(
|
||||
lambda x: f"{self._inst_prefix}{x}"
|
||||
)
|
||||
instruments_data = instruments_data.loc[:, _df_fields]
|
||||
instruments_data.to_csv(instruments_path, header=False, sep=self.INSTRUMENTS_SEP)
|
||||
instruments_data[self.symbol_field_name] = instruments_data[self.symbol_field_name].apply(
|
||||
lambda x: fname_to_code(x.lower()).upper()
|
||||
)
|
||||
instruments_data.to_csv(instruments_path, header=False, sep=self.INSTRUMENTS_SEP, index=False)
|
||||
else:
|
||||
np.savetxt(instruments_path, instruments_data, fmt="%s", encoding="utf-8")
|
||||
|
||||
@@ -223,26 +215,26 @@ class DumpDataBase:
|
||||
logger.warning(f"{features_dir.name} data is None or empty")
|
||||
return
|
||||
# align index
|
||||
_df = self.data_merge_calendar(df, self._calendars_list)
|
||||
_df = self.data_merge_calendar(df, calendar_list)
|
||||
# used when creating a bin file
|
||||
date_index = self.get_datetime_index(_df, calendar_list)
|
||||
for field in self.get_dump_fields(_df.columns):
|
||||
bin_path = features_dir.joinpath(f"{field}.{self.freq}{self.DUMP_FILE_SUFFIX}")
|
||||
if field not in _df.columns:
|
||||
continue
|
||||
if self._mode == self.UPDATE_MODE:
|
||||
if bin_path.exists() and self._mode == self.UPDATE_MODE:
|
||||
# update
|
||||
with bin_path.open("ab") as fp:
|
||||
np.array(_df[field]).astype("<f").tofile(fp)
|
||||
elif self._mode == self.ALL_MODE:
|
||||
np.hstack([date_index, _df[field]]).astype("<f").tofile(str(bin_path.resolve()))
|
||||
else:
|
||||
raise ValueError(f"{self._mode} cannot support!")
|
||||
# append; self._mode == self.ALL_MODE or not bin_path.exists()
|
||||
np.hstack([date_index, _df[field]]).astype("<f").tofile(str(bin_path.resolve()))
|
||||
|
||||
def _dump_bin(self, file_or_data: [Path, pd.DataFrame], calendar_list: List[pd.Timestamp]):
|
||||
if isinstance(file_or_data, pd.DataFrame):
|
||||
if file_or_data.empty:
|
||||
return
|
||||
code = file_or_data.iloc[0][self.symbol_field_name].lower()
|
||||
code = fname_to_code(file_or_data.iloc[0][self.symbol_field_name].lower())
|
||||
df = file_or_data
|
||||
elif isinstance(file_or_data, Path):
|
||||
code = self.get_symbol_from_file(file_or_data)
|
||||
@@ -253,8 +245,7 @@ class DumpDataBase:
|
||||
logger.warning(f"{code} data is None or empty")
|
||||
return
|
||||
# features save dir
|
||||
code = self._inst_prefix + code if self._inst_prefix else code
|
||||
features_dir = self._features_dir.joinpath(code)
|
||||
features_dir = self._features_dir.joinpath(code_to_fname(code).lower())
|
||||
features_dir.mkdir(parents=True, exist_ok=True)
|
||||
self._data_to_bin(df, calendar_list, features_dir)
|
||||
|
||||
@@ -283,8 +274,6 @@ class DumpDataAll(DumpDataBase):
|
||||
_end_time = self._format_datetime(_end_time)
|
||||
symbol = self.get_symbol_from_file(file_path)
|
||||
_inst_fields = [symbol.upper(), _begin_time, _end_time]
|
||||
if self._inst_prefix:
|
||||
_inst_fields.append(self._inst_prefix + symbol.upper())
|
||||
date_range_list.append(f"{self.INSTRUMENTS_SEP.join(_inst_fields)}")
|
||||
p_bar.update()
|
||||
self._kwargs["all_datetime_set"] = all_datetime
|
||||
@@ -323,12 +312,18 @@ class DumpDataFix(DumpDataAll):
|
||||
def _dump_instruments(self):
|
||||
logger.info("start dump instruments......")
|
||||
_fun = partial(self._get_date, is_begin_end=True)
|
||||
new_stock_files = sorted(filter(lambda x: x.name not in self._old_instruments, self.csv_files))
|
||||
new_stock_files = sorted(
|
||||
filter(
|
||||
lambda x: fname_to_code(x.name[: -len(self.file_suffix)].strip().lower()).upper()
|
||||
not in self._old_instruments,
|
||||
self.csv_files,
|
||||
)
|
||||
)
|
||||
with tqdm(total=len(new_stock_files)) as p_bar:
|
||||
with ProcessPoolExecutor(max_workers=self.works) as execute:
|
||||
for file_path, (_begin_time, _end_time) in zip(new_stock_files, execute.map(_fun, new_stock_files)):
|
||||
if isinstance(_begin_time, pd.Timestamp) and isinstance(_end_time, pd.Timestamp):
|
||||
symbol = self.get_symbol_from_file(file_path).upper()
|
||||
symbol = fname_to_code(self.get_symbol_from_file(file_path).lower()).upper()
|
||||
_dt_map = self._old_instruments.setdefault(symbol, dict())
|
||||
_dt_map[self.INSTRUMENTS_START_FIELD] = self._format_datetime(_begin_time)
|
||||
_dt_map[self.INSTRUMENTS_END_FIELD] = self._format_datetime(_end_time)
|
||||
@@ -406,10 +401,10 @@ class DumpDataUpdate(DumpDataBase):
|
||||
)
|
||||
self._mode = self.UPDATE_MODE
|
||||
self._old_calendar_list = self._read_calendars(self._calendars_dir.joinpath(f"{self.freq}.txt"))
|
||||
self._update_instruments = self._read_instruments(
|
||||
self._instruments_dir.joinpath(self.INSTRUMENTS_FILE_NAME)
|
||||
).to_dict(
|
||||
orient="index"
|
||||
self._update_instruments = (
|
||||
self._read_instruments(self._instruments_dir.joinpath(self.INSTRUMENTS_FILE_NAME))
|
||||
.set_index([self.symbol_field_name])
|
||||
.to_dict(orient="index")
|
||||
) # type: dict
|
||||
|
||||
# load all csv files
|
||||
@@ -425,10 +420,7 @@ class DumpDataUpdate(DumpDataBase):
|
||||
all_df = []
|
||||
|
||||
def _read_csv(file_path: Path):
|
||||
if self._include_fields:
|
||||
_df = pd.read_csv(file_path, usecols=self._include_fields)
|
||||
else:
|
||||
_df = pd.read_csv(file_path)
|
||||
_df = pd.read_csv(file_path, parse_dates=[self.date_field_name])
|
||||
if self.symbol_field_name not in _df.columns:
|
||||
_df[self.symbol_field_name] = self.get_symbol_from_file(file_path)
|
||||
return _df
|
||||
@@ -436,7 +428,7 @@ class DumpDataUpdate(DumpDataBase):
|
||||
with tqdm(total=len(self.csv_files)) as p_bar:
|
||||
with ThreadPoolExecutor(max_workers=self.works) as executor:
|
||||
for df in executor.map(_read_csv, self.csv_files):
|
||||
if df:
|
||||
if not df.empty:
|
||||
all_df.append(df)
|
||||
p_bar.update()
|
||||
|
||||
@@ -455,25 +447,27 @@ class DumpDataUpdate(DumpDataBase):
|
||||
with ProcessPoolExecutor(max_workers=self.works) as executor:
|
||||
futures = {}
|
||||
for _code, _df in self._all_data.groupby(self.symbol_field_name):
|
||||
_code = str(_code).upper()
|
||||
_code = fname_to_code(str(_code).lower()).upper()
|
||||
_start, _end = self._get_date(_df, is_begin_end=True)
|
||||
if not (isinstance(_start, pd.Timestamp) and isinstance(_end, pd.Timestamp)):
|
||||
continue
|
||||
if _code in self._update_instruments:
|
||||
self._update_instruments[_code]["end_time"] = _end
|
||||
self._update_instruments[_code][self.INSTRUMENTS_END_FIELD] = self._format_datetime(_end)
|
||||
futures[executor.submit(self._dump_bin, _df, self._update_calendars)] = _code
|
||||
else:
|
||||
# new stock
|
||||
_dt_range = self._update_instruments.setdefault(_code, dict())
|
||||
_dt_range["start_time"] = _start
|
||||
_dt_range["end_time"] = _end
|
||||
_dt_range[self.INSTRUMENTS_START_FIELD] = self._format_datetime(_start)
|
||||
_dt_range[self.INSTRUMENTS_END_FIELD] = self._format_datetime(_end)
|
||||
futures[executor.submit(self._dump_bin, _df, self._new_calendar_list)] = _code
|
||||
|
||||
for _future in tqdm(as_completed(futures)):
|
||||
try:
|
||||
_future.result()
|
||||
except Exception:
|
||||
error_code[futures[_future]] = traceback.format_exc()
|
||||
with tqdm(total=len(futures)) as p_bar:
|
||||
for _future in as_completed(futures):
|
||||
try:
|
||||
_future.result()
|
||||
except Exception:
|
||||
error_code[futures[_future]] = traceback.format_exc()
|
||||
p_bar.update()
|
||||
logger.info(f"dump bin errors: {error_code}")
|
||||
|
||||
logger.info("end of features dump.\n")
|
||||
@@ -481,7 +475,9 @@ class DumpDataUpdate(DumpDataBase):
|
||||
def dump(self):
|
||||
self.save_calendars(self._new_calendar_list)
|
||||
self._dump_features()
|
||||
self.save_instruments(pd.DataFrame.from_dict(self._update_instruments, orient="index"))
|
||||
df = pd.DataFrame.from_dict(self._update_instruments, orient="index")
|
||||
df.index.names = [self.symbol_field_name]
|
||||
self.save_instruments(df.reset_index())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import fire
|
||||
from qlib.tests.data import GetData
|
||||
|
||||
|
||||
3
setup.py
3
setup.py
@@ -11,7 +11,7 @@ NAME = "pyqlib"
|
||||
DESCRIPTION = "A Quantitative-research Platform"
|
||||
REQUIRES_PYTHON = ">=3.5.0"
|
||||
|
||||
VERSION = "0.6.1"
|
||||
VERSION = "0.6.2"
|
||||
|
||||
# Detect Cython
|
||||
try:
|
||||
@@ -35,7 +35,6 @@ REQUIRED = [
|
||||
"scipy>=1.0.0",
|
||||
"requests>=2.18.0",
|
||||
"sacred>=0.7.4",
|
||||
"pymongo==3.7.2",
|
||||
"python-socketio==3.1.2",
|
||||
"redis>=3.0.1",
|
||||
"python-redis-lock>=3.3.1",
|
||||
|
||||
@@ -66,7 +66,7 @@ class TestDataset(TestAutoData):
|
||||
# Check the data
|
||||
# Get data from DataFrame Directly
|
||||
data_from_df = (
|
||||
tsdh._handler.fetch(data_key=DataHandlerLP.DK_L)
|
||||
tsdh.handler.fetch(data_key=DataHandlerLP.DK_L)
|
||||
.loc(axis=0)["2015-01-01":"2016-12-31", "SZ300315"]
|
||||
.iloc[-30:]
|
||||
.values
|
||||
|
||||
@@ -37,7 +37,7 @@ class TestGetData(unittest.TestCase):
|
||||
|
||||
def test_0_qlib_data(self):
|
||||
|
||||
GetData().qlib_data(name="qlib_data_simple", target_dir=QLIB_DIR, region="cn", interval="1d", version="latest")
|
||||
GetData().qlib_data(name="qlib_data_simple", target_dir=QLIB_DIR, region="cn", interval="1d", delete_old=False)
|
||||
df = D.features(D.instruments("csi300"), self.FIELDS)
|
||||
self.assertListEqual(list(df.columns), self.FIELDS, "get qlib data failed")
|
||||
self.assertFalse(df.dropna().empty, "get qlib data failed")
|
||||
|
||||
69
tests/test_register_ops.py
Normal file
69
tests/test_register_ops.py
Normal file
@@ -0,0 +1,69 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import sys
|
||||
import unittest
|
||||
import numpy as np
|
||||
|
||||
import qlib
|
||||
from qlib.data import D
|
||||
from qlib.data.ops import ElemOperator, PairOperator
|
||||
from qlib.config import REG_CN
|
||||
from qlib.utils import exists_qlib_data
|
||||
from qlib.tests import TestAutoData
|
||||
from qlib.tests.data import GetData
|
||||
|
||||
|
||||
class Diff(ElemOperator):
|
||||
"""Feature First Difference
|
||||
Parameters
|
||||
----------
|
||||
feature : Expression
|
||||
feature instance
|
||||
Returns
|
||||
----------
|
||||
Expression
|
||||
a feature instance with first difference
|
||||
"""
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
series = self.feature.load(instrument, start_index, end_index, freq)
|
||||
return series.diff()
|
||||
|
||||
def get_extended_window_size(self):
|
||||
lft_etd, rght_etd = self.feature.get_extended_window_size()
|
||||
return lft_etd + 1, rght_etd
|
||||
|
||||
|
||||
class Distance(PairOperator):
|
||||
"""Feature Distance
|
||||
Parameters
|
||||
----------
|
||||
feature : Expression
|
||||
feature instance
|
||||
Returns
|
||||
----------
|
||||
Expression
|
||||
a feature instance with distance
|
||||
"""
|
||||
|
||||
def _load_internal(self, instrument, start_index, end_index, freq):
|
||||
series_left = self.feature_left.load(instrument, start_index, end_index, freq)
|
||||
series_right = self.feature_right.load(instrument, start_index, end_index, freq)
|
||||
return np.abs(series_left - series_right)
|
||||
|
||||
|
||||
class TestRegiterCustomOps(TestAutoData):
|
||||
@classmethod
|
||||
def setUpClass(cls) -> None:
|
||||
cls._setup_kwargs.update({"custom_ops": [Diff, Distance]})
|
||||
super().setUpClass()
|
||||
|
||||
def test_regiter_custom_ops(self):
|
||||
instruments = ["SH600000"]
|
||||
fields = ["Diff($close)", "Distance($close, Ref($close, 1))"]
|
||||
print(D.features(instruments, fields, start_time="2010-01-01", end_time="2017-12-31", freq="day"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user