1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-07-02 02:21:18 +08:00

Compare commits

..

10 Commits

Author SHA1 Message Date
SunsetWolf
ba123aa46c add dependencies for generate hdf5 files 2025-03-19 13:52:03 +08:00
Linlang
4d621bff99 fix pkl file not loading in StaticDataLoader (#1896)
* fix pkl file not loading in StaticDataLoader

* resolve hard code

* resolve hard code
2025-03-18 16:05:24 +08:00
Ben Heckmann
82f1ef2def DRAFT add Data Health Checker (#1574)
* #854 implement first data health checker draft

* #854 added support for qlib's data format, implemented factor check, reformatted summary

* adaptation current dataset

* format with black

* add data health check to docs

* fix sphinx error

* fix pylint error

* update code

* format with black

* format with pylint

---------

Co-authored-by: Linlang <Lv.Linlang@hotmail.com>
2025-01-09 21:35:59 +08:00
Linlang
186512f272 Fix csi300 constituents url (#1883)
* fix_csi300_constituents_url

* Fix issue in readme

* format with black
2025-01-03 16:57:17 +08:00
codecnotsupported
bda374180a Update links to chenditc/investment_data to always point to latest release (#1877)
* Update README.md

Link to latest release.
https://docs.github.com/en/repositories/releasing-projects-on-github/linking-to-releases#linking-to-the-latest-release

* Update README.md

Link to latest release.
https://docs.github.com/en/repositories/releasing-projects-on-github/linking-to-releases#linking-to-the-latest-release

* Update README.md

Link to latest release.
https://docs.github.com/en/repositories/releasing-projects-on-github/linking-to-releases#linking-to-the-latest-release

* Update README.md

Link to latest release.
https://docs.github.com/en/repositories/releasing-projects-on-github/linking-to-releases#linking-to-the-latest-release

* Update README.md

* Update README.md
2025-01-03 13:56:49 +08:00
Linlang
014ff7d3fe Fix broken URL for RL (#1881)
* fix_issue_1878

* fix_issue_1878
2025-01-02 14:41:54 +08:00
Chia-hung Tai
23d9d5a0a9 Fix the empty price_s case and self.instruments in SBBStrategyEMA. (#1677)
* Fix the empty price_s case and self.instruments in SBBStrategyEMA.

* Update qlib/contrib/strategy/rule_strategy.py

* Update qlib/contrib/strategy/rule_strategy.py

---------

Co-authored-by: you-n-g <you-n-g@users.noreply.github.com>
Co-authored-by: Linlang <Lv.Linlang@hotmail.com>
2024-12-26 15:56:41 +08:00
Linlang
7ce97c9da5 Bump version (#1872)
* bump version

* bump version

* Update README.md

* fix_ci_error

* fix_ci_error

* fix_ci_error

* fix_ci_error

---------

Co-authored-by: you-n-g <you-n-g@users.noreply.github.com>
2024-12-26 14:35:37 +08:00
Linlang
5a84aaf1dc Update version 2024-12-23 14:28:09 +08:00
Linlang
afbb178e24 Update publish (#1871)
* update publish

* reformat with black
2024-12-23 13:22:24 +08:00
26 changed files with 517 additions and 63 deletions

View File

@@ -3,22 +3,16 @@
name: Upload Python Package
# on:
# release:
# types: [published]
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
release:
types: [published]
jobs:
deploy_with_bdist_wheel:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [windows-latest, macos-13, macos-latest, macos-15]
os: [windows-latest, macos-13, macos-latest]
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
exclude:
- os: macos-13
@@ -38,13 +32,13 @@ jobs:
- name: Build wheel on ${{ matrix.os }}
run: |
make build
- name: Build and publish
- name: Upload to PyPi
env:
TWINE_USERNAME: __token__
TWINE_PASSWORD: ${{ secrets.TESTPYPI_TOKEN }}
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
run: |
ls dist
twine check dist/*.whl
twine upload dist/*.whl --verbose
deploy_with_manylinux:
runs-on: ubuntu-latest
@@ -57,19 +51,15 @@ jobs:
- name: Build wheel on Linux
uses: RalfG/python-wheels-manylinux-build@v0.7.1-manylinux2014_x86_64
with:
# not supporting 3.6 due to annotations is not supported https://stackoverflow.com/a/52890129
python-versions: 'cp38-cp38 cp39-cp39 cp310-cp310 cp311-cp311 cp312-cp312'
build-requirements: 'numpy cython'
- name: Install dependencies
run: |
python -m pip install twine
python -m pip list
- name: Build and publish
- name: Upload to PyPi
env:
TWINE_USERNAME: __token__
TWINE_PASSWORD: ${{ secrets.TESTPYPI_TOKEN }}
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
run: |
ls dist
twine check dist/*.whl
twine check dist/pyqlib-*-manylinux*.whl
twine upload dist/pyqlib-*-manylinux*.whl --verbose

View File

@@ -13,13 +13,10 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
# Since macos-latest changed from 12.7.4 to 14.4.1,
# the minimum python version that matches a 14.4.1 version of macos is 3.10,
# If you want to use python 3.7 in github action, then the latest macos system version is macos-13,
# after macos-13 python 3.7 is no longer supported.
# so we limit the macos version to macos-13.
os: [windows-latest, ubuntu-20.04, ubuntu-22.04, macos-13, macos-14, macos-15]
# not supporting 3.6 due to annotations is not supported https://stackoverflow.com/a/52890129
# In github action, using python 3.7, pip install will not match the latest version of the package.
# Also, python 3.7 is no longer supported from macos-14, and will be phased out from macos-13 in the near future.
# All things considered, we have removed python 3.7.
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps:
@@ -34,10 +31,16 @@ jobs:
- name: Update pip to the latest version
run: |
python -m pip install --upgrade pip
# Will cancel this step when the next qlib version is released. The current qlib version is: 0.9.6
- name: Installing pywinpt for windows
if: ${{ matrix.os == 'windows-latest' }}
run: |
python -m pip install pywinpty --only-binary=:all:
- name: Qlib installation test
run: |
python -m pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ pyqlib==0.9.5.80
python -m pip install pyqlib
- name: Install Lightgbm for MacOS
if: ${{ matrix.os == 'macos-13' || matrix.os == 'macos-14' || matrix.os == 'macos-15' }}

View File

@@ -0,0 +1,113 @@
name: Test qlib from source
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
jobs:
build:
timeout-minutes: 180
# we may retry for 3 times for `Unit tests with Pytest`
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [windows-latest, ubuntu-20.04, ubuntu-22.04, macos-13, macos-14, macos-15]
# In github action, using python 3.7, pip install will not match the latest version of the package.
# Also, python 3.7 is no longer supported from macos-14, and will be phased out from macos-13 in the near future.
# All things considered, we have removed python 3.7.
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps:
- name: Test qlib from source
uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Update pip to the latest version
run: |
python -m pip install --upgrade pip
- name: Installing pytorch for macos
if: ${{ matrix.os == 'macos-13' || matrix.os == 'macos-14' || matrix.os == 'macos-15' }}
run: |
python -m pip install torch torchvision torchaudio
- name: Installing pytorch for ubuntu
if: ${{ matrix.os == 'ubuntu-20.04' || matrix.os == 'ubuntu-22.04' }}
run: |
python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
- name: Installing pytorch for windows
if: ${{ matrix.os == 'windows-latest' }}
run: |
python -m pip install torch torchvision torchaudio
- name: Set up Python tools
run: |
make dev
- name: Lint with Black
run: |
make black
- name: Make html with sphinx
# Since read the docs builds on ubuntu 22.04, we only need to test that the build passes on ubuntu 22.04.
if: ${{ matrix.os == 'ubuntu-22.04' }}
run: |
make docs-gen
- name: Check Qlib with pylint
run: |
make pylint
- name: Check Qlib with flake8
run: |
make flake8
- name: Check Qlib with mypy
run: |
make mypy
- name: Check Qlib ipynb with nbqa
run: |
make nbqa
- name: Test data downloads
run: |
python scripts/get_data.py qlib_data --name qlib_data_simple --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
python scripts/get_data.py download_data --file_name rl_data.zip --target_dir tests/.data/rl
- name: Install Lightgbm for MacOS
if: ${{ matrix.os == 'macos-13' || matrix.os == 'macos-14' || matrix.os == 'macos-15' }}
run: |
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Microsoft/qlib/main/.github/brew_install.sh)"
HOMEBREW_NO_AUTO_UPDATE=1 brew install lightgbm
# FIX MacOS error: Segmentation fault
# reference: https://github.com/microsoft/LightGBM/issues/4229
wget https://raw.githubusercontent.com/Homebrew/homebrew-core/fb8323f2b170bd4ae97e1bac9bf3e2983af3fdb0/Formula/libomp.rb
brew unlink libomp
brew install libomp.rb
- name: Check Qlib ipynb with nbconvert
run: |
make nbconvert
- name: Test workflow by config (install from source)
run: |
python -m pip install numba
python qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
- name: Unit tests with Pytest
uses: nick-fields/retry@v2
with:
timeout_minutes: 60
max_attempts: 3
command: |
cd tests
python -m pytest . -m "not slow" --durations=0

View File

@@ -0,0 +1,58 @@
name: Test qlib from source slow
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
jobs:
build:
timeout-minutes: 720
# we may retry for 3 times for `Unit tests with Pytest`
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [windows-latest, ubuntu-20.04, ubuntu-22.04, macos-13, macos-14, macos-15]
# In github action, using python 3.7, pip install will not match the latest version of the package.
# Also, python 3.7 is no longer supported from macos-14, and will be phased out from macos-13 in the near future.
# All things considered, we have removed python 3.7.
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps:
- name: Test qlib from source slow
uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Set up Python tools
run: |
make dev
- name: Downloads dependencies data
run: |
python scripts/get_data.py qlib_data --name qlib_data_simple --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
- name: Install Lightgbm for MacOS
if: ${{ matrix.os == 'macos-13' || matrix.os == 'macos-14' || matrix.os == 'macos-15' }}
run: |
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Microsoft/qlib/main/.github/brew_install.sh)"
HOMEBREW_NO_AUTO_UPDATE=1 brew install lightgbm
# FIX MacOS error: Segmentation fault
# reference: https://github.com/microsoft/LightGBM/issues/4229
wget https://raw.githubusercontent.com/Homebrew/homebrew-core/fb8323f2b170bd4ae97e1bac9bf3e2983af3fdb0/Formula/libomp.rb
brew unlink libomp
brew install libomp.rb
- name: Unit tests with Pytest
uses: nick-fields/retry@v2
with:
timeout_minutes: 240
max_attempts: 3
command: |
cd tests
python -m pytest . -m "slow" --durations=0

View File

@@ -12,6 +12,12 @@ PUBLIC_DIR := $(shell [ "$$READTHEDOCS" = "True" ] && echo "$$READTHEDOCS_OUTPUT
SO_DIR := qlib/data/_libs
SO_FILES := $(wildcard $(SO_DIR)/*.so)
ifeq ($(OS),Windows_NT)
IS_WINDOWS = true
else
IS_WINDOWS = false
endif
########################################################################################
# Development Environment Management
########################################################################################
@@ -48,6 +54,10 @@ deepclean: clean
# What this code does is compile two Cython modules, rolling and expanding, using setuptools and Cython,
# and builds them as binary expansion modules that can be imported directly into Python.
# Since pyproject.toml can't do that, we compile it here.
# pywinpty as a dependency of jupyter on windows, if you use pip install pywinpty installation,
# will first download the tar.gz file, and then locally compiled and installed,
# this will lead to some unnecessary trouble, so we choose to install the compiled whl file, to avoid trouble.
prerequisite:
@if [ -n "$(SO_FILES)" ]; then \
echo "Shared library files exist, skipping build."; \
@@ -58,6 +68,10 @@ prerequisite:
python -c "from setuptools import setup, Extension; from Cython.Build import cythonize; import numpy; extensions = [Extension('qlib.data._libs.rolling', ['qlib/data/_libs/rolling.pyx'], language='c++', include_dirs=[numpy.get_include()]), Extension('qlib.data._libs.expanding', ['qlib/data/_libs/expanding.pyx'], language='c++', include_dirs=[numpy.get_include()])]; setup(ext_modules=cythonize(extensions, language_level='3'), script_args=['build_ext', '--inplace'])"; \
fi
@if [ "$(IS_WINDOWS)" = "true" ]; then \
python -m pip install pywinpty --only-binary=:all:; \
fi
# Install the package in editable mode.
dependencies:
python -m pip install -e .
@@ -87,7 +101,7 @@ analysis:
python -m pip install -e .[analysis]
all:
python -m pip install -e .[dev,lint,docs,package,test,analysis,rl]
python -m pip install -e .[pywinpty,dev,lint,docs,package,test,analysis,rl]
install: prerequisite dependencies

View File

@@ -155,15 +155,15 @@ Here is a quick **[demo](https://terminalizer.com/view/3f24561a4470)** shows how
This table demonstrates the supported Python version of `Qlib`:
| | install with pip | install from source | plot |
| ------------- |:---------------------:|:--------------------:|:------------------:|
| Python 3.7 | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
| Python 3.8 | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
| Python 3.9 | :x: | :heavy_check_mark: | :x: |
| Python 3.9 | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
| Python 3.10 | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
| Python 3.11 | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
| Python 3.12 | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
**Note**:
1. **Conda** is suggested for managing your Python environment. In some cases, using Python outside of a `conda` environment may result in missing header files, causing the installation failure of certain packages.
1. Please pay attention that installing cython in Python 3.6 will raise some error when installing ``Qlib`` from source. If users use Python 3.6 on their machines, it is recommended to *upgrade* Python to version 3.7 or use `conda`'s Python to install ``Qlib`` from source.
1. For Python 3.9, `Qlib` supports running workflows such as training models, doing backtest and plot most of the related figures (those included in [notebook](examples/workflow_by_code.ipynb)). However, plotting for the *model performance* is not supported for now and we will fix this when the dependent packages are upgraded in the future.
1. `Qlib`Requires `tables` package, `hdf5` in tables does not support python3.9.
2. Please pay attention that installing cython in Python 3.6 will raise some error when installing ``Qlib`` from source. If users use Python 3.6 on their machines, it is recommended to *upgrade* Python to version 3.8 or higher, or use `conda`'s Python to install ``Qlib`` from source.
### Install with pip
Users can easily install ``Qlib`` by pip according to the following command.
@@ -181,7 +181,7 @@ Also, users can install the latest dev version ``Qlib`` by the source code accor
```bash
pip install numpy
pip install --upgrade cython
pip install --upgrade cython
```
* Clone the repository and install ``Qlib`` as follows.
@@ -189,7 +189,6 @@ Also, users can install the latest dev version ``Qlib`` by the source code accor
git clone https://github.com/microsoft/qlib.git && cd qlib
pip install . # `pip install -e .[dev]` is recommended for development. check details in docs/developer/code_standard_and_dev_guide.rst
```
**Note**: You can install Qlib with `python setup.py install` as well. But it is not the recommended approach. It will skip `pip` and cause obscure problems. For example, **only** the command ``pip install .`` **can** overwrite the stable version installed by ``pip install pyqlib``, while the command ``python setup.py install`` **can't**.
**Tips**: If you fail to install `Qlib` or run the examples in your environment, comparing your steps and the [CI workflow](.github/workflows/test_qlib_from_source.yml) may help you find the problem.
@@ -197,11 +196,11 @@ Also, users can install the latest dev version ``Qlib`` by the source code accor
## Data Preparation
❗ Due to more restrict data security policy. The offical dataset is disabled temporarily. You can try [this data source](https://github.com/chenditc/investment_data/releases) contributed by the community.
Here is an example to download the data updated on 20240809.
Here is an example to download the latest data.
```bash
wget https://github.com/chenditc/investment_data/releases/download/2024-08-09/qlib_bin.tar.gz
wget https://github.com/chenditc/investment_data/releases/latest/download/qlib_bin.tar.gz
mkdir -p ~/.qlib/qlib_data/cn_data
tar -zxvf qlib_bin.tar.gz -C ~/.qlib/qlib_data/cn_data --strip-components=1
tar -zxvf qlib_bin.tar.gz -C ~/.qlib/qlib_data/cn_data --strip-components=2
rm -f qlib_bin.tar.gz
```
@@ -265,6 +264,16 @@ We recommend users to prepare their own data if they have a high-quality dataset
* *trading_date*: start of trading day
* *end_date*: end of trading day(not included)
### Checking the health of the data
* We provide a script to check the health of the data, you can run the following commands to check whether the data is healthy or not.
```
python scripts/check_data_health.py check_data --qlib_dir ~/.qlib/qlib_data/cn_data
```
* Of course, you can also add some parameters to adjust the test results, such as this.
```
python scripts/check_data_health.py check_data --qlib_dir ~/.qlib/qlib_data/cn_data --missing_data_num 30055 --large_step_threshold_volume 94485 --large_step_threshold_price 20
```
* If you want more information about `check_data_health`, please refer to the [documentation](https://qlib.readthedocs.io/en/latest/component/data.html#checking-the-health-of-the-data).
<!--
- Run the initialization code and get stock data:

View File

@@ -197,6 +197,57 @@ After conversion, users can find their Qlib format data in the directory `~/.qli
In the convention of `Qlib` data processing, `open, close, high, low, volume, money and factor` will be set to NaN if the stock is suspended.
If you want to use your own alpha-factor which can't be calculate by OCHLV, like PE, EPS and so on, you could add it to the CSV files with OHCLV together and then dump it to the Qlib format data.
Checking the health of the data
-------------------------------
``Qlib`` provides a script to check the health of the data.
- The main points to check are as follows
- Check if any data is missing in the DataFrame.
- Check if there are any large step changes above the threshold in the OHLCV columns.
- Check if any of the required columns (OLHCV) are missing in the DataFrame.
- Check if the 'factor' column is missing in the DataFrame.
- You can run the following commands to check whether the data is healthy or not.
for daily data:
.. code-block:: bash
python scripts/check_data_health.py check_data --qlib_dir ~/.qlib/qlib_data/cn_data
for 1min data:
.. code-block:: bash
python scripts/check_data_health.py check_data --qlib_dir ~/.qlib/qlib_data/cn_data_1min --freq 1min
- Of course, you can also add some parameters to adjust the test results.
- The available parameters are these.
- freq: Frequency of data.
- large_step_threshold_price: Maximum permitted price change
- large_step_threshold_volume: Maximum permitted volume change.
- missing_data_num: Maximum value for which data is allowed to be null.
- You can run the following commands to check whether the data is healthy or not.
for daily data:
.. code-block:: bash
python scripts/check_data_health.py check_data --qlib_dir ~/.qlib/qlib_data/cn_data --missing_data_num 30055 --large_step_threshold_volume 94485 --large_step_threshold_price 20
for 1min data:
.. code-block:: bash
python scripts/check_data_health.py check_data --qlib_dir ~/.qlib/qlib_data/cn_data --freq 1min --missing_data_num 35806 --large_step_threshold_volume 3205452000000 --large_step_threshold_price 0.91
Stock Pool (Market)
-------------------

View File

@@ -25,7 +25,7 @@ The design of the framework is shown in the yellow part in the middle of the fig
The frequency of the trading algorithm, decision content and execution environment can be customized by users (e.g. intraday trading, daily-frequency trading, weekly-frequency trading), and the execution environment can be nested with finer-grained trading algorithm and execution environment inside (i.e. sub-workflow in the figure, e.g. daily-frequency orders can be turned into finer-grained decisions by splitting orders within the day). The flexibility of the nested decision execution framework makes it easy for users to explore the effects of combining different levels of trading strategies and break down the optimization barriers between different levels of the trading algorithm.
The optimization for the nested decision execution framework can be implemented with the support of `QlibRL <https://qlib.readthedocs.io/en/latest/component/rl.html>`_. To know more about how to use the QlibRL, go to API Reference: `RL API <../reference/api.html#rl>`_.
The optimization for the nested decision execution framework can be implemented with the support of `QlibRL <./rl/overall.html>`_. To know more about how to use the QlibRL, go to API Reference: `RL API <../reference/api.html#rl>`_.
Example
=======

View File

@@ -7,7 +7,7 @@ The table below shows the performances of different solutions on different forec
## Alpha158 Dataset
Here is the [crowd sourced version of qlib data](data_collector/crowd_source/README.md): https://github.com/chenditc/investment_data/releases
```bash
wget https://github.com/chenditc/investment_data/releases/download/20220720/qlib_bin.tar.gz
wget https://github.com/chenditc/investment_data/releases/latest/download/qlib_bin.tar.gz
mkdir -p ~/.qlib/qlib_data/cn_data
tar -zxvf qlib_bin.tar.gz -C ~/.qlib/qlib_data/cn_data --strip-components=2
rm -f qlib_bin.tar.gz

View File

@@ -1,8 +1,8 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
The motivation of this demo
- To show the data modules of Qlib is Serializable, users can dump processed data to disk to avoid duplicated data preprocessing
The motivation of this demo
- To show the data modules of Qlib is Serializable, users can dump processed data to disk to avoid duplicated data preprocessing
"""
from copy import deepcopy

View File

@@ -1,8 +1,8 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
The motivation of this demo
- To show the data modules of Qlib is Serializable, users can dump processed data to disk to avoid duplicated data preprocessing
The motivation of this demo
- To show the data modules of Qlib is Serializable, users can dump processed data to disk to avoid duplicated data preprocessing
"""
from copy import deepcopy

View File

@@ -1,10 +1,10 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
NOTE:
- This scripts is a demo to import example data import Qlib
- !!!!!!!!!!!!!!!TODO!!!!!!!!!!!!!!!!!!!:
- Its structure is not well designed and very ugly, your contribution is welcome to make importing dataset easier
NOTE:
- This scripts is a demo to import example data import Qlib
- !!!!!!!!!!!!!!!TODO!!!!!!!!!!!!!!!!!!!:
- Its structure is not well designed and very ugly, your contribution is welcome to make importing dataset easier
"""
from datetime import date, datetime as dt
import os

View File

@@ -1,7 +1,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
Qlib provides two kinds of interfaces.
Qlib provides two kinds of interfaces.
(1) Users could define the Quant research workflow by a simple configuration.
(2) Qlib is designed in a modularized way and supports creating research workflow by code just like building blocks.

View File

@@ -44,6 +44,8 @@ dependencies = [
"matplotlib",
"jupyter",
"nbconvert",
"pyarrow",
"tables",
]
[project.optional-dependencies]

View File

@@ -2,7 +2,7 @@
# Licensed under the MIT License.
from pathlib import Path
__version__ = "0.9.5.80"
__version__ = "0.9.6.99"
__version__bak = __version__ # This version is backup for QlibConfig.reset_qlib_version
import os
from typing import Union

View File

@@ -427,6 +427,10 @@ class Indicator:
# NOTE ~(price_s < 1e-08) is different from price_s >= 1e-8
# ~(np.nan < 1e-8) -> ~(False) -> True
# if price_s is empty
if price_s.empty:
return None, None
assert isinstance(price_s, idd.SingleData)
if agg == "vwap":
volume_s = trade_exchange.get_volume(inst, trade_start_time, trade_end_time, method=None)

View File

@@ -326,8 +326,10 @@ class SBBStrategyEMA(SBBStrategyBase):
if instruments is None:
warnings.warn("`instruments` is not set, will load all stocks")
self.instruments = "all"
if isinstance(instruments, str):
elif isinstance(instruments, str):
self.instruments = D.instruments(instruments)
elif isinstance(instruments, List):
self.instruments = instruments
self.freq = freq
super(SBBStrategyEMA, self).__init__(
outer_trade_decision, level_infra, common_infra, trade_exchange=trade_exchange, **kwargs

View File

@@ -1,9 +1,9 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
This module is not a necessary part of Qlib.
They are just some tools for convenience
It is should not imported into the core part of qlib
This module is not a necessary part of Qlib.
They are just some tools for convenience
It is should not imported into the core part of qlib
"""
import torch
import numpy as np

View File

@@ -279,8 +279,11 @@ class StaticDataLoader(DataLoader, Serializable):
)
self._data.sort_index(inplace=True)
elif isinstance(self._config, (str, Path)):
with Path(self._config).open("rb") as f:
self._data = pickle.load(f)
if str(self._config).strip().endswith(".parquet"):
self._data = pd.read_parquet(self._config, engine="pyarrow")
else:
with Path(self._config).open("rb") as f:
self._data = pickle.load(f)
elif isinstance(self._config, pd.DataFrame):
self._data = self._config

View File

@@ -200,7 +200,7 @@ class Trainer:
if ckpt_path is not None:
_logger.info("Resuming states from %s", str(ckpt_path))
self.load_state_dict(torch.load(ckpt_path))
self.load_state_dict(torch.load(ckpt_path, weights_only=False))
else:
self.initialize()

View File

@@ -71,6 +71,6 @@ qlib.init(provider_uri=provider_uri, region=REG_CN)
## Use Crowd Sourced Data
The is also a [crowd sourced version of qlib data](data_collector/crowd_source/README.md): https://github.com/chenditc/investment_data/releases
```bash
wget https://github.com/chenditc/investment_data/releases/download/20220720/qlib_bin.tar.gz
wget https://github.com/chenditc/investment_data/releases/latest/download/qlib_bin.tar.gz
tar -zxvf qlib_bin.tar.gz -C ~/.qlib/qlib_data/cn_data --strip-components=2
```

View File

@@ -0,0 +1,203 @@
from loguru import logger
import os
from typing import Optional
import fire
import pandas as pd
import qlib
from tqdm import tqdm
from qlib.data import D
class DataHealthChecker:
"""Checks a dataset for data completeness and correctness. The data will be converted to a pd.DataFrame and checked for the following problems:
- any of the columns ["open", "high", "low", "close", "volume"] are missing
- any data is missing
- any step change in the OHLCV columns is above a threshold (default: 0.5 for price, 3 for volume)
- any factor is missing
"""
def __init__(
self,
csv_path=None,
qlib_dir=None,
freq="day",
large_step_threshold_price=0.5,
large_step_threshold_volume=3,
missing_data_num=0,
):
assert csv_path or qlib_dir, "One of csv_path or qlib_dir should be provided."
assert not (csv_path and qlib_dir), "Only one of csv_path or qlib_dir should be provided."
self.data = {}
self.problems = {}
self.freq = freq
self.large_step_threshold_price = large_step_threshold_price
self.large_step_threshold_volume = large_step_threshold_volume
self.missing_data_num = missing_data_num
if csv_path:
assert os.path.isdir(csv_path), f"{csv_path} should be a directory."
files = [f for f in os.listdir(csv_path) if f.endswith(".csv")]
for filename in tqdm(files, desc="Loading data"):
df = pd.read_csv(os.path.join(csv_path, filename))
self.data[filename] = df
elif qlib_dir:
qlib.init(provider_uri=qlib_dir)
self.load_qlib_data()
def load_qlib_data(self):
instruments = D.instruments(market="all")
instrument_list = D.list_instruments(instruments=instruments, as_list=True, freq=self.freq)
required_fields = ["$open", "$close", "$low", "$high", "$volume", "$factor"]
for instrument in instrument_list:
df = D.features([instrument], required_fields, freq=self.freq)
df.rename(
columns={
"$open": "open",
"$close": "close",
"$low": "low",
"$high": "high",
"$volume": "volume",
"$factor": "factor",
},
inplace=True,
)
self.data[instrument] = df
print(df)
def check_missing_data(self) -> Optional[pd.DataFrame]:
"""Check if any data is missing in the DataFrame."""
result_dict = {
"instruments": [],
"open": [],
"high": [],
"low": [],
"close": [],
"volume": [],
}
for filename, df in self.data.items():
missing_data_columns = df.isnull().sum()[df.isnull().sum() > self.missing_data_num].index.tolist()
if len(missing_data_columns) > 0:
result_dict["instruments"].append(filename)
result_dict["open"].append(df.isnull().sum()["open"])
result_dict["high"].append(df.isnull().sum()["high"])
result_dict["low"].append(df.isnull().sum()["low"])
result_dict["close"].append(df.isnull().sum()["close"])
result_dict["volume"].append(df.isnull().sum()["volume"])
result_df = pd.DataFrame(result_dict).set_index("instruments")
if not result_df.empty:
return result_df
else:
logger.info(f"✅ There are no missing data.")
return None
def check_large_step_changes(self) -> Optional[pd.DataFrame]:
"""Check if there are any large step changes above the threshold in the OHLCV columns."""
result_dict = {
"instruments": [],
"col_name": [],
"date": [],
"pct_change": [],
}
for filename, df in self.data.items():
affected_columns = []
for col in ["open", "high", "low", "close", "volume"]:
if col in df.columns:
pct_change = df[col].pct_change(fill_method=None).abs()
threshold = self.large_step_threshold_volume if col == "volume" else self.large_step_threshold_price
if pct_change.max() > threshold:
large_steps = pct_change[pct_change > threshold]
result_dict["instruments"].append(filename)
result_dict["col_name"].append(col)
result_dict["date"].append(large_steps.index.to_list()[0][1].strftime("%Y-%m-%d"))
result_dict["pct_change"].append(pct_change.max())
affected_columns.append(col)
result_df = pd.DataFrame(result_dict).set_index("instruments")
if not result_df.empty:
return result_df
else:
logger.info(f"✅ There are no large step changes in the OHLCV column above the threshold.")
return None
def check_required_columns(self) -> Optional[pd.DataFrame]:
"""Check if any of the required columns (OLHCV) are missing in the DataFrame."""
required_columns = ["open", "high", "low", "close", "volume"]
result_dict = {
"instruments": [],
"missing_col": [],
}
for filename, df in self.data.items():
if not all(column in df.columns for column in required_columns):
missing_required_columns = [column for column in required_columns if column not in df.columns]
result_dict["instruments"].append(filename)
result_dict["missing_col"] += missing_required_columns
result_df = pd.DataFrame(result_dict).set_index("instruments")
if not result_df.empty:
return result_df
else:
logger.info(f"✅ The columns (OLHCV) are complete and not missing.")
return None
def check_missing_factor(self) -> Optional[pd.DataFrame]:
"""Check if the 'factor' column is missing in the DataFrame."""
result_dict = {
"instruments": [],
"missing_factor_col": [],
"missing_factor_data": [],
}
for filename, df in self.data.items():
if "000300" in filename or "000903" in filename or "000905" in filename:
continue
if "factor" not in df.columns:
result_dict["instruments"].append(filename)
result_dict["missing_factor_col"].append(True)
if df["factor"].isnull().all():
if filename in result_dict["instruments"]:
result_dict["missing_factor_data"].append(True)
else:
result_dict["instruments"].append(filename)
result_dict["missing_factor_col"].append(False)
result_dict["missing_factor_data"].append(True)
result_df = pd.DataFrame(result_dict).set_index("instruments")
if not result_df.empty:
return result_df
else:
logger.info(f"✅ The `factor` column already exists and is not empty.")
return None
def check_data(self):
check_missing_data_result = self.check_missing_data()
check_large_step_changes_result = self.check_large_step_changes()
check_required_columns_result = self.check_required_columns()
check_missing_factor_result = self.check_missing_factor()
if (
check_large_step_changes_result is not None
or check_large_step_changes_result is not None
or check_required_columns_result is not None
or check_missing_factor_result is not None
):
print(f"\nSummary of data health check ({len(self.data)} files checked):")
print("-------------------------------------------------")
if isinstance(check_missing_data_result, pd.DataFrame):
logger.warning(f"There is missing data.")
print(check_missing_data_result)
if isinstance(check_large_step_changes_result, pd.DataFrame):
logger.warning(f"The OHLCV column has large step changes.")
print(check_large_step_changes_result)
if isinstance(check_required_columns_result, pd.DataFrame):
logger.warning(f"Columns (OLHCV) are missing.")
print(check_required_columns_result)
if isinstance(check_missing_factor_result, pd.DataFrame):
logger.warning(f"The factor column does not exist or is empty")
print(check_missing_factor_result)
if __name__ == "__main__":
fire.Fire(DataHealthChecker)

View File

@@ -23,7 +23,9 @@ from data_collector.utils import get_calendar_list, get_trading_date_by_shift, d
from data_collector.utils import get_instruments
NEW_COMPANIES_URL = "https://csi-web-dev.oss-cn-shanghai-finance-1-pub.aliyuncs.com/static/html/csindex/public/uploads/file/autofile/cons/{index_code}cons.xls"
NEW_COMPANIES_URL = (
"https://oss-ch.csindex.com.cn/static/html/csindex/public/uploads/file/autofile/cons/{index_code}cons.xls"
)
INDEX_CHANGES_URL = "https://www.csindex.com.cn/csindex-home/search/search-content?lang=cn&searchInput=%E5%85%B3%E4%BA%8E%E8%B0%83%E6%95%B4%E6%B2%AA%E6%B7%B1300%E5%92%8C%E4%B8%AD%E8%AF%81%E9%A6%99%E6%B8%AF100%E7%AD%89%E6%8C%87%E6%95%B0%E6%A0%B7%E6%9C%AC&pageNum={page_num}&pageSize={page_size}&sortField=date&dateRange=all&contentType=announcement"

View File

@@ -16,9 +16,9 @@ The packaged docker runtime is hosted on dockerhub: https://hub.docker.com/repos
## How to use it in qlib
### Option 1: Download release bin data
User can download data in qlib bin format and use it directly: https://github.com/chenditc/investment_data/releases/tag/20220720
User can download data in qlib bin format and use it directly: https://github.com/chenditc/investment_data/releases/latest
```bash
wget https://github.com/chenditc/investment_data/releases/download/20220720/qlib_bin.tar.gz
wget https://github.com/chenditc/investment_data/releases/latest/download/qlib_bin.tar.gz
tar -zxvf qlib_bin.tar.gz -C ~/.qlib/qlib_data/cn_data --strip-components=2
```

View File

@@ -37,5 +37,5 @@ setup(
language="c++",
include_dirs=[NUMPY_INCLUDE],
),
]
],
)

View File

@@ -194,7 +194,7 @@ def test_trainer_checkpoint():
assert (output_dir / "002.pth").exists()
assert os.readlink(output_dir / "latest.pth") == str(output_dir / "002.pth")
trainer.load_state_dict(torch.load(output_dir / "001.pth"))
trainer.load_state_dict(torch.load(output_dir / "001.pth", weights_only=False))
assert trainer.current_iter == 1
assert trainer.current_episode == 100