1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-06-29 09:01:18 +08:00

Compare commits

..

2 Commits

Author SHA1 Message Date
Linlang
8ac25ff4bd bump version 2024-12-23 19:56:07 +08:00
Linlang
b101006750 bump version 2024-12-23 16:00:18 +08:00
22 changed files with 44 additions and 322 deletions

View File

@@ -31,12 +31,6 @@ jobs:
- name: Update pip to the latest version
run: |
python -m pip install --upgrade pip
# Will cancel this step when the next qlib version is released. The current qlib version is: 0.9.6
- name: Installing pywinpt for windows
if: ${{ matrix.os == 'windows-latest' }}
run: |
python -m pip install pywinpty --only-binary=:all:
- name: Qlib installation test
run: |

View File

@@ -12,12 +12,6 @@ PUBLIC_DIR := $(shell [ "$$READTHEDOCS" = "True" ] && echo "$$READTHEDOCS_OUTPUT
SO_DIR := qlib/data/_libs
SO_FILES := $(wildcard $(SO_DIR)/*.so)
ifeq ($(OS),Windows_NT)
IS_WINDOWS = true
else
IS_WINDOWS = false
endif
########################################################################################
# Development Environment Management
########################################################################################
@@ -54,10 +48,6 @@ deepclean: clean
# What this code does is compile two Cython modules, rolling and expanding, using setuptools and Cython,
# and builds them as binary expansion modules that can be imported directly into Python.
# Since pyproject.toml can't do that, we compile it here.
# pywinpty as a dependency of jupyter on windows, if you use pip install pywinpty installation,
# will first download the tar.gz file, and then locally compiled and installed,
# this will lead to some unnecessary trouble, so we choose to install the compiled whl file, to avoid trouble.
prerequisite:
@if [ -n "$(SO_FILES)" ]; then \
echo "Shared library files exist, skipping build."; \
@@ -68,10 +58,6 @@ prerequisite:
python -c "from setuptools import setup, Extension; from Cython.Build import cythonize; import numpy; extensions = [Extension('qlib.data._libs.rolling', ['qlib/data/_libs/rolling.pyx'], language='c++', include_dirs=[numpy.get_include()]), Extension('qlib.data._libs.expanding', ['qlib/data/_libs/expanding.pyx'], language='c++', include_dirs=[numpy.get_include()])]; setup(ext_modules=cythonize(extensions, language_level='3'), script_args=['build_ext', '--inplace'])"; \
fi
@if [ "$(IS_WINDOWS)" = "true" ]; then \
python -m pip install pywinpty --only-binary=:all:; \
fi
# Install the package in editable mode.
dependencies:
python -m pip install -e .
@@ -101,7 +87,7 @@ analysis:
python -m pip install -e .[analysis]
all:
python -m pip install -e .[pywinpty,dev,lint,docs,package,test,analysis,rl]
python -m pip install -e .[dev,lint,docs,package,test,analysis,rl]
install: prerequisite dependencies

View File

@@ -164,6 +164,7 @@ This table demonstrates the supported Python version of `Qlib`:
**Note**:
1. **Conda** is suggested for managing your Python environment. In some cases, using Python outside of a `conda` environment may result in missing header files, causing the installation failure of certain packages.
2. Please pay attention that installing cython in Python 3.6 will raise some error when installing ``Qlib`` from source. If users use Python 3.6 on their machines, it is recommended to *upgrade* Python to version 3.8 or higher, or use `conda`'s Python to install ``Qlib`` from source.
3. For Python 3.9, `Qlib` supports running workflows such as training models, doing backtest and plot most of the related figures (those included in [notebook](examples/workflow_by_code.ipynb)). However, plotting for the *model performance* is not supported for now and we will fix this when the dependent packages are upgraded in the future.
### Install with pip
Users can easily install ``Qlib`` by pip according to the following command.
@@ -196,11 +197,11 @@ Also, users can install the latest dev version ``Qlib`` by the source code accor
## Data Preparation
❗ Due to more restrict data security policy. The offical dataset is disabled temporarily. You can try [this data source](https://github.com/chenditc/investment_data/releases) contributed by the community.
Here is an example to download the latest data.
Here is an example to download the data updated on 20240809.
```bash
wget https://github.com/chenditc/investment_data/releases/latest/download/qlib_bin.tar.gz
wget https://github.com/chenditc/investment_data/releases/download/2024-08-09/qlib_bin.tar.gz
mkdir -p ~/.qlib/qlib_data/cn_data
tar -zxvf qlib_bin.tar.gz -C ~/.qlib/qlib_data/cn_data --strip-components=2
tar -zxvf qlib_bin.tar.gz -C ~/.qlib/qlib_data/cn_data --strip-components=1
rm -f qlib_bin.tar.gz
```
@@ -264,16 +265,6 @@ We recommend users to prepare their own data if they have a high-quality dataset
* *trading_date*: start of trading day
* *end_date*: end of trading day(not included)
### Checking the health of the data
* We provide a script to check the health of the data, you can run the following commands to check whether the data is healthy or not.
```
python scripts/check_data_health.py check_data --qlib_dir ~/.qlib/qlib_data/cn_data
```
* Of course, you can also add some parameters to adjust the test results, such as this.
```
python scripts/check_data_health.py check_data --qlib_dir ~/.qlib/qlib_data/cn_data --missing_data_num 30055 --large_step_threshold_volume 94485 --large_step_threshold_price 20
```
* If you want more information about `check_data_health`, please refer to the [documentation](https://qlib.readthedocs.io/en/latest/component/data.html#checking-the-health-of-the-data).
<!--
- Run the initialization code and get stock data:

View File

@@ -197,57 +197,6 @@ After conversion, users can find their Qlib format data in the directory `~/.qli
In the convention of `Qlib` data processing, `open, close, high, low, volume, money and factor` will be set to NaN if the stock is suspended.
If you want to use your own alpha-factor which can't be calculate by OCHLV, like PE, EPS and so on, you could add it to the CSV files with OHCLV together and then dump it to the Qlib format data.
Checking the health of the data
-------------------------------
``Qlib`` provides a script to check the health of the data.
- The main points to check are as follows
- Check if any data is missing in the DataFrame.
- Check if there are any large step changes above the threshold in the OHLCV columns.
- Check if any of the required columns (OLHCV) are missing in the DataFrame.
- Check if the 'factor' column is missing in the DataFrame.
- You can run the following commands to check whether the data is healthy or not.
for daily data:
.. code-block:: bash
python scripts/check_data_health.py check_data --qlib_dir ~/.qlib/qlib_data/cn_data
for 1min data:
.. code-block:: bash
python scripts/check_data_health.py check_data --qlib_dir ~/.qlib/qlib_data/cn_data_1min --freq 1min
- Of course, you can also add some parameters to adjust the test results.
- The available parameters are these.
- freq: Frequency of data.
- large_step_threshold_price: Maximum permitted price change
- large_step_threshold_volume: Maximum permitted volume change.
- missing_data_num: Maximum value for which data is allowed to be null.
- You can run the following commands to check whether the data is healthy or not.
for daily data:
.. code-block:: bash
python scripts/check_data_health.py check_data --qlib_dir ~/.qlib/qlib_data/cn_data --missing_data_num 30055 --large_step_threshold_volume 94485 --large_step_threshold_price 20
for 1min data:
.. code-block:: bash
python scripts/check_data_health.py check_data --qlib_dir ~/.qlib/qlib_data/cn_data --freq 1min --missing_data_num 35806 --large_step_threshold_volume 3205452000000 --large_step_threshold_price 0.91
Stock Pool (Market)
-------------------

View File

@@ -25,7 +25,7 @@ The design of the framework is shown in the yellow part in the middle of the fig
The frequency of the trading algorithm, decision content and execution environment can be customized by users (e.g. intraday trading, daily-frequency trading, weekly-frequency trading), and the execution environment can be nested with finer-grained trading algorithm and execution environment inside (i.e. sub-workflow in the figure, e.g. daily-frequency orders can be turned into finer-grained decisions by splitting orders within the day). The flexibility of the nested decision execution framework makes it easy for users to explore the effects of combining different levels of trading strategies and break down the optimization barriers between different levels of the trading algorithm.
The optimization for the nested decision execution framework can be implemented with the support of `QlibRL <./rl/overall.html>`_. To know more about how to use the QlibRL, go to API Reference: `RL API <../reference/api.html#rl>`_.
The optimization for the nested decision execution framework can be implemented with the support of `QlibRL <https://qlib.readthedocs.io/en/latest/component/rl.html>`_. To know more about how to use the QlibRL, go to API Reference: `RL API <../reference/api.html#rl>`_.
Example
=======

View File

@@ -7,7 +7,7 @@ The table below shows the performances of different solutions on different forec
## Alpha158 Dataset
Here is the [crowd sourced version of qlib data](data_collector/crowd_source/README.md): https://github.com/chenditc/investment_data/releases
```bash
wget https://github.com/chenditc/investment_data/releases/latest/download/qlib_bin.tar.gz
wget https://github.com/chenditc/investment_data/releases/download/20220720/qlib_bin.tar.gz
mkdir -p ~/.qlib/qlib_data/cn_data
tar -zxvf qlib_bin.tar.gz -C ~/.qlib/qlib_data/cn_data --strip-components=2
rm -f qlib_bin.tar.gz

View File

@@ -1,8 +1,8 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
The motivation of this demo
- To show the data modules of Qlib is Serializable, users can dump processed data to disk to avoid duplicated data preprocessing
The motivation of this demo
- To show the data modules of Qlib is Serializable, users can dump processed data to disk to avoid duplicated data preprocessing
"""
from copy import deepcopy

View File

@@ -1,8 +1,8 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
The motivation of this demo
- To show the data modules of Qlib is Serializable, users can dump processed data to disk to avoid duplicated data preprocessing
The motivation of this demo
- To show the data modules of Qlib is Serializable, users can dump processed data to disk to avoid duplicated data preprocessing
"""
from copy import deepcopy

View File

@@ -1,10 +1,10 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
NOTE:
- This scripts is a demo to import example data import Qlib
- !!!!!!!!!!!!!!!TODO!!!!!!!!!!!!!!!!!!!:
- Its structure is not well designed and very ugly, your contribution is welcome to make importing dataset easier
NOTE:
- This scripts is a demo to import example data import Qlib
- !!!!!!!!!!!!!!!TODO!!!!!!!!!!!!!!!!!!!:
- Its structure is not well designed and very ugly, your contribution is welcome to make importing dataset easier
"""
from datetime import date, datetime as dt
import os

View File

@@ -1,7 +1,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
Qlib provides two kinds of interfaces.
Qlib provides two kinds of interfaces.
(1) Users could define the Quant research workflow by a simple configuration.
(2) Qlib is designed in a modularized way and supports creating research workflow by code just like building blocks.

View File

@@ -79,7 +79,6 @@ package = [
test = [
"yahooquery",
"baostock",
"akshare",
]
analysis = [
"plotly",

View File

@@ -427,10 +427,6 @@ class Indicator:
# NOTE ~(price_s < 1e-08) is different from price_s >= 1e-8
# ~(np.nan < 1e-8) -> ~(False) -> True
# if price_s is empty
if price_s.empty:
return None, None
assert isinstance(price_s, idd.SingleData)
if agg == "vwap":
volume_s = trade_exchange.get_volume(inst, trade_start_time, trade_end_time, method=None)

View File

@@ -326,10 +326,8 @@ class SBBStrategyEMA(SBBStrategyBase):
if instruments is None:
warnings.warn("`instruments` is not set, will load all stocks")
self.instruments = "all"
elif isinstance(instruments, str):
if isinstance(instruments, str):
self.instruments = D.instruments(instruments)
elif isinstance(instruments, List):
self.instruments = instruments
self.freq = freq
super(SBBStrategyEMA, self).__init__(
outer_trade_decision, level_infra, common_infra, trade_exchange=trade_exchange, **kwargs

View File

@@ -1,9 +1,9 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
This module is not a necessary part of Qlib.
They are just some tools for convenience
It is should not imported into the core part of qlib
This module is not a necessary part of Qlib.
They are just some tools for convenience
It is should not imported into the core part of qlib
"""
import torch
import numpy as np

View File

@@ -200,7 +200,7 @@ class Trainer:
if ckpt_path is not None:
_logger.info("Resuming states from %s", str(ckpt_path))
self.load_state_dict(torch.load(ckpt_path, weights_only=False))
self.load_state_dict(torch.load(ckpt_path))
else:
self.initialize()

View File

@@ -71,6 +71,6 @@ qlib.init(provider_uri=provider_uri, region=REG_CN)
## Use Crowd Sourced Data
The is also a [crowd sourced version of qlib data](data_collector/crowd_source/README.md): https://github.com/chenditc/investment_data/releases
```bash
wget https://github.com/chenditc/investment_data/releases/latest/download/qlib_bin.tar.gz
wget https://github.com/chenditc/investment_data/releases/download/20220720/qlib_bin.tar.gz
tar -zxvf qlib_bin.tar.gz -C ~/.qlib/qlib_data/cn_data --strip-components=2
```

View File

@@ -1,203 +0,0 @@
from loguru import logger
import os
from typing import Optional
import fire
import pandas as pd
import qlib
from tqdm import tqdm
from qlib.data import D
class DataHealthChecker:
"""Checks a dataset for data completeness and correctness. The data will be converted to a pd.DataFrame and checked for the following problems:
- any of the columns ["open", "high", "low", "close", "volume"] are missing
- any data is missing
- any step change in the OHLCV columns is above a threshold (default: 0.5 for price, 3 for volume)
- any factor is missing
"""
def __init__(
self,
csv_path=None,
qlib_dir=None,
freq="day",
large_step_threshold_price=0.5,
large_step_threshold_volume=3,
missing_data_num=0,
):
assert csv_path or qlib_dir, "One of csv_path or qlib_dir should be provided."
assert not (csv_path and qlib_dir), "Only one of csv_path or qlib_dir should be provided."
self.data = {}
self.problems = {}
self.freq = freq
self.large_step_threshold_price = large_step_threshold_price
self.large_step_threshold_volume = large_step_threshold_volume
self.missing_data_num = missing_data_num
if csv_path:
assert os.path.isdir(csv_path), f"{csv_path} should be a directory."
files = [f for f in os.listdir(csv_path) if f.endswith(".csv")]
for filename in tqdm(files, desc="Loading data"):
df = pd.read_csv(os.path.join(csv_path, filename))
self.data[filename] = df
elif qlib_dir:
qlib.init(provider_uri=qlib_dir)
self.load_qlib_data()
def load_qlib_data(self):
instruments = D.instruments(market="all")
instrument_list = D.list_instruments(instruments=instruments, as_list=True, freq=self.freq)
required_fields = ["$open", "$close", "$low", "$high", "$volume", "$factor"]
for instrument in instrument_list:
df = D.features([instrument], required_fields, freq=self.freq)
df.rename(
columns={
"$open": "open",
"$close": "close",
"$low": "low",
"$high": "high",
"$volume": "volume",
"$factor": "factor",
},
inplace=True,
)
self.data[instrument] = df
print(df)
def check_missing_data(self) -> Optional[pd.DataFrame]:
"""Check if any data is missing in the DataFrame."""
result_dict = {
"instruments": [],
"open": [],
"high": [],
"low": [],
"close": [],
"volume": [],
}
for filename, df in self.data.items():
missing_data_columns = df.isnull().sum()[df.isnull().sum() > self.missing_data_num].index.tolist()
if len(missing_data_columns) > 0:
result_dict["instruments"].append(filename)
result_dict["open"].append(df.isnull().sum()["open"])
result_dict["high"].append(df.isnull().sum()["high"])
result_dict["low"].append(df.isnull().sum()["low"])
result_dict["close"].append(df.isnull().sum()["close"])
result_dict["volume"].append(df.isnull().sum()["volume"])
result_df = pd.DataFrame(result_dict).set_index("instruments")
if not result_df.empty:
return result_df
else:
logger.info(f"✅ There are no missing data.")
return None
def check_large_step_changes(self) -> Optional[pd.DataFrame]:
"""Check if there are any large step changes above the threshold in the OHLCV columns."""
result_dict = {
"instruments": [],
"col_name": [],
"date": [],
"pct_change": [],
}
for filename, df in self.data.items():
affected_columns = []
for col in ["open", "high", "low", "close", "volume"]:
if col in df.columns:
pct_change = df[col].pct_change(fill_method=None).abs()
threshold = self.large_step_threshold_volume if col == "volume" else self.large_step_threshold_price
if pct_change.max() > threshold:
large_steps = pct_change[pct_change > threshold]
result_dict["instruments"].append(filename)
result_dict["col_name"].append(col)
result_dict["date"].append(large_steps.index.to_list()[0][1].strftime("%Y-%m-%d"))
result_dict["pct_change"].append(pct_change.max())
affected_columns.append(col)
result_df = pd.DataFrame(result_dict).set_index("instruments")
if not result_df.empty:
return result_df
else:
logger.info(f"✅ There are no large step changes in the OHLCV column above the threshold.")
return None
def check_required_columns(self) -> Optional[pd.DataFrame]:
"""Check if any of the required columns (OLHCV) are missing in the DataFrame."""
required_columns = ["open", "high", "low", "close", "volume"]
result_dict = {
"instruments": [],
"missing_col": [],
}
for filename, df in self.data.items():
if not all(column in df.columns for column in required_columns):
missing_required_columns = [column for column in required_columns if column not in df.columns]
result_dict["instruments"].append(filename)
result_dict["missing_col"] += missing_required_columns
result_df = pd.DataFrame(result_dict).set_index("instruments")
if not result_df.empty:
return result_df
else:
logger.info(f"✅ The columns (OLHCV) are complete and not missing.")
return None
def check_missing_factor(self) -> Optional[pd.DataFrame]:
"""Check if the 'factor' column is missing in the DataFrame."""
result_dict = {
"instruments": [],
"missing_factor_col": [],
"missing_factor_data": [],
}
for filename, df in self.data.items():
if "000300" in filename or "000903" in filename or "000905" in filename:
continue
if "factor" not in df.columns:
result_dict["instruments"].append(filename)
result_dict["missing_factor_col"].append(True)
if df["factor"].isnull().all():
if filename in result_dict["instruments"]:
result_dict["missing_factor_data"].append(True)
else:
result_dict["instruments"].append(filename)
result_dict["missing_factor_col"].append(False)
result_dict["missing_factor_data"].append(True)
result_df = pd.DataFrame(result_dict).set_index("instruments")
if not result_df.empty:
return result_df
else:
logger.info(f"✅ The `factor` column already exists and is not empty.")
return None
def check_data(self):
check_missing_data_result = self.check_missing_data()
check_large_step_changes_result = self.check_large_step_changes()
check_required_columns_result = self.check_required_columns()
check_missing_factor_result = self.check_missing_factor()
if (
check_large_step_changes_result is not None
or check_large_step_changes_result is not None
or check_required_columns_result is not None
or check_missing_factor_result is not None
):
print(f"\nSummary of data health check ({len(self.data)} files checked):")
print("-------------------------------------------------")
if isinstance(check_missing_data_result, pd.DataFrame):
logger.warning(f"There is missing data.")
print(check_missing_data_result)
if isinstance(check_large_step_changes_result, pd.DataFrame):
logger.warning(f"The OHLCV column has large step changes.")
print(check_large_step_changes_result)
if isinstance(check_required_columns_result, pd.DataFrame):
logger.warning(f"Columns (OLHCV) are missing.")
print(check_required_columns_result)
if isinstance(check_missing_factor_result, pd.DataFrame):
logger.warning(f"The factor column does not exist or is empty")
print(check_missing_factor_result)
if __name__ == "__main__":
fire.Fire(DataHealthChecker)

View File

@@ -23,9 +23,7 @@ from data_collector.utils import get_calendar_list, get_trading_date_by_shift, d
from data_collector.utils import get_instruments
NEW_COMPANIES_URL = (
"https://oss-ch.csindex.com.cn/static/html/csindex/public/uploads/file/autofile/cons/{index_code}cons.xls"
)
NEW_COMPANIES_URL = "https://csi-web-dev.oss-cn-shanghai-finance-1-pub.aliyuncs.com/static/html/csindex/public/uploads/file/autofile/cons/{index_code}cons.xls"
INDEX_CHANGES_URL = "https://www.csindex.com.cn/csindex-home/search/search-content?lang=cn&searchInput=%E5%85%B3%E4%BA%8E%E8%B0%83%E6%95%B4%E6%B2%AA%E6%B7%B1300%E5%92%8C%E4%B8%AD%E8%AF%81%E9%A6%99%E6%B8%AF100%E7%AD%89%E6%8C%87%E6%95%B0%E6%A0%B7%E6%9C%AC&pageNum={page_num}&pageSize={page_size}&sortField=date&dateRange=all&contentType=announcement"

View File

@@ -16,9 +16,9 @@ The packaged docker runtime is hosted on dockerhub: https://hub.docker.com/repos
## How to use it in qlib
### Option 1: Download release bin data
User can download data in qlib bin format and use it directly: https://github.com/chenditc/investment_data/releases/latest
User can download data in qlib bin format and use it directly: https://github.com/chenditc/investment_data/releases/tag/20220720
```bash
wget https://github.com/chenditc/investment_data/releases/latest/download/qlib_bin.tar.gz
wget https://github.com/chenditc/investment_data/releases/download/20220720/qlib_bin.tar.gz
tar -zxvf qlib_bin.tar.gz -C ~/.qlib/qlib_data/cn_data --strip-components=2
```

View File

@@ -13,7 +13,6 @@ import functools
from pathlib import Path
from typing import Iterable, Tuple, List
import akshare as ak
import numpy as np
import pandas as pd
from loguru import logger
@@ -203,9 +202,18 @@ def get_hs_stock_symbols() -> list:
-------
{600000.ss, 600001.ss, 600002.ss, 600003.ss, ...}
"""
stock_info_a_code_name_df = ak.stock_info_a_code_name()
stock_codes = stock_info_a_code_name_df["code"].tolist()
_symbols = [code for code in stock_codes if code and code.strip()]
url = "http://99.push2.eastmoney.com/api/qt/clist/get?pn=1&pz=10000&po=1&np=1&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f12"
try:
resp = requests.get(url, timeout=None)
resp.raise_for_status()
except requests.exceptions.HTTPError as e:
raise requests.exceptions.HTTPError(f"Request to {url} failed with status code {resp.status_code}") from e
try:
_symbols = [_v["f12"] for _v in resp.json()["data"]["diff"]]
except Exception as e:
logger.warning("An error occurred while extracting data from the response.")
raise
if len(_symbols) < 3900:
raise ValueError("The complete list of stocks is not available.")

View File

@@ -50,6 +50,12 @@ pip install -r requirements.txt
python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data_1min --region cn --interval 1min
# us 1d
python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/us_data --region us --interval 1d
# us 1min
python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/us_data_1min --region us --interval 1min
# in 1d
python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/in_data --region in --interval 1d
# in 1min
python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/in_data_1min --region in --interval 1min
```
### Collector *YahooFinance* data to qlib

View File

@@ -194,7 +194,7 @@ def test_trainer_checkpoint():
assert (output_dir / "002.pth").exists()
assert os.readlink(output_dir / "latest.pth") == str(output_dir / "002.pth")
trainer.load_state_dict(torch.load(output_dir / "001.pth", weights_only=False))
trainer.load_state_dict(torch.load(output_dir / "001.pth"))
assert trainer.current_iter == 1
assert trainer.current_episode == 100