Merge pull request #17 from you-n-g/main
add test, mod doc , refine data
@@ -150,3 +150,11 @@ Version 0.4.6
|
||||
- Some bugs are fixed
|
||||
- The default config in `Version 0.4.5` is not friendly to daily frequency data.
|
||||
- Backtest error in TopkWeightStrategy when `WithInteract=True`.
|
||||
|
||||
|
||||
Version 0.5.0
|
||||
--------------------
|
||||
- First opensource version
|
||||
- Refine the docs, code
|
||||
- Add baselines
|
||||
- public data crawler
|
||||
|
||||
38
README.md
@@ -12,13 +12,14 @@ With Qlib, you can easily try your ideas to create better Quant investment strat
|
||||
|
||||
For more details, please refer to our paper ["Qlib: An AI-oriented Quantitative Investment Platform"](https://arxiv.org/abs/2009.11189).
|
||||
|
||||
|
||||
- [Framework of Qlib](#framework-of-qlib)
|
||||
- [Quick Start](#quick-start)
|
||||
- [Installation](#installation)
|
||||
- [Data Preparation](#data-preparation)
|
||||
- [Auto Quant Research Workflow](#auto-quant-research-workflow)
|
||||
- [Building Customized Quant Research Workflow by Code](#building-customized-quant-research-workflow-by-code)
|
||||
- [Quant Model Zoo](#quant-model-zoo)
|
||||
- [Quant Dataset Zoo](#quant-dataset-zoo)
|
||||
- [More About Qlib](#more-about-qlib)
|
||||
- [Offline Mode and Online Mode](#offline-mode-and-online-mode)
|
||||
- [Performance of Qlib Data Server](#performance-of-qlib-data-server)
|
||||
@@ -124,16 +125,17 @@ Qlib provides a tool named `Estimator` to run the whole workflow automatically (
|
||||
```bash
|
||||
|
||||
risk
|
||||
excess_return_without_cost mean 0.000605
|
||||
std 0.005481
|
||||
annualized_return 0.152373
|
||||
information_ratio 1.751319
|
||||
max_drawdown -0.059055
|
||||
excess_return_with_cost mean 0.000410
|
||||
std 0.005478
|
||||
annualized_return 0.103265
|
||||
information_ratio 1.187411
|
||||
max_drawdown -0.075024
|
||||
excess_return_without_cost mean 0.000675
|
||||
std 0.005456
|
||||
annualized_return 0.170077
|
||||
information_ratio 1.963824
|
||||
max_drawdown -0.063646
|
||||
excess_return_with_cost mean 0.000479
|
||||
std 0.005453
|
||||
annualized_return 0.120776
|
||||
information_ratio 1.395116
|
||||
max_drawdown -0.071216
|
||||
|
||||
|
||||
|
||||
```
|
||||
@@ -171,6 +173,20 @@ Qlib provides a tool named `Estimator` to run the whole workflow automatically (
|
||||
The automatic workflow may not suite the research workflow of all Quant researchers. To support a flexible Quant research workflow, Qlib also provides a modularized interface to allow researchers to build their own workflow by code. [Here](examples/train_backtest_analyze.ipynb) is a demo for customized Quant research workflow by code
|
||||
|
||||
|
||||
# Quant-model-zoo
|
||||
|
||||
Here is a list of models build on `Qlib`.
|
||||
- [GBDT based on lightgbm](qlib/contrib/model/gbdt.py)
|
||||
- [MLP based on pytroch](qlib/contrib/model/pytorch_nn.py)
|
||||
|
||||
Your PR of new Quant models is highly welcomed.
|
||||
|
||||
# Quant-dataset-zoo
|
||||
Dataset plays a very important role in Quant. Here is a list of the datasets build on `Qlib`.
|
||||
- [Alpha360](./qlib/contrib/estimator/handler.py)
|
||||
- [QLibDataHandlerClose](./qlib/contrib/estimator/handler.py)
|
||||
|
||||
Your PR to build new Quant dataset is highly welcomed.
|
||||
|
||||
# More About Qlib
|
||||
The detailed documents are organized in [docs](docs/).
|
||||
|
||||
BIN
docs/_static/img/analysis/analysis_model_IC.png
vendored
|
Before Width: | Height: | Size: 47 KiB After Width: | Height: | Size: 40 KiB |
BIN
docs/_static/img/analysis/analysis_model_NDQ.png
vendored
|
Before Width: | Height: | Size: 27 KiB After Width: | Height: | Size: 24 KiB |
|
Before Width: | Height: | Size: 52 KiB After Width: | Height: | Size: 52 KiB |
|
Before Width: | Height: | Size: 69 KiB After Width: | Height: | Size: 66 KiB |
|
Before Width: | Height: | Size: 19 KiB After Width: | Height: | Size: 17 KiB |
|
Before Width: | Height: | Size: 21 KiB After Width: | Height: | Size: 18 KiB |
BIN
docs/_static/img/analysis/report.png
vendored
|
Before Width: | Height: | Size: 164 KiB After Width: | Height: | Size: 163 KiB |
|
Before Width: | Height: | Size: 50 KiB After Width: | Height: | Size: 53 KiB |
BIN
docs/_static/img/analysis/risk_analysis_bar.png
vendored
|
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 15 KiB |
|
Before Width: | Height: | Size: 54 KiB After Width: | Height: | Size: 56 KiB |
|
Before Width: | Height: | Size: 57 KiB After Width: | Height: | Size: 57 KiB |
BIN
docs/_static/img/analysis/risk_analysis_std.png
vendored
|
Before Width: | Height: | Size: 46 KiB After Width: | Height: | Size: 47 KiB |
BIN
docs/_static/img/analysis/score_ic.png
vendored
|
Before Width: | Height: | Size: 103 KiB After Width: | Height: | Size: 105 KiB |
@@ -65,7 +65,7 @@ After conversion, users can find their Qlib format data in the directory `~/.qli
|
||||
|
||||
.. note::
|
||||
|
||||
The arguments of `--include_fields` should correspond with the column names of CSV files. The columns names of dataset provided by ``Qlib`` includes open,close,high,low,volume,factor.
|
||||
The arguments of `--include_fields` should correspond with the column names of CSV files. The columns names of dataset provided by ``Qlib`` should include open, close, high, low, volume and factor at least.
|
||||
|
||||
- `open`
|
||||
The opening price
|
||||
@@ -80,6 +80,7 @@ After conversion, users can find their Qlib format data in the directory `~/.qli
|
||||
- `factor`
|
||||
The Restoration factor
|
||||
|
||||
In the convention of `Qlib` data processing, `open, close, high, low, volume, money and factor` will be set to NaN if the stock is suspended.
|
||||
|
||||
China-Stock Mode & US-Stock Mode
|
||||
--------------------------------
|
||||
|
||||
@@ -120,7 +120,7 @@ _default_client_config = {
|
||||
_default_region_config = {
|
||||
REG_CN: {
|
||||
"trade_unit": 100,
|
||||
"limit_threshold": 0.1,
|
||||
"limit_threshold": 0.099,
|
||||
"deal_price": "vwap",
|
||||
},
|
||||
REG_US: {
|
||||
|
||||
@@ -149,7 +149,7 @@ class Exchange:
|
||||
self.quote = quote_df.to_dict("index")
|
||||
|
||||
def _update_limit(self, buy_limit, sell_limit):
|
||||
self.quote["limit"] = ~self.quote["$change"].between(-sell_limit, buy_limit)
|
||||
self.quote["limit"] = ~self.quote["$change"].between(-sell_limit, buy_limit, inclusive=False)
|
||||
|
||||
def check_stock_limit(self, stock_id, trade_date):
|
||||
"""Parameter
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import re
|
||||
import requests
|
||||
|
||||
|
||||
@@ -1,5 +1,15 @@
|
||||
# Collect Data From Yahoo Finance
|
||||
|
||||
> *Please pay **ATTENTION** that the data is collected from [Yahoo Finance](https://finance.yahoo.com/lookup) and the data might not be perfect. We recommend users to prepare their own data if they have high-quality dataset. For more information, users can refer to the [related document](https://qlib.readthedocs.io/en/latest/component/data.html#converting-csv-format-into-qlib-format)*
|
||||
|
||||
|
||||
> **Examples of abnormal data**
|
||||
|
||||
- [SH000661](https://finance.yahoo.com/quote/000661.SZ/history?period1=1558310400&period2=1590796800&interval=1d&filter=history&frequency=1d)
|
||||
- [SZ300144](https://finance.yahoo.com/quote/300144.SZ/history?period1=1557446400&period2=1589932800&interval=1d&filter=history&frequency=1d)
|
||||
|
||||
We have considered **STOCK PRICE ADJUSTMENT**, but some price series seem still very abnormal.
|
||||
|
||||
## Requirements
|
||||
|
||||
```bash
|
||||
@@ -35,4 +45,4 @@ python collector.py manual_adj_data --normalize_dir ~/.qlib/stock_data/normalize
|
||||
|
||||
```bash
|
||||
python collector.py dump_data --normalize_dir ~/.qlib/stock_data/normalize_dir --qlib_dir ~/.qlib/stock_data/qlib_data
|
||||
```
|
||||
```
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
@@ -19,17 +20,20 @@ from dump_bin import DumpData
|
||||
from data_collector.utils import get_hs_calendar_list as get_calendar_list, get_hs_stock_symbols
|
||||
|
||||
CSI300_BENCH_URL = "http://push2his.eastmoney.com/api/qt/stock/kline/get?secid=1.000300&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58&klt=101&fqt=0&beg=19900101&end=20220101"
|
||||
MIN_NUMBERS_TRADING = 252 / 4
|
||||
|
||||
|
||||
class YahooCollector:
|
||||
def __init__(self, save_dir: [str, Path], max_workers=4, asynchronous=True, max_collector_count=3):
|
||||
def __init__(self, save_dir: [str, Path], max_workers=4, asynchronous=False, max_collector_count=5, delay=0):
|
||||
|
||||
self.save_dir = Path(save_dir).expanduser().resolve()
|
||||
self.save_dir.mkdir(parents=True, exist_ok=True)
|
||||
self._delay = delay
|
||||
self._stock_list = None
|
||||
self.max_workers = max_workers
|
||||
self._asynchronous = asynchronous
|
||||
self._max_collector_count = max_collector_count
|
||||
self._mini_symbol_map = {}
|
||||
|
||||
@property
|
||||
def stock_list(self):
|
||||
@@ -37,6 +41,9 @@ class YahooCollector:
|
||||
self._stock_list = get_hs_stock_symbols()
|
||||
return self._stock_list
|
||||
|
||||
def _sleep(self):
|
||||
time.sleep(self._delay)
|
||||
|
||||
def save_stock(self, symbol, df: pd.DataFrame):
|
||||
"""save stock data to file
|
||||
|
||||
@@ -56,6 +63,15 @@ class YahooCollector:
|
||||
df["symbol"] = symbol
|
||||
df.to_csv(stock_path, index=False)
|
||||
|
||||
def _temp_save_small_data(self, symbol, df):
|
||||
if len(df) <= MIN_NUMBERS_TRADING:
|
||||
logger.warning(f"the number of trading days of {symbol} is less than {MIN_NUMBERS_TRADING}!")
|
||||
_temp = self._mini_symbol_map.setdefault(symbol, [])
|
||||
_temp.append(df.copy())
|
||||
else:
|
||||
if symbol in self._mini_symbol_map:
|
||||
self._mini_symbol_map.pop(symbol)
|
||||
|
||||
def _collector(self, stock_list):
|
||||
|
||||
error_symbol = []
|
||||
@@ -63,12 +79,14 @@ class YahooCollector:
|
||||
futures = {}
|
||||
p_bar = tqdm(total=len(stock_list))
|
||||
for symbols in [stock_list[i : i + self.max_workers] for i in range(0, len(stock_list), self.max_workers)]:
|
||||
self._sleep()
|
||||
resp = Ticker(symbols, asynchronous=self._asynchronous, max_workers=self.max_workers).history(
|
||||
period="max"
|
||||
)
|
||||
if isinstance(resp, dict):
|
||||
for symbol, df in resp.items():
|
||||
if isinstance(df, pd.DataFrame):
|
||||
self._temp_save_small_data(self, df)
|
||||
futures[
|
||||
worker.submit(
|
||||
self.save_stock, symbol, df.reset_index().rename(columns={"index": "date"})
|
||||
@@ -78,6 +96,7 @@ class YahooCollector:
|
||||
error_symbol.append(symbol)
|
||||
else:
|
||||
for symbol, df in resp.reset_index().groupby("symbol"):
|
||||
self._temp_save_small_data(self, df)
|
||||
futures[worker.submit(self.save_stock, symbol, df)] = symbol
|
||||
p_bar.update(self.max_workers)
|
||||
p_bar.close()
|
||||
@@ -93,6 +112,7 @@ class YahooCollector:
|
||||
print(error_symbol)
|
||||
logger.info(f"error symbol nums: {len(error_symbol)}")
|
||||
logger.info(f"current get symbol nums: {len(stock_list)}")
|
||||
error_symbol.extend(self._mini_symbol_map.keys())
|
||||
return error_symbol
|
||||
|
||||
def collector_data(self):
|
||||
@@ -107,7 +127,14 @@ class YahooCollector:
|
||||
logger.info(f"getting data: {i+1}")
|
||||
stock_list = self._collector(stock_list)
|
||||
logger.info(f"{i+1} finish.")
|
||||
for _symbol, _df_list in self._mini_symbol_map.items():
|
||||
self.save_stock(_symbol, max(_df_list, key=len))
|
||||
|
||||
logger.warning(f"less than {MIN_NUMBERS_TRADING} stock list: {list(self._mini_symbol_map.keys())}")
|
||||
|
||||
self.download_csi300_data()
|
||||
|
||||
def download_csi300_data(self):
|
||||
# TODO: from MSN
|
||||
logger.info(f"get bench data: csi300(SH000300)......")
|
||||
df = pd.DataFrame(map(lambda x: x.split(","), requests.get(CSI300_BENCH_URL).json()["data"]["klines"]))
|
||||
@@ -164,6 +191,7 @@ class Run:
|
||||
df = pd.read_csv(file_path)
|
||||
df.set_index("date", inplace=True)
|
||||
df.index = pd.to_datetime(df.index)
|
||||
df = df[~df.index.duplicated(keep="first")]
|
||||
|
||||
# using China stock market data calendar
|
||||
df = df.reindex(pd.Index(get_calendar_list()))
|
||||
@@ -232,7 +260,7 @@ class Run:
|
||||
include_fields="close,open,high,low,volume,change,factor"
|
||||
)
|
||||
|
||||
def download_data(self):
|
||||
def download_data(self, asynchronous=False, max_collector_count=5, delay=0):
|
||||
"""download data from Internet
|
||||
|
||||
Examples
|
||||
@@ -240,7 +268,20 @@ class Run:
|
||||
$ python collector.py download_data --source_dir ~/.qlib/stock_data/source
|
||||
|
||||
"""
|
||||
YahooCollector(self.source_dir, max_workers=self.max_workers).collector_data()
|
||||
YahooCollector(
|
||||
self.source_dir,
|
||||
max_workers=self.max_workers,
|
||||
asynchronous=asynchronous,
|
||||
max_collector_count=max_collector_count,
|
||||
delay=delay,
|
||||
).collector_data()
|
||||
|
||||
def download_csi300_data(self):
|
||||
YahooCollector(self.source_dir).download_csi300_data()
|
||||
|
||||
def download_bench_data(self):
|
||||
"""download bench stock data(SH000300)
|
||||
"""
|
||||
|
||||
def collector_data(self):
|
||||
"""download -> normalize -> dump data
|
||||
|
||||
@@ -53,7 +53,7 @@ class GetData:
|
||||
for _file in tqdm(zp.namelist()):
|
||||
zp.extract(_file, str(target_dir.resolve()))
|
||||
|
||||
def qlib_data_cn(self, target_dir="~/.qlib/qlib_data/cn_data", version="v1"):
|
||||
def qlib_data_cn(self, target_dir="~/.qlib/qlib_data/cn_data", version="latest"):
|
||||
"""download cn qlib data from remote
|
||||
|
||||
Parameters
|
||||
@@ -61,7 +61,7 @@ class GetData:
|
||||
target_dir: str
|
||||
data save directory
|
||||
version: str
|
||||
data version, value from [v0, v1], by default v1
|
||||
data version, value from [v0, v1, ..., latest], by default latest
|
||||
|
||||
Examples
|
||||
---------
|
||||
|
||||
2
tests/dataset_tests/README.md
Normal file
@@ -0,0 +1,2 @@
|
||||
# About dataset tests
|
||||
Tests in this folder are for testing the prepared dataset from Yahoo
|
||||
42
tests/dataset_tests/test_dataset.py
Normal file
@@ -0,0 +1,42 @@
|
||||
|
||||
import qlib
|
||||
from qlib.data import D
|
||||
from qlib.config import REG_CN
|
||||
import unittest
|
||||
import numpy as np
|
||||
|
||||
|
||||
class TestDataset(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir
|
||||
qlib.init(provider_uri=provider_uri, region=REG_CN)
|
||||
|
||||
def testCSI300(self):
|
||||
close_p = D.features(D.instruments('csi300'), ['$close'])
|
||||
size = close_p.groupby('datetime').size()
|
||||
cnt = close_p.groupby('datetime').count()
|
||||
size_desc = size.describe(percentiles=np.arange(0.1, 0.9, 0.1))
|
||||
cnt_desc = cnt.describe(percentiles=np.arange(0.1, 0.9, 0.1))
|
||||
|
||||
print(size_desc)
|
||||
print(cnt_desc)
|
||||
|
||||
self.assertLessEqual(size_desc.loc["max"][0], 305, "Excessive number of CSI300 constituent stocks")
|
||||
self.assertLessEqual(size_desc.loc["80%"][0], 290, "Insufficient number of CSI300 constituent stocks")
|
||||
|
||||
self.assertLessEqual(cnt_desc.loc["max"][0], 305, "Excessive number of CSI300 constituent stocks")
|
||||
self.assertEqual(cnt_desc.loc["80%"][0], 300, "Insufficient number of CSI300 constituent stocks")
|
||||
|
||||
def testClose(self):
|
||||
close_p = D.features(D.instruments('csi300'), ['Ref($close, 1)/$close - 1'])
|
||||
close_desc = close_p.describe(percentiles=np.arange(0.1, 0.9, 0.1))
|
||||
print(close_desc)
|
||||
self.assertLessEqual(abs(close_desc.loc["80%"][0]), 0.1, "Close value is abnormal")
|
||||
self.assertLessEqual(abs(close_desc.loc["max"][0]), 0.2, "Close value is abnormal")
|
||||
self.assertGreaterEqual(close_desc.loc["min"][0], -0.2, "Close value is abnormal")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||