1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-06-06 05:51:17 +08:00

support collecting yahoo 1min data

This commit is contained in:
zhupr
2021-01-21 15:58:19 +08:00
committed by you-n-g
parent 36e5c601de
commit 1a8f1bfc57
4 changed files with 645 additions and 282 deletions

View File

@@ -136,7 +136,7 @@ After conversion, users can find their Qlib format data in the directory `~/.qli
- `volume`
The adjusted trading volume
- `factor`
The Restoration factor. Normally, original_price = adj_price / factor
The Restoration factor. Normally, ``factor = adjusted_price / original_price``, `adjusted price` reference: `split adjusted <https://www.investopedia.com/terms/s/splitadjusted.asp>`_
In the convention of `Qlib` data processing, `open, close, high, low, volume, money and factor` will be set to NaN if the stock is suspended.

View File

@@ -5,6 +5,7 @@ import re
import time
import bisect
import pickle
import random
import requests
import functools
from pathlib import Path
@@ -17,6 +18,7 @@ from yahooquery import Ticker
HS_SYMBOLS_URL = "http://app.finance.ifeng.com/hq/list.php?type=stock_a&class={s_type}"
CALENDAR_URL_BASE = "http://push2his.eastmoney.com/api/qt/stock/kline/get?secid={market}.{bench_code}&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58&klt=101&fqt=0&beg=19900101&end=20991231"
SZSE_CALENDAR_URL = "http://www.szse.cn/api/report/exchange/onepersistenthour/monthList?month={month}&random={random}"
CALENDAR_BENCH_URL_MAP = {
"CSI300": CALENDAR_URL_BASE.format(market=1, bench_code="000300"),
@@ -63,7 +65,29 @@ def get_calendar_list(bench_code="CSI300") -> list:
df = Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max")
calendar = df.index.get_level_values(level="date").map(pd.Timestamp).unique().tolist()
else:
calendar = _get_calendar(CALENDAR_BENCH_URL_MAP[bench_code])
if bench_code.upper() == "ALL":
@deco_retry
def _get_calendar(month):
_cal = []
try:
resp = requests.get(SZSE_CALENDAR_URL.format(month=month, random=random.random)).json()
for _r in resp["data"]:
if int(_r["jybz"]):
_cal.append(pd.Timestamp(_r["jyrq"]))
except Exception as e:
raise ValueError(f"{month}-->{e}")
return _cal
month_range = pd.date_range(start="2000-01", end=pd.Timestamp.now() + pd.Timedelta(days=31), freq="M")
calendar = []
for _m in month_range:
cal = _get_calendar(_m.strftime("%Y-%m"))
if cal:
calendar += cal
calendar = list(filter(lambda x: x <= pd.Timestamp.now(), calendar))
else:
calendar = _get_calendar(CALENDAR_BENCH_URL_MAP[bench_code])
_CALENDAR_MAP[bench_code] = calendar
logger.info(f"end of get calendar list: {bench_code}.")
return calendar

View File

@@ -18,23 +18,81 @@ pip install -r requirements.txt
## Collector Data
### Download data and Normalize data
```bash
python collector.py collector_data --source_dir ~/.qlib/stock_data/source --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d --normalize_dir ~/.qlib/stock_data/normalize
```
### Download Data
### CN Data
#### 1d
```bash
python collector.py download_data --source_dir ~/.qlib/stock_data/source --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d
# download from yahoo finance
python collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_1d --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d
# normalize
python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/cn_1d --normalize_dir ~/.qlib/stock_data/source/cn_1d_nor --region CN --interval 1d
# dump data
cd qlib/scripts
python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1d_nor --qlib_dir ~/.qlib/stock_data/source/qlib_cn_1d --freq day --exclude_fields date,adjclose,dividends,splits,symbol
# using
import qlib
from qlib.data import D
qlib.init(provider_uri="~/.qlib/stock_data/source/qlib_cn_1d", region="CN")
df = D.features(D.instruments("all"), ["$close"], freq="day")
```
### Normalize Data
#### 1min
```bash
python collector.py normalize_data --source_dir ~/.qlib/stock_data/source --normalize_dir ~/.qlib/stock_data/normalize --region CN
# download from yahoo finance
python collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_1min --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1min
# normalize
python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/cn_1min --normalize_dir ~/.qlib/stock_data/source/cn_1min_nor --region CN --interval 1min
# dump data
cd qlib/scripts
python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1min_nor --qlib_dir ~/.qlib/stock_data/source/qlib_cn_1min --freq 1min --exclude_fields date,adjclose,dividends,splits,symbol
# using
import qlib
from qlib.data import D
qlib.init(provider_uri="~/.qlib/stock_data/source/qlib_cn_1min", region="CN")
df = D.features(D.instruments("all"), ["$close"], freq="1min")
```
### US Data
#### 1d
```bash
# download from yahoo finance
python collector.py download_data --source_dir ~/.qlib/stock_data/source/us_1d --region US --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d
# normalize
python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/us_1d --normalize_dir ~/.qlib/stock_data/source/us_1d_nor --region US --interval 1d
# dump data
cd qlib/scripts
python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1d_nor --qlib_dir ~/.qlib/stock_data/source/qlib_us_1d --freq day --exclude_fields date,adjclose,dividends,splits,symbol
# using
import qlib
from qlib.data import D
qlib.init(provider_uri="~/.qlib/stock_data/source/qlib_us_1d", region="US")
df = D.features(D.instruments("all"), ["$close"], freq="day")
```
### Help
```bash
pythono collector.py collector_data --help
@@ -42,5 +100,5 @@ pythono collector.py collector_data --help
## Parameters
- interval: 1m or 1d
- interval: 1min or 1d
- region: CN or US

File diff suppressed because it is too large Load Diff