1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-06-06 05:51:17 +08:00

add function to automatically update daily frequency data

This commit is contained in:
zhupr
2021-06-17 23:01:08 +08:00
parent a4f6e04199
commit b6c31540e8
6 changed files with 189 additions and 21 deletions

View File

@@ -295,7 +295,7 @@ def get_instruments(
$ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method save_new_companies
"""
_cur_module = importlib.import_module("collector")
_cur_module = importlib.import_module("data_collector.cn_index.collector")
obj = getattr(_cur_module, f"{index_name.upper()}")(
qlib_dir=qlib_dir, index_name=index_name, request_retry=request_retry, retry_sleep=retry_sleep
)

View File

@@ -271,7 +271,7 @@ def get_instruments(
$ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/cn_data --method save_new_companies
"""
_cur_module = importlib.import_module("collector")
_cur_module = importlib.import_module("data_collector.us_index.collector")
obj = getattr(_cur_module, f"{index_name.upper()}Index")(
qlib_dir=qlib_dir, index_name=index_name, request_retry=request_retry, retry_sleep=retry_sleep
)

View File

@@ -1,3 +1,19 @@
- [Collector Data](#collector-data)
- [Automatic update data](#automatic-update-of-daily-frequency-data(from-yahoo-finance))
- [CN Data](#CN-Data)
- [1d from yahoo](#1d-from-yahoocn)
- [1d from qlib](#1d-from-qlibcn)
- [using data(1d)](#using-data1d-cn)
- [1min from yahoo](#1min-from-yahoocn)
- [1min from qlib](#1min-from-qlibcn)
- [using data(1min)](#using-data1min-cn)
- [US Data](#CN-Data)
- [1d from yahoo](#1d-from-yahoous)
- [1d from qlib](#1d-from-qlibus)
- [using data(1d)](#using-data1d-us)
# Collect Data From Yahoo Finance
> *Please pay **ATTENTION** that the data is collected from [Yahoo Finance](https://finance.yahoo.com/lookup) and the data might not be perfect. We recommend users to prepare their own data if they have high-quality dataset. For more information, users can refer to the [related document](https://qlib.readthedocs.io/en/latest/component/data.html#converting-csv-format-into-qlib-format)*
@@ -18,10 +34,37 @@ pip install -r requirements.txt
## Collector Data
### Automatic update of daily frequency data(from yahoo finance)
> It is recommended that users update the data manually once (--trading_date 2021-05-25) and then set it to update automatically.
* Automatic update of data to the "qlib" directory each trading day(Linux)
* use *crontab*: `crontab -e`
* set up timed tasks:
```
* * * * 1-5 python <script path> update_data_to_bin --qlib_data_1d_dir <user data dir>
```
* **script path**: *qlib/scripts/data_collector/yahoo/collector.py*
* Manual update of data
```
python qlib/scripts/data_collector/yahoo/collector.py update_data_to_bin --qlib_data_1d_dir <user data dir> --trading_date <start date> --end_date <end date>
```
* *trading_date*: start of trading day
* *end_date*: end of trading day(not included)
* qlib/scripts/data_collector/yahoo/collector.py update_data_to_bin parameters:
* *source_dir*: The directory where the raw data collected from the Internet is saved, default "Path(__file__).parent/source"
* *normalize_dir*: Directory for normalize data, default "Path(__file__).parent/normalize"
* *qlib_data_1d_dir*: the qlib data to be updated for yahoo, usually from: [download qlib data](https://github.com/microsoft/qlib/tree/main/scripts#download-cn-data)
* *trading_date*: trading days to be updated, by default ``datetime.datetime.now().strftime("%Y-%m-%d")``
* *end_date*: end datetime, default ``pd.Timestamp(trading_date + pd.Timedelta(days=1))``; open interval(excluding end)
* *region*: region, value from ["CN", "US"], default "CN"
### CN Data
#### 1d from yahoo
#### 1d from yahoo(CN)
```bash
@@ -37,12 +80,12 @@ python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1d_nor --qli
```
### 1d from qlib
### 1d from qlib(CN)
```bash
python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_cn_1d --region cn
```
### using data
### using data(1d CN)
```python
import qlib
@@ -52,7 +95,7 @@ qlib.init(provider_uri="~/.qlib/qlib_data/qlib_cn_1d", region="cn")
df = D.features(D.instruments("all"), ["$close"], freq="day")
```
#### 1min from yahoo
#### 1min from yahoo(CN)
```bash
@@ -67,12 +110,12 @@ cd qlib/scripts
python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1min_nor --qlib_dir ~/.qlib/qlib_data/qlib_cn_1min --freq 1min --exclude_fields date,adjclose,dividends,splits,symbol
```
### 1min from qlib
### 1min from qlib(CN)
```bash
python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_cn_1min --interval 1min --region cn
```
### using data
### using data(1min CN)
```python
import qlib
@@ -85,7 +128,7 @@ df = D.features(D.instruments("all"), ["$close"], freq="1min")
### US Data
#### 1d from yahoo
#### 1d from yahoo(US)
```bash
@@ -100,13 +143,13 @@ cd qlib/scripts
python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/us_1d_nor --qlib_dir ~/.qlib/stock_data/source/qlib_us_1d --freq day --exclude_fields date,adjclose,dividends,splits,symbol
```
#### 1d from qlib
#### 1d from qlib(US)
```bash
python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_us_1d --region us
```
### using data
### using data(1d US)
```python
# using

View File

@@ -9,7 +9,7 @@ import datetime
import importlib
from abc import ABC
from pathlib import Path
from typing import Iterable, Type
from typing import Iterable
import fire
import requests
@@ -18,11 +18,15 @@ import pandas as pd
from loguru import logger
from yahooquery import Ticker
from dateutil.tz import tzlocal
from qlib.utils import code_to_fname, fname_to_code
from qlib.tests.data import GetData
from qlib.utils import code_to_fname, fname_to_code, exists_qlib_data
from qlib.config import REG_CN as REGION_CN
CUR_DIR = Path(__file__).resolve().parent
sys.path.append(str(CUR_DIR.parent.parent))
from dump_bin import DumpDataUpdate
from data_collector.base import BaseCollector, BaseNormalize, BaseRun, Normalize
from data_collector.utils import (
deco_retry,
@@ -153,7 +157,10 @@ class YahooCollector(BaseCollector):
_result = None
if interval == self.INTERVAL_1d:
_result = _get_simple(start_datetime, end_datetime)
try:
_result = _get_simple(start_datetime, end_datetime)
except ValueError as e:
pass
elif interval == self.INTERVAL_1min:
_res = []
_start = self.start_datetime
@@ -184,7 +191,7 @@ class YahooCollector(BaseCollector):
class YahooCollectorCN(YahooCollector, ABC):
def get_instrument_list(self):
logger.info("get HS stock symbos......")
logger.info("get HS stock symbols......")
symbols = get_hs_stock_symbols()
logger.info(f"get {len(symbols)} symbols.")
return symbols
@@ -233,9 +240,9 @@ class YahooCollectorCN1d(YahooCollectorCN):
class YahooCollectorCN1min(YahooCollectorCN):
def download_index_data(self):
# TODO: 1m
logger.warning(f"{self.__class__.__name__} {self.interval} does not support: download_index_data")
def get_instrument_list(self):
symbols = super(YahooCollectorCN1min, self).get_instrument_list()
return symbols + ["000300.ss", "000905.ss", "00903.ss"]
class YahooCollectorUS(YahooCollector, ABC):
@@ -450,10 +457,12 @@ class YahooNormalize1dExtend(YahooNormalize1d):
_max_date = df.index.max()
df = df.reindex(self._calendar_list).loc[:_max_date].reset_index()
df = df[df[self._date_field_name] > _last_date]
if df.empty:
return pd.DataFrame()
_si = df["close"].first_valid_index()
if _si > df.index[0]:
logger.warning(
f"{df.iloc[0][self._symbol_field_name]} missing data: {df.loc[:_si][self._date_field_name]}"
f"{df.loc[_si][self._symbol_field_name]} missing data: {df.loc[:_si-1][self._date_field_name].to_list()}"
)
# normalize
df = self.normalize_yahoo(
@@ -661,7 +670,7 @@ class YahooNormalizeCN1min(YahooNormalizeCN, YahooNormalize1min):
def symbol_to_yahoo(self, symbol):
if "." not in symbol:
_exchange = symbol[:2]
_exchange = symbol[:2].lower()
_exchange = "ss" if _exchange == "sh" else _exchange
symbol = symbol[2:] + "." + _exchange
return symbol
@@ -864,7 +873,7 @@ class Run(BaseRun):
yc.normalize()
def normalize_data_1min_cn_offline(
self, qlib_data_1d_dir, date_field_name: str = "date", symbol_field_name: str = "symbol"
self, qlib_data_1d_dir: str, date_field_name: str = "date", symbol_field_name: str = "symbol"
):
"""Normalised to 1min using local 1d data
@@ -942,6 +951,72 @@ class Run(BaseRun):
limit_nums,
)
def update_data_to_bin(self, qlib_data_1d_dir: str, trading_date: str = None, end_date: str = None):
"""update yahoo data to bin
Parameters
----------
qlib_data_1d_dir: str
the qlib data to be updated for yahoo, usually from: https://github.com/microsoft/qlib/tree/main/scripts#download-cn-data
trading_date: str
trading days to be updated, by default ``datetime.datetime.now().strftime("%Y-%m-%d")``
end_date: str
end datetime, default ``pd.Timestamp(trading_date + pd.Timedelta(days=1))``; open interval(excluding end)
Notes
-----
If the data in qlib_data_dir is incomplete, np.nan will be populated to trading_date for the previous trading day
Examples
-------
$ python collector.py update_data_to_bin --qlib_data_1d_dir <user data dir> --trading_date <start date> --end_date <end date>
# get 1m data
"""
if self.interval.lower() != "1d":
logger.warning(f"currently supports 1d data updates: --interval 1d")
# start/end date
if trading_date is None:
trading_date = datetime.datetime.now().strftime("%Y-%m-%d")
logger.warning(f"trading_date is None, use the current date: {trading_date}")
if end_date is None:
end_date = (pd.Timestamp(trading_date) + pd.Timedelta(days=1)).strftime("%Y-%m-%d")
# download qlib 1d data
qlib_data_1d_dir = Path(qlib_data_1d_dir).expanduser().resolve()
if not exists_qlib_data(qlib_data_1d_dir):
GetData().qlib_data(target_dir=qlib_data_1d_dir, interval=self.interval, region=self.region)
# download data from yahoo
self.download_data(delay=1, start=trading_date, end=end_date, check_data_length=1)
# normalize data
self.normalize_data_1d_extend(str(qlib_data_1d_dir))
# dump bin
_dump = DumpDataUpdate(
csv_path=self.normalize_dir,
qlib_dir=qlib_data_1d_dir,
exclude_fields="symbol,date",
max_workers=self.max_workers,
)
_dump.dump()
# parse index
_region = self.region.lower()
if _region not in ["cn", "us"]:
logger.warning(f"Unsupported region: region={_region}, component downloads will be ignored")
return
index_list = ["CSI100", "CSI300"] if _region == "cn" else ["SP500", "NASDAQ100", "DJIA", "SP400"]
get_instruments = getattr(
importlib.import_module(f"data_collector.{_region}_index.collector"), "get_instruments"
)
for _index in index_list:
get_instruments(str(qlib_data_1d_dir), _index)
if __name__ == "__main__":
fire.Fire(Run)