mirror of
https://github.com/microsoft/qlib.git
synced 2026-06-06 05:51:17 +08:00
add function to automatically update daily frequency data
This commit is contained in:
22
README.md
22
README.md
@@ -159,6 +159,28 @@ Users could create the same dataset with it.
|
||||
*Please pay **ATTENTION** that the data is collected from [Yahoo Finance](https://finance.yahoo.com/lookup), and the data might not be perfect.
|
||||
We recommend users to prepare their own data if they have a high-quality dataset. For more information, users can refer to the [related document](https://qlib.readthedocs.io/en/latest/component/data.html#converting-csv-format-into-qlib-format)*.
|
||||
|
||||
### Automatic update of daily frequency data(from yahoo finance)
|
||||
> It is recommended that users update the data manually once (--trading_date 2021-05-25) and then set it to update automatically.
|
||||
|
||||
> For more information refer to: [yahoo collector](https://github.com/microsoft/qlib/tree/main/scripts/data_collector/yahoo#Automatic-update-of-daily-frequency-data)
|
||||
|
||||
* Automatic update of data to the "qlib" directory each trading day(Linux)
|
||||
* use *crontab*: `crontab -e`
|
||||
* set up timed tasks:
|
||||
|
||||
```
|
||||
* * * * 1-5 python <script path> update_data_to_bin --qlib_data_1d_dir <user data dir>
|
||||
```
|
||||
* **script path**: *qlib/scripts/data_collector/yahoo/collector.py*
|
||||
|
||||
* Manual update of data
|
||||
```
|
||||
python qlib/scripts/data_collector/yahoo/collector.py update_data_to_bin --qlib_data_1d_dir <user data dir> --trading_date <start date> --end_date <end date>
|
||||
```
|
||||
* *trading_date*: start of trading day
|
||||
* *end_date*: end of trading day(not included)
|
||||
|
||||
|
||||
<!--
|
||||
- Run the initialization code and get stock data:
|
||||
|
||||
|
||||
@@ -67,6 +67,34 @@ After running the above command, users can find china-stock and us-stock data in
|
||||
|
||||
When ``Qlib`` is initialized with this dataset, users could build and evaluate their own models with it. Please refer to `Initialization <../start/initialization.html>`_ for more details.
|
||||
|
||||
Automatic update of daily frequency data
|
||||
----------------------------------------
|
||||
|
||||
**It is recommended that users update the data manually once (\-\-trading_date 2021-05-25) and then set it to update automatically.**
|
||||
|
||||
For more information refer to: `yahoo collector <https://github.com/microsoft/qlib/tree/main/scripts/data_collector/yahoo#Automatic-update-of-daily-frequency-data>`_
|
||||
|
||||
- Automatic update of data to the "qlib" directory each trading day(Linux)
|
||||
- use *crontab*: `crontab -e`
|
||||
- set up timed tasks:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
* * * * 1-5 python <script path> update_data_to_bin --qlib_data_1d_dir <user data dir>
|
||||
|
||||
- **script path**: *qlib/scripts/data_collector/yahoo/collector.py*
|
||||
|
||||
- Manual update of data
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python qlib/scripts/data_collector/yahoo/collector.py update_data_to_bin --qlib_data_1d_dir <user data dir> --trading_date <start date> --end_date <end date>
|
||||
|
||||
- *trading_date*: start of trading day
|
||||
- *end_date*: end of trading day(not included)
|
||||
|
||||
|
||||
|
||||
Converting CSV Format into Qlib Format
|
||||
-------------------------------------------
|
||||
|
||||
|
||||
@@ -295,7 +295,7 @@ def get_instruments(
|
||||
$ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method save_new_companies
|
||||
|
||||
"""
|
||||
_cur_module = importlib.import_module("collector")
|
||||
_cur_module = importlib.import_module("data_collector.cn_index.collector")
|
||||
obj = getattr(_cur_module, f"{index_name.upper()}")(
|
||||
qlib_dir=qlib_dir, index_name=index_name, request_retry=request_retry, retry_sleep=retry_sleep
|
||||
)
|
||||
|
||||
@@ -271,7 +271,7 @@ def get_instruments(
|
||||
$ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/cn_data --method save_new_companies
|
||||
|
||||
"""
|
||||
_cur_module = importlib.import_module("collector")
|
||||
_cur_module = importlib.import_module("data_collector.us_index.collector")
|
||||
obj = getattr(_cur_module, f"{index_name.upper()}Index")(
|
||||
qlib_dir=qlib_dir, index_name=index_name, request_retry=request_retry, retry_sleep=retry_sleep
|
||||
)
|
||||
|
||||
@@ -1,3 +1,19 @@
|
||||
|
||||
- [Collector Data](#collector-data)
|
||||
- [Automatic update data](#automatic-update-of-daily-frequency-data(from-yahoo-finance))
|
||||
- [CN Data](#CN-Data)
|
||||
- [1d from yahoo](#1d-from-yahoocn)
|
||||
- [1d from qlib](#1d-from-qlibcn)
|
||||
- [using data(1d)](#using-data1d-cn)
|
||||
- [1min from yahoo](#1min-from-yahoocn)
|
||||
- [1min from qlib](#1min-from-qlibcn)
|
||||
- [using data(1min)](#using-data1min-cn)
|
||||
- [US Data](#CN-Data)
|
||||
- [1d from yahoo](#1d-from-yahoous)
|
||||
- [1d from qlib](#1d-from-qlibus)
|
||||
- [using data(1d)](#using-data1d-us)
|
||||
|
||||
|
||||
# Collect Data From Yahoo Finance
|
||||
|
||||
> *Please pay **ATTENTION** that the data is collected from [Yahoo Finance](https://finance.yahoo.com/lookup) and the data might not be perfect. We recommend users to prepare their own data if they have high-quality dataset. For more information, users can refer to the [related document](https://qlib.readthedocs.io/en/latest/component/data.html#converting-csv-format-into-qlib-format)*
|
||||
@@ -18,10 +34,37 @@ pip install -r requirements.txt
|
||||
|
||||
## Collector Data
|
||||
|
||||
### Automatic update of daily frequency data(from yahoo finance)
|
||||
> It is recommended that users update the data manually once (--trading_date 2021-05-25) and then set it to update automatically.
|
||||
|
||||
* Automatic update of data to the "qlib" directory each trading day(Linux)
|
||||
* use *crontab*: `crontab -e`
|
||||
* set up timed tasks:
|
||||
|
||||
```
|
||||
* * * * 1-5 python <script path> update_data_to_bin --qlib_data_1d_dir <user data dir>
|
||||
```
|
||||
* **script path**: *qlib/scripts/data_collector/yahoo/collector.py*
|
||||
|
||||
* Manual update of data
|
||||
```
|
||||
python qlib/scripts/data_collector/yahoo/collector.py update_data_to_bin --qlib_data_1d_dir <user data dir> --trading_date <start date> --end_date <end date>
|
||||
```
|
||||
* *trading_date*: start of trading day
|
||||
* *end_date*: end of trading day(not included)
|
||||
|
||||
* qlib/scripts/data_collector/yahoo/collector.py update_data_to_bin parameters:
|
||||
* *source_dir*: The directory where the raw data collected from the Internet is saved, default "Path(__file__).parent/source"
|
||||
* *normalize_dir*: Directory for normalize data, default "Path(__file__).parent/normalize"
|
||||
* *qlib_data_1d_dir*: the qlib data to be updated for yahoo, usually from: [download qlib data](https://github.com/microsoft/qlib/tree/main/scripts#download-cn-data)
|
||||
* *trading_date*: trading days to be updated, by default ``datetime.datetime.now().strftime("%Y-%m-%d")``
|
||||
* *end_date*: end datetime, default ``pd.Timestamp(trading_date + pd.Timedelta(days=1))``; open interval(excluding end)
|
||||
* *region*: region, value from ["CN", "US"], default "CN"
|
||||
|
||||
|
||||
### CN Data
|
||||
|
||||
#### 1d from yahoo
|
||||
#### 1d from yahoo(CN)
|
||||
|
||||
```bash
|
||||
|
||||
@@ -37,12 +80,12 @@ python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1d_nor --qli
|
||||
|
||||
```
|
||||
|
||||
### 1d from qlib
|
||||
### 1d from qlib(CN)
|
||||
```bash
|
||||
python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_cn_1d --region cn
|
||||
```
|
||||
|
||||
### using data
|
||||
### using data(1d CN)
|
||||
|
||||
```python
|
||||
import qlib
|
||||
@@ -52,7 +95,7 @@ qlib.init(provider_uri="~/.qlib/qlib_data/qlib_cn_1d", region="cn")
|
||||
df = D.features(D.instruments("all"), ["$close"], freq="day")
|
||||
```
|
||||
|
||||
#### 1min from yahoo
|
||||
#### 1min from yahoo(CN)
|
||||
|
||||
```bash
|
||||
|
||||
@@ -67,12 +110,12 @@ cd qlib/scripts
|
||||
python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1min_nor --qlib_dir ~/.qlib/qlib_data/qlib_cn_1min --freq 1min --exclude_fields date,adjclose,dividends,splits,symbol
|
||||
```
|
||||
|
||||
### 1min from qlib
|
||||
### 1min from qlib(CN)
|
||||
```bash
|
||||
python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_cn_1min --interval 1min --region cn
|
||||
```
|
||||
|
||||
### using data
|
||||
### using data(1min CN)
|
||||
|
||||
```python
|
||||
import qlib
|
||||
@@ -85,7 +128,7 @@ df = D.features(D.instruments("all"), ["$close"], freq="1min")
|
||||
|
||||
### US Data
|
||||
|
||||
#### 1d from yahoo
|
||||
#### 1d from yahoo(US)
|
||||
|
||||
```bash
|
||||
|
||||
@@ -100,13 +143,13 @@ cd qlib/scripts
|
||||
python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/us_1d_nor --qlib_dir ~/.qlib/stock_data/source/qlib_us_1d --freq day --exclude_fields date,adjclose,dividends,splits,symbol
|
||||
```
|
||||
|
||||
#### 1d from qlib
|
||||
#### 1d from qlib(US)
|
||||
|
||||
```bash
|
||||
python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_us_1d --region us
|
||||
```
|
||||
|
||||
### using data
|
||||
### using data(1d US)
|
||||
|
||||
```python
|
||||
# using
|
||||
|
||||
@@ -9,7 +9,7 @@ import datetime
|
||||
import importlib
|
||||
from abc import ABC
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Type
|
||||
from typing import Iterable
|
||||
|
||||
import fire
|
||||
import requests
|
||||
@@ -18,11 +18,15 @@ import pandas as pd
|
||||
from loguru import logger
|
||||
from yahooquery import Ticker
|
||||
from dateutil.tz import tzlocal
|
||||
from qlib.utils import code_to_fname, fname_to_code
|
||||
|
||||
from qlib.tests.data import GetData
|
||||
from qlib.utils import code_to_fname, fname_to_code, exists_qlib_data
|
||||
from qlib.config import REG_CN as REGION_CN
|
||||
|
||||
CUR_DIR = Path(__file__).resolve().parent
|
||||
sys.path.append(str(CUR_DIR.parent.parent))
|
||||
|
||||
from dump_bin import DumpDataUpdate
|
||||
from data_collector.base import BaseCollector, BaseNormalize, BaseRun, Normalize
|
||||
from data_collector.utils import (
|
||||
deco_retry,
|
||||
@@ -153,7 +157,10 @@ class YahooCollector(BaseCollector):
|
||||
|
||||
_result = None
|
||||
if interval == self.INTERVAL_1d:
|
||||
_result = _get_simple(start_datetime, end_datetime)
|
||||
try:
|
||||
_result = _get_simple(start_datetime, end_datetime)
|
||||
except ValueError as e:
|
||||
pass
|
||||
elif interval == self.INTERVAL_1min:
|
||||
_res = []
|
||||
_start = self.start_datetime
|
||||
@@ -184,7 +191,7 @@ class YahooCollector(BaseCollector):
|
||||
|
||||
class YahooCollectorCN(YahooCollector, ABC):
|
||||
def get_instrument_list(self):
|
||||
logger.info("get HS stock symbos......")
|
||||
logger.info("get HS stock symbols......")
|
||||
symbols = get_hs_stock_symbols()
|
||||
logger.info(f"get {len(symbols)} symbols.")
|
||||
return symbols
|
||||
@@ -233,9 +240,9 @@ class YahooCollectorCN1d(YahooCollectorCN):
|
||||
|
||||
|
||||
class YahooCollectorCN1min(YahooCollectorCN):
|
||||
def download_index_data(self):
|
||||
# TODO: 1m
|
||||
logger.warning(f"{self.__class__.__name__} {self.interval} does not support: download_index_data")
|
||||
def get_instrument_list(self):
|
||||
symbols = super(YahooCollectorCN1min, self).get_instrument_list()
|
||||
return symbols + ["000300.ss", "000905.ss", "00903.ss"]
|
||||
|
||||
|
||||
class YahooCollectorUS(YahooCollector, ABC):
|
||||
@@ -450,10 +457,12 @@ class YahooNormalize1dExtend(YahooNormalize1d):
|
||||
_max_date = df.index.max()
|
||||
df = df.reindex(self._calendar_list).loc[:_max_date].reset_index()
|
||||
df = df[df[self._date_field_name] > _last_date]
|
||||
if df.empty:
|
||||
return pd.DataFrame()
|
||||
_si = df["close"].first_valid_index()
|
||||
if _si > df.index[0]:
|
||||
logger.warning(
|
||||
f"{df.iloc[0][self._symbol_field_name]} missing data: {df.loc[:_si][self._date_field_name]}"
|
||||
f"{df.loc[_si][self._symbol_field_name]} missing data: {df.loc[:_si-1][self._date_field_name].to_list()}"
|
||||
)
|
||||
# normalize
|
||||
df = self.normalize_yahoo(
|
||||
@@ -661,7 +670,7 @@ class YahooNormalizeCN1min(YahooNormalizeCN, YahooNormalize1min):
|
||||
|
||||
def symbol_to_yahoo(self, symbol):
|
||||
if "." not in symbol:
|
||||
_exchange = symbol[:2]
|
||||
_exchange = symbol[:2].lower()
|
||||
_exchange = "ss" if _exchange == "sh" else _exchange
|
||||
symbol = symbol[2:] + "." + _exchange
|
||||
return symbol
|
||||
@@ -864,7 +873,7 @@ class Run(BaseRun):
|
||||
yc.normalize()
|
||||
|
||||
def normalize_data_1min_cn_offline(
|
||||
self, qlib_data_1d_dir, date_field_name: str = "date", symbol_field_name: str = "symbol"
|
||||
self, qlib_data_1d_dir: str, date_field_name: str = "date", symbol_field_name: str = "symbol"
|
||||
):
|
||||
"""Normalised to 1min using local 1d data
|
||||
|
||||
@@ -942,6 +951,72 @@ class Run(BaseRun):
|
||||
limit_nums,
|
||||
)
|
||||
|
||||
def update_data_to_bin(self, qlib_data_1d_dir: str, trading_date: str = None, end_date: str = None):
|
||||
"""update yahoo data to bin
|
||||
|
||||
Parameters
|
||||
----------
|
||||
qlib_data_1d_dir: str
|
||||
the qlib data to be updated for yahoo, usually from: https://github.com/microsoft/qlib/tree/main/scripts#download-cn-data
|
||||
|
||||
trading_date: str
|
||||
trading days to be updated, by default ``datetime.datetime.now().strftime("%Y-%m-%d")``
|
||||
end_date: str
|
||||
end datetime, default ``pd.Timestamp(trading_date + pd.Timedelta(days=1))``; open interval(excluding end)
|
||||
|
||||
Notes
|
||||
-----
|
||||
If the data in qlib_data_dir is incomplete, np.nan will be populated to trading_date for the previous trading day
|
||||
|
||||
Examples
|
||||
-------
|
||||
$ python collector.py update_data_to_bin --qlib_data_1d_dir <user data dir> --trading_date <start date> --end_date <end date>
|
||||
# get 1m data
|
||||
"""
|
||||
|
||||
if self.interval.lower() != "1d":
|
||||
logger.warning(f"currently supports 1d data updates: --interval 1d")
|
||||
|
||||
# start/end date
|
||||
if trading_date is None:
|
||||
trading_date = datetime.datetime.now().strftime("%Y-%m-%d")
|
||||
logger.warning(f"trading_date is None, use the current date: {trading_date}")
|
||||
|
||||
if end_date is None:
|
||||
end_date = (pd.Timestamp(trading_date) + pd.Timedelta(days=1)).strftime("%Y-%m-%d")
|
||||
|
||||
# download qlib 1d data
|
||||
qlib_data_1d_dir = Path(qlib_data_1d_dir).expanduser().resolve()
|
||||
if not exists_qlib_data(qlib_data_1d_dir):
|
||||
GetData().qlib_data(target_dir=qlib_data_1d_dir, interval=self.interval, region=self.region)
|
||||
|
||||
# download data from yahoo
|
||||
self.download_data(delay=1, start=trading_date, end=end_date, check_data_length=1)
|
||||
|
||||
# normalize data
|
||||
self.normalize_data_1d_extend(str(qlib_data_1d_dir))
|
||||
|
||||
# dump bin
|
||||
_dump = DumpDataUpdate(
|
||||
csv_path=self.normalize_dir,
|
||||
qlib_dir=qlib_data_1d_dir,
|
||||
exclude_fields="symbol,date",
|
||||
max_workers=self.max_workers,
|
||||
)
|
||||
_dump.dump()
|
||||
|
||||
# parse index
|
||||
_region = self.region.lower()
|
||||
if _region not in ["cn", "us"]:
|
||||
logger.warning(f"Unsupported region: region={_region}, component downloads will be ignored")
|
||||
return
|
||||
index_list = ["CSI100", "CSI300"] if _region == "cn" else ["SP500", "NASDAQ100", "DJIA", "SP400"]
|
||||
get_instruments = getattr(
|
||||
importlib.import_module(f"data_collector.{_region}_index.collector"), "get_instruments"
|
||||
)
|
||||
for _index in index_list:
|
||||
get_instruments(str(qlib_data_1d_dir), _index)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
fire.Fire(Run)
|
||||
|
||||
Reference in New Issue
Block a user