From bab50e883721afe658e1535eb78062b83a1d69a0 Mon Sep 17 00:00:00 2001 From: zhupr Date: Wed, 23 Jun 2021 16:13:26 +0800 Subject: [PATCH] fix YahooNormalize1min && update docs --- README.md | 2 +- examples/benchmarks/README.md | 4 + scripts/data_collector/yahoo/README.md | 250 ++++++++++++---------- scripts/data_collector/yahoo/collector.py | 16 +- 4 files changed, 149 insertions(+), 123 deletions(-) diff --git a/README.md b/README.md index 635b143f4..d60e3e2a7 100644 --- a/README.md +++ b/README.md @@ -162,7 +162,7 @@ We recommend users to prepare their own data if they have a high-quality dataset ### Automatic update of daily frequency data(from yahoo finance) > It is recommended that users update the data manually once (--trading_date 2021-05-25) and then set it to update automatically. - > For more information refer to: [yahoo collector](https://github.com/microsoft/qlib/tree/main/scripts/data_collector/yahoo#Automatic-update-of-daily-frequency-data) + > For more information refer to: [yahoo collector](https://github.com/microsoft/qlib/tree/main/scripts/data_collector/yahoo#automatic-update-of-daily-frequency-datafrom-yahoo-finance) * Automatic update of data to the "qlib" directory each trading day(Linux) * use *crontab*: `crontab -e` diff --git a/examples/benchmarks/README.md b/examples/benchmarks/README.md index c3d965d85..133380fe0 100644 --- a/examples/benchmarks/README.md +++ b/examples/benchmarks/README.md @@ -4,6 +4,10 @@ Here are the results of each benchmark model running on Qlib's `Alpha360` and `A The numbers shown below demonstrate the performance of the entire `workflow` of each model. We will update the `workflow` as well as models in the near future for better results. +> If you need to reproduce the results below, please use the **v1** dataset: `python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_cn_1d --region cn --version v1` +> +> In the new version of qlib, the default dataset is **v2**. Since the data is collected from the YahooFinance API (which is not very stable), the results of *v2* and *v1* may differ + ## Alpha360 dataset | Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown | |---|---|---|---|---|---|---|---|---| diff --git a/scripts/data_collector/yahoo/README.md b/scripts/data_collector/yahoo/README.md index 6fa4b1937..68124e132 100644 --- a/scripts/data_collector/yahoo/README.md +++ b/scripts/data_collector/yahoo/README.md @@ -1,17 +1,9 @@ - [Collector Data](#collector-data) - - [Automatic update data](#automatic-update-of-daily-frequency-data(from-yahoo-finance)) - - [CN Data](#CN-Data) - - [1d from yahoo](#1d-from-yahoocn) - - [1d from qlib](#1d-from-qlibcn) - - [using data(1d)](#using-data1d-cn) - - [1min from yahoo](#1min-from-yahoocn) - - [1min from qlib](#1min-from-qlibcn) - - [using data(1min)](#using-data1min-cn) - - [US Data](#CN-Data) - - [1d from yahoo](#1d-from-yahoous) - - [1d from qlib](#1d-from-qlibus) - - [using data(1d)](#using-data1d-us) + - [Get Qlib data](#get-qlib-databin-file) + - [Collector *YahooFinance* data to qlib](#collector-yahoofinance-data-to-qlib) + - [Automatic update of daily frequency data](#automatic-update-of-daily-frequency-datafrom-yahoo-finance) +- [Using qlib data](#using-qlib-data) # Collect Data From Yahoo Finance @@ -34,6 +26,110 @@ pip install -r requirements.txt ## Collector Data +### Get Qlib data(`bin file`) + > `qlib-data` from *YahooFinance*, is the data that has been dumped and can be used directly in `qlib` + + - get data: `python scripts/get_data.py qlib_data` + - parameters: + - `target_dir`: save dir, by default *~/.qlib/qlib_data/cn_data* + - `version`: dataset version, value from [`v1`, `v2`], by default `v1` + - `v2` end date is *2021-06*, `v1` end date is *2020-09* + - user can append data to `v2`: [automatic update of daily frequency data](#automatic-update-of-daily-frequency-datafrom-yahoo-finance) + - **the [benchmarks](https://github.com/microsoft/qlib/tree/main/examples/benchmarks) for qlib use `v1`**, *due to the unstable access to historical data by YahooFinance, there are some differences between `v2` and `v1`* + - `interval`: `1d` or `1min`, by default `1d` + - `region`: `cn` or `us`, by default `cn` + - `delete_old`: delete existing data from `target_dir`(*features, calendars, instruments, dataset_cache, features_cache*), value from [`True`, `False`], by default `True` + - `exists_skip`: traget_dir data already exists, skip `get_data`, value from [`True`, `False`], by default `False` + - examples: + ```bash + # cn 1d + python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_cn_1d --region cn + # cn 1min + python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_cn_1min --region cn --interval 1min + # us 1d + python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_us_1d --region us --interval 1d + # us 1min + python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_us_1min --region us --interval 1min + ``` + +### Collector *YahooFinance* data to qlib +> collector *YahooFinance* data and *dump* into `qlib` format + 1. download data to csv: `python scripts/data_collector/yahoo/collector.py download_data` + + - parameters: + - `source_dir`: save the directory + - `interval`: `1d` or `1min`, by default `1d` + > **due to the limitation of the *YahooFinance API*, only the last month's data is available in `1min`** + - `region`: `CN` or `US`, by default `CN` + - `delay`: `time.sleep(delay)`, by default *0.5* + - `start`: start datetime, by default *"2000-01-01"*; *closed interval(including start)* + - `end`: end datetime, by default `pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1))`; *open interval(excluding end)* + - `max_workers`: get the number of concurrent symbols, it is not recommended to change this parameter in order to maintain the integrity of the symbol data, by default *1* + - `check_data_length`: check the number of rows per *symbol*, by default `None` + > if `len(symbol_df) < check_data_length`, it will be re-fetched, with the number of re-fetches coming from the `max_collector_count` parameter + - `max_collector_count`: number of *"failed"* symbol retries, by default 2 + - examples: + ```bash + # cn 1d data + python collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_1d --start 2020-01-01 --end 2020-12-31 --delay 1 --interval 1d --region US + # cn 1min data + python collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_1min --delay 1 --interval 1min --region CN + # us 1d data + python collector.py download_data --source_dir ~/.qlib/stock_data/source/us_1d --start 2020-01-01 --end 2020-12-31 --delay 1 --interval 1d --region US + # us 1min data + python collector.py download_data --source_dir ~/.qlib/stock_data/source/us_1min --delay 1 --interval 1min --region US + ``` + 2. normalize data: `python scripts/data_collector/yahoo/collector.py normalize_data` + + - parameters: + - `source_dir`: csv directory + - `normalize_dir`: result directory + - `max_workers`: number of concurrent, by default *1* + - `interval`: `1d` or `1min`, by default `1d` + > if **`interval == 1min`**, `qlib_data_1d_dir` cannot be `None` + - `region`: `CN` or `US`, by default `CN` + - `date_field_name`: column *name* identifying time in csv files, by default `date` + - `symbol_field_name`: column *name* identifying symbol in csv files, by default `symbol` + - `end_date`: if not `None`, normalize the last date saved (*including end_date*); if `None`, it will ignore this parameter; by default `None` + - `qlib_data_1d_dir`: qlib directory(1d data) + ``` + if interval==1min, qlib_data_1d_dir cannot be None, normalize 1min needs to use 1d data; + + qlib_data_1d can be obtained like this: + $ python scripts/get_data.py qlilb_data --target_dir --interval 1d + $ python scripts/data_collector/yahoo/collector.py update_data_to_bin --qlib_data_1d_dir --trading_date 2021-06-01 + or: + download 1d data from YahooFinance + + ``` + - examples: + ```bash + # normalize 1d cn + python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/cn_1d --normalize_dir ~/.qlib/stock_data/source/cn_1d_nor --region CN --interval 1d + # normalize 1min cn + python collector.py normalize_data --qlib_data_1d_dir ~/.qlib/qlib_data/qlib_cn_1d --source_dir ~/.qlib/stock_data/source/cn_1min --normalize_dir ~/.qlib/stock_data/source/cn_1min_nor --region CN --interval 1min + ``` + 3. dump data: `python scripts/dump_bin.py dump_all` + + - parameters: + - `csv_path`: stock data path or directory, **normalize result(normalize_dir)** + - `qlib_dir`: qlib(dump) data director + - `freq`: transaction frequency, by default `day` + > `freq_map = {1d:day, 1mih: 1min}` + - `max_workers`: number of threads, by default *16* + - `include_fields`: dump fields, by default `""` + - `exclude_fields`: fields not dumped, by default `""" + > dump_fields = `include_fields if include_fields else set(symbol_df.columns) - set(exclude_fields) exclude_fields else symbol_df.columns` + - `symbol_field_name`: column *name* identifying symbol in csv files, by default `symbol` + - `date_field_name`: column *name* identifying time in csv files, by default `date` + - examples: + ```bash + # dump 1d cn + python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1d_nor --qlib_dir ~/.qlib/qlib_data/qlib_cn_1d --freq day --exclude_fields date,symbol + # dump 1min cn + python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1min_nor --qlib_dir ~/.qlib/qlib_data/qlib_cn_1min --freq 1min --exclude_fields date,symbol + ``` + ### Automatic update of daily frequency data(from yahoo finance) > It is recommended that users update the data manually once (--trading_date 2021-05-25) and then set it to update automatically. @@ -62,112 +158,36 @@ pip install -r requirements.txt * *region*: region, value from ["CN", "US"], default "CN" -### CN Data +## Using qlib data -#### 1d from yahoo(CN) + ```python + import qlib + from qlib.data import D -```bash + # 1d data cn + # freq=day, freq default day + qlib.init(provider_uri="~/.qlib/qlib_data/qlib_cn_1d", region="cn") + df = D.features(D.instruments("all"), ["$close"], freq="day") -# download from yahoo finance -python collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_1d --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d + # 1min data cn + # freq=1min + qlib.init(provider_uri="~/.qlib/qlib_data/qlib_cn_1min", region="cn") + inst = D.list_instruments(D.instruments("all"), freq="1min", as_list=True) + # get 100 symbols + df = D.features(inst[:100], ["$close"], freq="1min") + # get all symbol data + # df = D.features(D.instruments("all"), ["$close"], freq="1min") -# normalize -python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/cn_1d --normalize_dir ~/.qlib/stock_data/source/cn_1d_nor --region CN --interval 1d + # 1d data us + qlib.init(provider_uri="~/.qlib/qlib_data/qlib_us_1d", region="us") + df = D.features(D.instruments("all"), ["$close"], freq="day") -# dump data -cd qlib/scripts -python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1d_nor --qlib_dir ~/.qlib/qlib_data/qlib_cn_1d --freq day --exclude_fields date,adjclose,dividends,splits,symbol + # 1min data us + qlib.init(provider_uri="~/.qlib/qlib_data/qlib_us_1min", region="cn") + inst = D.list_instruments(D.instruments("all"), freq="1min", as_list=True) + # get 100 symbols + df = D.features(inst[:100], ["$close"], freq="1min") + # get all symbol data + # df = D.features(D.instruments("all"), ["$close"], freq="1min") + ``` -``` - -### 1d from qlib(CN) -```bash -python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_cn_1d --region cn -``` - -### using data(1d CN) - -```python -import qlib -from qlib.data import D - -qlib.init(provider_uri="~/.qlib/qlib_data/qlib_cn_1d", region="cn") -df = D.features(D.instruments("all"), ["$close"], freq="day") -``` - -#### 1min from yahoo(CN) - -```bash - -# download from yahoo finance -python collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_1min --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1min - -# normalize -python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/cn_1min --normalize_dir ~/.qlib/stock_data/source/cn_1min_nor --region CN --interval 1min - -# dump data -cd qlib/scripts -python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1min_nor --qlib_dir ~/.qlib/qlib_data/qlib_cn_1min --freq 1min --exclude_fields date,adjclose,dividends,splits,symbol -``` - -### 1min from qlib(CN) -```bash -python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_cn_1min --interval 1min --region cn -``` - -### using data(1min CN) - -```python -import qlib -from qlib.data import D - -qlib.init(provider_uri="~/.qlib/qlib_data/qlib_cn_1min", region="cn") -df = D.features(D.instruments("all"), ["$close"], freq="1min") - -``` - -### US Data - -#### 1d from yahoo(US) - -```bash - -# download from yahoo finance -python collector.py download_data --source_dir ~/.qlib/stock_data/source/us_1d --region US --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d - -# normalize -python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/us_1d --normalize_dir ~/.qlib/stock_data/source/us_1d_nor --region US --interval 1d - -# dump data -cd qlib/scripts -python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/us_1d_nor --qlib_dir ~/.qlib/stock_data/source/qlib_us_1d --freq day --exclude_fields date,adjclose,dividends,splits,symbol -``` - -#### 1d from qlib(US) - -```bash -python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_us_1d --region us -``` - -### using data(1d US) - -```python -# using -import qlib -from qlib.data import D - -qlib.init(provider_uri="~/.qlib/qlib_data/qlib_us_1d", region="us") -df = D.features(D.instruments("all"), ["$close"], freq="day") - -``` - - -### Help -```bash -pythono collector.py collector_data --help -``` - -## Parameters - -- interval: 1min or 1d -- region: CN or US diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py index b474d3924..06545bda1 100644 --- a/scripts/data_collector/yahoo/collector.py +++ b/scripts/data_collector/yahoo/collector.py @@ -242,7 +242,10 @@ class YahooCollectorCN1d(YahooCollectorCN): class YahooCollectorCN1min(YahooCollectorCN): def get_instrument_list(self): symbols = super(YahooCollectorCN1min, self).get_instrument_list() - return symbols + ["000300.ss", "000905.ss", "00903.ss"] + return symbols + ["000300.ss", "000905.ss", "000903.ss"] + + def download_index_data(self): + pass class YahooCollectorUS(YahooCollector, ABC): @@ -461,7 +464,7 @@ class YahooNormalize1dExtend(YahooNormalize1d): _si = df["close"].first_valid_index() if _si > df.index[0]: logger.warning( - f"{df.loc[_si][self._symbol_field_name]} missing data: {df.loc[:_si-1][self._date_field_name].to_list()}" + f"{df.loc[_si][self._symbol_field_name]} missing data: {df.loc[:_si - 1][self._date_field_name].to_list()}" ) # normalize df = self.normalize_yahoo( @@ -524,7 +527,7 @@ class YahooNormalize1min(YahooNormalize, ABC): data_1d: pd.DataFrame = self.get_1d_data(symbol, _start, _end) data_1d = data_1d.copy() if data_1d is None or data_1d.empty: - df["factor"] = 1 / df.loc[df["close"].first_valid_index()] + df["factor"] = 1 / df.loc[df["close"].first_valid_index()]["close"] # TODO: np.nan or 1 or 0 df["paused"] = np.nan else: @@ -770,7 +773,7 @@ class Run(BaseRun): def download_data( self, max_collector_count=2, - delay=0, + delay=0.5, start=None, end=None, check_data_length=None, @@ -783,7 +786,7 @@ class Run(BaseRun): max_collector_count: int default 2 delay: float - time.sleep(delay), default 0 + time.sleep(delay), default 0.5 start: str start datetime, default "2000-01-01"; closed interval(including start) end: str @@ -844,9 +847,8 @@ class Run(BaseRun): """ if self.interval.lower() == "1min": if qlib_data_1d_dir is None or not Path(qlib_data_1d_dir).expanduser().exists(): - # TODO: add reference url raise ValueError( - "If normalize 1min, the qlib_data_1d_dir parameter must be set: --qlib_data_1d_dir , Reference: " + "If normalize 1min, the qlib_data_1d_dir parameter must be set: --qlib_data_1d_dir , Reference: https://github.com/zhupr/qlib/tree/support_extend_data/scripts/data_collector/yahoo#automatic-update-of-daily-frequency-datafrom-yahoo-finance" ) super(Run, self).normalize_data( date_field_name, symbol_field_name, end_date=end_date, qlib_data_1d_dir=qlib_data_1d_dir