Merge pull request #17 from you-n-g/main

add test, mod doc , refine data
2026-07-04 11:30:57 +08:00 · 2020-09-27 15:44:38 +08:00
parent d02ccd0340 9eb3d3a416
commit 621b6058b9
24 changed files with 143 additions and 20 deletions
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -150,3 +150,11 @@ Version 0.4.6
 - Some bugs are fixed
    - The default config in `Version 0.4.5` is not friendly to daily frequency data.
    - Backtest error in TopkWeightStrategy when `WithInteract=True`.
+
+
+Version 0.5.0
+--------------------
+- First opensource version
+    - Refine the docs, code
+    - Add baselines
+    - public data crawler
--- a/README.md
+++ b/README.md
@@ -12,13 +12,14 @@ With Qlib, you can easily try your ideas to create better Quant investment strat

 For more details, please refer to our paper ["Qlib: An AI-oriented Quantitative Investment Platform"](https://arxiv.org/abs/2009.11189).

-
 - [Framework of Qlib](#framework-of-qlib)
 - [Quick Start](#quick-start)
  - [Installation](#installation)
  - [Data Preparation](#data-preparation)
  - [Auto Quant Research Workflow](#auto-quant-research-workflow)
  - [Building Customized Quant Research Workflow by Code](#building-customized-quant-research-workflow-by-code)
+- [Quant Model Zoo](#quant-model-zoo)
+- [Quant Dataset Zoo](#quant-dataset-zoo)
 - [More About Qlib](#more-about-qlib)
 - [Offline Mode and Online Mode](#offline-mode-and-online-mode)
  - [Performance of Qlib Data Server](#performance-of-qlib-data-server)
@@ -124,16 +125,17 @@ Qlib provides a tool named `Estimator` to run the whole workflow automatically (
    ```bash

                                                      risk
-    excess_return_without_cost mean               0.000605
-                               std                0.005481
-                               annualized_return  0.152373
-                               information_ratio  1.751319
-                               max_drawdown      -0.059055
-    excess_return_with_cost    mean               0.000410
-                               std                0.005478
-                               annualized_return  0.103265
-                               information_ratio  1.187411
-                               max_drawdown      -0.075024
+    excess_return_without_cost mean               0.000675
+                               std                0.005456
+                               annualized_return  0.170077
+                               information_ratio  1.963824
+                               max_drawdown      -0.063646
+    excess_return_with_cost    mean               0.000479
+                               std                0.005453
+                               annualized_return  0.120776
+                               information_ratio  1.395116
+                               max_drawdown      -0.071216
+


    ```
@@ -171,6 +173,20 @@ Qlib provides a tool named `Estimator` to run the whole workflow automatically (
 The automatic workflow may not suite the research workflow of all Quant researchers. To support a flexible Quant research workflow, Qlib also provides a modularized interface to allow researchers to build their own workflow by code. [Here](examples/train_backtest_analyze.ipynb) is a demo for customized Quant research workflow by code


+# Quant-model-zoo
+
+Here is a list of models build on `Qlib`.
+- [GBDT based on lightgbm](qlib/contrib/model/gbdt.py)
+- [MLP based on pytroch](qlib/contrib/model/pytorch_nn.py)
+
+Your PR of new Quant models is highly welcomed.
+
+# Quant-dataset-zoo
+Dataset plays a very important role in Quant. Here is a list of the datasets build on `Qlib`.
+- [Alpha360](./qlib/contrib/estimator/handler.py)
+- [QLibDataHandlerClose](./qlib/contrib/estimator/handler.py)
+
+Your PR to build new Quant dataset is highly welcomed.

 # More About Qlib
 The detailed documents are organized in [docs](docs/).
--- a/docs/_static/img/analysis/analysis_model_IC.png
+++ b/docs/_static/img/analysis/analysis_model_IC.png
--- a/docs/_static/img/analysis/analysis_model_NDQ.png
+++ b/docs/_static/img/analysis/analysis_model_NDQ.png
--- a/docs/_static/img/analysis/analysis_model_auto_correlation.png
+++ b/docs/_static/img/analysis/analysis_model_auto_correlation.png
--- a/docs/_static/img/analysis/analysis_model_cumulative_return.png
+++ b/docs/_static/img/analysis/analysis_model_cumulative_return.png
--- a/docs/_static/img/analysis/analysis_model_long_short.png
+++ b/docs/_static/img/analysis/analysis_model_long_short.png
--- a/docs/_static/img/analysis/analysis_model_monthly_IC.png
+++ b/docs/_static/img/analysis/analysis_model_monthly_IC.png
--- a/docs/_static/img/analysis/report.png
+++ b/docs/_static/img/analysis/report.png
--- a/docs/_static/img/analysis/risk_analysis_annualized_return.png
+++ b/docs/_static/img/analysis/risk_analysis_annualized_return.png
--- a/docs/_static/img/analysis/risk_analysis_bar.png
+++ b/docs/_static/img/analysis/risk_analysis_bar.png
--- a/docs/_static/img/analysis/risk_analysis_information_ratio.png
+++ b/docs/_static/img/analysis/risk_analysis_information_ratio.png
--- a/docs/_static/img/analysis/risk_analysis_max_drawdown.png
+++ b/docs/_static/img/analysis/risk_analysis_max_drawdown.png
--- a/docs/_static/img/analysis/risk_analysis_std.png
+++ b/docs/_static/img/analysis/risk_analysis_std.png
--- a/docs/_static/img/analysis/score_ic.png
+++ b/docs/_static/img/analysis/score_ic.png
--- a/docs/component/data.rst
+++ b/docs/component/data.rst
@@ -65,7 +65,7 @@ After conversion, users can find their Qlib format data in the directory `~/.qli

 .. note::

-    The arguments of `--include_fields` should correspond with the column names of CSV files. The columns names of dataset provided by ``Qlib`` includes open,close,high,low,volume,factor.
+    The arguments of `--include_fields` should correspond with the column names of CSV files. The columns names of dataset provided by ``Qlib`` should include open, close, high, low, volume and factor at least.
    
    - `open`
        The opening price
@@ -80,6 +80,7 @@ After conversion, users can find their Qlib format data in the directory `~/.qli
    - `factor`
        The Restoration factor

+    In the convention of `Qlib` data processing, `open, close, high, low, volume, money and factor` will be set to NaN if the stock is suspended. 

 China-Stock Mode & US-Stock Mode
 --------------------------------
--- a/qlib/config.py
+++ b/qlib/config.py
@@ -120,7 +120,7 @@ _default_client_config = {
 _default_region_config = {
    REG_CN: {
        "trade_unit": 100,
-        "limit_threshold": 0.1,
+        "limit_threshold": 0.099,
        "deal_price": "vwap",
    },
    REG_US: {
--- a/qlib/contrib/backtest/exchange.py
+++ b/qlib/contrib/backtest/exchange.py
@@ -149,7 +149,7 @@ class Exchange:
        self.quote = quote_df.to_dict("index")

    def _update_limit(self, buy_limit, sell_limit):
-        self.quote["limit"] = ~self.quote["$change"].between(-sell_limit, buy_limit)
+        self.quote["limit"] = ~self.quote["$change"].between(-sell_limit, buy_limit, inclusive=False)

    def check_stock_limit(self, stock_id, trade_date):
        """Parameter
--- a/scripts/data_collector/utils.py
+++ b/scripts/data_collector/utils.py
@@ -1,3 +1,6 @@
+#  Copyright (c) Microsoft Corporation.
+#  Licensed under the MIT License.
+
 import re
 import requests

--- a/scripts/data_collector/yahoo/README.md
+++ b/scripts/data_collector/yahoo/README.md
@@ -1,5 +1,15 @@
 # Collect Data From Yahoo Finance

+> *Please pay **ATTENTION** that the data is collected from [Yahoo Finance](https://finance.yahoo.com/lookup) and the data might not be perfect. We recommend users to prepare their own data if they have high-quality dataset. For more information, users can refer to the [related document](https://qlib.readthedocs.io/en/latest/component/data.html#converting-csv-format-into-qlib-format)*
+
+
+>  **Examples of abnormal data**
+
+- [SH000661](https://finance.yahoo.com/quote/000661.SZ/history?period1=1558310400&period2=1590796800&interval=1d&filter=history&frequency=1d)
+- [SZ300144](https://finance.yahoo.com/quote/300144.SZ/history?period1=1557446400&period2=1589932800&interval=1d&filter=history&frequency=1d)
+
+We have considered **STOCK PRICE ADJUSTMENT**, but some price series seem still very abnormal.
+
 ## Requirements

 ```bash
@@ -35,4 +45,4 @@ python collector.py manual_adj_data --normalize_dir ~/.qlib/stock_data/normalize

 ```bash
 python collector.py dump_data --normalize_dir ~/.qlib/stock_data/normalize_dir --qlib_dir ~/.qlib/stock_data/qlib_data
-```
+```
--- a/scripts/data_collector/yahoo/collector.py
+++ b/scripts/data_collector/yahoo/collector.py
@@ -2,6 +2,7 @@
 # Licensed under the MIT License.

 import sys
+import time
 from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor, as_completed

@@ -19,17 +20,20 @@ from dump_bin import DumpData
 from data_collector.utils import get_hs_calendar_list as get_calendar_list, get_hs_stock_symbols

 CSI300_BENCH_URL = "http://push2his.eastmoney.com/api/qt/stock/kline/get?secid=1.000300&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58&klt=101&fqt=0&beg=19900101&end=20220101"
+MIN_NUMBERS_TRADING = 252 / 4


 class YahooCollector:
-    def __init__(self, save_dir: [str, Path], max_workers=4, asynchronous=True, max_collector_count=3):
+    def __init__(self, save_dir: [str, Path], max_workers=4, asynchronous=False, max_collector_count=5, delay=0):

        self.save_dir = Path(save_dir).expanduser().resolve()
        self.save_dir.mkdir(parents=True, exist_ok=True)
+        self._delay = delay
        self._stock_list = None
        self.max_workers = max_workers
        self._asynchronous = asynchronous
        self._max_collector_count = max_collector_count
+        self._mini_symbol_map = {}

    @property
    def stock_list(self):
@@ -37,6 +41,9 @@ class YahooCollector:
            self._stock_list = get_hs_stock_symbols()
        return self._stock_list

+    def _sleep(self):
+        time.sleep(self._delay)
+
    def save_stock(self, symbol, df: pd.DataFrame):
        """save stock data to file

@@ -56,6 +63,15 @@ class YahooCollector:
        df["symbol"] = symbol
        df.to_csv(stock_path, index=False)

+    def _temp_save_small_data(self, symbol, df):
+        if len(df) <= MIN_NUMBERS_TRADING:
+            logger.warning(f"the number of trading days of {symbol} is less than {MIN_NUMBERS_TRADING}!")
+            _temp = self._mini_symbol_map.setdefault(symbol, [])
+            _temp.append(df.copy())
+        else:
+            if symbol in self._mini_symbol_map:
+                self._mini_symbol_map.pop(symbol)
+
    def _collector(self, stock_list):

        error_symbol = []
@@ -63,12 +79,14 @@ class YahooCollector:
            futures = {}
            p_bar = tqdm(total=len(stock_list))
            for symbols in [stock_list[i : i + self.max_workers] for i in range(0, len(stock_list), self.max_workers)]:
+                self._sleep()
                resp = Ticker(symbols, asynchronous=self._asynchronous, max_workers=self.max_workers).history(
                    period="max"
                )
                if isinstance(resp, dict):
                    for symbol, df in resp.items():
                        if isinstance(df, pd.DataFrame):
+                            self._temp_save_small_data(self, df)
                            futures[
                                worker.submit(
                                    self.save_stock, symbol, df.reset_index().rename(columns={"index": "date"})
@@ -78,6 +96,7 @@ class YahooCollector:
                            error_symbol.append(symbol)
                else:
                    for symbol, df in resp.reset_index().groupby("symbol"):
+                        self._temp_save_small_data(self, df)
                        futures[worker.submit(self.save_stock, symbol, df)] = symbol
                p_bar.update(self.max_workers)
            p_bar.close()
@@ -93,6 +112,7 @@ class YahooCollector:
        print(error_symbol)
        logger.info(f"error symbol nums: {len(error_symbol)}")
        logger.info(f"current get symbol nums: {len(stock_list)}")
+        error_symbol.extend(self._mini_symbol_map.keys())
        return error_symbol

    def collector_data(self):
@@ -107,7 +127,14 @@ class YahooCollector:
            logger.info(f"getting data: {i+1}")
            stock_list = self._collector(stock_list)
            logger.info(f"{i+1} finish.")
+        for _symbol, _df_list in self._mini_symbol_map.items():
+            self.save_stock(_symbol, max(_df_list, key=len))

+        logger.warning(f"less than {MIN_NUMBERS_TRADING} stock list: {list(self._mini_symbol_map.keys())}")
+        
+        self.download_csi300_data()
+
+    def download_csi300_data(self):
        # TODO: from MSN
        logger.info(f"get bench data: csi300(SH000300)......")
        df = pd.DataFrame(map(lambda x: x.split(","), requests.get(CSI300_BENCH_URL).json()["data"]["klines"]))
@@ -164,6 +191,7 @@ class Run:
            df = pd.read_csv(file_path)
            df.set_index("date", inplace=True)
            df.index = pd.to_datetime(df.index)
+            df = df[~df.index.duplicated(keep="first")]

            # using China stock market data calendar
            df = df.reindex(pd.Index(get_calendar_list()))
@@ -232,7 +260,7 @@ class Run:
            include_fields="close,open,high,low,volume,change,factor"
        )

-    def download_data(self):
+    def download_data(self, asynchronous=False, max_collector_count=5, delay=0):
        """download data from Internet

        Examples
@@ -240,7 +268,20 @@ class Run:
            $ python collector.py download_data --source_dir ~/.qlib/stock_data/source

        """
-        YahooCollector(self.source_dir, max_workers=self.max_workers).collector_data()
+        YahooCollector(
+            self.source_dir,
+            max_workers=self.max_workers,
+            asynchronous=asynchronous,
+            max_collector_count=max_collector_count,
+            delay=delay,
+        ).collector_data()
+
+    def download_csi300_data(self):
+        YahooCollector(self.source_dir).download_csi300_data()
+
+    def download_bench_data(self):
+        """download bench stock data(SH000300)
+        """

    def collector_data(self):
        """download -> normalize -> dump data
--- a/scripts/get_data.py
+++ b/scripts/get_data.py
@@ -53,7 +53,7 @@ class GetData:
            for _file in tqdm(zp.namelist()):
                zp.extract(_file, str(target_dir.resolve()))

-    def qlib_data_cn(self, target_dir="~/.qlib/qlib_data/cn_data", version="v1"):
+    def qlib_data_cn(self, target_dir="~/.qlib/qlib_data/cn_data", version="latest"):
        """download cn qlib data from remote

        Parameters
@@ -61,7 +61,7 @@ class GetData:
        target_dir: str
            data save directory
        version: str
-            data version, value from [v0, v1], by default v1
+            data version, value from [v0, v1, ..., latest], by default latest

        Examples
        ---------
--- a/tests/dataset_tests/README.md
+++ b/tests/dataset_tests/README.md
@@ -0,0 +1,2 @@
+# About dataset tests
+Tests in this folder are for testing the prepared dataset from Yahoo
--- a/tests/dataset_tests/test_dataset.py
+++ b/tests/dataset_tests/test_dataset.py
@@ -0,0 +1,42 @@
+
+import qlib
+from qlib.data import D
+from qlib.config import REG_CN
+import unittest
+import numpy as np
+
+
+class TestDataset(unittest.TestCase):
+
+    def setUp(self):
+        provider_uri = "~/.qlib/qlib_data/cn_data"  # target_dir
+        qlib.init(provider_uri=provider_uri, region=REG_CN)
+
+    def testCSI300(self):
+        close_p = D.features(D.instruments('csi300'), ['$close'])
+        size = close_p.groupby('datetime').size()
+        cnt = close_p.groupby('datetime').count()
+        size_desc = size.describe(percentiles=np.arange(0.1, 0.9, 0.1))
+        cnt_desc = cnt.describe(percentiles=np.arange(0.1, 0.9, 0.1))
+
+        print(size_desc)
+        print(cnt_desc)
+
+        self.assertLessEqual(size_desc.loc["max"][0], 305, "Excessive number of CSI300 constituent stocks")
+        self.assertLessEqual(size_desc.loc["80%"][0], 290, "Insufficient number of CSI300 constituent stocks")
+        
+        self.assertLessEqual(cnt_desc.loc["max"][0], 305, "Excessive number of CSI300 constituent stocks")
+        self.assertEqual(cnt_desc.loc["80%"][0], 300, "Insufficient number of CSI300 constituent stocks")
+
+    def testClose(self):
+        close_p = D.features(D.instruments('csi300'), ['Ref($close, 1)/$close - 1'])
+        close_desc = close_p.describe(percentiles=np.arange(0.1, 0.9, 0.1))
+        print(close_desc)
+        self.assertLessEqual(abs(close_desc.loc["80%"][0]), 0.1, "Close value is abnormal")
+        self.assertLessEqual(abs(close_desc.loc["max"][0]), 0.2, "Close value is abnormal")
+        self.assertGreaterEqual(close_desc.loc["min"][0], -0.2, "Close value is abnormal")
+
+
+if __name__ == '__main__':
+    unittest.main()
+