Merge branch 'microsoft:main' into stale

2026-07-01 10:01:19 +08:00 · 2021-05-09 17:59:46 +08:00
parent 143c257fa2 aa1f9b464b
commit 81bd2ca8fb
13 changed files with 236 additions and 28 deletions
--- a/README.md
+++ b/README.md
@@ -49,7 +49,7 @@ Your feedbacks about the features are very important.
 | Planning-based portfolio optimization | Under review:  https://github.com/microsoft/qlib/pull/280 | 
 | Fund data supporting and analysis  |  Under review: https://github.com/microsoft/qlib/pull/292 |
 | Point-in-Time database | Under review: https://github.com/microsoft/qlib/pull/343 |
-| High-frequency trading | Initial opensource version under development | 
+| High-frequency trading | Under review: https://github.com/microsoft/qlib/pull/408 | 
 | Meta-Learning-based data selection | Initial opensource version under development |

 Recent released features
--- a/docs/component/data.rst
+++ b/docs/component/data.rst
@@ -182,6 +182,11 @@ The `trade unit` defines the unit number of stocks can be used in a trade, and t
            qlib.init(provider_uri='~/.qlib/qlib_data/us_data', region=REG_US)
        

+.. note::
+
+    PRs for new data source are highly welcome! Users could commit the code to crawl data as a PR like `the examples here  <https://github.com/microsoft/qlib/tree/main/scripts>`_. And then we will use the code to create data cache on our server which other users could use directly.
+
+
 Data API
 ========================

--- a/qlib/data/data.py
+++ b/qlib/data/data.py
@@ -522,6 +522,9 @@ class LocalCalendarProvider(CalendarProvider):
            # if future calendar not exists, return current calendar
            if not os.path.exists(fname):
                get_module_logger("data").warning(f"{freq}_future.txt not exists, return current calendar!")
+                get_module_logger("data").warning(
+                    "You can get future calendar by referring to the following document: https://github.com/microsoft/qlib/blob/main/scripts/data_collector/contrib/README.md"
+                )
                fname = self._uri_cal.format(freq)
        else:
            fname = self._uri_cal.format(freq)
--- a/qlib/log.py
+++ b/qlib/log.py
@@ -15,9 +15,10 @@ from .config import C
 class MetaLogger(type):
    def __new__(cls, name, bases, dict):
        wrapper_dict = logging.Logger.__dict__.copy()
-        wrapper_dict.update(dict)
-        wrapper_dict["__doc__"] = logging.Logger.__doc__
-        return type.__new__(cls, name, bases, wrapper_dict)
+        for key in wrapper_dict:
+            if key not in dict and key != "__reduce__":
+                dict[key] = wrapper_dict[key]
+        return type.__new__(cls, name, bases, dict)


 class QlibLogger(metaclass=MetaLogger):
@@ -39,6 +40,9 @@ class QlibLogger(metaclass=MetaLogger):
        self.level = level

    def __getattr__(self, name):
+        # During unpickling, python will call __getattr__. Use this line to avoid maximum recursion error.
+        if name in {"__setstate__"}:
+            raise AttributeError
        return self.logger.__getattribute__(name)


@@ -159,3 +163,10 @@ class LogFilter(logging.Filter):
        elif isinstance(self.param, list):
            allow = not any([self.match_msg(p, record.msg) for p in self.param])
        return allow
+
+
+def set_global_logger_level(level: int):
+    qlib_logger = logging.root.manager.loggerDict.get("qlib", None)
+    if qlib_logger is not None:
+        for _handler in qlib_logger.handlers:
+            _handler.level = level
--- a/qlib/workflow/init.py
+++ b/qlib/workflow/init.py
@@ -23,7 +23,10 @@ class QlibRecorder:
    @contextmanager
    def start(
        self,
+        *,
+        experiment_id: Optional[Text] = None,
        experiment_name: Optional[Text] = None,
+        recorder_id: Optional[Text] = None,
        recorder_name: Optional[Text] = None,
        uri: Optional[Text] = None,
        resume: bool = False,
@@ -45,8 +48,12 @@ class QlibRecorder:

        Parameters
        ----------
+        experiment_id : str
+            id of the experiment one wants to start.
        experiment_name : str
            name of the experiment one wants to start.
+        recorder_id : str
+            id of the recorder under the experiment one wants to start.
        recorder_name : str
            name of the recorder under the experiment one wants to start.
        uri : str
@@ -57,7 +64,14 @@ class QlibRecorder:
        resume : bool
            whether to resume the specific recorder with given name under the given experiment.
        """
-        run = self.start_exp(experiment_name, recorder_name, uri, resume)
+        run = self.start_exp(
+            experiment_id=experiment_id,
+            experiment_name=experiment_name,
+            recorder_id=recorder_id,
+            recorder_name=recorder_name,
+            uri=uri,
+            resume=resume,
+        )
        try:
            yield run
        except Exception as e:
@@ -65,7 +79,9 @@ class QlibRecorder:
            raise e
        self.end_exp(Recorder.STATUS_FI)

-    def start_exp(self, experiment_name=None, recorder_name=None, uri=None, resume=False):
+    def start_exp(
+        self, *, experiment_id=None, experiment_name=None, recorder_id=None, recorder_name=None, uri=None, resume=False
+    ):
        """
        Lower level method for starting an experiment. When use this method, one should end the experiment manually
        and the status of the recorder may not be handled properly. Here is the example code:
@@ -79,8 +95,12 @@ class QlibRecorder:

        Parameters
        ----------
+        experiment_id : str
+            id of the experiment one wants to start.
        experiment_name : str
            the name of the experiment to be started
+        recorder_id : str
+            id of the recorder under the experiment one wants to start.
        recorder_name : str
            name of the recorder under the experiment one wants to start.
        uri : str
@@ -93,7 +113,14 @@ class QlibRecorder:
        -------
        An experiment instance being started.
        """
-        return self.exp_manager.start_exp(experiment_name, recorder_name, uri, resume)
+        return self.exp_manager.start_exp(
+            experiment_id=experiment_id,
+            experiment_name=experiment_name,
+            recorder_id=recorder_id,
+            recorder_name=recorder_name,
+            uri=uri,
+            resume=resume,
+        )

    def end_exp(self, recorder_status=Recorder.STATUS_FI):
        """
--- a/qlib/workflow/exp.py
+++ b/qlib/workflow/exp.py
@@ -39,12 +39,14 @@ class Experiment:
        output["recorders"] = list(recorders.keys())
        return output

-    def start(self, recorder_name=None, resume=False):
+    def start(self, *, recorder_id=None, recorder_name=None, resume=False):
        """
        Start the experiment and set it to be active. This method will also start a new recorder.

        Parameters
        ----------
+        recorder_id : str
+            the id of the recorder to be created.
        recorder_name : str
            the name of the recorder to be created.
        resume : bool
@@ -238,14 +240,14 @@ class MLflowExperiment(Experiment):
    def __repr__(self):
        return "{name}(id={id}, info={info})".format(name=self.__class__.__name__, id=self.id, info=self.info)

-    def start(self, recorder_name=None, resume=False):
+    def start(self, *, recorder_id=None, recorder_name=None, resume=False):
        logger.info(f"Experiment {self.id} starts running ...")
        # Get or create recorder
        if recorder_name is None:
            recorder_name = self._default_rec_name
        # resume the recorder
        if resume:
-            recorder, _ = self._get_or_create_rec(recorder_name=recorder_name)
+            recorder, _ = self._get_or_create_rec(recorder_id=recorder_id, recorder_name=recorder_name)
        # create a new recorder
        else:
            recorder = self.create_recorder(recorder_name)
--- a/qlib/workflow/expm.py
+++ b/qlib/workflow/expm.py
@@ -33,7 +33,10 @@ class ExpManager:

    def start_exp(
        self,
+        *,
+        experiment_id: Optional[Text] = None,
        experiment_name: Optional[Text] = None,
+        recorder_id: Optional[Text] = None,
        recorder_name: Optional[Text] = None,
        uri: Optional[Text] = None,
        resume: bool = False,
@@ -45,8 +48,12 @@ class ExpManager:

        Parameters
        ----------
+        experiment_id : str
+            id of the active experiment.
        experiment_name : str
            name of the active experiment.
+        recorder_id : str
+            id of the recorder to be started.
        recorder_name : str
            name of the recorder to be started.
        uri : str
@@ -298,7 +305,10 @@ class MLflowExpManager(ExpManager):

    def start_exp(
        self,
+        *,
+        experiment_id: Optional[Text] = None,
        experiment_name: Optional[Text] = None,
+        recorder_id: Optional[Text] = None,
        recorder_name: Optional[Text] = None,
        uri: Optional[Text] = None,
        resume: bool = False,
@@ -308,11 +318,11 @@ class MLflowExpManager(ExpManager):
        # Create experiment
        if experiment_name is None:
            experiment_name = self._default_exp_name
-        experiment, _ = self._get_or_create_exp(experiment_name=experiment_name)
+        experiment, _ = self._get_or_create_exp(experiment_id=experiment_id, experiment_name=experiment_name)
        # Set up active experiment
        self.active_experiment = experiment
        # Start the experiment
-        self.active_experiment.start(recorder_name, resume)
+        self.active_experiment.start(recorder_id=recorder_id, recorder_name=recorder_name, resume=resume)

        return self.active_experiment

--- a/scripts/README.md
+++ b/scripts/README.md
@@ -15,7 +15,11 @@
 ### Download CN Data

 ```bash
+# daily data
 python get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data --region cn
+
+# 1min  data (Optional for running non-high-frequency strategies)
+python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_cn_1min --region cn --interval 1min
 ```

 ### Downlaod US Data
--- a/scripts/data_collector/contrib/README.md
+++ b/scripts/data_collector/contrib/README.md
@@ -0,0 +1,24 @@
+# Get future trading days
+
+> `D.calendar(future=True)` will be used
+
+## Requirements
+
+```bash
+pip install -r requirements.txt
+```
+
+## Collector Data
+
+```bash
+# parse instruments, using in qlib/instruments.
+python future_trading_date_collector.py --qlib_dir ~/.qlib/qlib_data/cn_data --freq day
+```
+
+## Parameters
+
+- qlib_dir: qlib data directory
+- freq: value from [`day`, `1min`], default `day`
+
+
+
--- a/scripts/data_collector/contrib/future_trading_date_collector.py
+++ b/scripts/data_collector/contrib/future_trading_date_collector.py
@@ -0,0 +1,87 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import sys
+from typing import List
+from pathlib import Path
+
+import fire
+import numpy as np
+import pandas as pd
+from loguru import logger
+
+# get data from baostock
+import baostock as bs
+
+CUR_DIR = Path(__file__).resolve().parent
+sys.path.append(str(CUR_DIR.parent.parent))
+
+
+from data_collector.utils import generate_minutes_calendar_from_daily
+
+
+def read_calendar_from_qlib(qlib_dir: Path) -> pd.DataFrame:
+    calendar_path = qlib_dir.joinpath("calendars").joinpath("day.txt")
+    if not calendar_path.exists():
+        return pd.DataFrame()
+    return pd.read_csv(calendar_path, header=None)
+
+
+def write_calendar_to_qlib(qlib_dir: Path, date_list: List[str], freq: str = "day"):
+    calendar_path = str(qlib_dir.joinpath("calendars").joinpath(f"{freq}_future.txt"))
+
+    np.savetxt(calendar_path, date_list, fmt="%s", encoding="utf-8")
+    logger.info(f"write future calendars success: {calendar_path}")
+
+
+def generate_qlib_calendar(date_list: List[str], freq: str) -> List[str]:
+    print(freq)
+    if freq == "day":
+        return date_list
+    elif freq == "1min":
+        date_list = generate_minutes_calendar_from_daily(date_list, freq=freq).tolist()
+        return list(map(lambda x: pd.Timestamp(x).strftime("%Y-%m-%d %H:%M:%S"), date_list))
+    else:
+        raise ValueError(f"Unsupported freq: {freq}")
+
+
+def future_calendar_collector(qlib_dir: [str, Path], freq: str = "day"):
+    """get future calendar
+
+    Parameters
+    ----------
+    qlib_dir: str or Path
+        qlib data directory
+    freq: str
+        value from ["day", "1min"], by default day
+    """
+    qlib_dir = Path(qlib_dir).expanduser().resolve()
+    if not qlib_dir.exists():
+        raise FileNotFoundError(str(qlib_dir))
+
+    lg = bs.login()
+    if lg.error_code != "0":
+        logger.error(f"login error: {lg.error_msg}")
+        return
+    # read daily calendar
+    daily_calendar = read_calendar_from_qlib(qlib_dir)
+    end_year = pd.Timestamp.now().year
+    if daily_calendar.empty:
+        start_year = pd.Timestamp.now().year
+    else:
+        start_year = pd.Timestamp(daily_calendar.iloc[-1, 0]).year
+    rs = bs.query_trade_dates(start_date=pd.Timestamp(f"{start_year}-01-01"), end_date=f"{end_year}-12-31")
+    data_list = []
+    while (rs.error_code == "0") & rs.next():
+        _row_data = rs.get_row_data()
+        if int(_row_data[1]) == 1:
+            data_list.append(_row_data[0])
+    data_list = sorted(data_list)
+    date_list = generate_qlib_calendar(data_list, freq=freq)
+    write_calendar_to_qlib(qlib_dir, date_list, freq=freq)
+    bs.logout()
+    logger.info(f"get trading dates success: {start_year}-01-01 to {end_year}-12-31")
+
+
+if __name__ == "__main__":
+    fire.Fire(future_calendar_collector)
--- a/scripts/data_collector/contrib/requirements.txt
+++ b/scripts/data_collector/contrib/requirements.txt
@@ -0,0 +1,5 @@
+baostock
+fire
+numpy
+pandas
+loguru
--- a/scripts/data_collector/utils.py
+++ b/scripts/data_collector/utils.py
@@ -10,7 +10,9 @@ import random
 import requests
 import functools
 from pathlib import Path
+from typing import Iterable, Tuple

+import numpy as np
 import pandas as pd
 from lxml import etree
 from loguru import logger
@@ -418,5 +420,40 @@ def get_trading_date_by_shift(trading_list: list, trading_date: pd.Timestamp, sh
    return res


+def generate_minutes_calendar_from_daily(
+    calendars: Iterable,
+    freq: str = "1min",
+    am_range: Tuple[str, str] = ("09:30:00", "11:29:00"),
+    pm_range: Tuple[str, str] = ("13:00:00", "14:59:00"),
+) -> pd.Index:
+    """generate minutes calendar
+
+    Parameters
+    ----------
+    calendars: Iterable
+        daily calendar
+    freq: str
+        by default 1min
+    am_range: Tuple[str, str]
+        AM Time Range, by default China-Stock: ("09:30:00", "11:29:00")
+    pm_range: Tuple[str, str]
+        PM Time Range, by default China-Stock: ("13:00:00", "14:59:00")
+
+    """
+    daily_format: str = "%Y-%m-%d"
+    res = []
+    for _day in calendars:
+        for _range in [am_range, pm_range]:
+            res.append(
+                pd.date_range(
+                    f"{pd.Timestamp(_day).strftime(daily_format)} {_range[0]}",
+                    f"{pd.Timestamp(_day).strftime(daily_format)} {_range[1]}",
+                    freq=freq,
+                )
+            )
+
+    return pd.Index(sorted(set(np.hstack(res))))
+
+
 if __name__ == "__main__":
    assert len(get_hs_stock_symbols()) >= MINIMUM_SYMBOLS_NUM
--- a/scripts/data_collector/yahoo/collector.py
+++ b/scripts/data_collector/yahoo/collector.py
@@ -24,7 +24,12 @@ from qlib.config import REG_CN as REGION_CN
 CUR_DIR = Path(__file__).resolve().parent
 sys.path.append(str(CUR_DIR.parent.parent))
 from data_collector.base import BaseCollector, BaseNormalize, BaseRun
-from data_collector.utils import get_calendar_list, get_hs_stock_symbols, get_us_stock_symbols
+from data_collector.utils import (
+    get_calendar_list,
+    get_hs_stock_symbols,
+    get_us_stock_symbols,
+    generate_minutes_calendar_from_daily,
+)

 INDEX_BENCH_URL = "http://push2his.eastmoney.com/api/qt/stock/kline/get?secid=1.{index_code}&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58&klt=101&fqt=0&beg={begin}&end={end}"

@@ -418,21 +423,9 @@ class YahooNormalize1min(YahooNormalize, ABC):
        return calendar_list_1d

    def generate_1min_from_daily(self, calendars: Iterable) -> pd.Index:
-        res = []
-        daily_format = self.DAILY_FORMAT
-        am_range = self.AM_RANGE
-        pm_range = self.PM_RANGE
-        for _day in calendars:
-            for _range in [am_range, pm_range]:
-                res.append(
-                    pd.date_range(
-                        f"{_day.strftime(daily_format)} {_range[0]}",
-                        f"{_day.strftime(daily_format)} {_range[1]}",
-                        freq="1min",
-                    )
-                )
-
-        return pd.Index(sorted(set(np.hstack(res))))
+        return generate_minutes_calendar_from_daily(
+            calendars, freq="1min", am_range=self.AM_RANGE, pm_range=self.PM_RANGE
+        )

    def adjusted_price(self, df: pd.DataFrame) -> pd.DataFrame:
        # TODO: using daily data factor