diff --git a/README.md b/README.md index e78ffe751..84f17ccda 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ Your feedbacks about the features are very important. | Planning-based portfolio optimization | Under review: https://github.com/microsoft/qlib/pull/280 | | Fund data supporting and analysis | Under review: https://github.com/microsoft/qlib/pull/292 | | Point-in-Time database | Under review: https://github.com/microsoft/qlib/pull/343 | -| High-frequency trading | Initial opensource version under development | +| High-frequency trading | Under review: https://github.com/microsoft/qlib/pull/408 | | Meta-Learning-based data selection | Initial opensource version under development | Recent released features diff --git a/docs/component/data.rst b/docs/component/data.rst index 26f44a076..3cee803e6 100644 --- a/docs/component/data.rst +++ b/docs/component/data.rst @@ -182,6 +182,11 @@ The `trade unit` defines the unit number of stocks can be used in a trade, and t qlib.init(provider_uri='~/.qlib/qlib_data/us_data', region=REG_US) +.. note:: + + PRs for new data source are highly welcome! Users could commit the code to crawl data as a PR like `the examples here `_. And then we will use the code to create data cache on our server which other users could use directly. + + Data API ======================== diff --git a/qlib/data/data.py b/qlib/data/data.py index cea2f42eb..c2638e234 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -522,6 +522,9 @@ class LocalCalendarProvider(CalendarProvider): # if future calendar not exists, return current calendar if not os.path.exists(fname): get_module_logger("data").warning(f"{freq}_future.txt not exists, return current calendar!") + get_module_logger("data").warning( + "You can get future calendar by referring to the following document: https://github.com/microsoft/qlib/blob/main/scripts/data_collector/contrib/README.md" + ) fname = self._uri_cal.format(freq) else: fname = self._uri_cal.format(freq) diff --git a/qlib/log.py b/qlib/log.py index 5888b3841..a0e4f7986 100644 --- a/qlib/log.py +++ b/qlib/log.py @@ -15,9 +15,10 @@ from .config import C class MetaLogger(type): def __new__(cls, name, bases, dict): wrapper_dict = logging.Logger.__dict__.copy() - wrapper_dict.update(dict) - wrapper_dict["__doc__"] = logging.Logger.__doc__ - return type.__new__(cls, name, bases, wrapper_dict) + for key in wrapper_dict: + if key not in dict and key != "__reduce__": + dict[key] = wrapper_dict[key] + return type.__new__(cls, name, bases, dict) class QlibLogger(metaclass=MetaLogger): @@ -39,6 +40,9 @@ class QlibLogger(metaclass=MetaLogger): self.level = level def __getattr__(self, name): + # During unpickling, python will call __getattr__. Use this line to avoid maximum recursion error. + if name in {"__setstate__"}: + raise AttributeError return self.logger.__getattribute__(name) @@ -159,3 +163,10 @@ class LogFilter(logging.Filter): elif isinstance(self.param, list): allow = not any([self.match_msg(p, record.msg) for p in self.param]) return allow + + +def set_global_logger_level(level: int): + qlib_logger = logging.root.manager.loggerDict.get("qlib", None) + if qlib_logger is not None: + for _handler in qlib_logger.handlers: + _handler.level = level diff --git a/qlib/workflow/__init__.py b/qlib/workflow/__init__.py index a03665626..8135bab60 100644 --- a/qlib/workflow/__init__.py +++ b/qlib/workflow/__init__.py @@ -23,7 +23,10 @@ class QlibRecorder: @contextmanager def start( self, + *, + experiment_id: Optional[Text] = None, experiment_name: Optional[Text] = None, + recorder_id: Optional[Text] = None, recorder_name: Optional[Text] = None, uri: Optional[Text] = None, resume: bool = False, @@ -45,8 +48,12 @@ class QlibRecorder: Parameters ---------- + experiment_id : str + id of the experiment one wants to start. experiment_name : str name of the experiment one wants to start. + recorder_id : str + id of the recorder under the experiment one wants to start. recorder_name : str name of the recorder under the experiment one wants to start. uri : str @@ -57,7 +64,14 @@ class QlibRecorder: resume : bool whether to resume the specific recorder with given name under the given experiment. """ - run = self.start_exp(experiment_name, recorder_name, uri, resume) + run = self.start_exp( + experiment_id=experiment_id, + experiment_name=experiment_name, + recorder_id=recorder_id, + recorder_name=recorder_name, + uri=uri, + resume=resume, + ) try: yield run except Exception as e: @@ -65,7 +79,9 @@ class QlibRecorder: raise e self.end_exp(Recorder.STATUS_FI) - def start_exp(self, experiment_name=None, recorder_name=None, uri=None, resume=False): + def start_exp( + self, *, experiment_id=None, experiment_name=None, recorder_id=None, recorder_name=None, uri=None, resume=False + ): """ Lower level method for starting an experiment. When use this method, one should end the experiment manually and the status of the recorder may not be handled properly. Here is the example code: @@ -79,8 +95,12 @@ class QlibRecorder: Parameters ---------- + experiment_id : str + id of the experiment one wants to start. experiment_name : str the name of the experiment to be started + recorder_id : str + id of the recorder under the experiment one wants to start. recorder_name : str name of the recorder under the experiment one wants to start. uri : str @@ -93,7 +113,14 @@ class QlibRecorder: ------- An experiment instance being started. """ - return self.exp_manager.start_exp(experiment_name, recorder_name, uri, resume) + return self.exp_manager.start_exp( + experiment_id=experiment_id, + experiment_name=experiment_name, + recorder_id=recorder_id, + recorder_name=recorder_name, + uri=uri, + resume=resume, + ) def end_exp(self, recorder_status=Recorder.STATUS_FI): """ diff --git a/qlib/workflow/exp.py b/qlib/workflow/exp.py index 7b3d1f507..467c7c3f4 100644 --- a/qlib/workflow/exp.py +++ b/qlib/workflow/exp.py @@ -39,12 +39,14 @@ class Experiment: output["recorders"] = list(recorders.keys()) return output - def start(self, recorder_name=None, resume=False): + def start(self, *, recorder_id=None, recorder_name=None, resume=False): """ Start the experiment and set it to be active. This method will also start a new recorder. Parameters ---------- + recorder_id : str + the id of the recorder to be created. recorder_name : str the name of the recorder to be created. resume : bool @@ -238,14 +240,14 @@ class MLflowExperiment(Experiment): def __repr__(self): return "{name}(id={id}, info={info})".format(name=self.__class__.__name__, id=self.id, info=self.info) - def start(self, recorder_name=None, resume=False): + def start(self, *, recorder_id=None, recorder_name=None, resume=False): logger.info(f"Experiment {self.id} starts running ...") # Get or create recorder if recorder_name is None: recorder_name = self._default_rec_name # resume the recorder if resume: - recorder, _ = self._get_or_create_rec(recorder_name=recorder_name) + recorder, _ = self._get_or_create_rec(recorder_id=recorder_id, recorder_name=recorder_name) # create a new recorder else: recorder = self.create_recorder(recorder_name) diff --git a/qlib/workflow/expm.py b/qlib/workflow/expm.py index 590790c9e..04cc3bcb7 100644 --- a/qlib/workflow/expm.py +++ b/qlib/workflow/expm.py @@ -33,7 +33,10 @@ class ExpManager: def start_exp( self, + *, + experiment_id: Optional[Text] = None, experiment_name: Optional[Text] = None, + recorder_id: Optional[Text] = None, recorder_name: Optional[Text] = None, uri: Optional[Text] = None, resume: bool = False, @@ -45,8 +48,12 @@ class ExpManager: Parameters ---------- + experiment_id : str + id of the active experiment. experiment_name : str name of the active experiment. + recorder_id : str + id of the recorder to be started. recorder_name : str name of the recorder to be started. uri : str @@ -298,7 +305,10 @@ class MLflowExpManager(ExpManager): def start_exp( self, + *, + experiment_id: Optional[Text] = None, experiment_name: Optional[Text] = None, + recorder_id: Optional[Text] = None, recorder_name: Optional[Text] = None, uri: Optional[Text] = None, resume: bool = False, @@ -308,11 +318,11 @@ class MLflowExpManager(ExpManager): # Create experiment if experiment_name is None: experiment_name = self._default_exp_name - experiment, _ = self._get_or_create_exp(experiment_name=experiment_name) + experiment, _ = self._get_or_create_exp(experiment_id=experiment_id, experiment_name=experiment_name) # Set up active experiment self.active_experiment = experiment # Start the experiment - self.active_experiment.start(recorder_name, resume) + self.active_experiment.start(recorder_id=recorder_id, recorder_name=recorder_name, resume=resume) return self.active_experiment diff --git a/scripts/README.md b/scripts/README.md index b4eac4998..ff7ba8015 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -15,7 +15,11 @@ ### Download CN Data ```bash +# daily data python get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data --region cn + +# 1min data (Optional for running non-high-frequency strategies) +python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_cn_1min --region cn --interval 1min ``` ### Downlaod US Data diff --git a/scripts/data_collector/contrib/README.md b/scripts/data_collector/contrib/README.md new file mode 100644 index 000000000..011ff56e6 --- /dev/null +++ b/scripts/data_collector/contrib/README.md @@ -0,0 +1,24 @@ +# Get future trading days + +> `D.calendar(future=True)` will be used + +## Requirements + +```bash +pip install -r requirements.txt +``` + +## Collector Data + +```bash +# parse instruments, using in qlib/instruments. +python future_trading_date_collector.py --qlib_dir ~/.qlib/qlib_data/cn_data --freq day +``` + +## Parameters + +- qlib_dir: qlib data directory +- freq: value from [`day`, `1min`], default `day` + + + diff --git a/scripts/data_collector/contrib/future_trading_date_collector.py b/scripts/data_collector/contrib/future_trading_date_collector.py new file mode 100644 index 000000000..4da62d465 --- /dev/null +++ b/scripts/data_collector/contrib/future_trading_date_collector.py @@ -0,0 +1,87 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import sys +from typing import List +from pathlib import Path + +import fire +import numpy as np +import pandas as pd +from loguru import logger + +# get data from baostock +import baostock as bs + +CUR_DIR = Path(__file__).resolve().parent +sys.path.append(str(CUR_DIR.parent.parent)) + + +from data_collector.utils import generate_minutes_calendar_from_daily + + +def read_calendar_from_qlib(qlib_dir: Path) -> pd.DataFrame: + calendar_path = qlib_dir.joinpath("calendars").joinpath("day.txt") + if not calendar_path.exists(): + return pd.DataFrame() + return pd.read_csv(calendar_path, header=None) + + +def write_calendar_to_qlib(qlib_dir: Path, date_list: List[str], freq: str = "day"): + calendar_path = str(qlib_dir.joinpath("calendars").joinpath(f"{freq}_future.txt")) + + np.savetxt(calendar_path, date_list, fmt="%s", encoding="utf-8") + logger.info(f"write future calendars success: {calendar_path}") + + +def generate_qlib_calendar(date_list: List[str], freq: str) -> List[str]: + print(freq) + if freq == "day": + return date_list + elif freq == "1min": + date_list = generate_minutes_calendar_from_daily(date_list, freq=freq).tolist() + return list(map(lambda x: pd.Timestamp(x).strftime("%Y-%m-%d %H:%M:%S"), date_list)) + else: + raise ValueError(f"Unsupported freq: {freq}") + + +def future_calendar_collector(qlib_dir: [str, Path], freq: str = "day"): + """get future calendar + + Parameters + ---------- + qlib_dir: str or Path + qlib data directory + freq: str + value from ["day", "1min"], by default day + """ + qlib_dir = Path(qlib_dir).expanduser().resolve() + if not qlib_dir.exists(): + raise FileNotFoundError(str(qlib_dir)) + + lg = bs.login() + if lg.error_code != "0": + logger.error(f"login error: {lg.error_msg}") + return + # read daily calendar + daily_calendar = read_calendar_from_qlib(qlib_dir) + end_year = pd.Timestamp.now().year + if daily_calendar.empty: + start_year = pd.Timestamp.now().year + else: + start_year = pd.Timestamp(daily_calendar.iloc[-1, 0]).year + rs = bs.query_trade_dates(start_date=pd.Timestamp(f"{start_year}-01-01"), end_date=f"{end_year}-12-31") + data_list = [] + while (rs.error_code == "0") & rs.next(): + _row_data = rs.get_row_data() + if int(_row_data[1]) == 1: + data_list.append(_row_data[0]) + data_list = sorted(data_list) + date_list = generate_qlib_calendar(data_list, freq=freq) + write_calendar_to_qlib(qlib_dir, date_list, freq=freq) + bs.logout() + logger.info(f"get trading dates success: {start_year}-01-01 to {end_year}-12-31") + + +if __name__ == "__main__": + fire.Fire(future_calendar_collector) diff --git a/scripts/data_collector/contrib/requirements.txt b/scripts/data_collector/contrib/requirements.txt new file mode 100644 index 000000000..92dcb2374 --- /dev/null +++ b/scripts/data_collector/contrib/requirements.txt @@ -0,0 +1,5 @@ +baostock +fire +numpy +pandas +loguru diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index e8c9b9dc4..3f4539612 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -10,7 +10,9 @@ import random import requests import functools from pathlib import Path +from typing import Iterable, Tuple +import numpy as np import pandas as pd from lxml import etree from loguru import logger @@ -418,5 +420,40 @@ def get_trading_date_by_shift(trading_list: list, trading_date: pd.Timestamp, sh return res +def generate_minutes_calendar_from_daily( + calendars: Iterable, + freq: str = "1min", + am_range: Tuple[str, str] = ("09:30:00", "11:29:00"), + pm_range: Tuple[str, str] = ("13:00:00", "14:59:00"), +) -> pd.Index: + """generate minutes calendar + + Parameters + ---------- + calendars: Iterable + daily calendar + freq: str + by default 1min + am_range: Tuple[str, str] + AM Time Range, by default China-Stock: ("09:30:00", "11:29:00") + pm_range: Tuple[str, str] + PM Time Range, by default China-Stock: ("13:00:00", "14:59:00") + + """ + daily_format: str = "%Y-%m-%d" + res = [] + for _day in calendars: + for _range in [am_range, pm_range]: + res.append( + pd.date_range( + f"{pd.Timestamp(_day).strftime(daily_format)} {_range[0]}", + f"{pd.Timestamp(_day).strftime(daily_format)} {_range[1]}", + freq=freq, + ) + ) + + return pd.Index(sorted(set(np.hstack(res)))) + + if __name__ == "__main__": assert len(get_hs_stock_symbols()) >= MINIMUM_SYMBOLS_NUM diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py index f0e110694..a6e06613e 100644 --- a/scripts/data_collector/yahoo/collector.py +++ b/scripts/data_collector/yahoo/collector.py @@ -24,7 +24,12 @@ from qlib.config import REG_CN as REGION_CN CUR_DIR = Path(__file__).resolve().parent sys.path.append(str(CUR_DIR.parent.parent)) from data_collector.base import BaseCollector, BaseNormalize, BaseRun -from data_collector.utils import get_calendar_list, get_hs_stock_symbols, get_us_stock_symbols +from data_collector.utils import ( + get_calendar_list, + get_hs_stock_symbols, + get_us_stock_symbols, + generate_minutes_calendar_from_daily, +) INDEX_BENCH_URL = "http://push2his.eastmoney.com/api/qt/stock/kline/get?secid=1.{index_code}&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58&klt=101&fqt=0&beg={begin}&end={end}" @@ -418,21 +423,9 @@ class YahooNormalize1min(YahooNormalize, ABC): return calendar_list_1d def generate_1min_from_daily(self, calendars: Iterable) -> pd.Index: - res = [] - daily_format = self.DAILY_FORMAT - am_range = self.AM_RANGE - pm_range = self.PM_RANGE - for _day in calendars: - for _range in [am_range, pm_range]: - res.append( - pd.date_range( - f"{_day.strftime(daily_format)} {_range[0]}", - f"{_day.strftime(daily_format)} {_range[1]}", - freq="1min", - ) - ) - - return pd.Index(sorted(set(np.hstack(res)))) + return generate_minutes_calendar_from_daily( + calendars, freq="1min", am_range=self.AM_RANGE, pm_range=self.PM_RANGE + ) def adjusted_price(self, df: pd.DataFrame) -> pd.DataFrame: # TODO: using daily data factor