diff --git a/qlib/data/data.py b/qlib/data/data.py index 978fe6186..116861e78 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -15,6 +15,7 @@ import bisect import logging import importlib import traceback +from typing import List, Union import numpy as np import pandas as pd from multiprocessing import Pool @@ -212,19 +213,22 @@ class InstrumentProvider(abc.ABC, ProviderBackendMixin): self.backend = kwargs.get("backend", {}) @staticmethod - def instruments(market="all", filter_pipe=None): + def instruments(market: Union[List, str]="all", filter_pipe: Union[List, None]=None): """Get the general config dictionary for a base market adding several dynamic filters. Parameters ---------- - market : str - market/industry/index shortname, e.g. all/sse/szse/sse50/csi300/csi500. + market : Union[List, str] + str: + market/industry/index shortname, e.g. all/sse/szse/sse50/csi300/csi500. + list: + ["ID1", "ID2"]. A list of stocks filter_pipe : list the list of dynamic filters. Returns ---------- - dict + dict: if insinstance(market, str) dict of stockpool config. {`market`=>base market name, `filter_pipe`=>list of filters} @@ -242,7 +246,13 @@ class InstrumentProvider(abc.ABC, ProviderBackendMixin): 'name_rule_re': 'SH[0-9]{4}55', 'filter_start_time': None, 'filter_end_time': None}]} + + list: if insinstance(market, list) + just return the original list directly. + NOTE: this will make the instruments compatible with more cases. The user code will be simpler. """ + if isinstance(market, list): + return market if filter_pipe is None: filter_pipe = [] config = {"market": market, "filter_pipe": []} diff --git a/qlib/log.py b/qlib/log.py index 379544392..f0b04bcaa 100644 --- a/qlib/log.py +++ b/qlib/log.py @@ -68,7 +68,7 @@ def get_module_logger(module_name, level: Optional[int] = None) -> logging.Logge class TimeInspector: - timer_logger = get_module_logger("timer", level=logging.WARNING) + timer_logger = get_module_logger("timer", level=logging.INFO) time_marks = [] diff --git a/qlib/utils/resam.py b/qlib/utils/resam.py index ae0cdf9d1..76d97e1bc 100644 --- a/qlib/utils/resam.py +++ b/qlib/utils/resam.py @@ -7,7 +7,7 @@ from typing import Tuple, List, Union, Optional, Callable from . import lazy_sort_index from ..config import C -from .time import Freq +from .time import Freq, cal_sam_minute def resam_calendar(calendar_raw: np.ndarray, freq_raw: str, freq_sam: str) -> np.ndarray: """ @@ -36,38 +36,6 @@ def resam_calendar(calendar_raw: np.ndarray, freq_raw: str, freq_sam: str) -> np # if freq_sam is xminute, divide each trading day into several bars evenly if freq_sam == Freq.NORM_FREQ_MINUTE: - - def cal_sam_minute(x, sam_minutes): - """ - Sample raw calendar into calendar with sam_minutes freq, shift represents the shift minute the market time - - open time of stock market is [9:30 - shift*pd.Timedelta(minutes=1)] - - mid close time of stock market is [11:29 - shift*pd.Timedelta(minutes=1)] - - mid open time of stock market is [13:00 - shift*pd.Timedelta(minutes=1)] - - close time of stock market is [14:59 - shift*pd.Timedelta(minutes=1)] - """ - day_time = pd.Timestamp(x.date()) - shift = C.min_data_shift - - open_time = day_time + pd.Timedelta(hours=9, minutes=30) - shift * pd.Timedelta(minutes=1) - mid_close_time = day_time + pd.Timedelta(hours=11, minutes=29) - shift * pd.Timedelta(minutes=1) - mid_open_time = day_time + pd.Timedelta(hours=13, minutes=00) - shift * pd.Timedelta(minutes=1) - close_time = day_time + pd.Timedelta(hours=14, minutes=59) - shift * pd.Timedelta(minutes=1) - - if open_time <= x <= mid_close_time: - minute_index = (x - open_time).seconds // 60 - elif mid_open_time <= x <= close_time: - minute_index = (x - mid_open_time).seconds // 60 + 120 - else: - raise ValueError("datetime of calendar is out of range") - minute_index = minute_index // sam_minutes * sam_minutes - - if 0 <= minute_index < 120: - return open_time + minute_index * pd.Timedelta(minutes=1) - elif 120 <= minute_index < 240: - return mid_open_time + (minute_index - 120) * pd.Timedelta(minutes=1) - else: - raise ValueError("calendar minute_index error, check `min_data_shift` in qlib.config.C") - if freq_raw != Freq.NORM_FREQ_MINUTE: raise ValueError("when sampling minute calendar, freq of raw calendar must be minute or min") else: diff --git a/qlib/utils/time.py b/qlib/utils/time.py index 6e3bd71a3..fb37fd0a4 100644 --- a/qlib/utils/time.py +++ b/qlib/utils/time.py @@ -4,24 +4,34 @@ Time related utils are compiled in this script """ import bisect -from datetime import time +from datetime import datetime, time from typing import List, Tuple import re from numpy import append import pandas as pd +from qlib.config import C +import functools -def get_min_cal() -> List[time]: +@functools.lru_cache(maxsize=240) +def get_min_cal(shift: int=0) -> List[time]: """ get the minute level calendar in day period + Parameters + ---------- + shift : int + the shift direction would be like pandas shift. + series.shift(1) will replace the value at `i`-th with the one at `i-1`-th + Returns ------- List[time]: """ cal = [] - for ts in list(pd.date_range("9:30", "11:29", freq="1min")) + list(pd.date_range("13:00", "14:59", freq="1min")): + for ts in list(pd.date_range("9:30", "11:29", freq="1min") - pd.Timedelta(minutes=shift)) +\ + list(pd.date_range("13:00", "14:59", freq="1min") - pd.Timedelta(minutes=shift)): cal.append(ts.time()) return cal @@ -111,5 +121,35 @@ def get_day_min_idx_range(start: str, end: str, freq: str) -> Tuple[int, int]: return left_idx, right_idx +def cal_sam_minute(x: pd.Timestamp, sam_minutes: int) -> pd.Timestamp: + """ + align the minute-level data to a down sampled calendar + + e.g. align 10:38 to 10:35 in 5 minute-level(10:30 in 10 minute-level) + + Parameters + ---------- + x : pd.Timestamp + datetime to be aligned + sam_minutes : int + align to `sam_minutes` minute-level calendar + + Returns + ------- + pd.Timestamp: + the datetime after aligned + """ + cal = get_min_cal(C.min_data_shift)[::sam_minutes] + idx = bisect.bisect_right(cal, x.time()) - 1 + date, new_time = x.date(), cal[idx] + return pd.Timestamp( + datetime(date.year, + month=date.month, + day=date.day, + hour=new_time.hour, + minute=new_time.minute, + second=new_time.second, + microsecond=new_time.microsecond)) + if __name__ == "__main__": print(get_day_min_idx_range("8:30", "14:59", "10min")) diff --git a/tests/misc/test_utils.py b/tests/misc/test_utils.py new file mode 100644 index 000000000..4dabf5ed8 --- /dev/null +++ b/tests/misc/test_utils.py @@ -0,0 +1,89 @@ +from unittest.case import TestCase +import unittest +import pandas as pd +import numpy as np +from datetime import datetime +from qlib import init +from qlib.config import C +from qlib.log import TimeInspector +from qlib.utils.time import cal_sam_minute as cal_sam_minute_new, get_min_cal + + +def cal_sam_minute(x, sam_minutes): + """ + Sample raw calendar into calendar with sam_minutes freq, shift represents the shift minute the market time + - open time of stock market is [9:30 - shift*pd.Timedelta(minutes=1)] + - mid close time of stock market is [11:29 - shift*pd.Timedelta(minutes=1)] + - mid open time of stock market is [13:00 - shift*pd.Timedelta(minutes=1)] + - close time of stock market is [14:59 - shift*pd.Timedelta(minutes=1)] + """ + # TODO: actually, this version is much faster when no cache or optimization + day_time = pd.Timestamp(x.date()) + shift = C.min_data_shift + + open_time = day_time + pd.Timedelta(hours=9, minutes=30) - shift * pd.Timedelta(minutes=1) + mid_close_time = day_time + pd.Timedelta(hours=11, minutes=29) - shift * pd.Timedelta(minutes=1) + mid_open_time = day_time + pd.Timedelta(hours=13, minutes=00) - shift * pd.Timedelta(minutes=1) + close_time = day_time + pd.Timedelta(hours=14, minutes=59) - shift * pd.Timedelta(minutes=1) + + if open_time <= x <= mid_close_time: + minute_index = (x - open_time).seconds // 60 + elif mid_open_time <= x <= close_time: + minute_index = (x - mid_open_time).seconds // 60 + 120 + else: + raise ValueError("datetime of calendar is out of range") + minute_index = minute_index // sam_minutes * sam_minutes + + if 0 <= minute_index < 120: + return open_time + minute_index * pd.Timedelta(minutes=1) + elif 120 <= minute_index < 240: + return mid_open_time + (minute_index - 120) * pd.Timedelta(minutes=1) + else: + raise ValueError("calendar minute_index error, check `min_data_shift` in qlib.config.C") + + +class TimeUtils(TestCase): + @classmethod + def setUpClass(cls): + init() + + def test_cal_sam_minute(self): + # test the correctness of the code + random_n = 1000 + cal = get_min_cal() + + def gen_args(): + for time in np.random.choice(cal, size=random_n, replace=True): + sam_minutes = np.random.choice([1, 2, 3, 4, 5, 6]) + dt = pd.Timestamp( + datetime( + 2021, + month=3, + day=3, + hour=time.hour, + minute=time.minute, + second=time.second, + microsecond=time.microsecond, + ) + ) + args = dt, sam_minutes + yield args + + for args in gen_args(): + assert cal_sam_minute(*args) == cal_sam_minute_new(*args) + + # test the performance of the code + + args_l = list(gen_args()) + + with TimeInspector.logt(): + for args in args_l: + cal_sam_minute(*args) + + with TimeInspector.logt(): + for args in args_l: + cal_sam_minute_new(*args) + + +if __name__ == "__main__": + unittest.main()