1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-07-02 02:21:18 +08:00

performance optimization for cal_sam_minute

This commit is contained in:
Young
2021-06-27 09:24:55 +00:00
committed by you-n-g
parent b41267fa59
commit 9b91758aed
5 changed files with 148 additions and 41 deletions

View File

@@ -15,6 +15,7 @@ import bisect
import logging
import importlib
import traceback
from typing import List, Union
import numpy as np
import pandas as pd
from multiprocessing import Pool
@@ -212,19 +213,22 @@ class InstrumentProvider(abc.ABC, ProviderBackendMixin):
self.backend = kwargs.get("backend", {})
@staticmethod
def instruments(market="all", filter_pipe=None):
def instruments(market: Union[List, str]="all", filter_pipe: Union[List, None]=None):
"""Get the general config dictionary for a base market adding several dynamic filters.
Parameters
----------
market : str
market/industry/index shortname, e.g. all/sse/szse/sse50/csi300/csi500.
market : Union[List, str]
str:
market/industry/index shortname, e.g. all/sse/szse/sse50/csi300/csi500.
list:
["ID1", "ID2"]. A list of stocks
filter_pipe : list
the list of dynamic filters.
Returns
----------
dict
dict: if insinstance(market, str)
dict of stockpool config.
{`market`=>base market name, `filter_pipe`=>list of filters}
@@ -242,7 +246,13 @@ class InstrumentProvider(abc.ABC, ProviderBackendMixin):
'name_rule_re': 'SH[0-9]{4}55',
'filter_start_time': None,
'filter_end_time': None}]}
list: if insinstance(market, list)
just return the original list directly.
NOTE: this will make the instruments compatible with more cases. The user code will be simpler.
"""
if isinstance(market, list):
return market
if filter_pipe is None:
filter_pipe = []
config = {"market": market, "filter_pipe": []}

View File

@@ -68,7 +68,7 @@ def get_module_logger(module_name, level: Optional[int] = None) -> logging.Logge
class TimeInspector:
timer_logger = get_module_logger("timer", level=logging.WARNING)
timer_logger = get_module_logger("timer", level=logging.INFO)
time_marks = []

View File

@@ -7,7 +7,7 @@ from typing import Tuple, List, Union, Optional, Callable
from . import lazy_sort_index
from ..config import C
from .time import Freq
from .time import Freq, cal_sam_minute
def resam_calendar(calendar_raw: np.ndarray, freq_raw: str, freq_sam: str) -> np.ndarray:
"""
@@ -36,38 +36,6 @@ def resam_calendar(calendar_raw: np.ndarray, freq_raw: str, freq_sam: str) -> np
# if freq_sam is xminute, divide each trading day into several bars evenly
if freq_sam == Freq.NORM_FREQ_MINUTE:
def cal_sam_minute(x, sam_minutes):
"""
Sample raw calendar into calendar with sam_minutes freq, shift represents the shift minute the market time
- open time of stock market is [9:30 - shift*pd.Timedelta(minutes=1)]
- mid close time of stock market is [11:29 - shift*pd.Timedelta(minutes=1)]
- mid open time of stock market is [13:00 - shift*pd.Timedelta(minutes=1)]
- close time of stock market is [14:59 - shift*pd.Timedelta(minutes=1)]
"""
day_time = pd.Timestamp(x.date())
shift = C.min_data_shift
open_time = day_time + pd.Timedelta(hours=9, minutes=30) - shift * pd.Timedelta(minutes=1)
mid_close_time = day_time + pd.Timedelta(hours=11, minutes=29) - shift * pd.Timedelta(minutes=1)
mid_open_time = day_time + pd.Timedelta(hours=13, minutes=00) - shift * pd.Timedelta(minutes=1)
close_time = day_time + pd.Timedelta(hours=14, minutes=59) - shift * pd.Timedelta(minutes=1)
if open_time <= x <= mid_close_time:
minute_index = (x - open_time).seconds // 60
elif mid_open_time <= x <= close_time:
minute_index = (x - mid_open_time).seconds // 60 + 120
else:
raise ValueError("datetime of calendar is out of range")
minute_index = minute_index // sam_minutes * sam_minutes
if 0 <= minute_index < 120:
return open_time + minute_index * pd.Timedelta(minutes=1)
elif 120 <= minute_index < 240:
return mid_open_time + (minute_index - 120) * pd.Timedelta(minutes=1)
else:
raise ValueError("calendar minute_index error, check `min_data_shift` in qlib.config.C")
if freq_raw != Freq.NORM_FREQ_MINUTE:
raise ValueError("when sampling minute calendar, freq of raw calendar must be minute or min")
else:

View File

@@ -4,24 +4,34 @@
Time related utils are compiled in this script
"""
import bisect
from datetime import time
from datetime import datetime, time
from typing import List, Tuple
import re
from numpy import append
import pandas as pd
from qlib.config import C
import functools
def get_min_cal() -> List[time]:
@functools.lru_cache(maxsize=240)
def get_min_cal(shift: int=0) -> List[time]:
"""
get the minute level calendar in day period
Parameters
----------
shift : int
the shift direction would be like pandas shift.
series.shift(1) will replace the value at `i`-th with the one at `i-1`-th
Returns
-------
List[time]:
"""
cal = []
for ts in list(pd.date_range("9:30", "11:29", freq="1min")) + list(pd.date_range("13:00", "14:59", freq="1min")):
for ts in list(pd.date_range("9:30", "11:29", freq="1min") - pd.Timedelta(minutes=shift)) +\
list(pd.date_range("13:00", "14:59", freq="1min") - pd.Timedelta(minutes=shift)):
cal.append(ts.time())
return cal
@@ -111,5 +121,35 @@ def get_day_min_idx_range(start: str, end: str, freq: str) -> Tuple[int, int]:
return left_idx, right_idx
def cal_sam_minute(x: pd.Timestamp, sam_minutes: int) -> pd.Timestamp:
"""
align the minute-level data to a down sampled calendar
e.g. align 10:38 to 10:35 in 5 minute-level(10:30 in 10 minute-level)
Parameters
----------
x : pd.Timestamp
datetime to be aligned
sam_minutes : int
align to `sam_minutes` minute-level calendar
Returns
-------
pd.Timestamp:
the datetime after aligned
"""
cal = get_min_cal(C.min_data_shift)[::sam_minutes]
idx = bisect.bisect_right(cal, x.time()) - 1
date, new_time = x.date(), cal[idx]
return pd.Timestamp(
datetime(date.year,
month=date.month,
day=date.day,
hour=new_time.hour,
minute=new_time.minute,
second=new_time.second,
microsecond=new_time.microsecond))
if __name__ == "__main__":
print(get_day_min_idx_range("8:30", "14:59", "10min"))

89
tests/misc/test_utils.py Normal file
View File

@@ -0,0 +1,89 @@
from unittest.case import TestCase
import unittest
import pandas as pd
import numpy as np
from datetime import datetime
from qlib import init
from qlib.config import C
from qlib.log import TimeInspector
from qlib.utils.time import cal_sam_minute as cal_sam_minute_new, get_min_cal
def cal_sam_minute(x, sam_minutes):
"""
Sample raw calendar into calendar with sam_minutes freq, shift represents the shift minute the market time
- open time of stock market is [9:30 - shift*pd.Timedelta(minutes=1)]
- mid close time of stock market is [11:29 - shift*pd.Timedelta(minutes=1)]
- mid open time of stock market is [13:00 - shift*pd.Timedelta(minutes=1)]
- close time of stock market is [14:59 - shift*pd.Timedelta(minutes=1)]
"""
# TODO: actually, this version is much faster when no cache or optimization
day_time = pd.Timestamp(x.date())
shift = C.min_data_shift
open_time = day_time + pd.Timedelta(hours=9, minutes=30) - shift * pd.Timedelta(minutes=1)
mid_close_time = day_time + pd.Timedelta(hours=11, minutes=29) - shift * pd.Timedelta(minutes=1)
mid_open_time = day_time + pd.Timedelta(hours=13, minutes=00) - shift * pd.Timedelta(minutes=1)
close_time = day_time + pd.Timedelta(hours=14, minutes=59) - shift * pd.Timedelta(minutes=1)
if open_time <= x <= mid_close_time:
minute_index = (x - open_time).seconds // 60
elif mid_open_time <= x <= close_time:
minute_index = (x - mid_open_time).seconds // 60 + 120
else:
raise ValueError("datetime of calendar is out of range")
minute_index = minute_index // sam_minutes * sam_minutes
if 0 <= minute_index < 120:
return open_time + minute_index * pd.Timedelta(minutes=1)
elif 120 <= minute_index < 240:
return mid_open_time + (minute_index - 120) * pd.Timedelta(minutes=1)
else:
raise ValueError("calendar minute_index error, check `min_data_shift` in qlib.config.C")
class TimeUtils(TestCase):
@classmethod
def setUpClass(cls):
init()
def test_cal_sam_minute(self):
# test the correctness of the code
random_n = 1000
cal = get_min_cal()
def gen_args():
for time in np.random.choice(cal, size=random_n, replace=True):
sam_minutes = np.random.choice([1, 2, 3, 4, 5, 6])
dt = pd.Timestamp(
datetime(
2021,
month=3,
day=3,
hour=time.hour,
minute=time.minute,
second=time.second,
microsecond=time.microsecond,
)
)
args = dt, sam_minutes
yield args
for args in gen_args():
assert cal_sam_minute(*args) == cal_sam_minute_new(*args)
# test the performance of the code
args_l = list(gen_args())
with TimeInspector.logt():
for args in args_l:
cal_sam_minute(*args)
with TimeInspector.logt():
for args in args_l:
cal_sam_minute_new(*args)
if __name__ == "__main__":
unittest.main()