mirror of
https://github.com/microsoft/qlib.git
synced 2026-07-02 02:21:18 +08:00
performance optimization for cal_sam_minute
This commit is contained in:
@@ -15,6 +15,7 @@ import bisect
|
||||
import logging
|
||||
import importlib
|
||||
import traceback
|
||||
from typing import List, Union
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from multiprocessing import Pool
|
||||
@@ -212,19 +213,22 @@ class InstrumentProvider(abc.ABC, ProviderBackendMixin):
|
||||
self.backend = kwargs.get("backend", {})
|
||||
|
||||
@staticmethod
|
||||
def instruments(market="all", filter_pipe=None):
|
||||
def instruments(market: Union[List, str]="all", filter_pipe: Union[List, None]=None):
|
||||
"""Get the general config dictionary for a base market adding several dynamic filters.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
market : str
|
||||
market/industry/index shortname, e.g. all/sse/szse/sse50/csi300/csi500.
|
||||
market : Union[List, str]
|
||||
str:
|
||||
market/industry/index shortname, e.g. all/sse/szse/sse50/csi300/csi500.
|
||||
list:
|
||||
["ID1", "ID2"]. A list of stocks
|
||||
filter_pipe : list
|
||||
the list of dynamic filters.
|
||||
|
||||
Returns
|
||||
----------
|
||||
dict
|
||||
dict: if insinstance(market, str)
|
||||
dict of stockpool config.
|
||||
{`market`=>base market name, `filter_pipe`=>list of filters}
|
||||
|
||||
@@ -242,7 +246,13 @@ class InstrumentProvider(abc.ABC, ProviderBackendMixin):
|
||||
'name_rule_re': 'SH[0-9]{4}55',
|
||||
'filter_start_time': None,
|
||||
'filter_end_time': None}]}
|
||||
|
||||
list: if insinstance(market, list)
|
||||
just return the original list directly.
|
||||
NOTE: this will make the instruments compatible with more cases. The user code will be simpler.
|
||||
"""
|
||||
if isinstance(market, list):
|
||||
return market
|
||||
if filter_pipe is None:
|
||||
filter_pipe = []
|
||||
config = {"market": market, "filter_pipe": []}
|
||||
|
||||
@@ -68,7 +68,7 @@ def get_module_logger(module_name, level: Optional[int] = None) -> logging.Logge
|
||||
|
||||
class TimeInspector:
|
||||
|
||||
timer_logger = get_module_logger("timer", level=logging.WARNING)
|
||||
timer_logger = get_module_logger("timer", level=logging.INFO)
|
||||
|
||||
time_marks = []
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ from typing import Tuple, List, Union, Optional, Callable
|
||||
|
||||
from . import lazy_sort_index
|
||||
from ..config import C
|
||||
from .time import Freq
|
||||
from .time import Freq, cal_sam_minute
|
||||
|
||||
def resam_calendar(calendar_raw: np.ndarray, freq_raw: str, freq_sam: str) -> np.ndarray:
|
||||
"""
|
||||
@@ -36,38 +36,6 @@ def resam_calendar(calendar_raw: np.ndarray, freq_raw: str, freq_sam: str) -> np
|
||||
|
||||
# if freq_sam is xminute, divide each trading day into several bars evenly
|
||||
if freq_sam == Freq.NORM_FREQ_MINUTE:
|
||||
|
||||
def cal_sam_minute(x, sam_minutes):
|
||||
"""
|
||||
Sample raw calendar into calendar with sam_minutes freq, shift represents the shift minute the market time
|
||||
- open time of stock market is [9:30 - shift*pd.Timedelta(minutes=1)]
|
||||
- mid close time of stock market is [11:29 - shift*pd.Timedelta(minutes=1)]
|
||||
- mid open time of stock market is [13:00 - shift*pd.Timedelta(minutes=1)]
|
||||
- close time of stock market is [14:59 - shift*pd.Timedelta(minutes=1)]
|
||||
"""
|
||||
day_time = pd.Timestamp(x.date())
|
||||
shift = C.min_data_shift
|
||||
|
||||
open_time = day_time + pd.Timedelta(hours=9, minutes=30) - shift * pd.Timedelta(minutes=1)
|
||||
mid_close_time = day_time + pd.Timedelta(hours=11, minutes=29) - shift * pd.Timedelta(minutes=1)
|
||||
mid_open_time = day_time + pd.Timedelta(hours=13, minutes=00) - shift * pd.Timedelta(minutes=1)
|
||||
close_time = day_time + pd.Timedelta(hours=14, minutes=59) - shift * pd.Timedelta(minutes=1)
|
||||
|
||||
if open_time <= x <= mid_close_time:
|
||||
minute_index = (x - open_time).seconds // 60
|
||||
elif mid_open_time <= x <= close_time:
|
||||
minute_index = (x - mid_open_time).seconds // 60 + 120
|
||||
else:
|
||||
raise ValueError("datetime of calendar is out of range")
|
||||
minute_index = minute_index // sam_minutes * sam_minutes
|
||||
|
||||
if 0 <= minute_index < 120:
|
||||
return open_time + minute_index * pd.Timedelta(minutes=1)
|
||||
elif 120 <= minute_index < 240:
|
||||
return mid_open_time + (minute_index - 120) * pd.Timedelta(minutes=1)
|
||||
else:
|
||||
raise ValueError("calendar minute_index error, check `min_data_shift` in qlib.config.C")
|
||||
|
||||
if freq_raw != Freq.NORM_FREQ_MINUTE:
|
||||
raise ValueError("when sampling minute calendar, freq of raw calendar must be minute or min")
|
||||
else:
|
||||
|
||||
@@ -4,24 +4,34 @@
|
||||
Time related utils are compiled in this script
|
||||
"""
|
||||
import bisect
|
||||
from datetime import time
|
||||
from datetime import datetime, time
|
||||
from typing import List, Tuple
|
||||
import re
|
||||
from numpy import append
|
||||
import pandas as pd
|
||||
from qlib.config import C
|
||||
import functools
|
||||
|
||||
|
||||
def get_min_cal() -> List[time]:
|
||||
@functools.lru_cache(maxsize=240)
|
||||
def get_min_cal(shift: int=0) -> List[time]:
|
||||
"""
|
||||
get the minute level calendar in day period
|
||||
|
||||
Parameters
|
||||
----------
|
||||
shift : int
|
||||
the shift direction would be like pandas shift.
|
||||
series.shift(1) will replace the value at `i`-th with the one at `i-1`-th
|
||||
|
||||
Returns
|
||||
-------
|
||||
List[time]:
|
||||
|
||||
"""
|
||||
cal = []
|
||||
for ts in list(pd.date_range("9:30", "11:29", freq="1min")) + list(pd.date_range("13:00", "14:59", freq="1min")):
|
||||
for ts in list(pd.date_range("9:30", "11:29", freq="1min") - pd.Timedelta(minutes=shift)) +\
|
||||
list(pd.date_range("13:00", "14:59", freq="1min") - pd.Timedelta(minutes=shift)):
|
||||
cal.append(ts.time())
|
||||
return cal
|
||||
|
||||
@@ -111,5 +121,35 @@ def get_day_min_idx_range(start: str, end: str, freq: str) -> Tuple[int, int]:
|
||||
return left_idx, right_idx
|
||||
|
||||
|
||||
def cal_sam_minute(x: pd.Timestamp, sam_minutes: int) -> pd.Timestamp:
|
||||
"""
|
||||
align the minute-level data to a down sampled calendar
|
||||
|
||||
e.g. align 10:38 to 10:35 in 5 minute-level(10:30 in 10 minute-level)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : pd.Timestamp
|
||||
datetime to be aligned
|
||||
sam_minutes : int
|
||||
align to `sam_minutes` minute-level calendar
|
||||
|
||||
Returns
|
||||
-------
|
||||
pd.Timestamp:
|
||||
the datetime after aligned
|
||||
"""
|
||||
cal = get_min_cal(C.min_data_shift)[::sam_minutes]
|
||||
idx = bisect.bisect_right(cal, x.time()) - 1
|
||||
date, new_time = x.date(), cal[idx]
|
||||
return pd.Timestamp(
|
||||
datetime(date.year,
|
||||
month=date.month,
|
||||
day=date.day,
|
||||
hour=new_time.hour,
|
||||
minute=new_time.minute,
|
||||
second=new_time.second,
|
||||
microsecond=new_time.microsecond))
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(get_day_min_idx_range("8:30", "14:59", "10min"))
|
||||
|
||||
89
tests/misc/test_utils.py
Normal file
89
tests/misc/test_utils.py
Normal file
@@ -0,0 +1,89 @@
|
||||
from unittest.case import TestCase
|
||||
import unittest
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from datetime import datetime
|
||||
from qlib import init
|
||||
from qlib.config import C
|
||||
from qlib.log import TimeInspector
|
||||
from qlib.utils.time import cal_sam_minute as cal_sam_minute_new, get_min_cal
|
||||
|
||||
|
||||
def cal_sam_minute(x, sam_minutes):
|
||||
"""
|
||||
Sample raw calendar into calendar with sam_minutes freq, shift represents the shift minute the market time
|
||||
- open time of stock market is [9:30 - shift*pd.Timedelta(minutes=1)]
|
||||
- mid close time of stock market is [11:29 - shift*pd.Timedelta(minutes=1)]
|
||||
- mid open time of stock market is [13:00 - shift*pd.Timedelta(minutes=1)]
|
||||
- close time of stock market is [14:59 - shift*pd.Timedelta(minutes=1)]
|
||||
"""
|
||||
# TODO: actually, this version is much faster when no cache or optimization
|
||||
day_time = pd.Timestamp(x.date())
|
||||
shift = C.min_data_shift
|
||||
|
||||
open_time = day_time + pd.Timedelta(hours=9, minutes=30) - shift * pd.Timedelta(minutes=1)
|
||||
mid_close_time = day_time + pd.Timedelta(hours=11, minutes=29) - shift * pd.Timedelta(minutes=1)
|
||||
mid_open_time = day_time + pd.Timedelta(hours=13, minutes=00) - shift * pd.Timedelta(minutes=1)
|
||||
close_time = day_time + pd.Timedelta(hours=14, minutes=59) - shift * pd.Timedelta(minutes=1)
|
||||
|
||||
if open_time <= x <= mid_close_time:
|
||||
minute_index = (x - open_time).seconds // 60
|
||||
elif mid_open_time <= x <= close_time:
|
||||
minute_index = (x - mid_open_time).seconds // 60 + 120
|
||||
else:
|
||||
raise ValueError("datetime of calendar is out of range")
|
||||
minute_index = minute_index // sam_minutes * sam_minutes
|
||||
|
||||
if 0 <= minute_index < 120:
|
||||
return open_time + minute_index * pd.Timedelta(minutes=1)
|
||||
elif 120 <= minute_index < 240:
|
||||
return mid_open_time + (minute_index - 120) * pd.Timedelta(minutes=1)
|
||||
else:
|
||||
raise ValueError("calendar minute_index error, check `min_data_shift` in qlib.config.C")
|
||||
|
||||
|
||||
class TimeUtils(TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
init()
|
||||
|
||||
def test_cal_sam_minute(self):
|
||||
# test the correctness of the code
|
||||
random_n = 1000
|
||||
cal = get_min_cal()
|
||||
|
||||
def gen_args():
|
||||
for time in np.random.choice(cal, size=random_n, replace=True):
|
||||
sam_minutes = np.random.choice([1, 2, 3, 4, 5, 6])
|
||||
dt = pd.Timestamp(
|
||||
datetime(
|
||||
2021,
|
||||
month=3,
|
||||
day=3,
|
||||
hour=time.hour,
|
||||
minute=time.minute,
|
||||
second=time.second,
|
||||
microsecond=time.microsecond,
|
||||
)
|
||||
)
|
||||
args = dt, sam_minutes
|
||||
yield args
|
||||
|
||||
for args in gen_args():
|
||||
assert cal_sam_minute(*args) == cal_sam_minute_new(*args)
|
||||
|
||||
# test the performance of the code
|
||||
|
||||
args_l = list(gen_args())
|
||||
|
||||
with TimeInspector.logt():
|
||||
for args in args_l:
|
||||
cal_sam_minute(*args)
|
||||
|
||||
with TimeInspector.logt():
|
||||
for args in args_l:
|
||||
cal_sam_minute_new(*args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user