1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-06-06 05:51:17 +08:00

Fix high-freq data (#702)

* fix the collector.py yahoo 1min factor calculation

* fix HFSignalRecord
This commit is contained in:
Pengrong Zhu
2021-11-20 15:03:53 +08:00
committed by GitHub
parent 9265b66e09
commit d224ea447e
6 changed files with 112 additions and 22 deletions

View File

@@ -30,6 +30,7 @@ Run the example by running the following command:
## Benchmarks Performance
### Signal Test
Here are the results of signal test for benchmark models. We will keep updating benchmark models in future.
| Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Long precision| Short Precision | Long-Short Average Return | Long-Short Average Sharpe |
|---|---|---|---|---|---|---|---|---|---|
| LightGBM | Alpha158 | 0.3042±0.00 | 1.5372±0.00| 0.3117±0.00 | 1.6258±0.00 | 0.6720±0.00 | 0.6870±0.00 | 0.000769±0.00 | 1.0190±0.00 |
| LightGBM | Alpha158 | 0.0349±0.00 | 0.3805±0.00| 0.0435±0.00 | 0.4724±0.00 | 0.5111±0.00 | 0.5428±0.00 | 0.000074±0.00 | 0.2677±0.00 |

View File

@@ -1,27 +1,21 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from qlib.backtest import executor
import re
import logging
import warnings
import pandas as pd
from pathlib import Path
from pprint import pprint
from typing import Union, List, Optional
from collections import defaultdict
from qlib.utils.exceptions import LoadObjectError
from ..contrib.evaluate import indicator_analysis, risk_analysis, indicator_analysis
from ..contrib.evaluate import risk_analysis, indicator_analysis
from ..data.dataset import DatasetH
from ..data.dataset.handler import DataHandlerLP
from ..backtest import backtest as normal_backtest
from ..utils import init_instance_by_config, get_module_by_module_path
from ..log import get_module_logger
from ..utils import flatten_dict, class_casting
from ..utils.time import Freq
from ..strategy.base import BaseStrategy
from ..contrib.eva.alpha import calc_ic, calc_long_short_return, calc_long_short_prec
@@ -215,6 +209,7 @@ class HFSignalRecord(SignalRecord):
"""
artifact_path = "hg_sig_analysis"
depend_cls = SignalRecord
def __init__(self, recorder, **kwargs):
super().__init__(recorder=recorder)

View File

@@ -96,6 +96,27 @@ class CSIIndex(IndexBase):
"""
raise NotImplementedError()
def format_datetime(self, inst_df: pd.DataFrame) -> pd.DataFrame:
"""formatting the datetime in an instrument
Parameters
----------
inst_df: pd.DataFrame
inst_df.columns = [self.SYMBOL_FIELD_NAME, self.START_DATE_FIELD, self.END_DATE_FIELD]
Returns
-------
"""
if self.freq != "day":
inst_df[self.START_DATE_FIELD] = inst_df[self.START_DATE_FIELD].apply(
lambda x: (pd.Timestamp(x) + pd.Timedelta(hours=9, minutes=30)).strftime("%Y-%m-%d %H:%M:%S")
)
inst_df[self.END_DATE_FIELD] = inst_df[self.END_DATE_FIELD].apply(
lambda x: (pd.Timestamp(x) + pd.Timedelta(hours=15, minutes=0)).strftime("%Y-%m-%d %H:%M:%S")
)
return inst_df
def get_changes(self) -> pd.DataFrame:
"""get companies changes
@@ -284,7 +305,12 @@ class CSI100(CSIIndex):
def get_instruments(
qlib_dir: str, index_name: str, method: str = "parse_instruments", request_retry: int = 5, retry_sleep: int = 3
qlib_dir: str,
index_name: str,
method: str = "parse_instruments",
freq: str = "day",
request_retry: int = 5,
retry_sleep: int = 3,
):
"""
@@ -296,6 +322,8 @@ def get_instruments(
index name, value from ["csi100", "csi300"]
method: str
method, value from ["parse_instruments", "save_new_companies"]
freq: str
freq, value from ["day", "1min"]
request_retry: int
request retry, by default 5
retry_sleep: int
@@ -312,7 +340,7 @@ def get_instruments(
"""
_cur_module = importlib.import_module("data_collector.cn_index.collector")
obj = getattr(_cur_module, f"{index_name.upper()}")(
qlib_dir=qlib_dir, index_name=index_name, request_retry=request_retry, retry_sleep=retry_sleep
qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep
)
getattr(obj, method)()

View File

@@ -26,7 +26,14 @@ class IndexBase:
ADD = "add"
INST_PREFIX = ""
def __init__(self, index_name: str, qlib_dir: [str, Path] = None, request_retry: int = 5, retry_sleep: int = 3):
def __init__(
self,
index_name: str,
qlib_dir: [str, Path] = None,
freq: str = "day",
request_retry: int = 5,
retry_sleep: int = 3,
):
"""
Parameters
@@ -35,6 +42,8 @@ class IndexBase:
index name
qlib_dir: str
qlib directory, by default Path(__file__).resolve().parent.joinpath("qlib_data")
freq: str
freq, value from ["day", "1min"]
request_retry: int
request retry, by default 5
retry_sleep: int
@@ -49,6 +58,7 @@ class IndexBase:
self.cache_dir.mkdir(exist_ok=True, parents=True)
self._request_retry = request_retry
self._retry_sleep = retry_sleep
self.freq = freq
@property
@abc.abstractmethod
@@ -106,6 +116,21 @@ class IndexBase:
"""
raise NotImplementedError("rewrite get_changes")
@abc.abstractmethod
def format_datetime(self, inst_df: pd.DataFrame) -> pd.DataFrame:
"""formatting the datetime in an instrument
Parameters
----------
inst_df: pd.DataFrame
inst_df.columns = [self.SYMBOL_FIELD_NAME, self.START_DATE_FIELD, self.END_DATE_FIELD]
Returns
-------
"""
raise NotImplementedError("rewrite format_datetime")
def save_new_companies(self):
"""save new companies
@@ -206,6 +231,7 @@ class IndexBase:
_inst_prefix = self.INST_PREFIX.strip()
if _inst_prefix:
inst_df["save_inst"] = inst_df[self.SYMBOL_FIELD_NAME].apply(lambda x: f"{_inst_prefix}{x}")
inst_df = self.format_datetime(inst_df)
inst_df.to_csv(
self.instruments_dir.joinpath(f"{self.index_name.lower()}.txt"), sep="\t", index=False, header=None
)

View File

@@ -37,9 +37,16 @@ class WIKIIndex(IndexBase):
# https://superuser.com/questions/613313/why-cant-we-make-con-prn-null-folder-in-windows
INST_PREFIX = ""
def __init__(self, index_name: str, qlib_dir: [str, Path] = None, request_retry: int = 5, retry_sleep: int = 3):
def __init__(
self,
index_name: str,
qlib_dir: [str, Path] = None,
freq: str = "day",
request_retry: int = 5,
retry_sleep: int = 3,
):
super(WIKIIndex, self).__init__(
index_name=index_name, qlib_dir=qlib_dir, request_retry=request_retry, retry_sleep=retry_sleep
index_name=index_name, qlib_dir=qlib_dir, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep
)
self._target_url = f"{WIKI_URL}/{WIKI_INDEX_NAME_MAP[self.index_name.upper()]}"
@@ -71,6 +78,24 @@ class WIKIIndex(IndexBase):
"""
raise NotImplementedError("rewrite get_changes")
def format_datetime(self, inst_df: pd.DataFrame) -> pd.DataFrame:
"""formatting the datetime in an instrument
Parameters
----------
inst_df: pd.DataFrame
inst_df.columns = [self.SYMBOL_FIELD_NAME, self.START_DATE_FIELD, self.END_DATE_FIELD]
Returns
-------
"""
if self.freq != "day":
inst_df[self.END_DATE_FIELD] = inst_df[self.END_DATE_FIELD].apply(
lambda x: (pd.Timestamp(x) + pd.Timedelta(hours=23, minutes=59)).strftime("%Y-%m-%d %H:%M:%S")
)
return inst_df
@property
def calendar_list(self) -> List[pd.Timestamp]:
"""get history trading date
@@ -245,7 +270,12 @@ class SP400Index(WIKIIndex):
def get_instruments(
qlib_dir: str, index_name: str, method: str = "parse_instruments", request_retry: int = 5, retry_sleep: int = 3
qlib_dir: str,
index_name: str,
method: str = "parse_instruments",
freq: str = "day",
request_retry: int = 5,
retry_sleep: int = 3,
):
"""
@@ -257,6 +287,8 @@ def get_instruments(
index name, value from ["SP500", "NASDAQ100", "DJIA", "SP400"]
method: str
method, value from ["parse_instruments", "save_new_companies"]
freq: str
freq, value from ["day", "1min"]
request_retry: int
request retry, by default 5
retry_sleep: int
@@ -265,15 +297,15 @@ def get_instruments(
Examples
-------
# parse instruments
$ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/cn_data --method parse_instruments
$ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method parse_instruments
# parse new companies
$ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/cn_data --method save_new_companies
$ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method save_new_companies
"""
_cur_module = importlib.import_module("data_collector.us_index.collector")
obj = getattr(_cur_module, f"{index_name.upper()}Index")(
qlib_dir=qlib_dir, index_name=index_name, request_retry=request_retry, retry_sleep=retry_sleep
qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep
)
getattr(obj, method)()

View File

@@ -601,11 +601,19 @@ class YahooNormalize1min(YahooNormalize, ABC):
# - Close price adjusted for splits. Adjusted close price adjusted for both dividends and splits.
# - data_1d.adjclose: Adjusted close price adjusted for both dividends and splits.
# - data_1d.close: `data_1d.adjclose / (close for the first trading day that is not np.nan)`
df["date_tmp"] = df[self._date_field_name].apply(lambda x: pd.Timestamp(x).date())
df.set_index("date_tmp", inplace=True)
df.loc[:, "factor"] = data_1d["close"] / df["close"]
df.loc[:, "paused"] = data_1d["paused"]
df.reset_index("date_tmp", drop=True, inplace=True)
def _calc_factor(df_1d: pd.DataFrame):
try:
_date = pd.Timestamp(pd.Timestamp(df_1d[self._date_field_name].iloc[0]).date())
df_1d["factor"] = (
data_1d.loc[_date]["close"] / df_1d.loc[df_1d["close"].last_valid_index()]["close"]
)
df_1d["paused"] = data_1d.loc[_date]["paused"]
except Exception:
df_1d["factor"] = np.nan
df_1d["paused"] = np.nan
return df_1d
df = df.groupby([df[self._date_field_name].dt.date]).apply(_calc_factor)
if self.CONSISTENT_1d:
# the date sequence is consistent with 1d