diff --git a/examples/highfreq/README.md b/examples/highfreq/README.md index c07d8a2a0..e238e0328 100644 --- a/examples/highfreq/README.md +++ b/examples/highfreq/README.md @@ -30,6 +30,7 @@ Run the example by running the following command: ## Benchmarks Performance ### Signal Test Here are the results of signal test for benchmark models. We will keep updating benchmark models in future. + | Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Long precision| Short Precision | Long-Short Average Return | Long-Short Average Sharpe | |---|---|---|---|---|---|---|---|---|---| -| LightGBM | Alpha158 | 0.3042±0.00 | 1.5372±0.00| 0.3117±0.00 | 1.6258±0.00 | 0.6720±0.00 | 0.6870±0.00 | 0.000769±0.00 | 1.0190±0.00 | +| LightGBM | Alpha158 | 0.0349±0.00 | 0.3805±0.00| 0.0435±0.00 | 0.4724±0.00 | 0.5111±0.00 | 0.5428±0.00 | 0.000074±0.00 | 0.2677±0.00 | diff --git a/qlib/workflow/record_temp.py b/qlib/workflow/record_temp.py index c732ba394..cdc5f6391 100644 --- a/qlib/workflow/record_temp.py +++ b/qlib/workflow/record_temp.py @@ -1,27 +1,21 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -from qlib.backtest import executor -import re import logging import warnings import pandas as pd -from pathlib import Path from pprint import pprint from typing import Union, List, Optional -from collections import defaultdict from qlib.utils.exceptions import LoadObjectError -from ..contrib.evaluate import indicator_analysis, risk_analysis, indicator_analysis +from ..contrib.evaluate import risk_analysis, indicator_analysis from ..data.dataset import DatasetH from ..data.dataset.handler import DataHandlerLP from ..backtest import backtest as normal_backtest -from ..utils import init_instance_by_config, get_module_by_module_path from ..log import get_module_logger from ..utils import flatten_dict, class_casting from ..utils.time import Freq -from ..strategy.base import BaseStrategy from ..contrib.eva.alpha import calc_ic, calc_long_short_return, calc_long_short_prec @@ -215,6 +209,7 @@ class HFSignalRecord(SignalRecord): """ artifact_path = "hg_sig_analysis" + depend_cls = SignalRecord def __init__(self, recorder, **kwargs): super().__init__(recorder=recorder) diff --git a/scripts/data_collector/cn_index/collector.py b/scripts/data_collector/cn_index/collector.py index a529f61df..39104ed68 100644 --- a/scripts/data_collector/cn_index/collector.py +++ b/scripts/data_collector/cn_index/collector.py @@ -96,6 +96,27 @@ class CSIIndex(IndexBase): """ raise NotImplementedError() + def format_datetime(self, inst_df: pd.DataFrame) -> pd.DataFrame: + """formatting the datetime in an instrument + + Parameters + ---------- + inst_df: pd.DataFrame + inst_df.columns = [self.SYMBOL_FIELD_NAME, self.START_DATE_FIELD, self.END_DATE_FIELD] + + Returns + ------- + + """ + if self.freq != "day": + inst_df[self.START_DATE_FIELD] = inst_df[self.START_DATE_FIELD].apply( + lambda x: (pd.Timestamp(x) + pd.Timedelta(hours=9, minutes=30)).strftime("%Y-%m-%d %H:%M:%S") + ) + inst_df[self.END_DATE_FIELD] = inst_df[self.END_DATE_FIELD].apply( + lambda x: (pd.Timestamp(x) + pd.Timedelta(hours=15, minutes=0)).strftime("%Y-%m-%d %H:%M:%S") + ) + return inst_df + def get_changes(self) -> pd.DataFrame: """get companies changes @@ -284,7 +305,12 @@ class CSI100(CSIIndex): def get_instruments( - qlib_dir: str, index_name: str, method: str = "parse_instruments", request_retry: int = 5, retry_sleep: int = 3 + qlib_dir: str, + index_name: str, + method: str = "parse_instruments", + freq: str = "day", + request_retry: int = 5, + retry_sleep: int = 3, ): """ @@ -296,6 +322,8 @@ def get_instruments( index name, value from ["csi100", "csi300"] method: str method, value from ["parse_instruments", "save_new_companies"] + freq: str + freq, value from ["day", "1min"] request_retry: int request retry, by default 5 retry_sleep: int @@ -312,7 +340,7 @@ def get_instruments( """ _cur_module = importlib.import_module("data_collector.cn_index.collector") obj = getattr(_cur_module, f"{index_name.upper()}")( - qlib_dir=qlib_dir, index_name=index_name, request_retry=request_retry, retry_sleep=retry_sleep + qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep ) getattr(obj, method)() diff --git a/scripts/data_collector/index.py b/scripts/data_collector/index.py index 82a230e37..497c19948 100644 --- a/scripts/data_collector/index.py +++ b/scripts/data_collector/index.py @@ -26,7 +26,14 @@ class IndexBase: ADD = "add" INST_PREFIX = "" - def __init__(self, index_name: str, qlib_dir: [str, Path] = None, request_retry: int = 5, retry_sleep: int = 3): + def __init__( + self, + index_name: str, + qlib_dir: [str, Path] = None, + freq: str = "day", + request_retry: int = 5, + retry_sleep: int = 3, + ): """ Parameters @@ -35,6 +42,8 @@ class IndexBase: index name qlib_dir: str qlib directory, by default Path(__file__).resolve().parent.joinpath("qlib_data") + freq: str + freq, value from ["day", "1min"] request_retry: int request retry, by default 5 retry_sleep: int @@ -49,6 +58,7 @@ class IndexBase: self.cache_dir.mkdir(exist_ok=True, parents=True) self._request_retry = request_retry self._retry_sleep = retry_sleep + self.freq = freq @property @abc.abstractmethod @@ -106,6 +116,21 @@ class IndexBase: """ raise NotImplementedError("rewrite get_changes") + @abc.abstractmethod + def format_datetime(self, inst_df: pd.DataFrame) -> pd.DataFrame: + """formatting the datetime in an instrument + + Parameters + ---------- + inst_df: pd.DataFrame + inst_df.columns = [self.SYMBOL_FIELD_NAME, self.START_DATE_FIELD, self.END_DATE_FIELD] + + Returns + ------- + + """ + raise NotImplementedError("rewrite format_datetime") + def save_new_companies(self): """save new companies @@ -206,6 +231,7 @@ class IndexBase: _inst_prefix = self.INST_PREFIX.strip() if _inst_prefix: inst_df["save_inst"] = inst_df[self.SYMBOL_FIELD_NAME].apply(lambda x: f"{_inst_prefix}{x}") + inst_df = self.format_datetime(inst_df) inst_df.to_csv( self.instruments_dir.joinpath(f"{self.index_name.lower()}.txt"), sep="\t", index=False, header=None ) diff --git a/scripts/data_collector/us_index/collector.py b/scripts/data_collector/us_index/collector.py index 6947baa9a..576b3c32a 100644 --- a/scripts/data_collector/us_index/collector.py +++ b/scripts/data_collector/us_index/collector.py @@ -37,9 +37,16 @@ class WIKIIndex(IndexBase): # https://superuser.com/questions/613313/why-cant-we-make-con-prn-null-folder-in-windows INST_PREFIX = "" - def __init__(self, index_name: str, qlib_dir: [str, Path] = None, request_retry: int = 5, retry_sleep: int = 3): + def __init__( + self, + index_name: str, + qlib_dir: [str, Path] = None, + freq: str = "day", + request_retry: int = 5, + retry_sleep: int = 3, + ): super(WIKIIndex, self).__init__( - index_name=index_name, qlib_dir=qlib_dir, request_retry=request_retry, retry_sleep=retry_sleep + index_name=index_name, qlib_dir=qlib_dir, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep ) self._target_url = f"{WIKI_URL}/{WIKI_INDEX_NAME_MAP[self.index_name.upper()]}" @@ -71,6 +78,24 @@ class WIKIIndex(IndexBase): """ raise NotImplementedError("rewrite get_changes") + def format_datetime(self, inst_df: pd.DataFrame) -> pd.DataFrame: + """formatting the datetime in an instrument + + Parameters + ---------- + inst_df: pd.DataFrame + inst_df.columns = [self.SYMBOL_FIELD_NAME, self.START_DATE_FIELD, self.END_DATE_FIELD] + + Returns + ------- + + """ + if self.freq != "day": + inst_df[self.END_DATE_FIELD] = inst_df[self.END_DATE_FIELD].apply( + lambda x: (pd.Timestamp(x) + pd.Timedelta(hours=23, minutes=59)).strftime("%Y-%m-%d %H:%M:%S") + ) + return inst_df + @property def calendar_list(self) -> List[pd.Timestamp]: """get history trading date @@ -245,7 +270,12 @@ class SP400Index(WIKIIndex): def get_instruments( - qlib_dir: str, index_name: str, method: str = "parse_instruments", request_retry: int = 5, retry_sleep: int = 3 + qlib_dir: str, + index_name: str, + method: str = "parse_instruments", + freq: str = "day", + request_retry: int = 5, + retry_sleep: int = 3, ): """ @@ -257,6 +287,8 @@ def get_instruments( index name, value from ["SP500", "NASDAQ100", "DJIA", "SP400"] method: str method, value from ["parse_instruments", "save_new_companies"] + freq: str + freq, value from ["day", "1min"] request_retry: int request retry, by default 5 retry_sleep: int @@ -265,15 +297,15 @@ def get_instruments( Examples ------- # parse instruments - $ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/cn_data --method parse_instruments + $ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method parse_instruments # parse new companies - $ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/cn_data --method save_new_companies + $ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method save_new_companies """ _cur_module = importlib.import_module("data_collector.us_index.collector") obj = getattr(_cur_module, f"{index_name.upper()}Index")( - qlib_dir=qlib_dir, index_name=index_name, request_retry=request_retry, retry_sleep=retry_sleep + qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep ) getattr(obj, method)() diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py index e4422c4ce..3e8dc7c3f 100644 --- a/scripts/data_collector/yahoo/collector.py +++ b/scripts/data_collector/yahoo/collector.py @@ -601,11 +601,19 @@ class YahooNormalize1min(YahooNormalize, ABC): # - Close price adjusted for splits. Adjusted close price adjusted for both dividends and splits. # - data_1d.adjclose: Adjusted close price adjusted for both dividends and splits. # - data_1d.close: `data_1d.adjclose / (close for the first trading day that is not np.nan)` - df["date_tmp"] = df[self._date_field_name].apply(lambda x: pd.Timestamp(x).date()) - df.set_index("date_tmp", inplace=True) - df.loc[:, "factor"] = data_1d["close"] / df["close"] - df.loc[:, "paused"] = data_1d["paused"] - df.reset_index("date_tmp", drop=True, inplace=True) + def _calc_factor(df_1d: pd.DataFrame): + try: + _date = pd.Timestamp(pd.Timestamp(df_1d[self._date_field_name].iloc[0]).date()) + df_1d["factor"] = ( + data_1d.loc[_date]["close"] / df_1d.loc[df_1d["close"].last_valid_index()]["close"] + ) + df_1d["paused"] = data_1d.loc[_date]["paused"] + except Exception: + df_1d["factor"] = np.nan + df_1d["paused"] = np.nan + return df_1d + + df = df.groupby([df[self._date_field_name].dt.date]).apply(_calc_factor) if self.CONSISTENT_1d: # the date sequence is consistent with 1d