mirror of
https://github.com/microsoft/qlib.git
synced 2026-06-06 05:51:17 +08:00
Fix high-freq data (#702)
* fix the collector.py yahoo 1min factor calculation * fix HFSignalRecord
This commit is contained in:
@@ -96,6 +96,27 @@ class CSIIndex(IndexBase):
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def format_datetime(self, inst_df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""formatting the datetime in an instrument
|
||||
|
||||
Parameters
|
||||
----------
|
||||
inst_df: pd.DataFrame
|
||||
inst_df.columns = [self.SYMBOL_FIELD_NAME, self.START_DATE_FIELD, self.END_DATE_FIELD]
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
"""
|
||||
if self.freq != "day":
|
||||
inst_df[self.START_DATE_FIELD] = inst_df[self.START_DATE_FIELD].apply(
|
||||
lambda x: (pd.Timestamp(x) + pd.Timedelta(hours=9, minutes=30)).strftime("%Y-%m-%d %H:%M:%S")
|
||||
)
|
||||
inst_df[self.END_DATE_FIELD] = inst_df[self.END_DATE_FIELD].apply(
|
||||
lambda x: (pd.Timestamp(x) + pd.Timedelta(hours=15, minutes=0)).strftime("%Y-%m-%d %H:%M:%S")
|
||||
)
|
||||
return inst_df
|
||||
|
||||
def get_changes(self) -> pd.DataFrame:
|
||||
"""get companies changes
|
||||
|
||||
@@ -284,7 +305,12 @@ class CSI100(CSIIndex):
|
||||
|
||||
|
||||
def get_instruments(
|
||||
qlib_dir: str, index_name: str, method: str = "parse_instruments", request_retry: int = 5, retry_sleep: int = 3
|
||||
qlib_dir: str,
|
||||
index_name: str,
|
||||
method: str = "parse_instruments",
|
||||
freq: str = "day",
|
||||
request_retry: int = 5,
|
||||
retry_sleep: int = 3,
|
||||
):
|
||||
"""
|
||||
|
||||
@@ -296,6 +322,8 @@ def get_instruments(
|
||||
index name, value from ["csi100", "csi300"]
|
||||
method: str
|
||||
method, value from ["parse_instruments", "save_new_companies"]
|
||||
freq: str
|
||||
freq, value from ["day", "1min"]
|
||||
request_retry: int
|
||||
request retry, by default 5
|
||||
retry_sleep: int
|
||||
@@ -312,7 +340,7 @@ def get_instruments(
|
||||
"""
|
||||
_cur_module = importlib.import_module("data_collector.cn_index.collector")
|
||||
obj = getattr(_cur_module, f"{index_name.upper()}")(
|
||||
qlib_dir=qlib_dir, index_name=index_name, request_retry=request_retry, retry_sleep=retry_sleep
|
||||
qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep
|
||||
)
|
||||
getattr(obj, method)()
|
||||
|
||||
|
||||
@@ -26,7 +26,14 @@ class IndexBase:
|
||||
ADD = "add"
|
||||
INST_PREFIX = ""
|
||||
|
||||
def __init__(self, index_name: str, qlib_dir: [str, Path] = None, request_retry: int = 5, retry_sleep: int = 3):
|
||||
def __init__(
|
||||
self,
|
||||
index_name: str,
|
||||
qlib_dir: [str, Path] = None,
|
||||
freq: str = "day",
|
||||
request_retry: int = 5,
|
||||
retry_sleep: int = 3,
|
||||
):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
@@ -35,6 +42,8 @@ class IndexBase:
|
||||
index name
|
||||
qlib_dir: str
|
||||
qlib directory, by default Path(__file__).resolve().parent.joinpath("qlib_data")
|
||||
freq: str
|
||||
freq, value from ["day", "1min"]
|
||||
request_retry: int
|
||||
request retry, by default 5
|
||||
retry_sleep: int
|
||||
@@ -49,6 +58,7 @@ class IndexBase:
|
||||
self.cache_dir.mkdir(exist_ok=True, parents=True)
|
||||
self._request_retry = request_retry
|
||||
self._retry_sleep = retry_sleep
|
||||
self.freq = freq
|
||||
|
||||
@property
|
||||
@abc.abstractmethod
|
||||
@@ -106,6 +116,21 @@ class IndexBase:
|
||||
"""
|
||||
raise NotImplementedError("rewrite get_changes")
|
||||
|
||||
@abc.abstractmethod
|
||||
def format_datetime(self, inst_df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""formatting the datetime in an instrument
|
||||
|
||||
Parameters
|
||||
----------
|
||||
inst_df: pd.DataFrame
|
||||
inst_df.columns = [self.SYMBOL_FIELD_NAME, self.START_DATE_FIELD, self.END_DATE_FIELD]
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
"""
|
||||
raise NotImplementedError("rewrite format_datetime")
|
||||
|
||||
def save_new_companies(self):
|
||||
"""save new companies
|
||||
|
||||
@@ -206,6 +231,7 @@ class IndexBase:
|
||||
_inst_prefix = self.INST_PREFIX.strip()
|
||||
if _inst_prefix:
|
||||
inst_df["save_inst"] = inst_df[self.SYMBOL_FIELD_NAME].apply(lambda x: f"{_inst_prefix}{x}")
|
||||
inst_df = self.format_datetime(inst_df)
|
||||
inst_df.to_csv(
|
||||
self.instruments_dir.joinpath(f"{self.index_name.lower()}.txt"), sep="\t", index=False, header=None
|
||||
)
|
||||
|
||||
@@ -37,9 +37,16 @@ class WIKIIndex(IndexBase):
|
||||
# https://superuser.com/questions/613313/why-cant-we-make-con-prn-null-folder-in-windows
|
||||
INST_PREFIX = ""
|
||||
|
||||
def __init__(self, index_name: str, qlib_dir: [str, Path] = None, request_retry: int = 5, retry_sleep: int = 3):
|
||||
def __init__(
|
||||
self,
|
||||
index_name: str,
|
||||
qlib_dir: [str, Path] = None,
|
||||
freq: str = "day",
|
||||
request_retry: int = 5,
|
||||
retry_sleep: int = 3,
|
||||
):
|
||||
super(WIKIIndex, self).__init__(
|
||||
index_name=index_name, qlib_dir=qlib_dir, request_retry=request_retry, retry_sleep=retry_sleep
|
||||
index_name=index_name, qlib_dir=qlib_dir, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep
|
||||
)
|
||||
|
||||
self._target_url = f"{WIKI_URL}/{WIKI_INDEX_NAME_MAP[self.index_name.upper()]}"
|
||||
@@ -71,6 +78,24 @@ class WIKIIndex(IndexBase):
|
||||
"""
|
||||
raise NotImplementedError("rewrite get_changes")
|
||||
|
||||
def format_datetime(self, inst_df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""formatting the datetime in an instrument
|
||||
|
||||
Parameters
|
||||
----------
|
||||
inst_df: pd.DataFrame
|
||||
inst_df.columns = [self.SYMBOL_FIELD_NAME, self.START_DATE_FIELD, self.END_DATE_FIELD]
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
"""
|
||||
if self.freq != "day":
|
||||
inst_df[self.END_DATE_FIELD] = inst_df[self.END_DATE_FIELD].apply(
|
||||
lambda x: (pd.Timestamp(x) + pd.Timedelta(hours=23, minutes=59)).strftime("%Y-%m-%d %H:%M:%S")
|
||||
)
|
||||
return inst_df
|
||||
|
||||
@property
|
||||
def calendar_list(self) -> List[pd.Timestamp]:
|
||||
"""get history trading date
|
||||
@@ -245,7 +270,12 @@ class SP400Index(WIKIIndex):
|
||||
|
||||
|
||||
def get_instruments(
|
||||
qlib_dir: str, index_name: str, method: str = "parse_instruments", request_retry: int = 5, retry_sleep: int = 3
|
||||
qlib_dir: str,
|
||||
index_name: str,
|
||||
method: str = "parse_instruments",
|
||||
freq: str = "day",
|
||||
request_retry: int = 5,
|
||||
retry_sleep: int = 3,
|
||||
):
|
||||
"""
|
||||
|
||||
@@ -257,6 +287,8 @@ def get_instruments(
|
||||
index name, value from ["SP500", "NASDAQ100", "DJIA", "SP400"]
|
||||
method: str
|
||||
method, value from ["parse_instruments", "save_new_companies"]
|
||||
freq: str
|
||||
freq, value from ["day", "1min"]
|
||||
request_retry: int
|
||||
request retry, by default 5
|
||||
retry_sleep: int
|
||||
@@ -265,15 +297,15 @@ def get_instruments(
|
||||
Examples
|
||||
-------
|
||||
# parse instruments
|
||||
$ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/cn_data --method parse_instruments
|
||||
$ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method parse_instruments
|
||||
|
||||
# parse new companies
|
||||
$ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/cn_data --method save_new_companies
|
||||
$ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method save_new_companies
|
||||
|
||||
"""
|
||||
_cur_module = importlib.import_module("data_collector.us_index.collector")
|
||||
obj = getattr(_cur_module, f"{index_name.upper()}Index")(
|
||||
qlib_dir=qlib_dir, index_name=index_name, request_retry=request_retry, retry_sleep=retry_sleep
|
||||
qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep
|
||||
)
|
||||
getattr(obj, method)()
|
||||
|
||||
|
||||
@@ -601,11 +601,19 @@ class YahooNormalize1min(YahooNormalize, ABC):
|
||||
# - Close price adjusted for splits. Adjusted close price adjusted for both dividends and splits.
|
||||
# - data_1d.adjclose: Adjusted close price adjusted for both dividends and splits.
|
||||
# - data_1d.close: `data_1d.adjclose / (close for the first trading day that is not np.nan)`
|
||||
df["date_tmp"] = df[self._date_field_name].apply(lambda x: pd.Timestamp(x).date())
|
||||
df.set_index("date_tmp", inplace=True)
|
||||
df.loc[:, "factor"] = data_1d["close"] / df["close"]
|
||||
df.loc[:, "paused"] = data_1d["paused"]
|
||||
df.reset_index("date_tmp", drop=True, inplace=True)
|
||||
def _calc_factor(df_1d: pd.DataFrame):
|
||||
try:
|
||||
_date = pd.Timestamp(pd.Timestamp(df_1d[self._date_field_name].iloc[0]).date())
|
||||
df_1d["factor"] = (
|
||||
data_1d.loc[_date]["close"] / df_1d.loc[df_1d["close"].last_valid_index()]["close"]
|
||||
)
|
||||
df_1d["paused"] = data_1d.loc[_date]["paused"]
|
||||
except Exception:
|
||||
df_1d["factor"] = np.nan
|
||||
df_1d["paused"] = np.nan
|
||||
return df_1d
|
||||
|
||||
df = df.groupby([df[self._date_field_name].dt.date]).apply(_calc_factor)
|
||||
|
||||
if self.CONSISTENT_1d:
|
||||
# the date sequence is consistent with 1d
|
||||
|
||||
Reference in New Issue
Block a user