From 4ec41ea0e7fc74d94e0336948df9a11a53f8365d Mon Sep 17 00:00:00 2001 From: zhupr Date: Wed, 14 Jul 2021 23:23:04 +0800 Subject: [PATCH] Add a check if change is mutated to YahooNormalize1d --- .../future_trading_date_collector.py | 1 + scripts/data_collector/yahoo/collector.py | 27 ++++++++++++++----- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/scripts/data_collector/contrib/future_trading_date_collector/future_trading_date_collector.py b/scripts/data_collector/contrib/future_trading_date_collector/future_trading_date_collector.py index 8df0a4972..939ba7f6a 100644 --- a/scripts/data_collector/contrib/future_trading_date_collector/future_trading_date_collector.py +++ b/scripts/data_collector/contrib/future_trading_date_collector/future_trading_date_collector.py @@ -78,6 +78,7 @@ def future_calendar_collector(qlib_dir: [str, Path], freq: str = "day"): data_list.append(_row_data[0]) data_list = sorted(data_list) date_list = generate_qlib_calendar(data_list, freq=freq) + date_list = sorted(set(daily_calendar.loc[:, 0].values.tolist() + date_list)) write_calendar_to_qlib(qlib_dir, date_list, freq=freq) bs.logout() logger.info(f"get trading dates success: {start_year}-01-01 to {end_year}-12-31") diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py index 6a128a5be..e0e6e0368 100644 --- a/scripts/data_collector/yahoo/collector.py +++ b/scripts/data_collector/yahoo/collector.py @@ -283,6 +283,16 @@ class YahooNormalize(BaseNormalize): COLUMNS = ["open", "close", "high", "low", "volume"] DAILY_FORMAT = "%Y-%m-%d" + @staticmethod + def calc_change(df: pd.DataFrame, last_close: float) -> pd.Series: + df = df.copy() + _tmp_series = df["close"].fillna(method="ffill") + _tmp_shift_series = _tmp_series.shift(1) + if last_close is not None: + _tmp_shift_series.iloc[0] = float(last_close) + change_series = _tmp_series / _tmp_shift_series - 1 + return change_series + @staticmethod def normalize_yahoo( df: pd.DataFrame, @@ -310,11 +320,16 @@ class YahooNormalize(BaseNormalize): ) df.sort_index(inplace=True) df.loc[(df["volume"] <= 0) | np.isnan(df["volume"]), set(df.columns) - {symbol_field_name}] = np.nan - _tmp_series = df["close"].fillna(method="ffill") - _tmp_shift_series = _tmp_series.shift(1) - if last_close is not None: - _tmp_shift_series.iloc[0] = float(last_close) - df["change"] = _tmp_series / _tmp_shift_series - 1 + + change_series = YahooNormalize.calc_change(df, last_close) + # NOTE: The data obtained by Yahoo finance sometimes has exceptions + # WARNING: If it is normal for a `symbol(exchange)` to differ by a factor of *89* to *111* for consecutive trading days, + # WARNING: the logic in the following line needs to be modified + _mask = (change_series >= 89) & (change_series <= 111) + _tmp_cols = ["high", "close", "low", "open", "adjclose"] + df.loc[_mask, _tmp_cols] = df.loc[_mask, _tmp_cols] / 100 + df["change"] = YahooNormalize.calc_change(df, last_close) + columns += ["change"] df.loc[(df["volume"] <= 0) | np.isnan(df["volume"]), columns] = np.nan @@ -852,7 +867,7 @@ class Run(BaseRun): if self.interval.lower() == "1min": if qlib_data_1d_dir is None or not Path(qlib_data_1d_dir).expanduser().exists(): raise ValueError( - "If normalize 1min, the qlib_data_1d_dir parameter must be set: --qlib_data_1d_dir , Reference: https://github.com/zhupr/qlib/tree/support_extend_data/scripts/data_collector/yahoo#automatic-update-of-daily-frequency-datafrom-yahoo-finance" + "If normalize 1min, the qlib_data_1d_dir parameter must be set: --qlib_data_1d_dir , Reference: https://github.com/microsoft/qlib/tree/main/scripts/data_collector/yahoo#automatic-update-of-daily-frequency-datafrom-yahoo-finance" ) super(Run, self).normalize_data( date_field_name, symbol_field_name, end_date=end_date, qlib_data_1d_dir=qlib_data_1d_dir