From 6f150f3fd6f9d48a219e6e9045b285e7fb84c436 Mon Sep 17 00:00:00 2001
From: zhupr <zhu.pengrong@foxmail.com>
Date: Fri, 4 Jun 2021 22:28:42 +0800
Subject: [PATCH 01/14] Add YahooCollector support for extend data

---
 scripts/data_collector/base.py               |  14 +-
 scripts/data_collector/cn_index/collector.py |   5 +-
 scripts/data_collector/yahoo/collector.py    | 210 ++++++++++++++++++-
 3 files changed, 211 insertions(+), 18 deletions(-)

diff --git a/scripts/data_collector/base.py b/scripts/data_collector/base.py
index 12983f6a5..cb51f9b22 100644
--- a/scripts/data_collector/base.py
+++ b/scripts/data_collector/base.py
@@ -226,11 +226,7 @@ class BaseCollector(abc.ABC):
 
 
 class BaseNormalize(abc.ABC):
-    def __init__(
-        self,
-        date_field_name: str = "date",
-        symbol_field_name: str = "symbol",
-    ):
+    def __init__(self, date_field_name: str = "date", symbol_field_name: str = "symbol", **kwargs):
         """
 
         Parameters
@@ -265,6 +261,7 @@ class Normalize:
         max_workers: int = 16,
         date_field_name: str = "date",
         symbol_field_name: str = "symbol",
+        **kwargs,
     ):
         """
 
@@ -291,7 +288,9 @@ class Normalize:
 
         self._max_workers = max_workers
 
-        self._normalize_obj = normalize_class(date_field_name=date_field_name, symbol_field_name=symbol_field_name)
+        self._normalize_obj = normalize_class(
+            date_field_name=date_field_name, symbol_field_name=symbol_field_name, **kwargs
+        )
 
     def _executor(self, file_path: Path):
         file_path = Path(file_path)
@@ -404,7 +403,7 @@ class BaseRun(abc.ABC):
             limit_nums=limit_nums,
         ).collector_data()
 
-    def normalize_data(self, date_field_name: str = "date", symbol_field_name: str = "symbol"):
+    def normalize_data(self, date_field_name: str = "date", symbol_field_name: str = "symbol", **kwargs):
         """normalize data
 
         Parameters
@@ -426,5 +425,6 @@ class BaseRun(abc.ABC):
             max_workers=self.max_workers,
             date_field_name=date_field_name,
             symbol_field_name=symbol_field_name,
+            **kwargs,
         )
         yc.normalize()
diff --git a/scripts/data_collector/cn_index/collector.py b/scripts/data_collector/cn_index/collector.py
index 5af9785ec..1f8434c58 100644
--- a/scripts/data_collector/cn_index/collector.py
+++ b/scripts/data_collector/cn_index/collector.py
@@ -24,7 +24,10 @@ from data_collector.utils import get_calendar_list, get_trading_date_by_shift
 
 NEW_COMPANIES_URL = "http://www.csindex.com.cn/uploads/file/autofile/cons/{index_code}cons.xls"
 
-INDEX_CHANGES_URL = "http://www.csindex.com.cn/zh-CN/search/total?key=%E5%85%B3%E4%BA%8E%E8%B0%83%E6%95%B4%E6%B2%AA%E6%B7%B1300%E5%92%8C%E4%B8%AD%E8%AF%81%E9%A6%99%E6%B8%AF100%E7%AD%89%E6%8C%87%E6%95%B0%E6%A0%B7%E6%9C%AC%E8%82%A1%E7%9A%84%E5%85%AC%E5%91%8A"
+
+# INDEX_CHANGES_URL = "http://www.csindex.com.cn/zh-CN/search/total?key=%E5%85%B3%E4%BA%8E%E8%B0%83%E6%95%B4%E6%B2%AA%E6%B7%B1300%E5%92%8C%E4%B8%AD%E8%AF%81%E9%A6%99%E6%B8%AF100%E7%AD%89%E6%8C%87%E6%95%B0%E6%A0%B7%E6%9C%AC%E8%82%A1%E7%9A%84%E5%85%AC%E5%91%8A"
+# 2020-11-27 Announcement title change
+INDEX_CHANGES_URL = "http://www.csindex.com.cn/zh-CN/search/total?key=%E5%85%B3%E4%BA%8E%E8%B0%83%E6%95%B4%E6%B2%AA%E6%B7%B1300%E5%92%8C%E4%B8%AD%E8%AF%81%E9%A6%99%E6%B8%AF100%E7%AD%89"
 
 
 class CSIIndex(IndexBase):
diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py
index 2cd080199..fcaa9ff92 100644
--- a/scripts/data_collector/yahoo/collector.py
+++ b/scripts/data_collector/yahoo/collector.py
@@ -23,7 +23,7 @@ from qlib.config import REG_CN as REGION_CN
 
 CUR_DIR = Path(__file__).resolve().parent
 sys.path.append(str(CUR_DIR.parent.parent))
-from data_collector.base import BaseCollector, BaseNormalize, BaseRun
+from data_collector.base import BaseCollector, BaseNormalize, BaseRun, Normalize
 from data_collector.utils import (
     get_calendar_list,
     get_hs_stock_symbols,
@@ -297,6 +297,7 @@ class YahooNormalize(BaseNormalize):
         calendar_list: list = None,
         date_field_name: str = "date",
         symbol_field_name: str = "symbol",
+        last_close: float = None,
     ):
         if df.empty:
             return df
@@ -318,7 +319,10 @@ class YahooNormalize(BaseNormalize):
         df.sort_index(inplace=True)
         df.loc[(df["volume"] <= 0) | np.isnan(df["volume"]), set(df.columns) - {symbol_field_name}] = np.nan
         _tmp_series = df["close"].fillna(method="ffill")
-        df["change"] = _tmp_series / _tmp_series.shift(1) - 1
+        _tmp_shift_series = _tmp_series.shift(1)
+        if last_close is not None and isinstance(last_close, (int, float)):
+            _tmp_shift_series.iloc[0] = last_close
+        df["change"] = _tmp_series / _tmp_shift_series - 1
         columns += ["change"]
         df.loc[(df["volume"] <= 0) | np.isnan(df["volume"]), columns] = np.nan
 
@@ -367,6 +371,17 @@ class YahooNormalize1d(YahooNormalize, ABC):
         df = self._manual_adj_data(df)
         return df
 
+    def _get_first_close(self, df: pd.DataFrame) -> float:
+        """get first close value
+
+        Notes
+        -----
+            For incremental updates(append) to Yahoo 1D data, user need to use a close that is not 0 on the first trading day of the existing data
+        """
+        df = df.loc[df["close"].first_valid_index() :]
+        _close = df["close"].iloc[0]
+        return _close
+
     def _manual_adj_data(self, df: pd.DataFrame) -> pd.DataFrame:
         """manual adjust data: All fields (except change) are standardized according to the close of the first day"""
         if df.empty:
@@ -374,8 +389,7 @@ class YahooNormalize1d(YahooNormalize, ABC):
         df = df.copy()
         df.sort_values(self._date_field_name, inplace=True)
         df = df.set_index(self._date_field_name)
-        df = df.loc[df["close"].first_valid_index() :]
-        _close = df["close"].iloc[0]
+        _close = self._get_first_close(df)
         for _col in df.columns:
             if _col == self._symbol_field_name:
                 continue
@@ -388,18 +402,97 @@ class YahooNormalize1d(YahooNormalize, ABC):
         return df.reset_index()
 
 
+class YahooNormalize1dExtend(YahooNormalize1d):
+    def __init__(
+        self, old_qlib_data_dir: [str, Path], date_field_name: str = "date", symbol_field_name: str = "symbol", **kwargs
+    ):
+        """
+
+        Parameters
+        ----------
+        old_qlib_data_dir: str, Path
+            the qlib data to be updated for yahoo, usually from: https://github.com/microsoft/qlib/tree/main/scripts#download-cn-data
+        date_field_name: str
+            date field name, default is date
+        symbol_field_name: str
+            symbol field name, default is symbol
+        """
+        super(YahooNormalize1dExtend, self).__init__(date_field_name, symbol_field_name)
+        self._end_date, self._old_close = self._get_old_data(old_qlib_data_dir)
+        self._end_date = pd.Timestamp(self._end_date).strftime(self.DAILY_FORMAT)
+
+    def _get_old_data(self, qlib_data_dir: [str, Path]):
+        import qlib
+        from qlib.data import D
+
+        qlib_data_dir = str(Path(qlib_data_dir).expanduser().resolve())
+        qlib.init(provider_uri=qlib_data_dir, expression_cache=None, dataset_cache=None)
+        df = D.features(D.instruments("all"), ["$close/$factor"])
+        df.columns = ["close"]
+        return D.calendar()[-1], df
+
+    def _get_first_close(self, df: pd.DataFrame) -> float:
+        _symbol = df.iloc[0][self._symbol_field_name]
+        try:
+            _df = self._old_close.loc(axis=0)[_symbol.upper()]
+            _close = _df.loc[_df.first_valid_index()]["close"]
+        except KeyError:
+            _close = super(YahooNormalize1dExtend, self)._get_first_close(df)
+        return _close
+
+    def _get_last_close(self, df: pd.DataFrame) -> float:
+        _symbol = df.iloc[0][self._symbol_field_name]
+        try:
+            _df = self._old_close.loc(axis=0)[_symbol.upper()]
+            _close = _df.loc[_df.last_valid_index()]["close"]
+        except KeyError:
+            _close = None
+        return _close
+
+    def _get_last_date(self, df: pd.DataFrame) -> pd.Timestamp:
+        _symbol = df.iloc[0][self._symbol_field_name]
+        try:
+            _df = self._old_close.loc(axis=0)[_symbol.upper()]
+            _date = _df.index.max()
+        except KeyError:
+            _date = None
+        return _date
+
+    def normalize(self, df: pd.DataFrame) -> pd.DataFrame:
+        _last_close = self._get_last_close(df)
+        # reindex
+        _last_date = self._get_last_date(df)
+        if _last_date is not None:
+            df = df.set_index(self._date_field_name)
+            df.index = pd.to_datetime(df.index)
+            df = df[~df.index.duplicated(keep="first")]
+            _max_date = df.index.max()
+            df = df.reindex(self._calendar_list).loc[:_max_date].reset_index()
+            df = df[df[self._date_field_name] > _last_date]
+            _si = df["close"].first_valid_index()
+            if _si > df.index[0]:
+                logger.warning(
+                    f"{df.iloc[0][self._symbol_field_name]} missing data: {df.loc[:_si][self._date_field_name]}"
+                )
+        # normalize
+        df = self.normalize_yahoo(
+            df, self._calendar_list, self._date_field_name, self._symbol_field_name, last_close=_last_close
+        )
+        # adjusted price
+        df = self.adjusted_price(df)
+        df = self._manual_adj_data(df)
+        return df
+
+
 class YahooNormalize1min(YahooNormalize, ABC):
     AM_RANGE = None  # type: tuple  # eg: ("09:30:00", "11:29:00")
     PM_RANGE = None  # type: tuple  # eg: ("13:00:00", "14:59:00")
 
     # Whether the trading day of 1min data is consistent with 1d
     CONSISTENT_1d = False
+    CALC_PAUSED_NUM = False
 
-    def __init__(
-        self,
-        date_field_name: str = "date",
-        symbol_field_name: str = "symbol",
-    ):
+    def __init__(self, date_field_name: str = "date", symbol_field_name: str = "symbol", **kwargs):
         """
 
         Parameters
@@ -478,6 +571,54 @@ class YahooNormalize1min(YahooNormalize, ABC):
                 df[_col] = df[_col] / df["factor"]
             else:
                 df[_col] = df[_col] * df["factor"]
+
+        if self.CALC_PAUSED_NUM:
+            df = self.calc_paused_num(df)
+        return df
+
+    def calc_paused_num(self, df: pd.DataFrame):
+        _symbol = df.iloc[0][self._symbol_field_name]
+        df = df.copy()
+        df["date"] = df[self._date_field_name].apply(lambda x: pd.Timestamp(x).date())
+        # remove data that starts and ends with `np.nan` all day
+        all_data = []
+        # Record the number of consecutive trading days where the whole day is nan, to remove the last trading day where the whole day is nan
+        all_nan_nums = 0
+        # Record the number of consecutive occurrences of trading days that are not nan throughout the day
+        not_nan_nums = 0
+        for _date, _df in df.groupby(level="date"):
+            _df["paused"] = 0
+            if not _df.loc[_df["volume"] < 0].empty:
+                logger.warning(f"volume < 0, will fill np.nan: {_date} {_symbol}")
+                _df.loc[_df["volume"] < 0, "volume"] = np.nan
+
+            check_fields = set(_df.columns) - {
+                "date",
+                "paused",
+                "factor",
+                self._date_field_name,
+                self._symbol_field_name,
+            }
+            if _df.loc[:, check_fields].isna().values.all() or (_df["volume"] == 0).all():
+                all_nan_nums += 1
+                not_nan_nums = 0
+                _df["paused"] = 1
+                if all_data:
+                    _df["paused_num"] = not_nan_nums
+                    all_data.append(_df)
+            else:
+                all_nan_nums = 0
+                not_nan_nums += 1
+                _df["paused_num"] = not_nan_nums
+                all_data.append(_df)
+        all_data = all_data[: len(all_data) - all_nan_nums]
+        if all_data:
+            df = pd.concat(all_data, sort=False)
+        else:
+            logger.warning(f"data is empty: {_symbol}")
+            df = pd.DataFrame()
+            return df
+        del df["date"]
         return df
 
     @abc.abstractmethod
@@ -523,11 +664,16 @@ class YahooNormalizeCN1d(YahooNormalizeCN, YahooNormalize1d):
     pass
 
 
+class YahooNormalizeCN1dExtend(YahooNormalizeCN, YahooNormalize1dExtend):
+    pass
+
+
 class YahooNormalizeCN1min(YahooNormalizeCN, YahooNormalize1min):
     AM_RANGE = ("09:30:00", "11:29:00")
     PM_RANGE = ("13:00:00", "14:59:00")
 
     CONSISTENT_1d = True
+    CALC_PAUSED_NUM = True
 
     def _get_calendar_list(self):
         return self.generate_1min_from_daily(self.calendar_list_1d)
@@ -624,10 +770,54 @@ class Run(BaseRun):
 
         Examples
         ---------
-            $ python collector.py normalize_data --source_dir ~/.qlib/stock_data/source --normalize_dir ~/.qlib/stock_data/normalize --region CN --interval 1d
+            $ python collector.py normalize_data --source_dir ~/.qlib/stock_data/source --normalize_dir ~/.qlib/stock_data/normalize --region cn --interval 1d
         """
         super(Run, self).normalize_data(date_field_name, symbol_field_name)
 
+    def normalize_data_1d_extend(
+        self, old_qlib_data_dir, date_field_name: str = "date", symbol_field_name: str = "symbol"
+    ):
+        """normalize data extend; extending yahoo qlib data(from: https://github.com/microsoft/qlib/tree/main/scripts#download-cn-data)
+
+        Notes
+        -----
+            Steps to extend yahoo qlib data:
+
+                1. download qlib data: https://github.com/microsoft/qlib/tree/main/scripts#download-cn-data; save to <dir1>
+
+                2. collector source data: https://github.com/microsoft/qlib/tree/main/scripts/data_collector/yahoo#collector-data; save to <dir2>
+
+                3. normalize new source data(from step 2): python scripts/data_collector/yahoo/collector.py normalize_data_1d_extend --old_qlib_dir <dir1> --source_dir <dir2> --normalize_dir <dir3> --region CN --interval 1d
+
+                4. dump data: python scripts/dump_bin.py dump_update --csv_path <dir3> --qlib_dir <dir1> --freq day --date_field_name date --symbol_field_name symbol --exclude_fields symbol,date
+
+                5. update instrument(eg. csi300): python python scripts/data_collector/cn_index/collector.py --index_name CSI300 --qlib_dir <dir1> --method parse_instruments
+
+        Parameters
+        ----------
+        old_qlib_data_dir: str
+            the qlib data to be updated for yahoo, usually from: https://github.com/microsoft/qlib/tree/main/scripts#download-cn-data
+        date_field_name: str
+            date field name, default date
+        symbol_field_name: str
+            symbol field name, default symbol
+
+        Examples
+        ---------
+            $ python collector.py normalize_data_1d_extend --old_qlib_dir ~/.qlib/qlib_data/cn_1d --source_dir ~/.qlib/stock_data/source --normalize_dir ~/.qlib/stock_data/normalize --region CN --interval 1d
+        """
+        _class = getattr(self._cur_module, f"{self.normalize_class_name}Extend")
+        yc = Normalize(
+            source_dir=self.source_dir,
+            target_dir=self.normalize_dir,
+            normalize_class=_class,
+            max_workers=self.max_workers,
+            date_field_name=date_field_name,
+            symbol_field_name=symbol_field_name,
+            old_qlib_data_dir=old_qlib_data_dir,
+        )
+        yc.normalize()
+
 
 if __name__ == "__main__":
     fire.Fire(Run)

From 554b9c78268c6439754e4a9f6fb4b6b35e48cdd4 Mon Sep 17 00:00:00 2001
From: zhupr <zhu.pengrong@foxmail.com>
Date: Sat, 5 Jun 2021 16:01:01 +0800
Subject: [PATCH 02/14] fix YahooCollector getting 1min data occasionally
 missing

---
 scripts/data_collector/base.py            | 14 ++---
 scripts/data_collector/yahoo/collector.py | 63 +++++++++++------------
 2 files changed, 36 insertions(+), 41 deletions(-)

diff --git a/scripts/data_collector/base.py b/scripts/data_collector/base.py
index cb51f9b22..d261f11cd 100644
--- a/scripts/data_collector/base.py
+++ b/scripts/data_collector/base.py
@@ -22,9 +22,9 @@ class BaseCollector(abc.ABC):
     NORMAL_FLAG = "NORMAL"
 
     DEFAULT_START_DATETIME_1D = pd.Timestamp("2000-01-01")
-    DEFAULT_START_DATETIME_1MIN = pd.Timestamp(datetime.datetime.now() - pd.Timedelta(days=5 * 6))
-    DEFAULT_END_DATETIME_1D = pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1))
-    DEFAULT_END_DATETIME_1MIN = pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1))
+    DEFAULT_START_DATETIME_1MIN = pd.Timestamp(datetime.datetime.now() - pd.Timedelta(days=5 * 6 - 1)).date()
+    DEFAULT_END_DATETIME_1D = pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1)).date()
+    DEFAULT_END_DATETIME_1MIN = DEFAULT_END_DATETIME_1D
 
     INTERVAL_1min = "1min"
     INTERVAL_1d = "1d"
@@ -35,7 +35,7 @@ class BaseCollector(abc.ABC):
         start=None,
         end=None,
         interval="1d",
-        max_workers=4,
+        max_workers=1,
         max_collector_count=2,
         delay=0,
         check_data_length: bool = False,
@@ -48,7 +48,7 @@ class BaseCollector(abc.ABC):
         save_dir: str
             instrument save dir
         max_workers: int
-            workers, default 4
+            workers, default 1; Concurrent number, default is 1; when collecting data, it is recommended that max_workers be set to 1
         max_collector_count: int
             default 2
         delay: float
@@ -310,7 +310,7 @@ class Normalize:
 
 
 class BaseRun(abc.ABC):
-    def __init__(self, source_dir=None, normalize_dir=None, max_workers=4, interval="1d"):
+    def __init__(self, source_dir=None, normalize_dir=None, max_workers=1, interval="1d"):
         """
 
         Parameters
@@ -320,7 +320,7 @@ class BaseRun(abc.ABC):
         normalize_dir: str
             Directory for normalize data, default "Path(__file__).parent/normalize"
         max_workers: int
-            Concurrent number, default is 4
+            Concurrent number, default is 1; Concurrent number, default is 1; when collecting data, it is recommended that max_workers be set to 1
         interval: str
             freq, value from [1min, 1d], default 1d
         """
diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py
index fcaa9ff92..a48c5f16a 100644
--- a/scripts/data_collector/yahoo/collector.py
+++ b/scripts/data_collector/yahoo/collector.py
@@ -25,6 +25,7 @@ CUR_DIR = Path(__file__).resolve().parent
 sys.path.append(str(CUR_DIR.parent.parent))
 from data_collector.base import BaseCollector, BaseNormalize, BaseRun, Normalize
 from data_collector.utils import (
+    deco_retry,
     get_calendar_list,
     get_hs_stock_symbols,
     get_us_stock_symbols,
@@ -92,10 +93,6 @@ class YahooCollector(BaseCollector):
         else:
             raise ValueError(f"interval error: {self.interval}")
 
-        # using for 1min
-        self._next_datetime = self.convert_datetime(self.start_datetime.date() + pd.Timedelta(days=1), self._timezone)
-        self._latest_datetime = self.convert_datetime(self.end_datetime.date(), self._timezone)
-
         self.start_datetime = self.convert_datetime(self.start_datetime, self._timezone)
         self.end_datetime = self.convert_datetime(self.end_datetime, self._timezone)
 
@@ -140,40 +137,36 @@ class YahooCollector(BaseCollector):
     def get_data(
         self, symbol: str, interval: str, start_datetime: pd.Timestamp, end_datetime: pd.Timestamp
     ) -> pd.DataFrame:
+        @deco_retry(retry_sleep=1)
         def _get_simple(start_, end_):
             self.sleep()
             _remote_interval = "1m" if interval == self.INTERVAL_1min else interval
-            return self.get_data_from_remote(
+            resp = self.get_data_from_remote(
                 symbol,
                 interval=_remote_interval,
                 start=start_,
                 end=end_,
             )
+            if resp is None or resp.empty:
+                raise ValueError(f"get data error: {symbol}--{start_}--{end_}")
+            return resp
 
         _result = None
         if interval == self.INTERVAL_1d:
             _result = _get_simple(start_datetime, end_datetime)
         elif interval == self.INTERVAL_1min:
-            if self._next_datetime >= self._latest_datetime:
-                _result = _get_simple(start_datetime, end_datetime)
-            else:
-                _res = []
-
-                def _get_multi(start_, end_):
-                    _resp = _get_simple(start_, end_)
-                    if _resp is not None and not _resp.empty:
-                        _res.append(_resp)
-
-                for _s, _e in (
-                    (self.start_datetime, self._next_datetime),
-                    (self._latest_datetime, self.end_datetime),
-                ):
-                    _get_multi(_s, _e)
-                for _start in pd.date_range(self._next_datetime, self._latest_datetime, closed="left"):
-                    _end = _start + pd.Timedelta(days=1)
-                    _get_multi(_start, _end)
-                if _res:
-                    _result = pd.concat(_res, sort=False).sort_values(["symbol", "date"])
+            _res = []
+            _start = self.start_datetime
+            while _start < self.end_datetime:
+                _tmp_end = min(_start + pd.Timedelta(days=7), self.end_datetime)
+                try:
+                    _resp = _get_simple(_start, _tmp_end)
+                    _res.append(_resp)
+                except ValueError as e:
+                    pass
+                _start = _tmp_end
+            if _res:
+                _result = pd.concat(_res, sort=False).sort_values(["symbol", "date"])
         else:
             raise ValueError(f"cannot support {self.interval}")
         return pd.DataFrame() if _result is None else _result
@@ -520,6 +513,10 @@ class YahooNormalize1min(YahooNormalize, ABC):
             calendars, freq="1min", am_range=self.AM_RANGE, pm_range=self.PM_RANGE
         )
 
+    def get_1d_data(self, symbol: str, start: str, end: str) -> pd.DataFrame:
+        data_1d = YahooCollector.get_data_from_remote(self.symbol_to_yahoo(symbol), interval="1d", start=start, end=end)
+        return data_1d
+
     def adjusted_price(self, df: pd.DataFrame) -> pd.DataFrame:
         # TODO: using daily data factor
         if df.empty:
@@ -529,9 +526,7 @@ class YahooNormalize1min(YahooNormalize, ABC):
         # get 1d data from yahoo
         _start = pd.Timestamp(df[self._date_field_name].min()).strftime(self.DAILY_FORMAT)
         _end = (pd.Timestamp(df[self._date_field_name].max()) + pd.Timedelta(days=1)).strftime(self.DAILY_FORMAT)
-        data_1d = YahooCollector.get_data_from_remote(
-            self.symbol_to_yahoo(symbol), interval="1d", start=_start, end=_end
-        )
+        data_1d = self.get_1d_data(symbol, _start, _end)
         if data_1d is None or data_1d.empty:
             df["factor"] = 1
             # TODO: np.nan or 1 or 0
@@ -579,21 +574,21 @@ class YahooNormalize1min(YahooNormalize, ABC):
     def calc_paused_num(self, df: pd.DataFrame):
         _symbol = df.iloc[0][self._symbol_field_name]
         df = df.copy()
-        df["date"] = df[self._date_field_name].apply(lambda x: pd.Timestamp(x).date())
+        df["_tmp_date"] = df[self._date_field_name].apply(lambda x: pd.Timestamp(x).date())
         # remove data that starts and ends with `np.nan` all day
         all_data = []
         # Record the number of consecutive trading days where the whole day is nan, to remove the last trading day where the whole day is nan
         all_nan_nums = 0
         # Record the number of consecutive occurrences of trading days that are not nan throughout the day
         not_nan_nums = 0
-        for _date, _df in df.groupby(level="date"):
+        for _date, _df in df.groupby("_tmp_date"):
             _df["paused"] = 0
             if not _df.loc[_df["volume"] < 0].empty:
                 logger.warning(f"volume < 0, will fill np.nan: {_date} {_symbol}")
                 _df.loc[_df["volume"] < 0, "volume"] = np.nan
 
             check_fields = set(_df.columns) - {
-                "date",
+                "_tmp_date",
                 "paused",
                 "factor",
                 self._date_field_name,
@@ -618,7 +613,7 @@ class YahooNormalize1min(YahooNormalize, ABC):
             logger.warning(f"data is empty: {_symbol}")
             df = pd.DataFrame()
             return df
-        del df["date"]
+        del df["_tmp_date"]
         return df
 
     @abc.abstractmethod
@@ -690,7 +685,7 @@ class YahooNormalizeCN1min(YahooNormalizeCN, YahooNormalize1min):
 
 
 class Run(BaseRun):
-    def __init__(self, source_dir=None, normalize_dir=None, max_workers=4, interval="1d", region=REGION_CN):
+    def __init__(self, source_dir=None, normalize_dir=None, max_workers=1, interval="1d", region=REGION_CN):
         """
 
         Parameters
@@ -700,7 +695,7 @@ class Run(BaseRun):
         normalize_dir: str
             Directory for normalize data, default "Path(__file__).parent/normalize"
         max_workers: int
-            Concurrent number, default is 4
+            Concurrent number, default is 1; when collecting data, it is recommended that max_workers be set to 1
         interval: str
             freq, value from [1min, 1d], default 1d
         region: str

From a845a2271b3352e410b99843628167c7dce095db Mon Sep 17 00:00:00 2001
From: zhupr <zhu.pengrong@foxmail.com>
Date: Tue, 8 Jun 2021 14:45:20 +0800
Subject: [PATCH 03/14] add normalize 1min to use local data && change the
 default parameters for collecting 1min

---
 scripts/data_collector/base.py                |  52 ++---
 .../contrib/fill_cn_1min_data/README.md       |  23 ++
 .../fill_cn_1min_data/fill_cn_1min_data.py    |  98 +++++++++
 .../fill_cn_1min_data/requirements.txt        |   5 +
 .../README.md                                 |   0
 .../future_trading_date_collector.py          |   2 +-
 .../requirements.txt                          |   0
 scripts/data_collector/fund/collector.py      |  22 +-
 scripts/data_collector/yahoo/collector.py     | 196 +++++++++++++++---
 9 files changed, 328 insertions(+), 70 deletions(-)
 create mode 100644 scripts/data_collector/contrib/fill_cn_1min_data/README.md
 create mode 100644 scripts/data_collector/contrib/fill_cn_1min_data/fill_cn_1min_data.py
 create mode 100644 scripts/data_collector/contrib/fill_cn_1min_data/requirements.txt
 rename scripts/data_collector/contrib/{ => future_trading_date_collector}/README.md (100%)
 rename scripts/data_collector/contrib/{ => future_trading_date_collector}/future_trading_date_collector.py (98%)
 rename scripts/data_collector/contrib/{ => future_trading_date_collector}/requirements.txt (100%)

diff --git a/scripts/data_collector/base.py b/scripts/data_collector/base.py
index d261f11cd..08e1838a4 100644
--- a/scripts/data_collector/base.py
+++ b/scripts/data_collector/base.py
@@ -7,7 +7,7 @@ import time
 import datetime
 import importlib
 from pathlib import Path
-from typing import Type
+from typing import Type, Iterable
 from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
 
 import pandas as pd
@@ -38,7 +38,7 @@ class BaseCollector(abc.ABC):
         max_workers=1,
         max_collector_count=2,
         delay=0,
-        check_data_length: bool = False,
+        check_data_length: int = None,
         limit_nums: int = None,
     ):
         """
@@ -59,8 +59,8 @@ class BaseCollector(abc.ABC):
             start datetime, default None
         end: str
             end datetime, default None
-        check_data_length: bool
-            check data length, by default False
+        check_data_length: int
+            check data length, if not None and greater than 0, each symbol will be considered complete if its data length is greater than or equal to this value, otherwise it will be fetched again, the maximum number of fetches being (max_collector_count). By default None.
         limit_nums: int
             using for debug, by default None
         """
@@ -72,7 +72,7 @@ class BaseCollector(abc.ABC):
         self.max_collector_count = max_collector_count
         self.mini_symbol_map = {}
         self.interval = interval
-        self.check_small_data = check_data_length
+        self.check_data_length = max(int(check_data_length) if check_data_length is not None else 0, 0)
 
         self.start_datetime = self.normalize_start_datetime(start)
         self.end_datetime = self.normalize_end_datetime(end)
@@ -99,14 +99,6 @@ class BaseCollector(abc.ABC):
             else getattr(self, f"DEFAULT_END_DATETIME_{self.interval.upper()}")
         )
 
-    @property
-    @abc.abstractmethod
-    def min_numbers_trading(self):
-        # daily, one year: 252 / 4
-        # us 1min, a week: 6.5 * 60 * 5
-        # cn 1min, a week: 4 * 60 * 5
-        raise NotImplementedError("rewrite min_numbers_trading")
-
     @abc.abstractmethod
     def get_instrument_list(self):
         raise NotImplementedError("rewrite get_instrument_list")
@@ -132,7 +124,7 @@ class BaseCollector(abc.ABC):
 
         Returns
         ---------
-            pd.DataFrame, "symbol" in pd.columns
+            pd.DataFrame, "symbol" and "date"in pd.columns
 
         """
         raise NotImplementedError("rewrite get_timezone")
@@ -151,7 +143,7 @@ class BaseCollector(abc.ABC):
         self.sleep()
         df = self.get_data(symbol, self.interval, self.start_datetime, self.end_datetime)
         _result = self.NORMAL_FLAG
-        if self.check_small_data:
+        if self.check_data_length > 0:
             _result = self.cache_small_data(symbol, df)
         if _result == self.NORMAL_FLAG:
             self.save_instrument(symbol, df)
@@ -181,8 +173,8 @@ class BaseCollector(abc.ABC):
         df.to_csv(instrument_path, index=False)
 
     def cache_small_data(self, symbol, df):
-        if len(df) <= self.min_numbers_trading:
-            logger.warning(f"the number of trading days of {symbol} is less than {self.min_numbers_trading}!")
+        if len(df) < self.check_data_length:
+            logger.warning(f"the number of trading days of {symbol} is less than {self.check_data_length}!")
             _temp = self.mini_symbol_map.setdefault(symbol, [])
             _temp.append(df.copy())
             return self.CACHE_FLAG
@@ -194,9 +186,17 @@ class BaseCollector(abc.ABC):
     def _collector(self, instrument_list):
 
         error_symbol = []
-        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
-            with tqdm(total=len(instrument_list)) as p_bar:
-                for _symbol, _result in zip(instrument_list, executor.map(self._simple_collector, instrument_list)):
+        with tqdm(total=len(instrument_list)) as p_bar:
+            if self.max_workers is not None and self.max_workers > 1:
+                logger.info(f"concurrent collector, max_workers: {self.max_workers}")
+                with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+                    for _symbol, _result in zip(instrument_list, executor.map(self._simple_collector, instrument_list)):
+                        if _result != self.NORMAL_FLAG:
+                            error_symbol.append(_symbol)
+                        p_bar.update()
+            else:
+                for _symbol in instrument_list:
+                    _result = self._simple_collector(_symbol)
                     if _result != self.NORMAL_FLAG:
                         error_symbol.append(_symbol)
                     p_bar.update()
@@ -217,11 +217,11 @@ class BaseCollector(abc.ABC):
             instrument_list = self._collector(instrument_list)
             logger.info(f"{i+1} finish.")
         for _symbol, _df_list in self.mini_symbol_map.items():
-            self.save_instrument(
-                _symbol, pd.concat(_df_list, sort=False).drop_duplicates(["date"]).sort_values(["date"])
-            )
+            _df = pd.concat(_df_list, sort=False)
+            if not _df.empty:
+                self.save_instrument(_symbol, _df.drop_duplicates(["date"]).sort_values(["date"]))
         if self.mini_symbol_map:
-            logger.warning(f"less than {self.min_numbers_trading} instrument list: {list(self.mini_symbol_map.keys())}")
+            logger.warning(f"less than {self.check_data_length} instrument list: {list(self.mini_symbol_map.keys())}")
         logger.info(f"total {len(self.instrument_list)}, error: {len(set(instrument_list))}")
 
 
@@ -247,7 +247,7 @@ class BaseNormalize(abc.ABC):
         raise NotImplementedError("")
 
     @abc.abstractmethod
-    def _get_calendar_list(self):
+    def _get_calendar_list(self) -> Iterable[pd.Timestamp]:
         """Get benchmark calendar"""
         raise NotImplementedError("")
 
@@ -296,7 +296,7 @@ class Normalize:
         file_path = Path(file_path)
         df = pd.read_csv(file_path)
         df = self._normalize_obj.normalize(df)
-        if not df.empty:
+        if df is not None and not df.empty:
             df.to_csv(self._target_dir.joinpath(file_path.name), index=False)
 
     def normalize(self):
diff --git a/scripts/data_collector/contrib/fill_cn_1min_data/README.md b/scripts/data_collector/contrib/fill_cn_1min_data/README.md
new file mode 100644
index 000000000..c9ff0629c
--- /dev/null
+++ b/scripts/data_collector/contrib/fill_cn_1min_data/README.md
@@ -0,0 +1,23 @@
+# Use 1d data to fill in the missing symbols relative to 1min
+
+
+## Requirements
+
+```bash
+pip install -r requirements.txt
+```
+
+## fill 1min data
+
+```bash
+python fill_1min_using_1d.py --data_1min_dir ~/.qlib/csv_data/cn_data_1min --qlib_data_1d_dir ~/.qlib/qlib_data/cn_data
+```
+
+## Parameters
+
+- ata_1min_dir: csv data
+- qlib_data_1d_dir: qlib data directory
+- max_workers: `ThreadPoolExecutor(max_workers=max_workers)`, by default *16*
+- date_field_name: date field name, by default *date*
+- symbol_field_name: symbol field name, by default *symbol*
+
diff --git a/scripts/data_collector/contrib/fill_cn_1min_data/fill_cn_1min_data.py b/scripts/data_collector/contrib/fill_cn_1min_data/fill_cn_1min_data.py
new file mode 100644
index 000000000..4abca3361
--- /dev/null
+++ b/scripts/data_collector/contrib/fill_cn_1min_data/fill_cn_1min_data.py
@@ -0,0 +1,98 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import sys
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor
+
+import fire
+import qlib
+import pandas as pd
+from tqdm import tqdm
+from qlib.data import D
+from loguru import logger
+
+CUR_DIR = Path(__file__).resolve().parent
+sys.path.append(str(CUR_DIR.parent.parent.parent))
+from data_collector.utils import generate_minutes_calendar_from_daily
+
+
+def get_date_range(data_1min_dir: Path, max_workers: int = 16, date_field_name: str = "date"):
+    csv_files = list(data_1min_dir.glob("*.csv"))
+    min_date = None
+    max_date = None
+    with tqdm(total=len(csv_files)) as p_bar:
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            for _file, _result in zip(csv_files, executor.map(pd.read_csv, csv_files)):
+                if not _result.empty:
+                    _dates = pd.to_datetime(_result[date_field_name])
+
+                    _tmp_min = _dates.min()
+                    min_date = min_date(min_date, _tmp_min) if min_date is not None else _tmp_min
+
+                    _tmp_max = _dates.max()
+                    max_date = min_date(max_date, _tmp_max) if max_date is not None else _tmp_max
+                p_bar.update()
+    return min_date, max_date
+
+
+def get_symbols(data_1min_dir: Path):
+    return list(map(lambda x: x.name[:-4].upper(), data_1min_dir.glob("*.csv")))
+
+
+def fill_1min_using_1d(
+    data_1min_dir: [str, Path],
+    qlib_data_1d_dir: [str, Path],
+    max_workers: int = 16,
+    date_field_name: str = "date",
+    symbol_field_name: str = "symbol",
+):
+    """Use 1d data to fill in the missing symbols relative to 1min
+
+    Parameters
+    ----------
+    data_1min_dir: str
+        1min data dir
+    qlib_data_1d_dir: str
+        1d qlib data(bin data) dir, from: https://qlib.readthedocs.io/en/latest/component/data.html#converting-csv-format-into-qlib-format
+    max_workers: int
+        ThreadPoolExecutor(max_workers), by default 16
+    date_field_name: str
+        date field name, by default date
+    symbol_field_name: str
+        symbol field name, by default symbol
+
+    """
+    data_1min_dir = Path(data_1min_dir).expanduser().resolve()
+    qlib_data_1d_dir = Path(qlib_data_1d_dir).expanduser().resolve()
+
+    min_date, max_date = get_date_range(data_1min_dir, max_workers, date_field_name)
+    symbols_1min = get_symbols(data_1min_dir)
+
+    qlib.init(provider_uri=str(qlib_data_1d_dir))
+    data_1d = D.features(D.instruments("all"), ["$close"], min_date, max_date, freq="day")
+
+    miss_symbols = set(data_1d.index.get_level_values(level="instrument").unique()) - set(symbols_1min)
+    if not miss_symbols:
+        logger.warning("More symbols in 1min than 1d, no padding required")
+        return
+
+    logger.info(f"miss_symbols  {len(miss_symbols)}: {miss_symbols}")
+    tmp_df = pd.read_csv(list(data_1min_dir.glob("*.csv"))[0])
+    columns = tmp_df.columns
+    _si = tmp_df[symbol_field_name].first_valid_index()
+    is_lower = tmp_df.loc[tmp_df][symbol_field_name].islower()
+    for symbol in tqdm(miss_symbols):
+        if is_lower:
+            symbol = symbol.lower()
+        index_1d = data_1d.loc(axis=0)[symbol.upper()].index
+        index_1min = generate_minutes_calendar_from_daily(index_1d)
+        index_1min.name = date_field_name
+        _df = pd.DataFrame(columns=columns, index=index_1min)
+        _df.reset_index(inplace=True)
+        _df[symbol_field_name] = symbol
+        _df.to_csv(data_1min_dir.joinpath(f"{symbol}.csv"), index=False)
+
+
+if __name__ == "__main__":
+    fire.Fire(fill_1min_using_1d)
diff --git a/scripts/data_collector/contrib/fill_cn_1min_data/requirements.txt b/scripts/data_collector/contrib/fill_cn_1min_data/requirements.txt
new file mode 100644
index 000000000..057683685
--- /dev/null
+++ b/scripts/data_collector/contrib/fill_cn_1min_data/requirements.txt
@@ -0,0 +1,5 @@
+fire
+pandas
+loguru
+tqdm
+pyqlib
\ No newline at end of file
diff --git a/scripts/data_collector/contrib/README.md b/scripts/data_collector/contrib/future_trading_date_collector/README.md
similarity index 100%
rename from scripts/data_collector/contrib/README.md
rename to scripts/data_collector/contrib/future_trading_date_collector/README.md
diff --git a/scripts/data_collector/contrib/future_trading_date_collector.py b/scripts/data_collector/contrib/future_trading_date_collector/future_trading_date_collector.py
similarity index 98%
rename from scripts/data_collector/contrib/future_trading_date_collector.py
rename to scripts/data_collector/contrib/future_trading_date_collector/future_trading_date_collector.py
index 4da62d465..8df0a4972 100644
--- a/scripts/data_collector/contrib/future_trading_date_collector.py
+++ b/scripts/data_collector/contrib/future_trading_date_collector/future_trading_date_collector.py
@@ -14,7 +14,7 @@ from loguru import logger
 import baostock as bs
 
 CUR_DIR = Path(__file__).resolve().parent
-sys.path.append(str(CUR_DIR.parent.parent))
+sys.path.append(str(CUR_DIR.parent.parent.parent))
 
 
 from data_collector.utils import generate_minutes_calendar_from_daily
diff --git a/scripts/data_collector/contrib/requirements.txt b/scripts/data_collector/contrib/future_trading_date_collector/requirements.txt
similarity index 100%
rename from scripts/data_collector/contrib/requirements.txt
rename to scripts/data_collector/contrib/future_trading_date_collector/requirements.txt
diff --git a/scripts/data_collector/fund/collector.py b/scripts/data_collector/fund/collector.py
index 10800a7a3..fc06a27e4 100644
--- a/scripts/data_collector/fund/collector.py
+++ b/scripts/data_collector/fund/collector.py
@@ -3,18 +3,13 @@
 
 import abc
 import sys
-import copy
-import time
 import datetime
-import importlib
 import json
 from abc import ABC
 from pathlib import Path
-from typing import Iterable, Type
 
 import fire
 import requests
-import numpy as np
 import pandas as pd
 from loguru import logger
 from dateutil.tz import tzlocal
@@ -38,7 +33,7 @@ class FundCollector(BaseCollector):
         max_workers=4,
         max_collector_count=2,
         delay=0,
-        check_data_length: bool = False,
+        check_data_length: int = None,
         limit_nums: int = None,
     ):
         """
@@ -59,8 +54,8 @@ class FundCollector(BaseCollector):
             start datetime, default None
         end: str
             end datetime, default None
-        check_data_length: bool
-            check data length, by default False
+        check_data_length: int
+            check data length, if not None and greater than 0, each symbol will be considered complete if its data length is greater than or equal to this value, otherwise it will be fetched again, the maximum number of fetches being (max_collector_count). By default None.
         limit_nums: int
             using for debug, by default None
         """
@@ -168,10 +163,7 @@ class FundollectorCN(FundCollector, ABC):
 
 
 class FundCollectorCN1d(FundollectorCN):
-    @property
-    def min_numbers_trading(self):
-        return 252 / 4
-
+    pass
 
 class FundNormalize(BaseNormalize):
     DAILY_FORMAT = "%Y-%m-%d"
@@ -261,7 +253,7 @@ class Run(BaseRun):
         start=None,
         end=None,
         interval="1d",
-        check_data_length=False,
+        check_data_length=None,
         limit_nums=None,
     ):
         """download data from Internet
@@ -278,8 +270,8 @@ class Run(BaseRun):
             start datetime, default "2000-01-01"
         end: str
             end datetime, default ``pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1))``
-        check_data_length: bool # if this param useful?
-            check data length, by default False
+        check_data_length: int # if this param useful?
+            check data length, if not None and greater than 0, each symbol will be considered complete if its data length is greater than or equal to this value, otherwise it will be fetched again, the maximum number of fetches being (max_collector_count). By default None.
         limit_nums: int
             using for debug, by default None
 
diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py
index a48c5f16a..16b0a32ba 100644
--- a/scripts/data_collector/yahoo/collector.py
+++ b/scripts/data_collector/yahoo/collector.py
@@ -137,7 +137,7 @@ class YahooCollector(BaseCollector):
     def get_data(
         self, symbol: str, interval: str, start_datetime: pd.Timestamp, end_datetime: pd.Timestamp
     ) -> pd.DataFrame:
-        @deco_retry(retry_sleep=1)
+        @deco_retry(retry_sleep=self.delay)
         def _get_simple(start_, end_):
             self.sleep()
             _remote_interval = "1m" if interval == self.INTERVAL_1min else interval
@@ -200,10 +200,6 @@ class YahooCollectorCN(YahooCollector, ABC):
 
 
 class YahooCollectorCN1d(YahooCollectorCN):
-    @property
-    def min_numbers_trading(self):
-        return 252 / 4
-
     def download_index_data(self):
         # TODO: from MSN
         _format = "%Y%m%d"
@@ -237,10 +233,6 @@ class YahooCollectorCN1d(YahooCollectorCN):
 
 
 class YahooCollectorCN1min(YahooCollectorCN):
-    @property
-    def min_numbers_trading(self):
-        return 60 * 4 * 5
-
     def download_index_data(self):
         # TODO: 1m
         logger.warning(f"{self.__class__.__name__} {self.interval} does not support: download_index_data")
@@ -269,15 +261,11 @@ class YahooCollectorUS(YahooCollector, ABC):
 
 
 class YahooCollectorUS1d(YahooCollectorUS):
-    @property
-    def min_numbers_trading(self):
-        return 252 / 4
+    pass
 
 
 class YahooCollectorUS1min(YahooCollectorUS):
-    @property
-    def min_numbers_trading(self):
-        return 60 * 6.5 * 5
+    pass
 
 
 class YahooNormalize(BaseNormalize):
@@ -514,7 +502,17 @@ class YahooNormalize1min(YahooNormalize, ABC):
         )
 
     def get_1d_data(self, symbol: str, start: str, end: str) -> pd.DataFrame:
+        """get 1d data
+
+        Returns
+        ------
+            data_1d: pd.DataFrame
+                set(data_1d.columns) - set([self._date_field_name, self._symbol_field_name, "paused", "volume", "factor"]) == {}
+
+        """
         data_1d = YahooCollector.get_data_from_remote(self.symbol_to_yahoo(symbol), interval="1d", start=start, end=end)
+        if not (data_1d is None or data_1d.empty):
+            data_1d = self.data_1d_obj.normalize(data_1d)
         return data_1d
 
     def adjusted_price(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -526,13 +524,12 @@ class YahooNormalize1min(YahooNormalize, ABC):
         # get 1d data from yahoo
         _start = pd.Timestamp(df[self._date_field_name].min()).strftime(self.DAILY_FORMAT)
         _end = (pd.Timestamp(df[self._date_field_name].max()) + pd.Timedelta(days=1)).strftime(self.DAILY_FORMAT)
-        data_1d = self.get_1d_data(symbol, _start, _end)
+        data_1d: pd.DataFrame = self.get_1d_data(symbol, _start, _end)
         if data_1d is None or data_1d.empty:
             df["factor"] = 1
             # TODO: np.nan or 1 or 0
             df["paused"] = np.nan
         else:
-            data_1d = self.data_1d_obj.normalize(data_1d)  # type: pd.DataFrame
             # NOTE: volume is np.nan or volume <= 0, paused = 1
             # FIXME: find a more accurate data source
             data_1d["paused"] = 0
@@ -621,12 +618,12 @@ class YahooNormalize1min(YahooNormalize, ABC):
         raise NotImplementedError("rewrite symbol_to_yahoo")
 
     @abc.abstractmethod
-    def _get_1d_calendar_list(self):
+    def _get_1d_calendar_list(self) -> Iterable[pd.Timestamp]:
         raise NotImplementedError("rewrite _get_1d_calendar_list")
 
 
 class YahooNormalizeUS:
-    def _get_calendar_list(self):
+    def _get_calendar_list(self) -> Iterable[pd.Timestamp]:
         # TODO: from MSN
         return get_calendar_list("US_ALL")
 
@@ -638,7 +635,7 @@ class YahooNormalizeUS1d(YahooNormalizeUS, YahooNormalize1d):
 class YahooNormalizeUS1min(YahooNormalizeUS, YahooNormalize1min):
     CONSISTENT_1d = False
 
-    def _get_calendar_list(self):
+    def _get_calendar_list(self) -> Iterable[pd.Timestamp]:
         # TODO: support 1min
         raise ValueError("Does not support 1min")
 
@@ -650,7 +647,7 @@ class YahooNormalizeUS1min(YahooNormalizeUS, YahooNormalize1min):
 
 
 class YahooNormalizeCN:
-    def _get_calendar_list(self):
+    def _get_calendar_list(self) -> Iterable[pd.Timestamp]:
         # TODO: from MSN
         return get_calendar_list("ALL")
 
@@ -670,7 +667,7 @@ class YahooNormalizeCN1min(YahooNormalizeCN, YahooNormalize1min):
     CONSISTENT_1d = True
     CALC_PAUSED_NUM = True
 
-    def _get_calendar_list(self):
+    def _get_calendar_list(self) -> Iterable[pd.Timestamp]:
         return self.generate_1min_from_daily(self.calendar_list_1d)
 
     def symbol_to_yahoo(self, symbol):
@@ -680,10 +677,67 @@ class YahooNormalizeCN1min(YahooNormalizeCN, YahooNormalize1min):
             symbol = symbol[2:] + "." + _exchange
         return symbol
 
-    def _get_1d_calendar_list(self):
+    def _get_1d_calendar_list(self) -> Iterable[pd.Timestamp]:
         return get_calendar_list("ALL")
 
 
+class YahooNormalizeCN1minOffline(YahooNormalizeCN1min):
+    """Normalised to 1min using local 1d data
+    1d data usually from: Normalised to 1min using local 1d data
+    """
+
+    def __init__(
+        self, qlib_data_1d_dir: [str, Path], date_field_name: str = "date", symbol_field_name: str = "symbol", **kwargs
+    ):
+        """
+
+        Parameters
+        ----------
+        qlib_data_1d_dir: str, Path
+            the qlib data to be updated for yahoo, usually from: Normalised to 1min using local 1d data
+        date_field_name: str
+            date field name, default is date
+        symbol_field_name: str
+            symbol field name, default is symbol
+        """
+        super(YahooNormalizeCN1minOffline, self).__init__(date_field_name, symbol_field_name)
+        self.qlib_data_1d_dir = qlib_data_1d_dir
+        self._all_1d_data = self._get_all_1d_data()
+
+    def _get_1d_calendar_list(self) -> Iterable[pd.Timestamp]:
+        import qlib
+        from qlib.data import D
+
+        qlib.init(provider_uri=self.qlib_data_1d_dir)
+        return list(D.calendar(freq="day"))
+
+    def _get_all_1d_data(self):
+        import qlib
+        from qlib.data import D
+
+        qlib.init(provider_uri=self.qlib_data_1d_dir)
+        df = D.features(D.instruments("all"), ["$paused", "$volume", "$factor"], freq="day")
+        df.reset_index(inplace=True)
+        df.rename(columns={"datetime": self._date_field_name, "instrument": self._symbol_field_name}, inplace=True)
+        df.columns = list(map(lambda x: x[1:] if x.startswith("$") else x, df.columns))
+        return df
+
+    def get_1d_data(self, symbol: str, start: str, end: str) -> pd.DataFrame:
+        """get 1d data
+
+        Returns
+        ------
+            data_1d: pd.DataFrame
+                set(data_1d.columns) - set([self._date_field_name, self._symbol_field_name, "paused", "volume", "factor"]) == {}
+
+        """
+        return self._all_1d_data[
+            (self._all_1d_data[self._symbol_field_name] == symbol.upper())
+            & (self._all_1d_data[self._date_field_name] >= pd.Timestamp(start))
+            & (self._all_1d_data[self._date_field_name] < pd.Timestamp(end))
+        ]
+
+
 class Run(BaseRun):
     def __init__(self, source_dir=None, normalize_dir=None, max_workers=1, interval="1d", region=REGION_CN):
         """
@@ -722,7 +776,7 @@ class Run(BaseRun):
         delay=0,
         start=None,
         end=None,
-        check_data_length=False,
+        check_data_length=None,
         limit_nums=None,
     ):
         """download data from Internet
@@ -734,14 +788,21 @@ class Run(BaseRun):
         delay: float
             time.sleep(delay), default 0
         start: str
-            start datetime, default "2000-01-01"
+            start datetime, default "2000-01-01"; closed interval(including start)
         end: str
-            end datetime, default ``pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1))``
-        check_data_length: bool
-            check data length, by default False
+            end datetime, default ``pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1))``; open interval(excluding end)
+        check_data_length: int
+            check data length, if not None and greater than 0, each symbol will be considered complete if its data length is greater than or equal to this value, otherwise it will be fetched again, the maximum number of fetches being (max_collector_count). By default None.
         limit_nums: int
             using for debug, by default None
 
+        Notes
+        -----
+            check_data_length, example:
+                daily, one year: 252 // 4
+                us 1min, a week: 6.5 * 60 * 5
+                cn 1min, a week: 4 * 60 * 5
+
         Examples
         ---------
             # get daily data
@@ -813,6 +874,85 @@ class Run(BaseRun):
         )
         yc.normalize()
 
+    def normalize_data_1min_cn_offline(
+        self, qlib_data_1d_dir, date_field_name: str = "date", symbol_field_name: str = "symbol"
+    ):
+        """Normalised to 1min using local 1d data
+
+        Parameters
+        ----------
+        qlib_data_1d_dir: str
+            the qlib data to be updated for yahoo, usually from: https://github.com/microsoft/qlib/tree/main/scripts#download-cn-data
+        date_field_name: str
+            date field name, default date
+        symbol_field_name: str
+            symbol field name, default symbol
+
+        Examples
+        ---------
+            $ python collector.py normalize_data_1min_cn_offline --qlib_data_1d_dir ~/.qlib/qlib_data/cn_1d --source_dir ~/.qlib/stock_data/source_cn_1min --normalize_dir ~/.qlib/stock_data/normalize_cn_1min --region CN --interval 1min
+        """
+        _class = getattr(self._cur_module, f"{self.normalize_class_name}Offline")
+        yc = Normalize(
+            source_dir=self.source_dir,
+            target_dir=self.normalize_dir,
+            normalize_class=_class,
+            max_workers=self.max_workers,
+            date_field_name=date_field_name,
+            symbol_field_name=symbol_field_name,
+            qlib_data_1d_dir=qlib_data_1d_dir,
+        )
+        yc.normalize()
+
+    def download_today_data(
+        self,
+        max_collector_count=2,
+        delay=0,
+        check_data_length=None,
+        limit_nums=None,
+    ):
+        """download today data from Internet
+
+        Parameters
+        ----------
+        max_collector_count: int
+            default 2
+        delay: float
+            time.sleep(delay), default 0
+        check_data_length: int
+            check data length, if not None and greater than 0, each symbol will be considered complete if its data length is greater than or equal to this value, otherwise it will be fetched again, the maximum number of fetches being (max_collector_count). By default None.
+        limit_nums: int
+            using for debug, by default None
+
+        Notes
+        -----
+            Download today's data:
+                start_time = datetime.datetime.now().date(); closed interval(including start)
+                end_time = pd.Timestamp(start_time + pd.Timedelta(days=1)).date(); open interval(excluding end)
+
+            check_data_length, example:
+                daily, one year: 252 // 4
+                us 1min, a week: 6.5 * 60 * 5
+                cn 1min, a week: 4 * 60 * 5
+
+        Examples
+        ---------
+            # get daily data
+            $ python collector.py download_today_data --source_dir ~/.qlib/stock_data/source --region CN --delay 0.1 --interval 1d
+            # get 1m data
+            $ python collector.py download_today_data --source_dir ~/.qlib/stock_data/source --region CN --delay 0.1 --interval 1m
+        """
+        start = datetime.datetime.now().date()
+        end = pd.Timestamp(start + pd.Timedelta(days=1)).date()
+        self.download_data(
+            max_collector_count,
+            delay,
+            start.strftime("%Y-%m-%d"),
+            end.strftime("%Y-%m-%d"),
+            check_data_length,
+            limit_nums,
+        )
+
 
 if __name__ == "__main__":
     fire.Fire(Run)

From 03eb0882de52b9c8476fd8456185e99b41e11dc5 Mon Sep 17 00:00:00 2001
From: zhupr <zhu.pengrong@foxmail.com>
Date: Tue, 8 Jun 2021 22:23:05 +0800
Subject: [PATCH 04/14] fix YahooNormalizeCN1minOffline bugs

---
 .../fill_cn_1min_data/fill_cn_1min_data.py    | 10 ++++----
 scripts/data_collector/yahoo/collector.py     | 23 +++++--------------
 2 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/scripts/data_collector/contrib/fill_cn_1min_data/fill_cn_1min_data.py b/scripts/data_collector/contrib/fill_cn_1min_data/fill_cn_1min_data.py
index 4abca3361..0a721298d 100644
--- a/scripts/data_collector/contrib/fill_cn_1min_data/fill_cn_1min_data.py
+++ b/scripts/data_collector/contrib/fill_cn_1min_data/fill_cn_1min_data.py
@@ -28,10 +28,9 @@ def get_date_range(data_1min_dir: Path, max_workers: int = 16, date_field_name:
                     _dates = pd.to_datetime(_result[date_field_name])
 
                     _tmp_min = _dates.min()
-                    min_date = min_date(min_date, _tmp_min) if min_date is not None else _tmp_min
-
+                    min_date = min(min_date, _tmp_min) if min_date is not None else _tmp_min
                     _tmp_max = _dates.max()
-                    max_date = min_date(max_date, _tmp_max) if max_date is not None else _tmp_max
+                    max_date = max(max_date, _tmp_max) if max_date is not None else _tmp_max
                 p_bar.update()
     return min_date, max_date
 
@@ -81,7 +80,7 @@ def fill_1min_using_1d(
     tmp_df = pd.read_csv(list(data_1min_dir.glob("*.csv"))[0])
     columns = tmp_df.columns
     _si = tmp_df[symbol_field_name].first_valid_index()
-    is_lower = tmp_df.loc[tmp_df][symbol_field_name].islower()
+    is_lower = tmp_df.loc[_si][symbol_field_name].islower()
     for symbol in tqdm(miss_symbols):
         if is_lower:
             symbol = symbol.lower()
@@ -89,8 +88,11 @@ def fill_1min_using_1d(
         index_1min = generate_minutes_calendar_from_daily(index_1d)
         index_1min.name = date_field_name
         _df = pd.DataFrame(columns=columns, index=index_1min)
+        if date_field_name in _df.columns:
+            del _df[date_field_name]
         _df.reset_index(inplace=True)
         _df[symbol_field_name] = symbol
+        _df["paused_num"] = 0
         _df.to_csv(data_1min_dir.joinpath(f"{symbol}.csv"), index=False)
 
 
diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py
index 16b0a32ba..58e1d3009 100644
--- a/scripts/data_collector/yahoo/collector.py
+++ b/scripts/data_collector/yahoo/collector.py
@@ -473,21 +473,6 @@ class YahooNormalize1min(YahooNormalize, ABC):
     CONSISTENT_1d = False
     CALC_PAUSED_NUM = False
 
-    def __init__(self, date_field_name: str = "date", symbol_field_name: str = "symbol", **kwargs):
-        """
-
-        Parameters
-        ----------
-        date_field_name: str
-            date field name, default is date
-        symbol_field_name: str
-            symbol field name, default is symbol
-        """
-        super(YahooNormalize1min, self).__init__(date_field_name, symbol_field_name)
-        _class_name = self.__class__.__name__.replace("min", "d")
-        _class = getattr(importlib.import_module("collector"), _class_name)  # type: Type[YahooNormalize]
-        self.data_1d_obj = _class(self._date_field_name, self._symbol_field_name)
-
     @property
     def calendar_list_1d(self):
         calendar_list_1d = getattr(self, "_calendar_list_1d", None)
@@ -512,7 +497,10 @@ class YahooNormalize1min(YahooNormalize, ABC):
         """
         data_1d = YahooCollector.get_data_from_remote(self.symbol_to_yahoo(symbol), interval="1d", start=start, end=end)
         if not (data_1d is None or data_1d.empty):
-            data_1d = self.data_1d_obj.normalize(data_1d)
+            _class_name = self.__class__.__name__.replace("min", "d")
+            _class: type(YahooNormalize) = getattr(importlib.import_module("collector"), _class_name)
+            data_1d_obj = _class(self._date_field_name, self._symbol_field_name)
+            data_1d = data_1d_obj.normalize(data_1d)
         return data_1d
 
     def adjusted_price(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -525,6 +513,7 @@ class YahooNormalize1min(YahooNormalize, ABC):
         _start = pd.Timestamp(df[self._date_field_name].min()).strftime(self.DAILY_FORMAT)
         _end = (pd.Timestamp(df[self._date_field_name].max()) + pd.Timedelta(days=1)).strftime(self.DAILY_FORMAT)
         data_1d: pd.DataFrame = self.get_1d_data(symbol, _start, _end)
+        data_1d = data_1d.copy()
         if data_1d is None or data_1d.empty:
             df["factor"] = 1
             # TODO: np.nan or 1 or 0
@@ -700,8 +689,8 @@ class YahooNormalizeCN1minOffline(YahooNormalizeCN1min):
         symbol_field_name: str
             symbol field name, default is symbol
         """
-        super(YahooNormalizeCN1minOffline, self).__init__(date_field_name, symbol_field_name)
         self.qlib_data_1d_dir = qlib_data_1d_dir
+        super(YahooNormalizeCN1minOffline, self).__init__(date_field_name, symbol_field_name)
         self._all_1d_data = self._get_all_1d_data()
 
     def _get_1d_calendar_list(self) -> Iterable[pd.Timestamp]:

From 9a44fbf9c1e797fe99c9ef283586544347eede0d Mon Sep 17 00:00:00 2001
From: zhupr <zhu.pengrong@foxmail.com>
Date: Tue, 8 Jun 2021 22:52:31 +0800
Subject: [PATCH 05/14] fix PEP8: qlib/scripts/data_collector/fund/collector.py

---
 scripts/data_collector/fund/collector.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/data_collector/fund/collector.py b/scripts/data_collector/fund/collector.py
index fc06a27e4..7b5566f72 100644
--- a/scripts/data_collector/fund/collector.py
+++ b/scripts/data_collector/fund/collector.py
@@ -165,6 +165,7 @@ class FundollectorCN(FundCollector, ABC):
 class FundCollectorCN1d(FundollectorCN):
     pass
 
+
 class FundNormalize(BaseNormalize):
     DAILY_FORMAT = "%Y-%m-%d"
 

From b4efbd53b2f8889b984a5f283e8d62cd3ecf1976 Mon Sep 17 00:00:00 2001
From: zhupr <zhu.pengrong@foxmail.com>
Date: Wed, 16 Jun 2021 22:00:43 +0800
Subject: [PATCH 06/14] Fix 'report' compatibility with matplotlib versions

---
 .../analysis_model_performance.py             | 35 ++++++++++++++++---
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/qlib/contrib/report/analysis_model/analysis_model_performance.py b/qlib/contrib/report/analysis_model/analysis_model_performance.py
index 1cb14d261..1d444b104 100644
--- a/qlib/contrib/report/analysis_model/analysis_model_performance.py
+++ b/qlib/contrib/report/analysis_model/analysis_model_performance.py
@@ -3,7 +3,6 @@
 
 import pandas as pd
 
-import plotly.tools as tls
 import plotly.graph_objs as go
 
 import statsmodels.api as sm
@@ -80,9 +79,37 @@ def _plot_qq(data: pd.Series = None, dist=stats.norm) -> go.Figure:
     :param dist:
     :return:
     """
-    fig, ax = plt.subplots(figsize=(8, 5))
-    _mpl_fig = sm.qqplot(data.dropna(), dist, fit=True, line="45", ax=ax)
-    return tls.mpl_to_plotly(_mpl_fig)
+    _plt_fig = sm.qqplot(data.dropna(), dist=dist, fit=True, line="45")
+    plt.close(_plt_fig)
+    qqplot_data = _plt_fig.gca().lines
+    fig = go.Figure()
+
+    fig.add_trace({
+        'type': 'scatter',
+        'x': qqplot_data[0].get_xdata(),
+        # 'x': [0, 1],
+        'y': qqplot_data[0].get_ydata(),
+        # 'y': [1, 2],
+        'mode': 'markers',
+        'marker': {
+            'color': '#19d3f3'
+        }
+    })
+
+    fig.add_trace({
+        'type': 'scatter',
+        'x': qqplot_data[1].get_xdata(),
+        # 'x': [0, 1],
+        'y': qqplot_data[1].get_ydata(),
+        # 'y': [1, 2],
+        'mode': 'lines',
+        'line': {
+            'color': '#636efa'
+        }
+
+    })
+    del qqplot_data
+    return fig
 
 
 def _pred_ic(pred_label: pd.DataFrame = None, rank: bool = False, **kwargs) -> tuple:

From a4f6e0419943428def7d5bd12958d065d07ecc9f Mon Sep 17 00:00:00 2001
From: zhupr <zhu.pengrong@foxmail.com>
Date: Thu, 17 Jun 2021 22:33:31 +0800
Subject: [PATCH 07/14] modify dump_update starts with the last end date of
 each symbol

---
 scripts/dump_bin.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/scripts/dump_bin.py b/scripts/dump_bin.py
index b3a18cc90..83daa28bc 100644
--- a/scripts/dump_bin.py
+++ b/scripts/dump_bin.py
@@ -401,6 +401,8 @@ class DumpDataUpdate(DumpDataBase):
         )
         self._mode = self.UPDATE_MODE
         self._old_calendar_list = self._read_calendars(self._calendars_dir.joinpath(f"{self.freq}.txt"))
+        # NOTE: all.txt only exists once for each stock
+        # NOTE: if a stock corresponds to multiple different time ranges, user need to modify self._update_instruments
         self._update_instruments = (
             self._read_instruments(self._instruments_dir.joinpath(self.INSTRUMENTS_FILE_NAME))
             .set_index([self.symbol_field_name])
@@ -409,10 +411,9 @@ class DumpDataUpdate(DumpDataBase):
 
         # load all csv files
         self._all_data = self._load_all_source_data()  # type: pd.DataFrame
-        self._update_calendars = sorted(
+        self._new_calendar_list = self._old_calendar_list + sorted(
             filter(lambda x: x > self._old_calendar_list[-1], self._all_data[self.date_field_name].unique())
         )
-        self._new_calendar_list = self._old_calendar_list + self._update_calendars
 
     def _load_all_source_data(self):
         # NOTE: Need more memory
@@ -452,8 +453,16 @@ class DumpDataUpdate(DumpDataBase):
                 if not (isinstance(_start, pd.Timestamp) and isinstance(_end, pd.Timestamp)):
                     continue
                 if _code in self._update_instruments:
+                    # exists stock, will append data
+                    _update_calendars = (
+                        _df[_df[self.date_field_name] > self._update_instruments[_code][self.INSTRUMENTS_START_FIELD]][
+                            self.date_field_name
+                        ]
+                        .sort_values()
+                        .to_list()
+                    )
                     self._update_instruments[_code][self.INSTRUMENTS_END_FIELD] = self._format_datetime(_end)
-                    futures[executor.submit(self._dump_bin, _df, self._update_calendars)] = _code
+                    futures[executor.submit(self._dump_bin, _df, _update_calendars)] = _code
                 else:
                     # new stock
                     _dt_range = self._update_instruments.setdefault(_code, dict())

From b6c31540e8c8bd7559b58bb6e4e268e9f91d32d5 Mon Sep 17 00:00:00 2001
From: zhupr <zhu.pengrong@foxmail.com>
Date: Thu, 17 Jun 2021 23:01:08 +0800
Subject: [PATCH 08/14] add function to automatically update daily frequency
 data

---
 README.md                                    | 22 +++++
 docs/component/data.rst                      | 28 ++++++
 scripts/data_collector/cn_index/collector.py |  2 +-
 scripts/data_collector/us_index/collector.py |  2 +-
 scripts/data_collector/yahoo/README.md       | 61 +++++++++++--
 scripts/data_collector/yahoo/collector.py    | 95 +++++++++++++++++---
 6 files changed, 189 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 8276c4951..635b143f4 100644
--- a/README.md
+++ b/README.md
@@ -159,6 +159,28 @@ Users could create the same dataset with it.
 *Please pay **ATTENTION** that the data is collected from [Yahoo Finance](https://finance.yahoo.com/lookup), and the data might not be perfect.
 We recommend users to prepare their own data if they have a high-quality dataset. For more information, users can refer to the [related document](https://qlib.readthedocs.io/en/latest/component/data.html#converting-csv-format-into-qlib-format)*.
 
+### Automatic update of daily frequency data(from yahoo finance)
+  > It is recommended that users update the data manually once (--trading_date 2021-05-25) and then set it to update automatically.
+
+  > For more information refer to: [yahoo collector](https://github.com/microsoft/qlib/tree/main/scripts/data_collector/yahoo#Automatic-update-of-daily-frequency-data)
+
+  * Automatic update of data to the "qlib" directory each trading day(Linux)
+      * use *crontab*: `crontab -e`
+      * set up timed tasks:
+
+        ```
+        * * * * 1-5 python <script path> update_data_to_bin --qlib_data_1d_dir <user data dir>
+        ```
+        * **script path**: *qlib/scripts/data_collector/yahoo/collector.py*
+
+  * Manual update of data
+      ```
+      python qlib/scripts/data_collector/yahoo/collector.py update_data_to_bin --qlib_data_1d_dir <user data dir> --trading_date <start date> --end_date <end date>
+      ```
+      * *trading_date*: start of trading day
+      * *end_date*: end of trading day(not included)
+
+
 <!-- 
 - Run the initialization code and get stock data:
 
diff --git a/docs/component/data.rst b/docs/component/data.rst
index 0a650c523..e399a41d7 100644
--- a/docs/component/data.rst
+++ b/docs/component/data.rst
@@ -67,6 +67,34 @@ After running the above command, users can find china-stock and us-stock data in
 
 When ``Qlib`` is initialized with this dataset, users could build and evaluate their own models with it.  Please refer to `Initialization <../start/initialization.html>`_ for more details.
 
+Automatic update of daily frequency data
+----------------------------------------
+
+  **It is recommended that users update the data manually once (\-\-trading_date 2021-05-25) and then set it to update automatically.**
+
+  For more information refer to: `yahoo collector <https://github.com/microsoft/qlib/tree/main/scripts/data_collector/yahoo#Automatic-update-of-daily-frequency-data>`_
+
+  - Automatic update of data to the "qlib" directory each trading day(Linux)
+      - use *crontab*: `crontab -e`
+      - set up timed tasks:
+
+        .. code-block:: bash
+
+            * * * * 1-5 python <script path> update_data_to_bin --qlib_data_1d_dir <user data dir>
+
+        - **script path**: *qlib/scripts/data_collector/yahoo/collector.py*
+
+  - Manual update of data
+
+      .. code-block:: bash
+
+        python qlib/scripts/data_collector/yahoo/collector.py update_data_to_bin --qlib_data_1d_dir <user data dir> --trading_date <start date> --end_date <end date>
+
+      - *trading_date*: start of trading day
+      - *end_date*: end of trading day(not included)
+
+
+
 Converting CSV Format into Qlib Format
 -------------------------------------------
 
diff --git a/scripts/data_collector/cn_index/collector.py b/scripts/data_collector/cn_index/collector.py
index 1f8434c58..63b2ebd9e 100644
--- a/scripts/data_collector/cn_index/collector.py
+++ b/scripts/data_collector/cn_index/collector.py
@@ -295,7 +295,7 @@ def get_instruments(
         $ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method save_new_companies
 
     """
-    _cur_module = importlib.import_module("collector")
+    _cur_module = importlib.import_module("data_collector.cn_index.collector")
     obj = getattr(_cur_module, f"{index_name.upper()}")(
         qlib_dir=qlib_dir, index_name=index_name, request_retry=request_retry, retry_sleep=retry_sleep
     )
diff --git a/scripts/data_collector/us_index/collector.py b/scripts/data_collector/us_index/collector.py
index 371668330..6947baa9a 100644
--- a/scripts/data_collector/us_index/collector.py
+++ b/scripts/data_collector/us_index/collector.py
@@ -271,7 +271,7 @@ def get_instruments(
         $ python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/cn_data --method save_new_companies
 
     """
-    _cur_module = importlib.import_module("collector")
+    _cur_module = importlib.import_module("data_collector.us_index.collector")
     obj = getattr(_cur_module, f"{index_name.upper()}Index")(
         qlib_dir=qlib_dir, index_name=index_name, request_retry=request_retry, retry_sleep=retry_sleep
     )
diff --git a/scripts/data_collector/yahoo/README.md b/scripts/data_collector/yahoo/README.md
index b9fd9123c..6fa4b1937 100644
--- a/scripts/data_collector/yahoo/README.md
+++ b/scripts/data_collector/yahoo/README.md
@@ -1,3 +1,19 @@
+
+- [Collector Data](#collector-data)
+  - [Automatic update data](#automatic-update-of-daily-frequency-data(from-yahoo-finance))
+  - [CN Data](#CN-Data)
+    - [1d from yahoo](#1d-from-yahoocn)
+    - [1d from qlib](#1d-from-qlibcn)
+    - [using data(1d)](#using-data1d-cn)
+    - [1min from yahoo](#1min-from-yahoocn)
+    - [1min from qlib](#1min-from-qlibcn)
+    - [using data(1min)](#using-data1min-cn)
+  - [US Data](#CN-Data)
+    - [1d from yahoo](#1d-from-yahoous)
+    - [1d from qlib](#1d-from-qlibus)
+    - [using data(1d)](#using-data1d-us)
+
+
 # Collect Data From Yahoo Finance
 
 > *Please pay **ATTENTION** that the data is collected from [Yahoo Finance](https://finance.yahoo.com/lookup) and the data might not be perfect. We recommend users to prepare their own data if they have high-quality dataset. For more information, users can refer to the [related document](https://qlib.readthedocs.io/en/latest/component/data.html#converting-csv-format-into-qlib-format)*
@@ -18,10 +34,37 @@ pip install -r requirements.txt
 
 ## Collector Data
 
+### Automatic update of daily frequency data(from yahoo finance)
+  > It is recommended that users update the data manually once (--trading_date 2021-05-25) and then set it to update automatically.
+
+  * Automatic update of data to the "qlib" directory each trading day(Linux)
+      * use *crontab*: `crontab -e`
+      * set up timed tasks:
+
+        ```
+        * * * * 1-5 python <script path> update_data_to_bin --qlib_data_1d_dir <user data dir>
+        ```
+        * **script path**: *qlib/scripts/data_collector/yahoo/collector.py*
+
+  * Manual update of data
+      ```
+      python qlib/scripts/data_collector/yahoo/collector.py update_data_to_bin --qlib_data_1d_dir <user data dir> --trading_date <start date> --end_date <end date>
+      ```
+      * *trading_date*: start of trading day
+      * *end_date*: end of trading day(not included)
+
+  * qlib/scripts/data_collector/yahoo/collector.py update_data_to_bin parameters:
+      * *source_dir*: The directory where the raw data collected from the Internet is saved, default "Path(__file__).parent/source"
+      * *normalize_dir*: Directory for normalize data, default "Path(__file__).parent/normalize"
+      * *qlib_data_1d_dir*: the qlib data to be updated for yahoo, usually from: [download qlib data](https://github.com/microsoft/qlib/tree/main/scripts#download-cn-data)
+      * *trading_date*: trading days to be updated, by default ``datetime.datetime.now().strftime("%Y-%m-%d")``
+      * *end_date*: end datetime, default ``pd.Timestamp(trading_date + pd.Timedelta(days=1))``; open interval(excluding end)
+      * *region*: region, value from ["CN", "US"], default "CN"
+
 
 ### CN Data
 
-#### 1d from yahoo
+#### 1d from yahoo(CN)
 
 ```bash
 
@@ -37,12 +80,12 @@ python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1d_nor --qli
 
 ```
 
-### 1d from qlib
+### 1d from qlib(CN)
 ```bash
 python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_cn_1d --region cn
 ```
 
-### using data
+### using data(1d CN)
 
 ```python
 import qlib
@@ -52,7 +95,7 @@ qlib.init(provider_uri="~/.qlib/qlib_data/qlib_cn_1d", region="cn")
 df = D.features(D.instruments("all"), ["$close"], freq="day")
 ```
 
-#### 1min from yahoo
+#### 1min from yahoo(CN)
 
 ```bash
 
@@ -67,12 +110,12 @@ cd qlib/scripts
 python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1min_nor --qlib_dir ~/.qlib/qlib_data/qlib_cn_1min --freq 1min --exclude_fields date,adjclose,dividends,splits,symbol
 ```
 
-### 1min from qlib
+### 1min from qlib(CN)
 ```bash
 python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_cn_1min --interval 1min --region cn
 ```
 
-### using data
+### using data(1min CN)
 
 ```python
 import qlib
@@ -85,7 +128,7 @@ df = D.features(D.instruments("all"), ["$close"], freq="1min")
 
 ### US Data
 
-#### 1d from yahoo
+#### 1d from yahoo(US)
 
 ```bash
 
@@ -100,13 +143,13 @@ cd qlib/scripts
 python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/us_1d_nor --qlib_dir ~/.qlib/stock_data/source/qlib_us_1d --freq day --exclude_fields date,adjclose,dividends,splits,symbol
 ```
 
-#### 1d from qlib
+#### 1d from qlib(US)
 
 ```bash
 python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_us_1d --region us
 ```
 
-### using data
+### using data(1d US)
 
 ```python
 # using
diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py
index 58e1d3009..f9a209168 100644
--- a/scripts/data_collector/yahoo/collector.py
+++ b/scripts/data_collector/yahoo/collector.py
@@ -9,7 +9,7 @@ import datetime
 import importlib
 from abc import ABC
 from pathlib import Path
-from typing import Iterable, Type
+from typing import Iterable
 
 import fire
 import requests
@@ -18,11 +18,15 @@ import pandas as pd
 from loguru import logger
 from yahooquery import Ticker
 from dateutil.tz import tzlocal
-from qlib.utils import code_to_fname, fname_to_code
+
+from qlib.tests.data import GetData
+from qlib.utils import code_to_fname, fname_to_code, exists_qlib_data
 from qlib.config import REG_CN as REGION_CN
 
 CUR_DIR = Path(__file__).resolve().parent
 sys.path.append(str(CUR_DIR.parent.parent))
+
+from dump_bin import DumpDataUpdate
 from data_collector.base import BaseCollector, BaseNormalize, BaseRun, Normalize
 from data_collector.utils import (
     deco_retry,
@@ -153,7 +157,10 @@ class YahooCollector(BaseCollector):
 
         _result = None
         if interval == self.INTERVAL_1d:
-            _result = _get_simple(start_datetime, end_datetime)
+            try:
+                _result = _get_simple(start_datetime, end_datetime)
+            except ValueError as e:
+                pass
         elif interval == self.INTERVAL_1min:
             _res = []
             _start = self.start_datetime
@@ -184,7 +191,7 @@ class YahooCollector(BaseCollector):
 
 class YahooCollectorCN(YahooCollector, ABC):
     def get_instrument_list(self):
-        logger.info("get HS stock symbos......")
+        logger.info("get HS stock symbols......")
         symbols = get_hs_stock_symbols()
         logger.info(f"get {len(symbols)} symbols.")
         return symbols
@@ -233,9 +240,9 @@ class YahooCollectorCN1d(YahooCollectorCN):
 
 
 class YahooCollectorCN1min(YahooCollectorCN):
-    def download_index_data(self):
-        # TODO: 1m
-        logger.warning(f"{self.__class__.__name__} {self.interval} does not support: download_index_data")
+    def get_instrument_list(self):
+        symbols = super(YahooCollectorCN1min, self).get_instrument_list()
+        return symbols + ["000300.ss", "000905.ss", "00903.ss"]
 
 
 class YahooCollectorUS(YahooCollector, ABC):
@@ -450,10 +457,12 @@ class YahooNormalize1dExtend(YahooNormalize1d):
             _max_date = df.index.max()
             df = df.reindex(self._calendar_list).loc[:_max_date].reset_index()
             df = df[df[self._date_field_name] > _last_date]
+            if df.empty:
+                return pd.DataFrame()
             _si = df["close"].first_valid_index()
             if _si > df.index[0]:
                 logger.warning(
-                    f"{df.iloc[0][self._symbol_field_name]} missing data: {df.loc[:_si][self._date_field_name]}"
+                    f"{df.loc[_si][self._symbol_field_name]} missing data: {df.loc[:_si-1][self._date_field_name].to_list()}"
                 )
         # normalize
         df = self.normalize_yahoo(
@@ -661,7 +670,7 @@ class YahooNormalizeCN1min(YahooNormalizeCN, YahooNormalize1min):
 
     def symbol_to_yahoo(self, symbol):
         if "." not in symbol:
-            _exchange = symbol[:2]
+            _exchange = symbol[:2].lower()
             _exchange = "ss" if _exchange == "sh" else _exchange
             symbol = symbol[2:] + "." + _exchange
         return symbol
@@ -864,7 +873,7 @@ class Run(BaseRun):
         yc.normalize()
 
     def normalize_data_1min_cn_offline(
-        self, qlib_data_1d_dir, date_field_name: str = "date", symbol_field_name: str = "symbol"
+        self, qlib_data_1d_dir: str, date_field_name: str = "date", symbol_field_name: str = "symbol"
     ):
         """Normalised to 1min using local 1d data
 
@@ -942,6 +951,72 @@ class Run(BaseRun):
             limit_nums,
         )
 
+    def update_data_to_bin(self, qlib_data_1d_dir: str, trading_date: str = None, end_date: str = None):
+        """update yahoo data to bin
+
+        Parameters
+        ----------
+        qlib_data_1d_dir: str
+            the qlib data to be updated for yahoo, usually from: https://github.com/microsoft/qlib/tree/main/scripts#download-cn-data
+
+        trading_date: str
+            trading days to be updated, by default ``datetime.datetime.now().strftime("%Y-%m-%d")``
+        end_date: str
+            end datetime, default ``pd.Timestamp(trading_date + pd.Timedelta(days=1))``; open interval(excluding end)
+
+        Notes
+        -----
+            If the data in qlib_data_dir is incomplete, np.nan will be populated to trading_date for the previous trading day
+
+        Examples
+        -------
+            $ python collector.py update_data_to_bin --qlib_data_1d_dir <user data dir> --trading_date <start date> --end_date <end date>
+            # get 1m data
+        """
+
+        if self.interval.lower() != "1d":
+            logger.warning(f"currently supports 1d data updates: --interval 1d")
+
+        # start/end date
+        if trading_date is None:
+            trading_date = datetime.datetime.now().strftime("%Y-%m-%d")
+            logger.warning(f"trading_date is None, use the current date: {trading_date}")
+
+        if end_date is None:
+            end_date = (pd.Timestamp(trading_date) + pd.Timedelta(days=1)).strftime("%Y-%m-%d")
+
+        # download qlib 1d data
+        qlib_data_1d_dir = Path(qlib_data_1d_dir).expanduser().resolve()
+        if not exists_qlib_data(qlib_data_1d_dir):
+            GetData().qlib_data(target_dir=qlib_data_1d_dir, interval=self.interval, region=self.region)
+
+        # download data from yahoo
+        self.download_data(delay=1, start=trading_date, end=end_date, check_data_length=1)
+
+        # normalize data
+        self.normalize_data_1d_extend(str(qlib_data_1d_dir))
+
+        # dump bin
+        _dump = DumpDataUpdate(
+            csv_path=self.normalize_dir,
+            qlib_dir=qlib_data_1d_dir,
+            exclude_fields="symbol,date",
+            max_workers=self.max_workers,
+        )
+        _dump.dump()
+
+        # parse index
+        _region = self.region.lower()
+        if _region not in ["cn", "us"]:
+            logger.warning(f"Unsupported region: region={_region}, component downloads will be ignored")
+            return
+        index_list = ["CSI100", "CSI300"] if _region == "cn" else ["SP500", "NASDAQ100", "DJIA", "SP400"]
+        get_instruments = getattr(
+            importlib.import_module(f"data_collector.{_region}_index.collector"), "get_instruments"
+        )
+        for _index in index_list:
+            get_instruments(str(qlib_data_1d_dir), _index)
+
 
 if __name__ == "__main__":
     fire.Fire(Run)

From 985fd0816cd81cccaa315f25bda871dfa9c3c526 Mon Sep 17 00:00:00 2001
From: zhupr <zhu.pengrong@foxmail.com>
Date: Mon, 21 Jun 2021 17:18:04 +0800
Subject: [PATCH 09/14] Fix cn_index.collector network error

---
 scripts/data_collector/cn_index/collector.py | 27 +++++++++++++++-----
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/scripts/data_collector/cn_index/collector.py b/scripts/data_collector/cn_index/collector.py
index 63b2ebd9e..a529f61df 100644
--- a/scripts/data_collector/cn_index/collector.py
+++ b/scripts/data_collector/cn_index/collector.py
@@ -19,7 +19,7 @@ CUR_DIR = Path(__file__).resolve().parent
 sys.path.append(str(CUR_DIR.parent.parent))
 
 from data_collector.index import IndexBase
-from data_collector.utils import get_calendar_list, get_trading_date_by_shift
+from data_collector.utils import get_calendar_list, get_trading_date_by_shift, deco_retry
 
 
 NEW_COMPANIES_URL = "http://www.csindex.com.cn/uploads/file/autofile/cons/{index_code}cons.xls"
@@ -29,6 +29,22 @@ NEW_COMPANIES_URL = "http://www.csindex.com.cn/uploads/file/autofile/cons/{index
 # 2020-11-27 Announcement title change
 INDEX_CHANGES_URL = "http://www.csindex.com.cn/zh-CN/search/total?key=%E5%85%B3%E4%BA%8E%E8%B0%83%E6%95%B4%E6%B2%AA%E6%B7%B1300%E5%92%8C%E4%B8%AD%E8%AF%81%E9%A6%99%E6%B8%AF100%E7%AD%89"
 
+REQ_HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36 Edg/91.0.864.48"
+}
+
+
+@deco_retry
+def retry_request(url: str, method: str = "get", exclude_status: List = None):
+    if exclude_status is None:
+        exclude_status = []
+    method_func = getattr(requests, method)
+    _resp = method_func(url, headers=REQ_HEADERS)
+    _status = _resp.status_code
+    if _status not in exclude_status and _status != 200:
+        raise ValueError(f"response status: {_status}, url={url}")
+    return _resp
+
 
 class CSIIndex(IndexBase):
     @property
@@ -137,9 +153,8 @@ class CSIIndex(IndexBase):
                 date: pd.Timestamp
                 type: str, value from ["add", "remove"]
         """
-        resp = requests.get(url)
+        resp = retry_request(url)
         _text = resp.text
-
         date_list = re.findall(r"(\d{4}).*?年.*?(\d+).*?月.*?(\d+).*?日", _text)
         if len(date_list) >= 2:
             add_date = pd.Timestamp("-".join(date_list[0]))
@@ -150,7 +165,7 @@ class CSIIndex(IndexBase):
         logger.info(f"get {add_date} changes")
         try:
             excel_url = re.findall('.*href="(.*?xls.*?)".*', _text)[0]
-            content = requests.get(f"http://www.csindex.com.cn{excel_url}").content
+            content = retry_request(f"http://www.csindex.com.cn{excel_url}", exclude_status=[404]).content
             _io = BytesIO(content)
             df_map = pd.read_excel(_io, sheet_name=None)
             with self.cache_dir.joinpath(
@@ -204,7 +219,7 @@ class CSIIndex(IndexBase):
         -------
             [url1, url2]
         """
-        resp = requests.get(self.changes_url)
+        resp = retry_request(self.changes_url)
         html = etree.HTML(resp.text)
         return html.xpath("//*[@id='itemContainer']//li/a/@href")
 
@@ -224,7 +239,7 @@ class CSIIndex(IndexBase):
                 end_date: pd.Timestamp
         """
         logger.info("get new companies......")
-        context = requests.get(self.new_companies_url).content
+        context = retry_request(self.new_companies_url).content
         with self.cache_dir.joinpath(
             f"{self.index_name.lower()}_new_companies.{self.new_companies_url.split('.')[-1]}"
         ).open("wb") as fp:

From 99fb49650ac24b59f9f2631ac16827812b1eddbe Mon Sep 17 00:00:00 2001
From: zhupr <zhu.pengrong@foxmail.com>
Date: Mon, 21 Jun 2021 17:20:37 +0800
Subject: [PATCH 10/14] add end_date parameter to collector.normalize_data

---
 scripts/data_collector/base.py            |  9 +++++++--
 scripts/data_collector/utils.py           |  5 ++---
 scripts/data_collector/yahoo/collector.py | 13 +++++++------
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/scripts/data_collector/base.py b/scripts/data_collector/base.py
index 08e1838a4..ce94f1783 100644
--- a/scripts/data_collector/base.py
+++ b/scripts/data_collector/base.py
@@ -238,7 +238,7 @@ class BaseNormalize(abc.ABC):
         """
         self._date_field_name = date_field_name
         self._symbol_field_name = symbol_field_name
-
+        self.kwargs = kwargs
         self._calendar_list = self._get_calendar_list()
 
     @abc.abstractmethod
@@ -285,7 +285,9 @@ class Normalize:
         self._source_dir = Path(source_dir).expanduser()
         self._target_dir = Path(target_dir).expanduser()
         self._target_dir.mkdir(parents=True, exist_ok=True)
-
+        self._date_field_name = date_field_name
+        self._symbol_field_name = symbol_field_name
+        self._end_date = kwargs.get("end_date", None)
         self._max_workers = max_workers
 
         self._normalize_obj = normalize_class(
@@ -297,6 +299,9 @@ class Normalize:
         df = pd.read_csv(file_path)
         df = self._normalize_obj.normalize(df)
         if df is not None and not df.empty:
+            if self._end_date is not None:
+                _mask = pd.to_datetime(df[self._date_field_name]) <= pd.Timestamp(self._end_date)
+                df = df[_mask]
             df.to_csv(self._target_dir.joinpath(file_path.name), index=False)
 
     def normalize(self):
diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py
index 3f4539612..1a8d479d9 100644
--- a/scripts/data_collector/utils.py
+++ b/scripts/data_collector/utils.py
@@ -2,7 +2,6 @@
 #  Licensed under the MIT License.
 
 import re
-import os
 import time
 import bisect
 import pickle
@@ -10,7 +9,7 @@ import random
 import requests
 import functools
 from pathlib import Path
-from typing import Iterable, Tuple
+from typing import Iterable, Tuple, List
 
 import numpy as np
 import pandas as pd
@@ -47,7 +46,7 @@ _CALENDAR_MAP = {}
 MINIMUM_SYMBOLS_NUM = 3900
 
 
-def get_calendar_list(bench_code="CSI300") -> list:
+def get_calendar_list(bench_code="CSI300") -> List[pd.Timestamp]:
     """get SH/SZ history calendar list
 
     Parameters
diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py
index f9a209168..44cfce7ca 100644
--- a/scripts/data_collector/yahoo/collector.py
+++ b/scripts/data_collector/yahoo/collector.py
@@ -379,14 +379,13 @@ class YahooNormalize1d(YahooNormalize, ABC):
         df = df.set_index(self._date_field_name)
         _close = self._get_first_close(df)
         for _col in df.columns:
-            if _col == self._symbol_field_name:
+            # NOTE: retain original adjclose, required for incremental updates
+            if _col in [self._symbol_field_name, "adjclose", "change"]:
                 continue
             if _col == "volume":
                 df[_col] = df[_col] * _close
-            elif _col != "change":
-                df[_col] = df[_col] / _close
             else:
-                pass
+                df[_col] = df[_col] / _close
         return df.reset_index()
 
 
@@ -812,7 +811,7 @@ class Run(BaseRun):
             max_collector_count, delay, start, end, self.interval, check_data_length, limit_nums
         )
 
-    def normalize_data(self, date_field_name: str = "date", symbol_field_name: str = "symbol"):
+    def normalize_data(self, date_field_name: str = "date", symbol_field_name: str = "symbol", end_date: str = None):
         """normalize data
 
         Parameters
@@ -821,12 +820,14 @@ class Run(BaseRun):
             date field name, default date
         symbol_field_name: str
             symbol field name, default symbol
+        end_date: str
+            if not None, normalize the last date saved (including end_date); if None, it will ignore this parameter; by default None
 
         Examples
         ---------
             $ python collector.py normalize_data --source_dir ~/.qlib/stock_data/source --normalize_dir ~/.qlib/stock_data/normalize --region cn --interval 1d
         """
-        super(Run, self).normalize_data(date_field_name, symbol_field_name)
+        super(Run, self).normalize_data(date_field_name, symbol_field_name, end_date=end_date)
 
     def normalize_data_1d_extend(
         self, old_qlib_data_dir, date_field_name: str = "date", symbol_field_name: str = "symbol"

From 46714adf4c14ea628b46d163070b9bd18b3f52a0 Mon Sep 17 00:00:00 2001
From: zhupr <zhu.pengrong@foxmail.com>
Date: Tue, 22 Jun 2021 11:15:09 +0800
Subject: [PATCH 11/14] modify the YahooNormalize1min factor calculation

---
 .../analysis_model_performance.py             |  41 ++--
 scripts/data_collector/yahoo/collector.py     | 201 +++++++++---------
 2 files changed, 117 insertions(+), 125 deletions(-)

diff --git a/qlib/contrib/report/analysis_model/analysis_model_performance.py b/qlib/contrib/report/analysis_model/analysis_model_performance.py
index 1d444b104..ddf97fb36 100644
--- a/qlib/contrib/report/analysis_model/analysis_model_performance.py
+++ b/qlib/contrib/report/analysis_model/analysis_model_performance.py
@@ -84,30 +84,29 @@ def _plot_qq(data: pd.Series = None, dist=stats.norm) -> go.Figure:
     qqplot_data = _plt_fig.gca().lines
     fig = go.Figure()
 
-    fig.add_trace({
-        'type': 'scatter',
-        'x': qqplot_data[0].get_xdata(),
-        # 'x': [0, 1],
-        'y': qqplot_data[0].get_ydata(),
-        # 'y': [1, 2],
-        'mode': 'markers',
-        'marker': {
-            'color': '#19d3f3'
+    fig.add_trace(
+        {
+            "type": "scatter",
+            "x": qqplot_data[0].get_xdata(),
+            # 'x': [0, 1],
+            "y": qqplot_data[0].get_ydata(),
+            # 'y': [1, 2],
+            "mode": "markers",
+            "marker": {"color": "#19d3f3"},
         }
-    })
+    )
 
-    fig.add_trace({
-        'type': 'scatter',
-        'x': qqplot_data[1].get_xdata(),
-        # 'x': [0, 1],
-        'y': qqplot_data[1].get_ydata(),
-        # 'y': [1, 2],
-        'mode': 'lines',
-        'line': {
-            'color': '#636efa'
+    fig.add_trace(
+        {
+            "type": "scatter",
+            "x": qqplot_data[1].get_xdata(),
+            # 'x': [0, 1],
+            "y": qqplot_data[1].get_ydata(),
+            # 'y': [1, 2],
+            "mode": "lines",
+            "line": {"color": "#636efa"},
         }
-
-    })
+    )
     del qqplot_data
     return fig
 
diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py
index 44cfce7ca..b474d3924 100644
--- a/scripts/data_collector/yahoo/collector.py
+++ b/scripts/data_collector/yahoo/collector.py
@@ -478,8 +478,8 @@ class YahooNormalize1min(YahooNormalize, ABC):
     PM_RANGE = None  # type: tuple  # eg: ("13:00:00", "14:59:00")
 
     # Whether the trading day of 1min data is consistent with 1d
-    CONSISTENT_1d = False
-    CALC_PAUSED_NUM = False
+    CONSISTENT_1d = True
+    CALC_PAUSED_NUM = True
 
     @property
     def calendar_list_1d(self):
@@ -500,7 +500,7 @@ class YahooNormalize1min(YahooNormalize, ABC):
         Returns
         ------
             data_1d: pd.DataFrame
-                set(data_1d.columns) - set([self._date_field_name, self._symbol_field_name, "paused", "volume", "factor"]) == {}
+                data_1d.columns = [self._date_field_name, self._symbol_field_name, "paused", "volume", "factor", "close"]
 
         """
         data_1d = YahooCollector.get_data_from_remote(self.symbol_to_yahoo(symbol), interval="1d", start=start, end=end)
@@ -516,6 +516,7 @@ class YahooNormalize1min(YahooNormalize, ABC):
         if df.empty:
             return df
         df = df.copy()
+        df = df.sort_values(self._date_field_name)
         symbol = df.iloc[0][self._symbol_field_name]
         # get 1d data from yahoo
         _start = pd.Timestamp(df[self._date_field_name].min()).strftime(self.DAILY_FORMAT)
@@ -523,7 +524,7 @@ class YahooNormalize1min(YahooNormalize, ABC):
         data_1d: pd.DataFrame = self.get_1d_data(symbol, _start, _end)
         data_1d = data_1d.copy()
         if data_1d is None or data_1d.empty:
-            df["factor"] = 1
+            df["factor"] = 1 / df.loc[df["close"].first_valid_index()]
             # TODO: np.nan or 1 or 0
             df["paused"] = np.nan
         else:
@@ -534,9 +535,13 @@ class YahooNormalize1min(YahooNormalize, ABC):
             data_1d = data_1d.set_index(self._date_field_name)
 
             # add factor from 1d data
+            # NOTE: yahoo 1d data info:
+            #   - Close price adjusted for splits. Adjusted close price adjusted for both dividends and splits.
+            #   - data_1d.adjclose: Adjusted close price adjusted for both dividends and splits.
+            #   - data_1d.close: `data_1d.adjclose / (close for the first trading day that is not np.nan)`
             df["date_tmp"] = df[self._date_field_name].apply(lambda x: pd.Timestamp(x).date())
             df.set_index("date_tmp", inplace=True)
-            df.loc[:, "factor"] = data_1d["factor"]
+            df.loc[:, "factor"] = data_1d["close"] / df["close"]
             df.loc[:, "paused"] = data_1d["paused"]
             df.reset_index("date_tmp", drop=True, inplace=True)
 
@@ -619,6 +624,61 @@ class YahooNormalize1min(YahooNormalize, ABC):
         raise NotImplementedError("rewrite _get_1d_calendar_list")
 
 
+class YahooNormalize1minOffline(YahooNormalize1min):
+    """Normalised to 1min using local 1d data"""
+
+    def __init__(
+        self, qlib_data_1d_dir: [str, Path], date_field_name: str = "date", symbol_field_name: str = "symbol", **kwargs
+    ):
+        """
+
+        Parameters
+        ----------
+        qlib_data_1d_dir: str, Path
+            the qlib data to be updated for yahoo, usually from: Normalised to 1min using local 1d data
+        date_field_name: str
+            date field name, default is date
+        symbol_field_name: str
+            symbol field name, default is symbol
+        """
+        self.qlib_data_1d_dir = qlib_data_1d_dir
+        super(YahooNormalize1minOffline, self).__init__(date_field_name, symbol_field_name)
+        self._all_1d_data = self._get_all_1d_data()
+
+    def _get_1d_calendar_list(self) -> Iterable[pd.Timestamp]:
+        import qlib
+        from qlib.data import D
+
+        qlib.init(provider_uri=self.qlib_data_1d_dir)
+        return list(D.calendar(freq="day"))
+
+    def _get_all_1d_data(self):
+        import qlib
+        from qlib.data import D
+
+        qlib.init(provider_uri=self.qlib_data_1d_dir)
+        df = D.features(D.instruments("all"), ["$paused", "$volume", "$factor", "$close"], freq="day")
+        df.reset_index(inplace=True)
+        df.rename(columns={"datetime": self._date_field_name, "instrument": self._symbol_field_name}, inplace=True)
+        df.columns = list(map(lambda x: x[1:] if x.startswith("$") else x, df.columns))
+        return df
+
+    def get_1d_data(self, symbol: str, start: str, end: str) -> pd.DataFrame:
+        """get 1d data
+
+        Returns
+        ------
+            data_1d: pd.DataFrame
+                data_1d.columns = [self._date_field_name, self._symbol_field_name, "paused", "volume", "factor", "close"]
+
+        """
+        return self._all_1d_data[
+            (self._all_1d_data[self._symbol_field_name] == symbol.upper())
+            & (self._all_1d_data[self._date_field_name] >= pd.Timestamp(start))
+            & (self._all_1d_data[self._date_field_name] < pd.Timestamp(end))
+        ]
+
+
 class YahooNormalizeUS:
     def _get_calendar_list(self) -> Iterable[pd.Timestamp]:
         # TODO: from MSN
@@ -629,8 +689,8 @@ class YahooNormalizeUS1d(YahooNormalizeUS, YahooNormalize1d):
     pass
 
 
-class YahooNormalizeUS1min(YahooNormalizeUS, YahooNormalize1min):
-    CONSISTENT_1d = False
+class YahooNormalizeUS1min(YahooNormalizeUS, YahooNormalize1minOffline):
+    CALC_PAUSED_NUM = False
 
     def _get_calendar_list(self) -> Iterable[pd.Timestamp]:
         # TODO: support 1min
@@ -657,20 +717,17 @@ class YahooNormalizeCN1dExtend(YahooNormalizeCN, YahooNormalize1dExtend):
     pass
 
 
-class YahooNormalizeCN1min(YahooNormalizeCN, YahooNormalize1min):
+class YahooNormalizeCN1min(YahooNormalizeCN, YahooNormalize1minOffline):
     AM_RANGE = ("09:30:00", "11:29:00")
     PM_RANGE = ("13:00:00", "14:59:00")
 
-    CONSISTENT_1d = True
-    CALC_PAUSED_NUM = True
-
     def _get_calendar_list(self) -> Iterable[pd.Timestamp]:
         return self.generate_1min_from_daily(self.calendar_list_1d)
 
     def symbol_to_yahoo(self, symbol):
         if "." not in symbol:
-            _exchange = symbol[:2].lower()
-            _exchange = "ss" if _exchange == "sh" else _exchange
+            _exchange = symbol[:2]
+            _exchange = ("ss" if _exchange.islower() else "SS") if _exchange.lower() == "sh" else _exchange
             symbol = symbol[2:] + "." + _exchange
         return symbol
 
@@ -678,63 +735,6 @@ class YahooNormalizeCN1min(YahooNormalizeCN, YahooNormalize1min):
         return get_calendar_list("ALL")
 
 
-class YahooNormalizeCN1minOffline(YahooNormalizeCN1min):
-    """Normalised to 1min using local 1d data
-    1d data usually from: Normalised to 1min using local 1d data
-    """
-
-    def __init__(
-        self, qlib_data_1d_dir: [str, Path], date_field_name: str = "date", symbol_field_name: str = "symbol", **kwargs
-    ):
-        """
-
-        Parameters
-        ----------
-        qlib_data_1d_dir: str, Path
-            the qlib data to be updated for yahoo, usually from: Normalised to 1min using local 1d data
-        date_field_name: str
-            date field name, default is date
-        symbol_field_name: str
-            symbol field name, default is symbol
-        """
-        self.qlib_data_1d_dir = qlib_data_1d_dir
-        super(YahooNormalizeCN1minOffline, self).__init__(date_field_name, symbol_field_name)
-        self._all_1d_data = self._get_all_1d_data()
-
-    def _get_1d_calendar_list(self) -> Iterable[pd.Timestamp]:
-        import qlib
-        from qlib.data import D
-
-        qlib.init(provider_uri=self.qlib_data_1d_dir)
-        return list(D.calendar(freq="day"))
-
-    def _get_all_1d_data(self):
-        import qlib
-        from qlib.data import D
-
-        qlib.init(provider_uri=self.qlib_data_1d_dir)
-        df = D.features(D.instruments("all"), ["$paused", "$volume", "$factor"], freq="day")
-        df.reset_index(inplace=True)
-        df.rename(columns={"datetime": self._date_field_name, "instrument": self._symbol_field_name}, inplace=True)
-        df.columns = list(map(lambda x: x[1:] if x.startswith("$") else x, df.columns))
-        return df
-
-    def get_1d_data(self, symbol: str, start: str, end: str) -> pd.DataFrame:
-        """get 1d data
-
-        Returns
-        ------
-            data_1d: pd.DataFrame
-                set(data_1d.columns) - set([self._date_field_name, self._symbol_field_name, "paused", "volume", "factor"]) == {}
-
-        """
-        return self._all_1d_data[
-            (self._all_1d_data[self._symbol_field_name] == symbol.upper())
-            & (self._all_1d_data[self._date_field_name] >= pd.Timestamp(start))
-            & (self._all_1d_data[self._date_field_name] < pd.Timestamp(end))
-        ]
-
-
 class Run(BaseRun):
     def __init__(self, source_dir=None, normalize_dir=None, max_workers=1, interval="1d", region=REGION_CN):
         """
@@ -811,7 +811,13 @@ class Run(BaseRun):
             max_collector_count, delay, start, end, self.interval, check_data_length, limit_nums
         )
 
-    def normalize_data(self, date_field_name: str = "date", symbol_field_name: str = "symbol", end_date: str = None):
+    def normalize_data(
+        self,
+        date_field_name: str = "date",
+        symbol_field_name: str = "symbol",
+        end_date: str = None,
+        qlib_data_1d_dir: str = None,
+    ):
         """normalize data
 
         Parameters
@@ -822,12 +828,29 @@ class Run(BaseRun):
             symbol field name, default symbol
         end_date: str
             if not None, normalize the last date saved (including end_date); if None, it will ignore this parameter; by default None
+        qlib_data_1d_dir: str
+            if interval==1min, qlib_data_1d_dir cannot be None, normalize 1min needs to use 1d data;
+
+                qlib_data_1d can be obtained like this:
+                    $ python scripts/get_data.py qlilb_data --target_dir <qlib_data_1d_dir> --interval 1d
+                    $ python scripts/data_collector/yahoo/collector.py update_data_to_bin --qlib_data_1d_dir <qlib_data_1d_dir> --trading_date 2021-06-01
+                or:
+                    download 1d data, reference: https://github.com/microsoft/qlib/tree/main/scripts/data_collector/yahoo#1d-from-yahoo
 
         Examples
         ---------
             $ python collector.py normalize_data --source_dir ~/.qlib/stock_data/source --normalize_dir ~/.qlib/stock_data/normalize --region cn --interval 1d
+            $ python collector.py normalize_data --qlib_data_1d_dir ~/.qlib/qlib_data/cn_1d --source_dir ~/.qlib/stock_data/source_cn_1min --normalize_dir ~/.qlib/stock_data/normalize_cn_1min --region CN --interval 1min
         """
-        super(Run, self).normalize_data(date_field_name, symbol_field_name, end_date=end_date)
+        if self.interval.lower() == "1min":
+            if qlib_data_1d_dir is None or not Path(qlib_data_1d_dir).expanduser().exists():
+                # TODO: add reference url
+                raise ValueError(
+                    "If normalize 1min, the qlib_data_1d_dir parameter must be set: --qlib_data_1d_dir <user qlib 1d data >, Reference: "
+                )
+        super(Run, self).normalize_data(
+            date_field_name, symbol_field_name, end_date=end_date, qlib_data_1d_dir=qlib_data_1d_dir
+        )
 
     def normalize_data_1d_extend(
         self, old_qlib_data_dir, date_field_name: str = "date", symbol_field_name: str = "symbol"
@@ -873,36 +896,6 @@ class Run(BaseRun):
         )
         yc.normalize()
 
-    def normalize_data_1min_cn_offline(
-        self, qlib_data_1d_dir: str, date_field_name: str = "date", symbol_field_name: str = "symbol"
-    ):
-        """Normalised to 1min using local 1d data
-
-        Parameters
-        ----------
-        qlib_data_1d_dir: str
-            the qlib data to be updated for yahoo, usually from: https://github.com/microsoft/qlib/tree/main/scripts#download-cn-data
-        date_field_name: str
-            date field name, default date
-        symbol_field_name: str
-            symbol field name, default symbol
-
-        Examples
-        ---------
-            $ python collector.py normalize_data_1min_cn_offline --qlib_data_1d_dir ~/.qlib/qlib_data/cn_1d --source_dir ~/.qlib/stock_data/source_cn_1min --normalize_dir ~/.qlib/stock_data/normalize_cn_1min --region CN --interval 1min
-        """
-        _class = getattr(self._cur_module, f"{self.normalize_class_name}Offline")
-        yc = Normalize(
-            source_dir=self.source_dir,
-            target_dir=self.normalize_dir,
-            normalize_class=_class,
-            max_workers=self.max_workers,
-            date_field_name=date_field_name,
-            symbol_field_name=symbol_field_name,
-            qlib_data_1d_dir=qlib_data_1d_dir,
-        )
-        yc.normalize()
-
     def download_today_data(
         self,
         max_collector_count=2,
@@ -987,7 +980,7 @@ class Run(BaseRun):
             end_date = (pd.Timestamp(trading_date) + pd.Timedelta(days=1)).strftime("%Y-%m-%d")
 
         # download qlib 1d data
-        qlib_data_1d_dir = Path(qlib_data_1d_dir).expanduser().resolve()
+        qlib_data_1d_dir = str(Path(qlib_data_1d_dir).expanduser().resolve())
         if not exists_qlib_data(qlib_data_1d_dir):
             GetData().qlib_data(target_dir=qlib_data_1d_dir, interval=self.interval, region=self.region)
 
@@ -995,7 +988,7 @@ class Run(BaseRun):
         self.download_data(delay=1, start=trading_date, end=end_date, check_data_length=1)
 
         # normalize data
-        self.normalize_data_1d_extend(str(qlib_data_1d_dir))
+        self.normalize_data_1d_extend(qlib_data_1d_dir)
 
         # dump bin
         _dump = DumpDataUpdate(

From bab50e883721afe658e1535eb78062b83a1d69a0 Mon Sep 17 00:00:00 2001
From: zhupr <zhu.pengrong@foxmail.com>
Date: Wed, 23 Jun 2021 16:13:26 +0800
Subject: [PATCH 12/14] fix YahooNormalize1min && update docs

---
 README.md                                 |   2 +-
 examples/benchmarks/README.md             |   4 +
 scripts/data_collector/yahoo/README.md    | 250 ++++++++++++----------
 scripts/data_collector/yahoo/collector.py |  16 +-
 4 files changed, 149 insertions(+), 123 deletions(-)

diff --git a/README.md b/README.md
index 635b143f4..d60e3e2a7 100644
--- a/README.md
+++ b/README.md
@@ -162,7 +162,7 @@ We recommend users to prepare their own data if they have a high-quality dataset
 ### Automatic update of daily frequency data(from yahoo finance)
   > It is recommended that users update the data manually once (--trading_date 2021-05-25) and then set it to update automatically.
 
-  > For more information refer to: [yahoo collector](https://github.com/microsoft/qlib/tree/main/scripts/data_collector/yahoo#Automatic-update-of-daily-frequency-data)
+  > For more information refer to: [yahoo collector](https://github.com/microsoft/qlib/tree/main/scripts/data_collector/yahoo#automatic-update-of-daily-frequency-datafrom-yahoo-finance)
 
   * Automatic update of data to the "qlib" directory each trading day(Linux)
       * use *crontab*: `crontab -e`
diff --git a/examples/benchmarks/README.md b/examples/benchmarks/README.md
index c3d965d85..133380fe0 100644
--- a/examples/benchmarks/README.md
+++ b/examples/benchmarks/README.md
@@ -4,6 +4,10 @@ Here are the results of each benchmark model running on Qlib's `Alpha360` and `A
 
 The numbers shown below demonstrate the performance of the entire `workflow` of each model. We will update the `workflow` as well as models in the near future for better results.
 
+> If you need to reproduce the results below, please use the **v1** dataset: `python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_cn_1d --region cn --version v1`
+>
+> In the new version of qlib, the default dataset is **v2**. Since the data is collected from the YahooFinance API (which is not very stable), the results of *v2* and *v1* may differ
+
 ## Alpha360 dataset
 | Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown |
 |---|---|---|---|---|---|---|---|---|
diff --git a/scripts/data_collector/yahoo/README.md b/scripts/data_collector/yahoo/README.md
index 6fa4b1937..68124e132 100644
--- a/scripts/data_collector/yahoo/README.md
+++ b/scripts/data_collector/yahoo/README.md
@@ -1,17 +1,9 @@
 
 - [Collector Data](#collector-data)
-  - [Automatic update data](#automatic-update-of-daily-frequency-data(from-yahoo-finance))
-  - [CN Data](#CN-Data)
-    - [1d from yahoo](#1d-from-yahoocn)
-    - [1d from qlib](#1d-from-qlibcn)
-    - [using data(1d)](#using-data1d-cn)
-    - [1min from yahoo](#1min-from-yahoocn)
-    - [1min from qlib](#1min-from-qlibcn)
-    - [using data(1min)](#using-data1min-cn)
-  - [US Data](#CN-Data)
-    - [1d from yahoo](#1d-from-yahoous)
-    - [1d from qlib](#1d-from-qlibus)
-    - [using data(1d)](#using-data1d-us)
+  - [Get Qlib data](#get-qlib-databin-file)
+  - [Collector *YahooFinance* data to qlib](#collector-yahoofinance-data-to-qlib)
+  - [Automatic update of daily frequency data](#automatic-update-of-daily-frequency-datafrom-yahoo-finance)
+- [Using qlib data](#using-qlib-data)
 
 
 # Collect Data From Yahoo Finance
@@ -34,6 +26,110 @@ pip install -r requirements.txt
 
 ## Collector Data
 
+### Get Qlib data(`bin file`)
+  > `qlib-data` from *YahooFinance*, is the data that has been dumped and can be used directly in `qlib`
+
+  - get data: `python scripts/get_data.py qlib_data`
+  - parameters:
+    - `target_dir`: save dir, by default *~/.qlib/qlib_data/cn_data*
+    - `version`: dataset version, value from [`v1`, `v2`], by default `v1`
+      - `v2` end date is *2021-06*, `v1` end date is *2020-09*
+      - user can append data to `v2`: [automatic update of daily frequency data](#automatic-update-of-daily-frequency-datafrom-yahoo-finance)
+      - **the [benchmarks](https://github.com/microsoft/qlib/tree/main/examples/benchmarks) for qlib use `v1`**, *due to the unstable access to historical data by YahooFinance, there are some differences between `v2` and `v1`*
+    - `interval`: `1d` or `1min`, by default `1d`
+    - `region`: `cn` or `us`, by default `cn`
+    - `delete_old`: delete existing data from `target_dir`(*features, calendars, instruments, dataset_cache, features_cache*), value from [`True`, `False`], by default `True`
+    - `exists_skip`: traget_dir data already exists, skip `get_data`, value from [`True`, `False`], by default `False`
+  - examples:
+    ```bash
+    # cn 1d
+    python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_cn_1d --region cn
+    # cn 1min
+    python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_cn_1min --region cn --interval 1min
+    # us 1d
+    python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_us_1d --region us --interval 1d
+    # us 1min
+    python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_us_1min --region us --interval 1min
+    ```
+
+### Collector *YahooFinance* data to qlib
+> collector *YahooFinance* data and *dump* into `qlib` format
+  1. download data to csv: `python scripts/data_collector/yahoo/collector.py download_data`
+
+     - parameters:
+          - `source_dir`: save the directory
+          - `interval`: `1d` or `1min`, by default `1d`
+            > **due to the limitation of the *YahooFinance API*, only the last month's data is available in `1min`**
+          - `region`: `CN` or `US`, by default `CN`
+          - `delay`: `time.sleep(delay)`, by default *0.5*
+          - `start`: start datetime, by default *"2000-01-01"*; *closed interval(including start)*
+          - `end`: end datetime, by default `pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1))`; *open interval(excluding end)*
+          - `max_workers`: get the number of concurrent symbols, it is not recommended to change this parameter in order to maintain the integrity of the symbol data, by default *1*
+          - `check_data_length`: check the number of rows per *symbol*, by default `None`
+            > if `len(symbol_df) < check_data_length`, it will be re-fetched, with the number of re-fetches coming from the `max_collector_count` parameter
+          - `max_collector_count`: number of *"failed"* symbol retries, by default 2
+     - examples:
+          ```bash
+          # cn 1d data
+          python collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_1d --start 2020-01-01 --end 2020-12-31 --delay 1 --interval 1d --region US
+          # cn 1min data
+          python collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_1min --delay 1 --interval 1min --region CN
+          # us 1d data
+          python collector.py download_data --source_dir ~/.qlib/stock_data/source/us_1d --start 2020-01-01 --end 2020-12-31 --delay 1 --interval 1d --region US
+          # us 1min data
+          python collector.py download_data --source_dir ~/.qlib/stock_data/source/us_1min --delay 1 --interval 1min --region US
+          ```
+  2. normalize data: `python scripts/data_collector/yahoo/collector.py normalize_data`
+     
+     - parameters:
+          - `source_dir`: csv directory
+          - `normalize_dir`: result directory
+          - `max_workers`: number of concurrent, by default *1*
+          - `interval`: `1d` or `1min`, by default `1d`
+            > if **`interval == 1min`**, `qlib_data_1d_dir` cannot be `None`
+          - `region`: `CN` or `US`, by default `CN`
+          - `date_field_name`: column *name* identifying time in csv files, by default `date`
+          - `symbol_field_name`: column *name* identifying symbol in csv files, by default `symbol`
+          - `end_date`: if not `None`, normalize the last date saved (*including end_date*); if `None`, it will ignore this parameter; by default `None`
+          - `qlib_data_1d_dir`: qlib directory(1d data)
+            ```
+            if interval==1min, qlib_data_1d_dir cannot be None, normalize 1min needs to use 1d data;
+        
+                qlib_data_1d can be obtained like this:
+                    $ python scripts/get_data.py qlilb_data --target_dir <qlib_data_1d_dir> --interval 1d
+                    $ python scripts/data_collector/yahoo/collector.py update_data_to_bin --qlib_data_1d_dir <qlib_data_1d_dir> --trading_date 2021-06-01
+                or:
+                    download 1d data from YahooFinance
+            
+            ```
+      - examples:
+        ```bash
+        # normalize 1d cn
+        python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/cn_1d --normalize_dir ~/.qlib/stock_data/source/cn_1d_nor --region CN --interval 1d
+        # normalize 1min cn
+        python collector.py normalize_data --qlib_data_1d_dir ~/.qlib/qlib_data/qlib_cn_1d --source_dir ~/.qlib/stock_data/source/cn_1min --normalize_dir ~/.qlib/stock_data/source/cn_1min_nor --region CN --interval 1min
+        ```
+  3. dump data: `python scripts/dump_bin.py dump_all`
+    
+     - parameters:
+       - `csv_path`: stock data path or directory, **normalize result(normalize_dir)**
+       - `qlib_dir`: qlib(dump) data director
+       - `freq`: transaction frequency, by default `day`
+         > `freq_map = {1d:day, 1mih: 1min}`
+       - `max_workers`: number of threads, by default *16*
+       - `include_fields`: dump fields, by default `""`
+       - `exclude_fields`: fields not dumped, by default `"""
+         > dump_fields = `include_fields if include_fields else set(symbol_df.columns) - set(exclude_fields) exclude_fields else symbol_df.columns`
+       - `symbol_field_name`: column *name* identifying symbol in csv files, by default `symbol`
+       - `date_field_name`: column *name* identifying time in csv files, by default `date`
+     - examples:
+       ```bash
+       # dump 1d cn
+       python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1d_nor --qlib_dir ~/.qlib/qlib_data/qlib_cn_1d --freq day --exclude_fields date,symbol
+       # dump 1min cn
+       python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1min_nor --qlib_dir ~/.qlib/qlib_data/qlib_cn_1min --freq 1min --exclude_fields date,symbol
+       ```
+
 ### Automatic update of daily frequency data(from yahoo finance)
   > It is recommended that users update the data manually once (--trading_date 2021-05-25) and then set it to update automatically.
 
@@ -62,112 +158,36 @@ pip install -r requirements.txt
       * *region*: region, value from ["CN", "US"], default "CN"
 
 
-### CN Data
+## Using qlib data
 
-#### 1d from yahoo(CN)
+  ```python
+  import qlib
+  from qlib.data import D
 
-```bash
+  # 1d data cn
+  # freq=day, freq default day
+  qlib.init(provider_uri="~/.qlib/qlib_data/qlib_cn_1d", region="cn")
+  df = D.features(D.instruments("all"), ["$close"], freq="day")
 
-# download from yahoo finance
-python collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_1d --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d
+  # 1min data cn
+  # freq=1min
+  qlib.init(provider_uri="~/.qlib/qlib_data/qlib_cn_1min", region="cn")
+  inst = D.list_instruments(D.instruments("all"), freq="1min", as_list=True)
+  # get 100 symbols
+  df = D.features(inst[:100], ["$close"], freq="1min")
+  # get all symbol data
+  # df = D.features(D.instruments("all"), ["$close"], freq="1min")
 
-# normalize
-python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/cn_1d --normalize_dir ~/.qlib/stock_data/source/cn_1d_nor --region CN --interval 1d
+  # 1d data us
+  qlib.init(provider_uri="~/.qlib/qlib_data/qlib_us_1d", region="us")
+  df = D.features(D.instruments("all"), ["$close"], freq="day")
 
-# dump data
-cd qlib/scripts
-python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1d_nor --qlib_dir ~/.qlib/qlib_data/qlib_cn_1d --freq day --exclude_fields date,adjclose,dividends,splits,symbol
+  # 1min data us
+  qlib.init(provider_uri="~/.qlib/qlib_data/qlib_us_1min", region="cn")
+  inst = D.list_instruments(D.instruments("all"), freq="1min", as_list=True)
+  # get 100 symbols
+  df = D.features(inst[:100], ["$close"], freq="1min")
+  # get all symbol data
+  # df = D.features(D.instruments("all"), ["$close"], freq="1min")
+  ```
 
-```
-
-### 1d from qlib(CN)
-```bash
-python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_cn_1d --region cn
-```
-
-### using data(1d CN)
-
-```python
-import qlib
-from qlib.data import D
-
-qlib.init(provider_uri="~/.qlib/qlib_data/qlib_cn_1d", region="cn")
-df = D.features(D.instruments("all"), ["$close"], freq="day")
-```
-
-#### 1min from yahoo(CN)
-
-```bash
-
-# download from yahoo finance
-python collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_1min --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1min
-
-# normalize
-python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/cn_1min --normalize_dir ~/.qlib/stock_data/source/cn_1min_nor --region CN --interval 1min
-
-# dump data
-cd qlib/scripts
-python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1min_nor --qlib_dir ~/.qlib/qlib_data/qlib_cn_1min --freq 1min --exclude_fields date,adjclose,dividends,splits,symbol
-```
-
-### 1min from qlib(CN)
-```bash
-python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_cn_1min --interval 1min --region cn
-```
-
-### using data(1min CN)
-
-```python
-import qlib
-from qlib.data import D
-
-qlib.init(provider_uri="~/.qlib/qlib_data/qlib_cn_1min", region="cn")
-df = D.features(D.instruments("all"), ["$close"], freq="1min")
-
-```
-
-### US Data
-
-#### 1d from yahoo(US)
-
-```bash
-
-# download from yahoo finance
-python collector.py download_data --source_dir ~/.qlib/stock_data/source/us_1d --region US --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d
-
-# normalize
-python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/us_1d --normalize_dir ~/.qlib/stock_data/source/us_1d_nor --region US --interval 1d
-
-# dump data
-cd qlib/scripts
-python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/us_1d_nor --qlib_dir ~/.qlib/stock_data/source/qlib_us_1d --freq day --exclude_fields date,adjclose,dividends,splits,symbol
-```
-
-#### 1d from qlib(US)
-
-```bash
-python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/qlib_us_1d --region us
-```
-
-### using data(1d US)
-
-```python
-# using
-import qlib
-from qlib.data import D
-
-qlib.init(provider_uri="~/.qlib/qlib_data/qlib_us_1d", region="us")
-df = D.features(D.instruments("all"), ["$close"], freq="day")
-
-```
-
-
-### Help
-```bash
-pythono collector.py collector_data --help
-```
-
-## Parameters
-
-- interval: 1min or 1d
-- region: CN or US
diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py
index b474d3924..06545bda1 100644
--- a/scripts/data_collector/yahoo/collector.py
+++ b/scripts/data_collector/yahoo/collector.py
@@ -242,7 +242,10 @@ class YahooCollectorCN1d(YahooCollectorCN):
 class YahooCollectorCN1min(YahooCollectorCN):
     def get_instrument_list(self):
         symbols = super(YahooCollectorCN1min, self).get_instrument_list()
-        return symbols + ["000300.ss", "000905.ss", "00903.ss"]
+        return symbols + ["000300.ss", "000905.ss", "000903.ss"]
+
+    def download_index_data(self):
+        pass
 
 
 class YahooCollectorUS(YahooCollector, ABC):
@@ -461,7 +464,7 @@ class YahooNormalize1dExtend(YahooNormalize1d):
             _si = df["close"].first_valid_index()
             if _si > df.index[0]:
                 logger.warning(
-                    f"{df.loc[_si][self._symbol_field_name]} missing data: {df.loc[:_si-1][self._date_field_name].to_list()}"
+                    f"{df.loc[_si][self._symbol_field_name]} missing data: {df.loc[:_si - 1][self._date_field_name].to_list()}"
                 )
         # normalize
         df = self.normalize_yahoo(
@@ -524,7 +527,7 @@ class YahooNormalize1min(YahooNormalize, ABC):
         data_1d: pd.DataFrame = self.get_1d_data(symbol, _start, _end)
         data_1d = data_1d.copy()
         if data_1d is None or data_1d.empty:
-            df["factor"] = 1 / df.loc[df["close"].first_valid_index()]
+            df["factor"] = 1 / df.loc[df["close"].first_valid_index()]["close"]
             # TODO: np.nan or 1 or 0
             df["paused"] = np.nan
         else:
@@ -770,7 +773,7 @@ class Run(BaseRun):
     def download_data(
         self,
         max_collector_count=2,
-        delay=0,
+        delay=0.5,
         start=None,
         end=None,
         check_data_length=None,
@@ -783,7 +786,7 @@ class Run(BaseRun):
         max_collector_count: int
             default 2
         delay: float
-            time.sleep(delay), default 0
+            time.sleep(delay), default 0.5
         start: str
             start datetime, default "2000-01-01"; closed interval(including start)
         end: str
@@ -844,9 +847,8 @@ class Run(BaseRun):
         """
         if self.interval.lower() == "1min":
             if qlib_data_1d_dir is None or not Path(qlib_data_1d_dir).expanduser().exists():
-                # TODO: add reference url
                 raise ValueError(
-                    "If normalize 1min, the qlib_data_1d_dir parameter must be set: --qlib_data_1d_dir <user qlib 1d data >, Reference: "
+                    "If normalize 1min, the qlib_data_1d_dir parameter must be set: --qlib_data_1d_dir <user qlib 1d data >, Reference: https://github.com/zhupr/qlib/tree/support_extend_data/scripts/data_collector/yahoo#automatic-update-of-daily-frequency-datafrom-yahoo-finance"
                 )
         super(Run, self).normalize_data(
             date_field_name, symbol_field_name, end_date=end_date, qlib_data_1d_dir=qlib_data_1d_dir

From 14184170340ed22d0496786aeb3c6169c87a3293 Mon Sep 17 00:00:00 2001
From: zhupr <zhu.pengrong@foxmail.com>
Date: Wed, 23 Jun 2021 23:59:59 +0800
Subject: [PATCH 13/14] fix automatic update of daily frequency data

---
 README.md                                     |  4 +-
 docs/component/data.rst                       |  4 +-
 .../analysis_model_performance.py             |  7 +-
 scripts/data_collector/base.py                | 27 +++-----
 scripts/data_collector/fund/collector.py      |  2 +-
 scripts/data_collector/yahoo/README.md        | 24 ++++---
 scripts/data_collector/yahoo/collector.py     | 68 ++++++++++++-------
 scripts/data_collector/yahoo/requirements.txt |  1 +
 8 files changed, 76 insertions(+), 61 deletions(-)

diff --git a/README.md b/README.md
index d60e3e2a7..1a8b25b43 100644
--- a/README.md
+++ b/README.md
@@ -171,11 +171,11 @@ We recommend users to prepare their own data if they have a high-quality dataset
         ```
         * * * * 1-5 python <script path> update_data_to_bin --qlib_data_1d_dir <user data dir>
         ```
-        * **script path**: *qlib/scripts/data_collector/yahoo/collector.py*
+        * **script path**: *scripts/data_collector/yahoo/collector.py*
 
   * Manual update of data
       ```
-      python qlib/scripts/data_collector/yahoo/collector.py update_data_to_bin --qlib_data_1d_dir <user data dir> --trading_date <start date> --end_date <end date>
+      python scripts/data_collector/yahoo/collector.py update_data_to_bin --qlib_data_1d_dir <user data dir> --trading_date <start date> --end_date <end date>
       ```
       * *trading_date*: start of trading day
       * *end_date*: end of trading day(not included)
diff --git a/docs/component/data.rst b/docs/component/data.rst
index e399a41d7..8d4af71f1 100644
--- a/docs/component/data.rst
+++ b/docs/component/data.rst
@@ -82,13 +82,13 @@ Automatic update of daily frequency data
 
             * * * * 1-5 python <script path> update_data_to_bin --qlib_data_1d_dir <user data dir>
 
-        - **script path**: *qlib/scripts/data_collector/yahoo/collector.py*
+        - **script path**: *scripts/data_collector/yahoo/collector.py*
 
   - Manual update of data
 
       .. code-block:: bash
 
-        python qlib/scripts/data_collector/yahoo/collector.py update_data_to_bin --qlib_data_1d_dir <user data dir> --trading_date <start date> --end_date <end date>
+        python scripts/data_collector/yahoo/collector.py update_data_to_bin --qlib_data_1d_dir <user data dir> --trading_date <start date> --end_date <end date>
 
       - *trading_date*: start of trading day
       - *end_date*: end of trading day(not included)
diff --git a/qlib/contrib/report/analysis_model/analysis_model_performance.py b/qlib/contrib/report/analysis_model/analysis_model_performance.py
index ddf97fb36..1b30e11e1 100644
--- a/qlib/contrib/report/analysis_model/analysis_model_performance.py
+++ b/qlib/contrib/report/analysis_model/analysis_model_performance.py
@@ -79,6 +79,9 @@ def _plot_qq(data: pd.Series = None, dist=stats.norm) -> go.Figure:
     :param dist:
     :return:
     """
+    # NOTE: plotly.tools.mpl_to_plotly not actively maintained, resulting in errors in the new version of matplotlib,
+    # ref: https://github.com/plotly/plotly.py/issues/2913#issuecomment-730071567
+    # removing plotly.tools.mpl_to_plotly for greater compatibility with matplotlib versions
     _plt_fig = sm.qqplot(data.dropna(), dist=dist, fit=True, line="45")
     plt.close(_plt_fig)
     qqplot_data = _plt_fig.gca().lines
@@ -88,9 +91,7 @@ def _plot_qq(data: pd.Series = None, dist=stats.norm) -> go.Figure:
         {
             "type": "scatter",
             "x": qqplot_data[0].get_xdata(),
-            # 'x': [0, 1],
             "y": qqplot_data[0].get_ydata(),
-            # 'y': [1, 2],
             "mode": "markers",
             "marker": {"color": "#19d3f3"},
         }
@@ -100,9 +101,7 @@ def _plot_qq(data: pd.Series = None, dist=stats.norm) -> go.Figure:
         {
             "type": "scatter",
             "x": qqplot_data[1].get_xdata(),
-            # 'x': [0, 1],
             "y": qqplot_data[1].get_ydata(),
-            # 'y': [1, 2],
             "mode": "lines",
             "line": {"color": "#636efa"},
         }
diff --git a/scripts/data_collector/base.py b/scripts/data_collector/base.py
index ce94f1783..6d7959d4e 100644
--- a/scripts/data_collector/base.py
+++ b/scripts/data_collector/base.py
@@ -13,6 +13,7 @@ from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
 import pandas as pd
 from tqdm import tqdm
 from loguru import logger
+from joblib import Parallel, delayed
 from qlib.utils import code_to_fname
 
 
@@ -186,20 +187,12 @@ class BaseCollector(abc.ABC):
     def _collector(self, instrument_list):
 
         error_symbol = []
-        with tqdm(total=len(instrument_list)) as p_bar:
-            if self.max_workers is not None and self.max_workers > 1:
-                logger.info(f"concurrent collector, max_workers: {self.max_workers}")
-                with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
-                    for _symbol, _result in zip(instrument_list, executor.map(self._simple_collector, instrument_list)):
-                        if _result != self.NORMAL_FLAG:
-                            error_symbol.append(_symbol)
-                        p_bar.update()
-            else:
-                for _symbol in instrument_list:
-                    _result = self._simple_collector(_symbol)
-                    if _result != self.NORMAL_FLAG:
-                        error_symbol.append(_symbol)
-                    p_bar.update()
+        res = Parallel(n_jobs=self.max_workers)(
+            delayed(self._simple_collector)(_inst) for _inst in tqdm(instrument_list)
+        )
+        for _symbol, _result in zip(instrument_list, res):
+            if _result != self.NORMAL_FLAG:
+                error_symbol.append(_symbol)
         print(error_symbol)
         logger.info(f"error symbol nums: {len(error_symbol)}")
         logger.info(f"current get symbol nums: {len(instrument_list)}")
@@ -365,7 +358,7 @@ class BaseRun(abc.ABC):
         start=None,
         end=None,
         interval="1d",
-        check_data_length=False,
+        check_data_length: int = None,
         limit_nums=None,
     ):
         """download data from Internet
@@ -382,8 +375,8 @@ class BaseRun(abc.ABC):
             start datetime, default "2000-01-01"
         end: str
             end datetime, default ``pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1))``
-        check_data_length: bool
-            check data length, by default False
+        check_data_length: int
+            check data length, if not None and greater than 0, each symbol will be considered complete if its data length is greater than or equal to this value, otherwise it will be fetched again, the maximum number of fetches being (max_collector_count). By default None.
         limit_nums: int
             using for debug, by default None
 
diff --git a/scripts/data_collector/fund/collector.py b/scripts/data_collector/fund/collector.py
index 7b5566f72..cecabb33c 100644
--- a/scripts/data_collector/fund/collector.py
+++ b/scripts/data_collector/fund/collector.py
@@ -254,7 +254,7 @@ class Run(BaseRun):
         start=None,
         end=None,
         interval="1d",
-        check_data_length=None,
+        check_data_length: int = None,
         limit_nums=None,
     ):
         """download data from Internet
diff --git a/scripts/data_collector/yahoo/README.md b/scripts/data_collector/yahoo/README.md
index 68124e132..8fa5998b7 100644
--- a/scripts/data_collector/yahoo/README.md
+++ b/scripts/data_collector/yahoo/README.md
@@ -140,22 +140,24 @@ pip install -r requirements.txt
         ```
         * * * * 1-5 python <script path> update_data_to_bin --qlib_data_1d_dir <user data dir>
         ```
-        * **script path**: *qlib/scripts/data_collector/yahoo/collector.py*
+        * **script path**: *scripts/data_collector/yahoo/collector.py*
 
   * Manual update of data
       ```
-      python qlib/scripts/data_collector/yahoo/collector.py update_data_to_bin --qlib_data_1d_dir <user data dir> --trading_date <start date> --end_date <end date>
+      python scripts/data_collector/yahoo/collector.py update_data_to_bin --qlib_data_1d_dir <user data dir> --trading_date <start date> --end_date <end date>
       ```
-      * *trading_date*: start of trading day
-      * *end_date*: end of trading day(not included)
+      * `trading_date`: start of trading day
+      * `end_date`: end of trading day(not included)
+      * `check_data_length`: check the number of rows per *symbol*, by default `None`
+        > if `len(symbol_df) < check_data_length`, it will be re-fetched, with the number of re-fetches coming from the `max_collector_count` parameter
 
-  * qlib/scripts/data_collector/yahoo/collector.py update_data_to_bin parameters:
-      * *source_dir*: The directory where the raw data collected from the Internet is saved, default "Path(__file__).parent/source"
-      * *normalize_dir*: Directory for normalize data, default "Path(__file__).parent/normalize"
-      * *qlib_data_1d_dir*: the qlib data to be updated for yahoo, usually from: [download qlib data](https://github.com/microsoft/qlib/tree/main/scripts#download-cn-data)
-      * *trading_date*: trading days to be updated, by default ``datetime.datetime.now().strftime("%Y-%m-%d")``
-      * *end_date*: end datetime, default ``pd.Timestamp(trading_date + pd.Timedelta(days=1))``; open interval(excluding end)
-      * *region*: region, value from ["CN", "US"], default "CN"
+  * `scripts/data_collector/yahoo/collector.py update_data_to_bin` parameters:
+      * `source_dir`: The directory where the raw data collected from the Internet is saved, default "Path(__file__).parent/source"
+      * `normalize_dir`: Directory for normalize data, default "Path(__file__).parent/normalize"
+      * `qlib_data_1d_dir`: the qlib data to be updated for yahoo, usually from: [download qlib data](https://github.com/microsoft/qlib/tree/main/scripts#download-cn-data)
+      * `trading_date`: trading days to be updated, by default ``datetime.datetime.now().strftime("%Y-%m-%d")``
+      * `end_date`: end datetime, default ``pd.Timestamp(trading_date + pd.Timedelta(days=1))``; open interval(excluding end)
+      * `region`: region, value from ["CN", "US"], default "CN"
 
 
 ## Using qlib data
diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py
index 06545bda1..81e11cb2e 100644
--- a/scripts/data_collector/yahoo/collector.py
+++ b/scripts/data_collector/yahoo/collector.py
@@ -8,6 +8,7 @@ import time
 import datetime
 import importlib
 from abc import ABC
+import multiprocessing
 from pathlib import Path
 from typing import Iterable
 
@@ -49,7 +50,7 @@ class YahooCollector(BaseCollector):
         max_workers=4,
         max_collector_count=2,
         delay=0,
-        check_data_length: bool = False,
+        check_data_length: int = None,
         limit_nums: int = None,
     ):
         """
@@ -70,8 +71,8 @@ class YahooCollector(BaseCollector):
             start datetime, default None
         end: str
             end datetime, default None
-        check_data_length: bool
-            check data length, by default False
+        check_data_length: int
+            check data length, by default None
         limit_nums: int
             using for debug, by default None
         """
@@ -311,8 +312,8 @@ class YahooNormalize(BaseNormalize):
         df.loc[(df["volume"] <= 0) | np.isnan(df["volume"]), set(df.columns) - {symbol_field_name}] = np.nan
         _tmp_series = df["close"].fillna(method="ffill")
         _tmp_shift_series = _tmp_series.shift(1)
-        if last_close is not None and isinstance(last_close, (int, float)):
-            _tmp_shift_series.iloc[0] = last_close
+        if last_close is not None:
+            _tmp_shift_series.iloc[0] = float(last_close)
         df["change"] = _tmp_series / _tmp_shift_series - 1
         columns += ["change"]
         df.loc[(df["volume"] <= 0) | np.isnan(df["volume"]), columns] = np.nan
@@ -408,8 +409,9 @@ class YahooNormalize1dExtend(YahooNormalize1d):
             symbol field name, default is symbol
         """
         super(YahooNormalize1dExtend, self).__init__(date_field_name, symbol_field_name)
-        self._end_date, self._old_close = self._get_old_data(old_qlib_data_dir)
-        self._end_date = pd.Timestamp(self._end_date).strftime(self.DAILY_FORMAT)
+        self._first_close_field = "first_close"
+        self._ori_close_field = "ori_close"
+        self.old_qlib_data = self._get_old_data(old_qlib_data_dir)
 
     def _get_old_data(self, qlib_data_dir: [str, Path]):
         import qlib
@@ -417,32 +419,34 @@ class YahooNormalize1dExtend(YahooNormalize1d):
 
         qlib_data_dir = str(Path(qlib_data_dir).expanduser().resolve())
         qlib.init(provider_uri=qlib_data_dir, expression_cache=None, dataset_cache=None)
-        df = D.features(D.instruments("all"), ["$close/$factor"])
-        df.columns = ["close"]
-        return D.calendar()[-1], df
+        df = D.features(D.instruments("all"), ["$close/$factor", "$adjclose/$close"])
+        df.columns = [self._ori_close_field, self._first_close_field]
+        return df
+
+    def _get_close(self, df: pd.DataFrame, field_name: str):
+        _symbol = df.loc[df[self._symbol_field_name].first_valid_index()][self._symbol_field_name].upper()
+        _df = self.old_qlib_data.loc(axis=0)[_symbol]
+        _close = _df.loc[_df.last_valid_index()][field_name]
+        return _close
 
     def _get_first_close(self, df: pd.DataFrame) -> float:
-        _symbol = df.iloc[0][self._symbol_field_name]
         try:
-            _df = self._old_close.loc(axis=0)[_symbol.upper()]
-            _close = _df.loc[_df.first_valid_index()]["close"]
+            _close = self._get_close(df, field_name=self._first_close_field)
         except KeyError:
             _close = super(YahooNormalize1dExtend, self)._get_first_close(df)
         return _close
 
     def _get_last_close(self, df: pd.DataFrame) -> float:
-        _symbol = df.iloc[0][self._symbol_field_name]
         try:
-            _df = self._old_close.loc(axis=0)[_symbol.upper()]
-            _close = _df.loc[_df.last_valid_index()]["close"]
+            _close = self._get_close(df, field_name=self._ori_close_field)
         except KeyError:
             _close = None
         return _close
 
     def _get_last_date(self, df: pd.DataFrame) -> pd.Timestamp:
-        _symbol = df.iloc[0][self._symbol_field_name]
+        _symbol = df.loc[df[self._symbol_field_name].first_valid_index()][self._symbol_field_name].upper()
         try:
-            _df = self._old_close.loc(axis=0)[_symbol.upper()]
+            _df = self.old_qlib_data.loc(axis=0)[_symbol]
             _date = _df.index.max()
         except KeyError:
             _date = None
@@ -901,7 +905,7 @@ class Run(BaseRun):
     def download_today_data(
         self,
         max_collector_count=2,
-        delay=0,
+        delay=0.5,
         check_data_length=None,
         limit_nums=None,
     ):
@@ -912,7 +916,7 @@ class Run(BaseRun):
         max_collector_count: int
             default 2
         delay: float
-            time.sleep(delay), default 0
+            time.sleep(delay), default 0.5
         check_data_length: int
             check data length, if not None and greater than 0, each symbol will be considered complete if its data length is greater than or equal to this value, otherwise it will be fetched again, the maximum number of fetches being (max_collector_count). By default None.
         limit_nums: int
@@ -947,7 +951,14 @@ class Run(BaseRun):
             limit_nums,
         )
 
-    def update_data_to_bin(self, qlib_data_1d_dir: str, trading_date: str = None, end_date: str = None):
+    def update_data_to_bin(
+        self,
+        qlib_data_1d_dir: str,
+        trading_date: str = None,
+        end_date: str = None,
+        check_data_length: int = None,
+        delay: float = 1,
+    ):
         """update yahoo data to bin
 
         Parameters
@@ -959,7 +970,10 @@ class Run(BaseRun):
             trading days to be updated, by default ``datetime.datetime.now().strftime("%Y-%m-%d")``
         end_date: str
             end datetime, default ``pd.Timestamp(trading_date + pd.Timedelta(days=1))``; open interval(excluding end)
-
+        check_data_length: int
+            check data length, if not None and greater than 0, each symbol will be considered complete if its data length is greater than or equal to this value, otherwise it will be fetched again, the maximum number of fetches being (max_collector_count). By default None.
+        delay: float
+            time.sleep(delay), default 1
         Notes
         -----
             If the data in qlib_data_dir is incomplete, np.nan will be populated to trading_date for the previous trading day
@@ -987,8 +1001,14 @@ class Run(BaseRun):
             GetData().qlib_data(target_dir=qlib_data_1d_dir, interval=self.interval, region=self.region)
 
         # download data from yahoo
-        self.download_data(delay=1, start=trading_date, end=end_date, check_data_length=1)
-
+        # NOTE: when downloading data from YahooFinance, max_workers is recommended to be 1
+        self.download_data(delay=delay, start=trading_date, end=end_date, check_data_length=check_data_length)
+        # NOTE: a larger max_workers setting here would be faster
+        self.max_workers = (
+            max(multiprocessing.cpu_count() - 2, 1)
+            if self.max_workers is None or self.max_workers <= 1
+            else self.max_workers
+        )
         # normalize data
         self.normalize_data_1d_extend(qlib_data_1d_dir)
 
diff --git a/scripts/data_collector/yahoo/requirements.txt b/scripts/data_collector/yahoo/requirements.txt
index 3e3e0d1e0..0a9b12361 100644
--- a/scripts/data_collector/yahoo/requirements.txt
+++ b/scripts/data_collector/yahoo/requirements.txt
@@ -7,3 +7,4 @@ tqdm
 lxml
 loguru
 yahooquery
+joblib

From 39340fbf0651daacaeff09917c7798772ea742b5 Mon Sep 17 00:00:00 2001
From: zhupr <zhu.pengrong@foxmail.com>
Date: Thu, 24 Jun 2021 11:07:40 +0800
Subject: [PATCH 14/14] fix: typo

---
 scripts/data_collector/yahoo/README.md    | 2 +-
 scripts/data_collector/yahoo/collector.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/data_collector/yahoo/README.md b/scripts/data_collector/yahoo/README.md
index 8fa5998b7..6cc630e87 100644
--- a/scripts/data_collector/yahoo/README.md
+++ b/scripts/data_collector/yahoo/README.md
@@ -96,7 +96,7 @@ pip install -r requirements.txt
             if interval==1min, qlib_data_1d_dir cannot be None, normalize 1min needs to use 1d data;
         
                 qlib_data_1d can be obtained like this:
-                    $ python scripts/get_data.py qlilb_data --target_dir <qlib_data_1d_dir> --interval 1d
+                    $ python scripts/get_data.py qlib_data --target_dir <qlib_data_1d_dir> --interval 1d
                     $ python scripts/data_collector/yahoo/collector.py update_data_to_bin --qlib_data_1d_dir <qlib_data_1d_dir> --trading_date 2021-06-01
                 or:
                     download 1d data from YahooFinance
diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py
index 81e11cb2e..6a128a5be 100644
--- a/scripts/data_collector/yahoo/collector.py
+++ b/scripts/data_collector/yahoo/collector.py
@@ -839,7 +839,7 @@ class Run(BaseRun):
             if interval==1min, qlib_data_1d_dir cannot be None, normalize 1min needs to use 1d data;
 
                 qlib_data_1d can be obtained like this:
-                    $ python scripts/get_data.py qlilb_data --target_dir <qlib_data_1d_dir> --interval 1d
+                    $ python scripts/get_data.py qlib_data --target_dir <qlib_data_1d_dir> --interval 1d
                     $ python scripts/data_collector/yahoo/collector.py update_data_to_bin --qlib_data_1d_dir <qlib_data_1d_dir> --trading_date 2021-06-01
                 or:
                     download 1d data, reference: https://github.com/microsoft/qlib/tree/main/scripts/data_collector/yahoo#1d-from-yahoo