qlib/scripts/data_collector/yahoo/collector.py

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import abc
import sys
import copy
import time
import datetime
import importlib
from abc import ABC
from pathlib import Path
from typing import Iterable, Type
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

import fire
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from loguru import logger
from yahooquery import Ticker
from dateutil.tz import tzlocal
from qlib.utils import code_to_fname, fname_to_code

CUR_DIR = Path(__file__).resolve().parent
sys.path.append(str(CUR_DIR.parent.parent))
from data_collector.utils import get_calendar_list, get_hs_stock_symbols, get_us_stock_symbols

INDEX_BENCH_URL = "http://push2his.eastmoney.com/api/qt/stock/kline/get?secid=1.{index_code}&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58&klt=101&fqt=0&beg={begin}&end={end}"
REGION_CN = "CN"
REGION_US = "US"


class YahooData:
    START_DATETIME = pd.Timestamp("2000-01-01")
    HIGH_FREQ_START_DATETIME = pd.Timestamp(datetime.datetime.now() - pd.Timedelta(days=5 * 6))
    END_DATETIME = pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1))
    INTERVAL_1min = "1min"
    INTERVAL_1d = "1d"

    def __init__(
        self,
        timezone: str = None,
        start=None,
        end=None,
        interval="1d",
        delay=0,
        show_1min_logging: bool = False,
    ):
        """

        Parameters
        ----------
        timezone: str
            The timezone where the data is located
        delay: float
            time.sleep(delay), default 0
        interval: str
            freq, value from [1min, 1d], default 1min
        start: str
            start datetime, default None
        end: str
            end datetime, default None
        show_1min_logging: bool
            show 1min logging, by default False; if True, there may be many warning logs
        """
        self._timezone = tzlocal() if timezone is None else timezone
        self._delay = delay
        self._interval = interval
        self._show_1min_logging = show_1min_logging
        self.start_datetime = pd.Timestamp(str(start)) if start else self.START_DATETIME
        self.end_datetime = min(pd.Timestamp(str(end)) if end else self.END_DATETIME, self.END_DATETIME)
        if self._interval == self.INTERVAL_1min:
            self.start_datetime = max(self.start_datetime, self.HIGH_FREQ_START_DATETIME)
        elif self._interval == self.INTERVAL_1d:
            pass
        else:
            raise ValueError(f"interval error: {self._interval}")

        # using for 1min
        self._next_datetime = self.convert_datetime(self.start_datetime.date() + pd.Timedelta(days=1), self._timezone)
        self._latest_datetime = self.convert_datetime(self.end_datetime.date(), self._timezone)

        self.start_datetime = self.convert_datetime(self.start_datetime, self._timezone)
        self.end_datetime = self.convert_datetime(self.end_datetime, self._timezone)

    @staticmethod
    def convert_datetime(dt: [pd.Timestamp, datetime.date, str], timezone):
        try:
            dt = pd.Timestamp(dt, tz=timezone).timestamp()
            dt = pd.Timestamp(dt, tz=tzlocal(), unit="s")
        except ValueError as e:
            pass
        return dt

    def _sleep(self):
        time.sleep(self._delay)

    @staticmethod
    def get_data_from_remote(symbol, interval, start, end, show_1min_logging: bool = False):
        error_msg = f"{symbol}-{interval}-{start}-{end}"

        def _show_logging_func():
            if interval == YahooData.INTERVAL_1min and show_1min_logging:
                logger.warning(f"{error_msg}:{_resp}")

        interval = "1m" if interval in ["1m", "1min"] else interval
        try:
            _resp = Ticker(symbol, asynchronous=False).history(interval=interval, start=start, end=end)
            if isinstance(_resp, pd.DataFrame):
                return _resp.reset_index()
            elif isinstance(_resp, dict):
                _temp_data = _resp.get(symbol, {})
                if isinstance(_temp_data, str) or (
                    isinstance(_resp, dict) and _temp_data.get("indicators", {}).get("quote", None) is None
                ):
                    _show_logging_func()
            else:
                _show_logging_func()
        except Exception as e:
            logger.warning(f"{error_msg}:{e}")

    def get_data(self, symbol: str) -> [pd.DataFrame]:
        def _get_simple(start_, end_):
            self._sleep()
            _remote_interval = "1m" if self._interval == self.INTERVAL_1min else self._interval
            return self.get_data_from_remote(
                symbol,
                interval=_remote_interval,
                start=start_,
                end=end_,
                show_1min_logging=self._show_1min_logging,
            )

        _result = None
        if self._interval == self.INTERVAL_1d:
            _result = _get_simple(self.start_datetime, self.end_datetime)
        elif self._interval == self.INTERVAL_1min:
            if self._next_datetime >= self._latest_datetime:
                _result = _get_simple(self.start_datetime, self.end_datetime)
            else:
                _res = []

                def _get_multi(start_, end_):
                    _resp = _get_simple(start_, end_)
                    if _resp is not None and not _resp.empty:
                        _res.append(_resp)

                for _s, _e in (
                    (self.start_datetime, self._next_datetime),
                    (self._latest_datetime, self.end_datetime),
                ):
                    _get_multi(_s, _e)
                for _start in pd.date_range(self._next_datetime, self._latest_datetime, closed="left"):
                    _end = _start + pd.Timedelta(days=1)
                    _get_multi(_start, _end)
                if _res:
                    _result = pd.concat(_res, sort=False).sort_values(["symbol", "date"])
        else:
            raise ValueError(f"cannot support {self._interval}")
        return _result


class YahooCollector:
    def __init__(
        self,
        save_dir: [str, Path],
        start=None,
        end=None,
        interval="1d",
        max_workers=4,
        max_collector_count=2,
        delay=0,
        check_data_length: bool = False,
        limit_nums: int = None,
        show_1min_logging: bool = False,
    ):
        """

        Parameters
        ----------
        save_dir: str
            stock save dir
        max_workers: int
            workers, default 4
        max_collector_count: int
            default 2
        delay: float
            time.sleep(delay), default 0
        interval: str
            freq, value from [1min, 1d], default 1min
        start: str
            start datetime, default None
        end: str
            end datetime, default None
        check_data_length: bool
            check data length, by default False
        limit_nums: int
            using for debug, by default None
        show_1min_logging: bool
            show 1m logging, by default False; if True, there may be many warning logs
        """
        self.save_dir = Path(save_dir).expanduser().resolve()
        self.save_dir.mkdir(parents=True, exist_ok=True)

        self._delay = delay
        self.max_workers = max_workers
        self._max_collector_count = max_collector_count
        self._mini_symbol_map = {}
        self._interval = interval
        self._check_small_data = check_data_length

        self.stock_list = sorted(set(self.get_stock_list()))
        if limit_nums is not None:
            try:
                self.stock_list = self.stock_list[: int(limit_nums)]
            except Exception as e:
                logger.warning(f"Cannot use limit_nums={limit_nums}, the parameter will be ignored")

        self.yahoo_data = YahooData(
            timezone=self._timezone,
            start=start,
            end=end,
            interval=interval,
            delay=delay,
            show_1min_logging=show_1min_logging,
        )

    @property
    @abc.abstractmethod
    def min_numbers_trading(self):
        # daily, one year: 252 / 4
        # us 1min, a week: 6.5 * 60 * 5
        # cn 1min, a week: 4 * 60 * 5
        raise NotImplementedError("rewrite min_numbers_trading")

    @abc.abstractmethod
    def get_stock_list(self):
        raise NotImplementedError("rewrite get_stock_list")

    @property
    @abc.abstractmethod
    def _timezone(self):
        raise NotImplementedError("rewrite get_timezone")

    def save_stock(self, symbol, df: pd.DataFrame):
        """save stock data to file

        Parameters
        ----------
        symbol: str
            stock code
        df : pd.DataFrame
            df.columns must contain "symbol" and "datetime"
        """
        if df.empty:
            logger.warning(f"{symbol} is empty")
            return

        symbol = self.normalize_symbol(symbol)
        symbol = code_to_fname(symbol)
        stock_path = self.save_dir.joinpath(f"{symbol}.csv")
        df["symbol"] = symbol
        if stock_path.exists():
            _old_df = pd.read_csv(stock_path)
            df = _old_df.append(df, sort=False)
        df.to_csv(stock_path, index=False)

    def _save_small_data(self, symbol, df):
        if len(df) <= self.min_numbers_trading:
            logger.warning(f"the number of trading days of {symbol} is less than {self.min_numbers_trading}!")
            _temp = self._mini_symbol_map.setdefault(symbol, [])
            _temp.append(df.copy())
            return None
        else:
            if symbol in self._mini_symbol_map:
                self._mini_symbol_map.pop(symbol)
            return symbol

    def _get_data(self, symbol):
        _result = None
        df = self.yahoo_data.get_data(symbol)
        if isinstance(df, pd.DataFrame):
            if not df.empty:
                if self._check_small_data:
                    if self._save_small_data(symbol, df) is not None:
                        _result = symbol
                        self.save_stock(symbol, df)
                else:
                    _result = symbol
                    self.save_stock(symbol, df)
        return _result

    def _collector(self, stock_list):

        error_symbol = []
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            with tqdm(total=len(stock_list)) as p_bar:
                for _symbol, _result in zip(stock_list, executor.map(self._get_data, stock_list)):
                    if _result is None:
                        error_symbol.append(_symbol)
                    p_bar.update()
        print(error_symbol)
        logger.info(f"error symbol nums: {len(error_symbol)}")
        logger.info(f"current get symbol nums: {len(stock_list)}")
        error_symbol.extend(self._mini_symbol_map.keys())
        return sorted(set(error_symbol))

    def collector_data(self):
        """collector data"""
        logger.info("start collector yahoo data......")
        stock_list = self.stock_list
        for i in range(self._max_collector_count):
            if not stock_list:
                break
            logger.info(f"getting data: {i+1}")
            stock_list = self._collector(stock_list)
            logger.info(f"{i+1} finish.")
        for _symbol, _df_list in self._mini_symbol_map.items():
            self.save_stock(_symbol, pd.concat(_df_list, sort=False).drop_duplicates(["date"]).sort_values(["date"]))
        if self._mini_symbol_map:
            logger.warning(f"less than {self.min_numbers_trading} stock list: {list(self._mini_symbol_map.keys())}")
        logger.info(f"total {len(self.stock_list)}, error: {len(set(stock_list))}")

        self.download_index_data()

    @abc.abstractmethod
    def download_index_data(self):
        """download index data"""
        raise NotImplementedError("rewrite download_index_data")

    @abc.abstractmethod
    def normalize_symbol(self, symbol: str):
        """normalize symbol"""
        raise NotImplementedError("rewrite normalize_symbol")


class YahooCollectorCN(YahooCollector, ABC):
    def get_stock_list(self):
        logger.info("get HS stock symbos......")
        symbols = get_hs_stock_symbols()
        logger.info(f"get {len(symbols)} symbols.")
        return symbols

    def normalize_symbol(self, symbol):
        symbol_s = symbol.split(".")
        symbol = f"sh{symbol_s[0]}" if symbol_s[-1] == "ss" else f"sz{symbol_s[0]}"
        return symbol

    @property
    def _timezone(self):
        return "Asia/Shanghai"


class YahooCollectorCN1d(YahooCollectorCN):
    @property
    def min_numbers_trading(self):
        return 252 / 4

    def download_index_data(self):
        # TODO: from MSN
        _format = "%Y%m%d"
        _begin = self.yahoo_data.start_datetime.strftime(_format)
        _end = (self.yahoo_data.end_datetime + pd.Timedelta(days=-1)).strftime(_format)
        for _index_name, _index_code in {"csi300": "000300", "csi100": "000903"}.items():
            logger.info(f"get bench data: {_index_name}({_index_code})......")
            try:
                df = pd.DataFrame(
                    map(
                        lambda x: x.split(","),
                        requests.get(INDEX_BENCH_URL.format(index_code=_index_code, begin=_begin, end=_end)).json()[
                            "data"
                        ]["klines"],
                    )
                )
            except Exception as e:
                logger.warning(f"get {_index_name} error: {e}")
                continue
            df.columns = ["date", "open", "close", "high", "low", "volume", "money", "change"]
            df["date"] = pd.to_datetime(df["date"])
            df = df.astype(float, errors="ignore")
            df["adjclose"] = df["close"]
            df["symbol"] = f"sh{_index_code}"
            _path = self.save_dir.joinpath(f"sh{_index_code}.csv")
            if _path.exists():
                _old_df = pd.read_csv(_path)
                df = _old_df.append(df, sort=False)
            df.to_csv(_path, index=False)
            time.sleep(5)


class YahooCollectorCN1min(YahooCollectorCN):
    @property
    def min_numbers_trading(self):
        return 60 * 4 * 5

    def download_index_data(self):
        # TODO: 1m
        logger.warning(f"{self.__class__.__name__} {self._interval} does not support: download_index_data")


class YahooCollectorUS(YahooCollector, ABC):
    def get_stock_list(self):
        logger.info("get US stock symbols......")
        symbols = get_us_stock_symbols() + [
            "^GSPC",
            "^NDX",
            "^DJI",
        ]
        logger.info(f"get {len(symbols)} symbols.")
        return symbols

    def download_index_data(self):
        pass

    def normalize_symbol(self, symbol):
        return code_to_fname(symbol).upper()

    @property
    def _timezone(self):
        return "America/New_York"


class YahooCollectorUS1d(YahooCollectorUS):
    @property
    def min_numbers_trading(self):
        return 252 / 4


class YahooCollectorUS1min(YahooCollectorUS):
    @property
    def min_numbers_trading(self):
        return 60 * 6.5 * 5


class YahooNormalize:
    COLUMNS = ["open", "close", "high", "low", "volume"]
    DAILY_FORMAT = "%Y-%m-%d"

    def __init__(
        self,
        date_field_name: str = "date",
        symbol_field_name: str = "symbol",
    ):
        """

        Parameters
        ----------
        date_field_name: str
            date field name, default is date
        symbol_field_name: str
            symbol field name, default is symbol
        """
        self._date_field_name = date_field_name
        self._symbol_field_name = symbol_field_name

        self._calendar_list = self._get_calendar_list()

    @staticmethod
    def normalize_yahoo(
        df: pd.DataFrame,
        calendar_list: list = None,
        date_field_name: str = "date",
        symbol_field_name: str = "symbol",
    ):
        if df.empty:
            return df
        symbol = df.loc[df[symbol_field_name].first_valid_index(), symbol_field_name]
        columns = copy.deepcopy(YahooNormalize.COLUMNS)
        df = df.copy()
        df.set_index(date_field_name, inplace=True)
        df.index = pd.to_datetime(df.index)
        df = df[~df.index.duplicated(keep="first")]
        if calendar_list is not None:
            df = df.reindex(
                pd.DataFrame(index=calendar_list)
                .loc[
                    pd.Timestamp(df.index.min()).date() : pd.Timestamp(df.index.max()).date()
                    + pd.Timedelta(hours=23, minutes=59)
                ]
                .index
            )
        df.sort_index(inplace=True)
        df.loc[(df["volume"] <= 0) | np.isnan(df["volume"]), set(df.columns) - {symbol_field_name}] = np.nan
        _tmp_series = df["close"].fillna(method="ffill")
        df["change"] = _tmp_series / _tmp_series.shift(1) - 1
        columns += ["change"]
        df.loc[(df["volume"] <= 0) | np.isnan(df["volume"]), columns] = np.nan

        df[symbol_field_name] = symbol
        df.index.names = [date_field_name]
        return df.reset_index()

    def normalize(self, df: pd.DataFrame) -> pd.DataFrame:
        # normalize
        df = self.normalize_yahoo(df, self._calendar_list, self._date_field_name, self._symbol_field_name)
        # adjusted price
        df = self.adjusted_price(df)
        return df

    @abc.abstractmethod
    def _get_calendar_list(self):
        """Get benchmark calendar"""
        raise NotImplementedError("")

    @abc.abstractmethod
    def adjusted_price(self, df: pd.DataFrame) -> pd.DataFrame:
        """adjusted price"""
        raise NotImplementedError("rewrite adjusted_price")


class YahooNormalize1d(YahooNormalize, ABC):
    DAILY_FORMAT = "%Y-%m-%d"

    def adjusted_price(self, df: pd.DataFrame) -> pd.DataFrame:
        if df.empty:
            return df
        df = df.copy()
        df.set_index(self._date_field_name, inplace=True)
        if "adjclose" in df:
            df["factor"] = df["adjclose"] / df["close"]
            df["factor"] = df["factor"].fillna(method="ffill")
        else:
            df["factor"] = 1
        for _col in self.COLUMNS:
            if _col not in df.columns:
                continue
            if _col == "volume":
                df[_col] = df[_col] / df["factor"]
            else:
                df[_col] = df[_col] * df["factor"]
        df.index.names = [self._date_field_name]
        return df.reset_index()

    def normalize(self, df: pd.DataFrame) -> pd.DataFrame:
        df = super(YahooNormalize1d, self).normalize(df)
        df = self._manual_adj_data(df)
        return df

    def _manual_adj_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """manual adjust data: All fields (except change) are standardized according to the close of the first day"""
        if df.empty:
            return df
        df = df.copy()
        df.sort_values(self._date_field_name, inplace=True)
        df = df.set_index(self._date_field_name)
        df = df.loc[df["close"].first_valid_index() :]
        _close = df["close"].iloc[0]
        for _col in df.columns:
            if _col == self._symbol_field_name:
                continue
            if _col == "volume":
                df[_col] = df[_col] * _close
            elif _col != "change":
                df[_col] = df[_col] / _close
            else:
                pass
        return df.reset_index()


class YahooNormalize1min(YahooNormalize, ABC):
    AM_RANGE = None  # type: tuple  # eg: ("09:30:00", "11:29:00")
    PM_RANGE = None  # type: tuple  # eg: ("13:00:00", "14:59:00")

    # Whether the trading day of 1min data is consistent with 1d
    CONSISTENT_1d = False

    def __init__(
        self,
        date_field_name: str = "date",
        symbol_field_name: str = "symbol",
    ):
        """

        Parameters
        ----------
        date_field_name: str
            date field name, default is date
        symbol_field_name: str
            symbol field name, default is symbol
        """
        super(YahooNormalize1min, self).__init__(date_field_name, symbol_field_name)
        _class_name = self.__class__.__name__.replace("min", "d")
        _class = getattr(importlib.import_module("collector"), _class_name)  # type: Type[YahooNormalize]
        self.data_1d_obj = _class(self._date_field_name, self._symbol_field_name)

    @property
    def calendar_list_1d(self):
        calendar_list_1d = getattr(self, "_calendar_list_1d", None)
        if calendar_list_1d is None:
            calendar_list_1d = self._get_1d_calendar_list()
            setattr(self, "_calendar_list_1d", calendar_list_1d)
        return calendar_list_1d

    def generate_1min_from_daily(self, calendars: Iterable) -> pd.Index:
        res = []
        daily_format = self.DAILY_FORMAT
        am_range = self.AM_RANGE
        pm_range = self.PM_RANGE
        for _day in calendars:
            for _range in [am_range, pm_range]:
                res.append(
                    pd.date_range(
                        f"{_day.strftime(daily_format)} {_range[0]}",
                        f"{_day.strftime(daily_format)} {_range[1]}",
                        freq="1min",
                    )
                )

        return pd.Index(sorted(set(np.hstack(res))))

    def adjusted_price(self, df: pd.DataFrame) -> pd.DataFrame:
        # TODO: using daily data factor
        if df.empty:
            return df
        df = df.copy()
        symbol = df.iloc[0][self._symbol_field_name]
        # get 1d data from yahoo
        _start = pd.Timestamp(df[self._date_field_name].min()).strftime(self.DAILY_FORMAT)
        _end = (pd.Timestamp(df[self._date_field_name].max()) + pd.Timedelta(days=1)).strftime(self.DAILY_FORMAT)
        data_1d = YahooData.get_data_from_remote(self.symbol_to_yahoo(symbol), interval="1d", start=_start, end=_end)
        if data_1d is None or data_1d.empty:
            df["factor"] = 1
            # TODO: np.nan or 1 or 0
            df["paused"] = np.nan
        else:
            data_1d = self.data_1d_obj.normalize(data_1d)  # type: pd.DataFrame
            # NOTE: volume is np.nan or volume <= 0, paused = 1
            # FIXME: find a more accurate data source
            data_1d["paused"] = 0
            data_1d.loc[(data_1d["volume"].isna()) | (data_1d["volume"] <= 0), "paused"] = 1
            data_1d = data_1d.set_index(self._date_field_name)

            # add factor from 1d data
            df["date_tmp"] = df[self._date_field_name].apply(lambda x: pd.Timestamp(x).date())
            df.set_index("date_tmp", inplace=True)
            df.loc[:, "factor"] = data_1d["factor"]
            df.loc[:, "paused"] = data_1d["paused"]
            df.reset_index("date_tmp", drop=True, inplace=True)

            if self.CONSISTENT_1d:
                # the date sequence is consistent with 1d
                df.set_index(self._date_field_name, inplace=True)
                df = df.reindex(
                    self.generate_1min_from_daily(
                        pd.to_datetime(data_1d.reset_index()[self._date_field_name].drop_duplicates())
                    )
                )
                df[self._symbol_field_name] = df.loc[df[self._symbol_field_name].first_valid_index()][
                    self._symbol_field_name
                ]
                df.index.names = [self._date_field_name]
                df.reset_index(inplace=True)
        for _col in self.COLUMNS:
            if _col not in df.columns:
                continue
            if _col == "volume":
                df[_col] = df[_col] / df["factor"]
            else:
                df[_col] = df[_col] * df["factor"]
        return df

    @abc.abstractmethod
    def symbol_to_yahoo(self, symbol):
        raise NotImplementedError("rewrite symbol_to_yahoo")

    @abc.abstractmethod
    def _get_1d_calendar_list(self):
        raise NotImplementedError("rewrite _get_1d_calendar_list")


class YahooNormalizeUS:
    def _get_calendar_list(self):
        # TODO: from MSN
        return get_calendar_list("US_ALL")


class YahooNormalizeUS1d(YahooNormalizeUS, YahooNormalize1d):
    pass


class YahooNormalizeUS1min(YahooNormalizeUS, YahooNormalize1min):
    CONSISTENT_1d = False

    def _get_calendar_list(self):
        # TODO: support 1min
        raise ValueError("Does not support 1min")

    def _get_1d_calendar_list(self):
        return get_calendar_list("US_ALL")

    def symbol_to_yahoo(self, symbol):
        return fname_to_code(symbol)


class YahooNormalizeCN:
    def _get_calendar_list(self):
        # TODO: from MSN
        return get_calendar_list("ALL")


class YahooNormalizeCN1d(YahooNormalizeCN, YahooNormalize1d):
    pass


class YahooNormalizeCN1min(YahooNormalizeCN, YahooNormalize1min):
    AM_RANGE = ("09:30:00", "11:29:00")
    PM_RANGE = ("13:00:00", "14:59:00")

    CONSISTENT_1d = True

    def _get_calendar_list(self):
        return self.generate_1min_from_daily(self.calendar_list_1d)

    def symbol_to_yahoo(self, symbol):
        if "." not in symbol:
            _exchange = symbol[:2]
            _exchange = "ss" if _exchange == "sh" else _exchange
            symbol = symbol[2:] + "." + _exchange
        return symbol

    def _get_1d_calendar_list(self):
        return get_calendar_list("ALL")


class Normalize:
    def __init__(
        self,
        source_dir: [str, Path],
        target_dir: [str, Path],
        normalize_class: Type[YahooNormalize],
        max_workers: int = 16,
        date_field_name: str = "date",
        symbol_field_name: str = "symbol",
    ):
        """

        Parameters
        ----------
        source_dir: str or Path
            The directory where the raw data collected from the Internet is saved
        target_dir: str or Path
            Directory for normalize data
        normalize_class: Type[YahooNormalize]
            normalize class
        max_workers: int
            Concurrent number, default is 16
        date_field_name: str
            date field name, default is date
        symbol_field_name: str
            symbol field name, default is symbol
        """
        if not (source_dir and target_dir):
            raise ValueError("source_dir and target_dir cannot be None")
        self._source_dir = Path(source_dir).expanduser()
        self._target_dir = Path(target_dir).expanduser()
        self._target_dir.mkdir(parents=True, exist_ok=True)

        self._max_workers = max_workers

        self._normalize_obj = normalize_class(date_field_name=date_field_name, symbol_field_name=symbol_field_name)

    def _executor(self, file_path: Path):
        file_path = Path(file_path)
        df = pd.read_csv(file_path)
        df = self._normalize_obj.normalize(df)
        if not df.empty:
            df.to_csv(self._target_dir.joinpath(file_path.name), index=False)

    def normalize(self):
        logger.info("normalize data......")

        with ProcessPoolExecutor(max_workers=self._max_workers) as worker:
            file_list = list(self._source_dir.glob("*.csv"))
            with tqdm(total=len(file_list)) as p_bar:
                for _ in worker.map(self._executor, file_list):
                    p_bar.update()


class Run:
    def __init__(self, source_dir=None, normalize_dir=None, max_workers=4, region=REGION_CN):
        """

        Parameters
        ----------
        source_dir: str
            The directory where the raw data collected from the Internet is saved, default "Path(__file__).parent/source"
        normalize_dir: str
            Directory for normalize data, default "Path(__file__).parent/normalize"
        max_workers: int
            Concurrent number, default is 4
        region: str
            region, value from ["CN", "US"], default "CN"
        """
        if source_dir is None:
            source_dir = CUR_DIR.joinpath("source")
        self.source_dir = Path(source_dir).expanduser().resolve()
        self.source_dir.mkdir(parents=True, exist_ok=True)

        if normalize_dir is None:
            normalize_dir = CUR_DIR.joinpath("normalize")
        self.normalize_dir = Path(normalize_dir).expanduser().resolve()
        self.normalize_dir.mkdir(parents=True, exist_ok=True)

        self._cur_module = importlib.import_module("collector")
        self.max_workers = max_workers
        self.region = region

    def download_data(
        self,
        max_collector_count=2,
        delay=0,
        start=None,
        end=None,
        interval="1d",
        check_data_length=False,
        limit_nums=None,
        show_1min_logging=False,
    ):
        """download data from Internet

        Parameters
        ----------
        max_collector_count: int
            default 2
        delay: float
            time.sleep(delay), default 0
        interval: str
            freq, value from [1min, 1d], default 1d
        start: str
            start datetime, default "2000-01-01"
        end: str
            end datetime, default ``pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1))``
        check_data_length: bool
            check data length, by default False
        limit_nums: int
            using for debug, by default None
        show_1min_logging: bool
            show 1m logging, by default False; if True, there may be many warning logs

        Examples
        ---------
            # get daily data
            $ python collector.py download_data --source_dir ~/.qlib/stock_data/source --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d
            # get 1m data
            $ python collector.py download_data --source_dir ~/.qlib/stock_data/source --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1m
        """

        _class = getattr(
            self._cur_module, f"YahooCollector{self.region.upper()}{interval}"
        )  # type: Type[YahooCollector]
        _class(
            self.source_dir,
            max_workers=self.max_workers,
            max_collector_count=max_collector_count,
            delay=delay,
            start=start,
            end=end,
            interval=interval,
            check_data_length=check_data_length,
            limit_nums=limit_nums,
            show_1min_logging=show_1min_logging,
        ).collector_data()

    def normalize_data(self, interval: str = "1d", date_field_name: str = "date", symbol_field_name: str = "symbol"):
        """normalize data

        Parameters
        ----------
        interval: str
            freq, value from [1min, 1d], default 1d
        date_field_name: str
            date field name, default date
        symbol_field_name: str
            symbol field name, default symbol

        Examples
        ---------
            $ python collector.py normalize_data --source_dir ~/.qlib/stock_data/source --normalize_dir ~/.qlib/stock_data/normalize --region CN --interval 1d
        """
        _class = getattr(self._cur_module, f"YahooNormalize{self.region.upper()}{interval}")
        yc = Normalize(
            source_dir=self.source_dir,
            target_dir=self.normalize_dir,
            normalize_class=_class,
            max_workers=self.max_workers,
            date_field_name=date_field_name,
            symbol_field_name=symbol_field_name,
        )
        yc.normalize()


if __name__ == "__main__":
    fire.Fire(Run)