mirror of
https://github.com/microsoft/qlib.git
synced 2026-06-06 05:51:17 +08:00
* feat: download ibovespa index historic composition ibovespa(ibov) is the largest index in Brazil's stocks exchange. The br_index folder has support for downloading new companies for the current index composition. And has support, as well, for downloading companies from historic composition of ibov index. Partially resolves issue #956 * fix: typo error instead of end_date, it was written end_ate * feat: adds support for downloading stocks historic prices from Brazil's stocks exchange (B3) Together with commit c2f933 it resolves issue #956 * fix: code formatted with black. * wip: Creating code logic for brazils stock market data normalization * docs: brazils stock market data normalization code documentation * fix: code formatted the with black * docs: fixed typo * docs: more info about python version used to generate requirements.txt file * docs: added BeautifulSoup requirements * feat: removed debug prints * feat: added ibov_index_composition variable as a class attribute of IBOVIndex * feat: added increment to generate the four month period used by the ibov index * refactor: Added get_instruments() method inside utils.py for better code usability. Message in the PR request to understand the context of the change In the course of reviewing this PR we found two issues. 1. there are multiple places where the get_instruments() method is used, and we feel that scripts.index.py is the best place for the get_instruments() method to go. 2. data_collector.utils has some very generic stuff put inside it. * refactor: improve brazils stocks download speed The reason to use retry=2 is due to the fact that Yahoo Finance unfortunately does not keep track of the majority of Brazilian stocks. Therefore, the decorator deco_retry with retry argument set to 5 will keep trying to get the stock data 5 times, which makes the code to download Brazilians stocks very slow. In future, this may change, but for now I suggest to leave retry argument to 1 or 2 in order to improve download speed. In order to achieve this code logic an argument called retry_config was added into YahooCollectorBR1d and YahooCollectorBR1min * fix: added __main__ at the bottom of the script * refactor: changed interface inside each index Using partial as `fire.Fire(partial(get_instruments, market_index="br_index" ))` will make the interface easier for the user to execute the script. Then all the collector.py CLI in each folder can remove a redundant arguments. * refactor: implemented class interface retry into YahooCollectorBR * docs: added BR as a possible region into the documentation * refactor: make retry attribute part of the interface This way we don't have to use hasattr to access the retry attribute as previously done
608 lines
19 KiB
Python
608 lines
19 KiB
Python
# Copyright (c) Microsoft Corporation.
|
|
# Licensed under the MIT License.
|
|
|
|
import re
|
|
import importlib
|
|
import time
|
|
import bisect
|
|
import pickle
|
|
import random
|
|
import requests
|
|
import functools
|
|
from pathlib import Path
|
|
from typing import Iterable, Tuple, List
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from lxml import etree
|
|
from loguru import logger
|
|
from yahooquery import Ticker
|
|
from tqdm import tqdm
|
|
from functools import partial
|
|
from concurrent.futures import ProcessPoolExecutor
|
|
from bs4 import BeautifulSoup
|
|
|
|
HS_SYMBOLS_URL = "http://app.finance.ifeng.com/hq/list.php?type=stock_a&class={s_type}"
|
|
|
|
CALENDAR_URL_BASE = "http://push2his.eastmoney.com/api/qt/stock/kline/get?secid={market}.{bench_code}&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58&klt=101&fqt=0&beg=19900101&end=20991231"
|
|
SZSE_CALENDAR_URL = "http://www.szse.cn/api/report/exchange/onepersistenthour/monthList?month={month}&random={random}"
|
|
|
|
CALENDAR_BENCH_URL_MAP = {
|
|
"CSI300": CALENDAR_URL_BASE.format(market=1, bench_code="000300"),
|
|
"CSI500": CALENDAR_URL_BASE.format(market=1, bench_code="000905"),
|
|
"CSI100": CALENDAR_URL_BASE.format(market=1, bench_code="000903"),
|
|
# NOTE: Use the time series of SH600000 as the sequence of all stocks
|
|
"ALL": CALENDAR_URL_BASE.format(market=1, bench_code="000905"),
|
|
# NOTE: Use the time series of ^GSPC(SP500) as the sequence of all stocks
|
|
"US_ALL": "^GSPC",
|
|
"IN_ALL": "^NSEI",
|
|
"BR_ALL": "^BVSP",
|
|
}
|
|
|
|
_BENCH_CALENDAR_LIST = None
|
|
_ALL_CALENDAR_LIST = None
|
|
_HS_SYMBOLS = None
|
|
_US_SYMBOLS = None
|
|
_IN_SYMBOLS = None
|
|
_BR_SYMBOLS = None
|
|
_EN_FUND_SYMBOLS = None
|
|
_CALENDAR_MAP = {}
|
|
|
|
# NOTE: Until 2020-10-20 20:00:00
|
|
MINIMUM_SYMBOLS_NUM = 3900
|
|
|
|
|
|
def get_calendar_list(bench_code="CSI300") -> List[pd.Timestamp]:
|
|
"""get SH/SZ history calendar list
|
|
|
|
Parameters
|
|
----------
|
|
bench_code: str
|
|
value from ["CSI300", "CSI500", "ALL", "US_ALL"]
|
|
|
|
Returns
|
|
-------
|
|
history calendar list
|
|
"""
|
|
|
|
logger.info(f"get calendar list: {bench_code}......")
|
|
|
|
def _get_calendar(url):
|
|
_value_list = requests.get(url).json()["data"]["klines"]
|
|
return sorted(map(lambda x: pd.Timestamp(x.split(",")[0]), _value_list))
|
|
|
|
calendar = _CALENDAR_MAP.get(bench_code, None)
|
|
if calendar is None:
|
|
if bench_code.startswith("US_") or bench_code.startswith("IN_") or bench_code.startswith("BR_"):
|
|
print(Ticker(CALENDAR_BENCH_URL_MAP[bench_code]))
|
|
print(Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max"))
|
|
df = Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max")
|
|
calendar = df.index.get_level_values(level="date").map(pd.Timestamp).unique().tolist()
|
|
else:
|
|
if bench_code.upper() == "ALL":
|
|
|
|
@deco_retry
|
|
def _get_calendar(month):
|
|
_cal = []
|
|
try:
|
|
resp = requests.get(SZSE_CALENDAR_URL.format(month=month, random=random.random)).json()
|
|
for _r in resp["data"]:
|
|
if int(_r["jybz"]):
|
|
_cal.append(pd.Timestamp(_r["jyrq"]))
|
|
except Exception as e:
|
|
raise ValueError(f"{month}-->{e}")
|
|
return _cal
|
|
|
|
month_range = pd.date_range(start="2000-01", end=pd.Timestamp.now() + pd.Timedelta(days=31), freq="M")
|
|
calendar = []
|
|
for _m in month_range:
|
|
cal = _get_calendar(_m.strftime("%Y-%m"))
|
|
if cal:
|
|
calendar += cal
|
|
calendar = list(filter(lambda x: x <= pd.Timestamp.now(), calendar))
|
|
else:
|
|
calendar = _get_calendar(CALENDAR_BENCH_URL_MAP[bench_code])
|
|
_CALENDAR_MAP[bench_code] = calendar
|
|
logger.info(f"end of get calendar list: {bench_code}.")
|
|
return calendar
|
|
|
|
|
|
def return_date_list(date_field_name: str, file_path: Path):
|
|
date_list = pd.read_csv(file_path, sep=",", index_col=0)[date_field_name].to_list()
|
|
return sorted(map(lambda x: pd.Timestamp(x), date_list))
|
|
|
|
|
|
def get_calendar_list_by_ratio(
|
|
source_dir: [str, Path],
|
|
date_field_name: str = "date",
|
|
threshold: float = 0.5,
|
|
minimum_count: int = 10,
|
|
max_workers: int = 16,
|
|
) -> list:
|
|
"""get calendar list by selecting the date when few funds trade in this day
|
|
|
|
Parameters
|
|
----------
|
|
source_dir: str or Path
|
|
The directory where the raw data collected from the Internet is saved
|
|
date_field_name: str
|
|
date field name, default is date
|
|
threshold: float
|
|
threshold to exclude some days when few funds trade in this day, default 0.5
|
|
minimum_count: int
|
|
minimum count of funds should trade in one day
|
|
max_workers: int
|
|
Concurrent number, default is 16
|
|
|
|
Returns
|
|
-------
|
|
history calendar list
|
|
"""
|
|
logger.info(f"get calendar list from {source_dir} by threshold = {threshold}......")
|
|
|
|
source_dir = Path(source_dir).expanduser()
|
|
file_list = list(source_dir.glob("*.csv"))
|
|
|
|
_number_all_funds = len(file_list)
|
|
|
|
logger.info(f"count how many funds trade in this day......")
|
|
_dict_count_trade = dict() # dict{date:count}
|
|
_fun = partial(return_date_list, date_field_name)
|
|
all_oldest_list = []
|
|
with tqdm(total=_number_all_funds) as p_bar:
|
|
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
|
for date_list in executor.map(_fun, file_list):
|
|
if date_list:
|
|
all_oldest_list.append(date_list[0])
|
|
for date in date_list:
|
|
if date not in _dict_count_trade.keys():
|
|
_dict_count_trade[date] = 0
|
|
|
|
_dict_count_trade[date] += 1
|
|
|
|
p_bar.update()
|
|
|
|
logger.info(f"count how many funds have founded in this day......")
|
|
_dict_count_founding = {date: _number_all_funds for date in _dict_count_trade.keys()} # dict{date:count}
|
|
with tqdm(total=_number_all_funds) as p_bar:
|
|
for oldest_date in all_oldest_list:
|
|
for date in _dict_count_founding.keys():
|
|
if date < oldest_date:
|
|
_dict_count_founding[date] -= 1
|
|
|
|
calendar = [
|
|
date
|
|
for date in _dict_count_trade
|
|
if _dict_count_trade[date] >= max(int(_dict_count_founding[date] * threshold), minimum_count)
|
|
]
|
|
|
|
return calendar
|
|
|
|
|
|
def get_hs_stock_symbols() -> list:
|
|
"""get SH/SZ stock symbols
|
|
|
|
Returns
|
|
-------
|
|
stock symbols
|
|
"""
|
|
global _HS_SYMBOLS
|
|
|
|
def _get_symbol():
|
|
_res = set()
|
|
for _k, _v in (("ha", "ss"), ("sa", "sz"), ("gem", "sz")):
|
|
resp = requests.get(HS_SYMBOLS_URL.format(s_type=_k))
|
|
_res |= set(
|
|
map(
|
|
lambda x: "{}.{}".format(re.findall(r"\d+", x)[0], _v),
|
|
etree.HTML(resp.text).xpath("//div[@class='result']/ul//li/a/text()"),
|
|
)
|
|
)
|
|
time.sleep(3)
|
|
return _res
|
|
|
|
if _HS_SYMBOLS is None:
|
|
symbols = set()
|
|
_retry = 60
|
|
# It may take multiple times to get the complete
|
|
while len(symbols) < MINIMUM_SYMBOLS_NUM:
|
|
symbols |= _get_symbol()
|
|
time.sleep(3)
|
|
|
|
symbol_cache_path = Path("~/.cache/hs_symbols_cache.pkl").expanduser().resolve()
|
|
symbol_cache_path.parent.mkdir(parents=True, exist_ok=True)
|
|
if symbol_cache_path.exists():
|
|
with symbol_cache_path.open("rb") as fp:
|
|
cache_symbols = pickle.load(fp)
|
|
symbols |= cache_symbols
|
|
with symbol_cache_path.open("wb") as fp:
|
|
pickle.dump(symbols, fp)
|
|
|
|
_HS_SYMBOLS = sorted(list(symbols))
|
|
|
|
return _HS_SYMBOLS
|
|
|
|
|
|
def get_us_stock_symbols(qlib_data_path: [str, Path] = None) -> list:
|
|
"""get US stock symbols
|
|
|
|
Returns
|
|
-------
|
|
stock symbols
|
|
"""
|
|
global _US_SYMBOLS
|
|
|
|
@deco_retry
|
|
def _get_eastmoney():
|
|
url = "http://4.push2.eastmoney.com/api/qt/clist/get?pn=1&pz=10000&fs=m:105,m:106,m:107&fields=f12"
|
|
resp = requests.get(url)
|
|
if resp.status_code != 200:
|
|
raise ValueError("request error")
|
|
|
|
try:
|
|
_symbols = [_v["f12"].replace("_", "-P") for _v in resp.json()["data"]["diff"].values()]
|
|
except Exception as e:
|
|
logger.warning(f"request error: {e}")
|
|
raise
|
|
|
|
if len(_symbols) < 8000:
|
|
raise ValueError("request error")
|
|
|
|
return _symbols
|
|
|
|
@deco_retry
|
|
def _get_nasdaq():
|
|
_res_symbols = []
|
|
for _name in ["otherlisted", "nasdaqtraded"]:
|
|
url = f"ftp://ftp.nasdaqtrader.com/SymbolDirectory/{_name}.txt"
|
|
df = pd.read_csv(url, sep="|")
|
|
df = df.rename(columns={"ACT Symbol": "Symbol"})
|
|
_symbols = df["Symbol"].dropna()
|
|
_symbols = _symbols.str.replace("$", "-P", regex=False)
|
|
_symbols = _symbols.str.replace(".W", "-WT", regex=False)
|
|
_symbols = _symbols.str.replace(".U", "-UN", regex=False)
|
|
_symbols = _symbols.str.replace(".R", "-RI", regex=False)
|
|
_symbols = _symbols.str.replace(".", "-", regex=False)
|
|
_res_symbols += _symbols.unique().tolist()
|
|
return _res_symbols
|
|
|
|
@deco_retry
|
|
def _get_nyse():
|
|
url = "https://www.nyse.com/api/quotes/filter"
|
|
_parms = {
|
|
"instrumentType": "EQUITY",
|
|
"pageNumber": 1,
|
|
"sortColumn": "NORMALIZED_TICKER",
|
|
"sortOrder": "ASC",
|
|
"maxResultsPerPage": 10000,
|
|
"filterToken": "",
|
|
}
|
|
resp = requests.post(url, json=_parms)
|
|
if resp.status_code != 200:
|
|
raise ValueError("request error")
|
|
|
|
try:
|
|
_symbols = [_v["symbolTicker"].replace("-", "-P") for _v in resp.json()]
|
|
except Exception as e:
|
|
logger.warning(f"request error: {e}")
|
|
_symbols = []
|
|
return _symbols
|
|
|
|
if _US_SYMBOLS is None:
|
|
_all_symbols = _get_eastmoney() + _get_nasdaq() + _get_nyse()
|
|
if qlib_data_path is not None:
|
|
for _index in ["nasdaq100", "sp500"]:
|
|
ins_df = pd.read_csv(
|
|
Path(qlib_data_path).joinpath(f"instruments/{_index}.txt"),
|
|
sep="\t",
|
|
names=["symbol", "start_date", "end_date"],
|
|
)
|
|
_all_symbols += ins_df["symbol"].unique().tolist()
|
|
|
|
def _format(s_):
|
|
s_ = s_.replace(".", "-")
|
|
s_ = s_.strip("$")
|
|
s_ = s_.strip("*")
|
|
return s_
|
|
|
|
_US_SYMBOLS = sorted(set(map(_format, filter(lambda x: len(x) < 8 and not x.endswith("WS"), _all_symbols))))
|
|
|
|
return _US_SYMBOLS
|
|
|
|
|
|
def get_in_stock_symbols(qlib_data_path: [str, Path] = None) -> list:
|
|
"""get IN stock symbols
|
|
|
|
Returns
|
|
-------
|
|
stock symbols
|
|
"""
|
|
global _IN_SYMBOLS
|
|
|
|
@deco_retry
|
|
def _get_nifty():
|
|
url = f"https://www1.nseindia.com/content/equities/EQUITY_L.csv"
|
|
df = pd.read_csv(url)
|
|
df = df.rename(columns={"SYMBOL": "Symbol"})
|
|
df["Symbol"] = df["Symbol"] + ".NS"
|
|
_symbols = df["Symbol"].dropna()
|
|
_symbols = _symbols.unique().tolist()
|
|
return _symbols
|
|
|
|
if _IN_SYMBOLS is None:
|
|
_all_symbols = _get_nifty()
|
|
if qlib_data_path is not None:
|
|
for _index in ["nifty"]:
|
|
ins_df = pd.read_csv(
|
|
Path(qlib_data_path).joinpath(f"instruments/{_index}.txt"),
|
|
sep="\t",
|
|
names=["symbol", "start_date", "end_date"],
|
|
)
|
|
_all_symbols += ins_df["symbol"].unique().tolist()
|
|
|
|
def _format(s_):
|
|
s_ = s_.replace(".", "-")
|
|
s_ = s_.strip("$")
|
|
s_ = s_.strip("*")
|
|
return s_
|
|
|
|
_IN_SYMBOLS = sorted(set(_all_symbols))
|
|
|
|
return _IN_SYMBOLS
|
|
|
|
|
|
def get_br_stock_symbols(qlib_data_path: [str, Path] = None) -> list:
|
|
"""get Brazil(B3) stock symbols
|
|
|
|
Returns
|
|
-------
|
|
B3 stock symbols
|
|
"""
|
|
global _BR_SYMBOLS
|
|
|
|
@deco_retry
|
|
def _get_ibovespa():
|
|
_symbols = []
|
|
url = "https://www.fundamentus.com.br/detalhes.php?papel="
|
|
|
|
# Request
|
|
agent = {"User-Agent": "Mozilla/5.0"}
|
|
page = requests.get(url, headers=agent)
|
|
|
|
# BeautifulSoup
|
|
soup = BeautifulSoup(page.content, "html.parser")
|
|
tbody = soup.find("tbody")
|
|
|
|
children = tbody.findChildren("a", recursive=True)
|
|
for child in children:
|
|
_symbols.append(str(child).split('"')[-1].split(">")[1].split("<")[0])
|
|
|
|
return _symbols
|
|
|
|
if _BR_SYMBOLS is None:
|
|
_all_symbols = _get_ibovespa()
|
|
if qlib_data_path is not None:
|
|
for _index in ["ibov"]:
|
|
ins_df = pd.read_csv(
|
|
Path(qlib_data_path).joinpath(f"instruments/{_index}.txt"),
|
|
sep="\t",
|
|
names=["symbol", "start_date", "end_date"],
|
|
)
|
|
_all_symbols += ins_df["symbol"].unique().tolist()
|
|
|
|
def _format(s_):
|
|
s_ = s_.strip()
|
|
s_ = s_.strip("$")
|
|
s_ = s_.strip("*")
|
|
s_ = s_ + ".SA"
|
|
return s_
|
|
|
|
_BR_SYMBOLS = sorted(set(map(_format, _all_symbols)))
|
|
|
|
return _BR_SYMBOLS
|
|
|
|
|
|
def get_en_fund_symbols(qlib_data_path: [str, Path] = None) -> list:
|
|
"""get en fund symbols
|
|
|
|
Returns
|
|
-------
|
|
fund symbols in China
|
|
"""
|
|
global _EN_FUND_SYMBOLS
|
|
|
|
@deco_retry
|
|
def _get_eastmoney():
|
|
url = "http://fund.eastmoney.com/js/fundcode_search.js"
|
|
resp = requests.get(url)
|
|
if resp.status_code != 200:
|
|
raise ValueError("request error")
|
|
try:
|
|
_symbols = []
|
|
for sub_data in re.findall(r"[\[](.*?)[\]]", resp.content.decode().split("= [")[-1].replace("];", "")):
|
|
data = sub_data.replace('"', "").replace("'", "")
|
|
# TODO: do we need other information, like fund_name from ['000001', 'HXCZHH', '华夏成长混合', '混合型', 'HUAXIACHENGZHANGHUNHE']
|
|
_symbols.append(data.split(",")[0])
|
|
except Exception as e:
|
|
logger.warning(f"request error: {e}")
|
|
raise
|
|
if len(_symbols) < 8000:
|
|
raise ValueError("request error")
|
|
return _symbols
|
|
|
|
if _EN_FUND_SYMBOLS is None:
|
|
_all_symbols = _get_eastmoney()
|
|
|
|
_EN_FUND_SYMBOLS = sorted(set(_all_symbols))
|
|
|
|
return _EN_FUND_SYMBOLS
|
|
|
|
|
|
def symbol_suffix_to_prefix(symbol: str, capital: bool = True) -> str:
|
|
"""symbol suffix to prefix
|
|
|
|
Parameters
|
|
----------
|
|
symbol: str
|
|
symbol
|
|
capital : bool
|
|
by default True
|
|
Returns
|
|
-------
|
|
|
|
"""
|
|
code, exchange = symbol.split(".")
|
|
if exchange.lower() in ["sh", "ss"]:
|
|
res = f"sh{code}"
|
|
else:
|
|
res = f"{exchange}{code}"
|
|
return res.upper() if capital else res.lower()
|
|
|
|
|
|
def symbol_prefix_to_sufix(symbol: str, capital: bool = True) -> str:
|
|
"""symbol prefix to sufix
|
|
|
|
Parameters
|
|
----------
|
|
symbol: str
|
|
symbol
|
|
capital : bool
|
|
by default True
|
|
Returns
|
|
-------
|
|
|
|
"""
|
|
res = f"{symbol[:-2]}.{symbol[-2:]}"
|
|
return res.upper() if capital else res.lower()
|
|
|
|
|
|
def deco_retry(retry: int = 5, retry_sleep: int = 3):
|
|
def deco_func(func):
|
|
@functools.wraps(func)
|
|
def wrapper(*args, **kwargs):
|
|
_retry = 5 if callable(retry) else retry
|
|
_result = None
|
|
for _i in range(1, _retry + 1):
|
|
try:
|
|
_result = func(*args, **kwargs)
|
|
break
|
|
|
|
except Exception as e:
|
|
logger.warning(f"{func.__name__}: {_i} :{e}")
|
|
if _i == _retry:
|
|
raise
|
|
|
|
time.sleep(retry_sleep)
|
|
return _result
|
|
|
|
return wrapper
|
|
|
|
return deco_func(retry) if callable(retry) else deco_func
|
|
|
|
|
|
def get_trading_date_by_shift(trading_list: list, trading_date: pd.Timestamp, shift: int = 1):
|
|
"""get trading date by shift
|
|
|
|
Parameters
|
|
----------
|
|
trading_list: list
|
|
trading calendar list
|
|
shift : int
|
|
shift, default is 1
|
|
|
|
trading_date : pd.Timestamp
|
|
trading date
|
|
Returns
|
|
-------
|
|
|
|
"""
|
|
trading_date = pd.Timestamp(trading_date)
|
|
left_index = bisect.bisect_left(trading_list, trading_date)
|
|
try:
|
|
res = trading_list[left_index + shift]
|
|
except IndexError:
|
|
res = trading_date
|
|
return res
|
|
|
|
|
|
def generate_minutes_calendar_from_daily(
|
|
calendars: Iterable,
|
|
freq: str = "1min",
|
|
am_range: Tuple[str, str] = ("09:30:00", "11:29:00"),
|
|
pm_range: Tuple[str, str] = ("13:00:00", "14:59:00"),
|
|
) -> pd.Index:
|
|
"""generate minutes calendar
|
|
|
|
Parameters
|
|
----------
|
|
calendars: Iterable
|
|
daily calendar
|
|
freq: str
|
|
by default 1min
|
|
am_range: Tuple[str, str]
|
|
AM Time Range, by default China-Stock: ("09:30:00", "11:29:00")
|
|
pm_range: Tuple[str, str]
|
|
PM Time Range, by default China-Stock: ("13:00:00", "14:59:00")
|
|
|
|
"""
|
|
daily_format: str = "%Y-%m-%d"
|
|
res = []
|
|
for _day in calendars:
|
|
for _range in [am_range, pm_range]:
|
|
res.append(
|
|
pd.date_range(
|
|
f"{pd.Timestamp(_day).strftime(daily_format)} {_range[0]}",
|
|
f"{pd.Timestamp(_day).strftime(daily_format)} {_range[1]}",
|
|
freq=freq,
|
|
)
|
|
)
|
|
|
|
return pd.Index(sorted(set(np.hstack(res))))
|
|
|
|
def get_instruments(
|
|
qlib_dir: str,
|
|
index_name: str,
|
|
method: str = "parse_instruments",
|
|
freq: str = "day",
|
|
request_retry: int = 5,
|
|
retry_sleep: int = 3,
|
|
market_index: str = "cn_index"
|
|
):
|
|
"""
|
|
|
|
Parameters
|
|
----------
|
|
qlib_dir: str
|
|
qlib data dir, default "Path(__file__).parent/qlib_data"
|
|
index_name: str
|
|
index name, value from ["csi100", "csi300"]
|
|
method: str
|
|
method, value from ["parse_instruments", "save_new_companies"]
|
|
freq: str
|
|
freq, value from ["day", "1min"]
|
|
request_retry: int
|
|
request retry, by default 5
|
|
retry_sleep: int
|
|
request sleep, by default 3
|
|
market_index: str
|
|
Where the files to obtain the index are located,
|
|
for example data_collector.cn_index.collector
|
|
|
|
Examples
|
|
-------
|
|
# parse instruments
|
|
$ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method parse_instruments
|
|
|
|
# parse new companies
|
|
$ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method save_new_companies
|
|
|
|
"""
|
|
_cur_module = importlib.import_module("data_collector.{}.collector".format(market_index))
|
|
obj = getattr(_cur_module, f"{index_name.upper()}Index")(
|
|
qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep
|
|
)
|
|
getattr(obj, method)()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
assert len(get_hs_stock_symbols()) >= MINIMUM_SYMBOLS_NUM |