From 34a2372f019295e8ea3acbb289c15853900087b0 Mon Sep 17 00:00:00 2001 From: Linlang Date: Wed, 28 Jan 2026 16:59:40 +0800 Subject: [PATCH] fix: possible bug causing missing calendar_list data --- scripts/data_collector/utils.py | 41 +++++++++++++++++---------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index 667d070da..fbf1d64ba 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -25,7 +25,6 @@ from bs4 import BeautifulSoup HS_SYMBOLS_URL = "http://app.finance.ifeng.com/hq/list.php?type=stock_a&class={s_type}" -# CALENDAR_URL_BASE = "http://push2his.eastmoney.com/api/qt/stock/kline/get?secid={market}.{bench_code}&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58&klt=101&fqt=0&beg={start}&end={end}" CALENDAR_URL_BASE = "http://push2his.eastmoney.com/api/qt/stock/kline/get?secid={market}.{bench_code}&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58&klt=101&fqt=0" SZSE_CALENDAR_URL = "http://www.szse.cn/api/report/exchange/onepersistenthour/monthList?month={month}&random={random}" @@ -97,37 +96,39 @@ def get_calendar_list(bench_code="CSI300") -> List[pd.Timestamp]: logger.info(f"get calendar list: {bench_code}......") - def _get_calendar(url): + def _get_calendar(url, max_retry=3): session = requests.Session() session.headers.update(build_headers()) current_datetime = datetime.datetime.now() cur_year = current_datetime.year res_list = [] - for per_year in range(2000, cur_year + 1): - start = f"{per_year}0101" - end = f"{per_year}1231" + failed_years = [] + for year in range(2000, cur_year + 1): + start = f"{year}0101" + end = f"{year}1231" formatted_url = url + f"&beg={start}&end={end}".format(start=start, end=end) - try: - resp = session.get(formatted_url, timeout=10) - resp.raise_for_status() - payload = resp.json() - data = payload.get("data") - if not data or "klines" not in data: - continue + for attempt in range(max_retry): + try: + resp = session.get(formatted_url, timeout=10) + resp.raise_for_status() - klines = data["klines"] - res_list.extend(pd.Timestamp(x.split(",")[0]) for x in klines) + data = resp.json().get("data") + if not data or "klines" not in data: + raise ValueError("missing klines") - except requests.RequestException as e: - continue + res_list.extend(pd.Timestamp(x.split(",")[0]) for x in data["klines"]) + break - time.sleep(random.uniform(0.5, 1.2)) + except Exception as e: + time.sleep(random.uniform(0.8, 1.5)) + else: + failed_years.append(year) + + if failed_years: + logger.warning(f"Calendar incomplete, failed years: {failed_years}") return sorted(set(res_list)) - # _value_list = requests.get(url, timeout=None).json()["data"]["klines"] - # return sorted(map(lambda x: pd.Timestamp(x.split(",")[0]), _value_list)) - calendar = _CALENDAR_MAP.get(bench_code, None) if calendar is None: if bench_code.startswith("US_") or bench_code.startswith("IN_") or bench_code.startswith("BR_"):