From 985fd0816cd81cccaa315f25bda871dfa9c3c526 Mon Sep 17 00:00:00 2001 From: zhupr Date: Mon, 21 Jun 2021 17:18:04 +0800 Subject: [PATCH] Fix cn_index.collector network error --- scripts/data_collector/cn_index/collector.py | 27 +++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/scripts/data_collector/cn_index/collector.py b/scripts/data_collector/cn_index/collector.py index 63b2ebd9e..a529f61df 100644 --- a/scripts/data_collector/cn_index/collector.py +++ b/scripts/data_collector/cn_index/collector.py @@ -19,7 +19,7 @@ CUR_DIR = Path(__file__).resolve().parent sys.path.append(str(CUR_DIR.parent.parent)) from data_collector.index import IndexBase -from data_collector.utils import get_calendar_list, get_trading_date_by_shift +from data_collector.utils import get_calendar_list, get_trading_date_by_shift, deco_retry NEW_COMPANIES_URL = "http://www.csindex.com.cn/uploads/file/autofile/cons/{index_code}cons.xls" @@ -29,6 +29,22 @@ NEW_COMPANIES_URL = "http://www.csindex.com.cn/uploads/file/autofile/cons/{index # 2020-11-27 Announcement title change INDEX_CHANGES_URL = "http://www.csindex.com.cn/zh-CN/search/total?key=%E5%85%B3%E4%BA%8E%E8%B0%83%E6%95%B4%E6%B2%AA%E6%B7%B1300%E5%92%8C%E4%B8%AD%E8%AF%81%E9%A6%99%E6%B8%AF100%E7%AD%89" +REQ_HEADERS = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36 Edg/91.0.864.48" +} + + +@deco_retry +def retry_request(url: str, method: str = "get", exclude_status: List = None): + if exclude_status is None: + exclude_status = [] + method_func = getattr(requests, method) + _resp = method_func(url, headers=REQ_HEADERS) + _status = _resp.status_code + if _status not in exclude_status and _status != 200: + raise ValueError(f"response status: {_status}, url={url}") + return _resp + class CSIIndex(IndexBase): @property @@ -137,9 +153,8 @@ class CSIIndex(IndexBase): date: pd.Timestamp type: str, value from ["add", "remove"] """ - resp = requests.get(url) + resp = retry_request(url) _text = resp.text - date_list = re.findall(r"(\d{4}).*?年.*?(\d+).*?月.*?(\d+).*?日", _text) if len(date_list) >= 2: add_date = pd.Timestamp("-".join(date_list[0])) @@ -150,7 +165,7 @@ class CSIIndex(IndexBase): logger.info(f"get {add_date} changes") try: excel_url = re.findall('.*href="(.*?xls.*?)".*', _text)[0] - content = requests.get(f"http://www.csindex.com.cn{excel_url}").content + content = retry_request(f"http://www.csindex.com.cn{excel_url}", exclude_status=[404]).content _io = BytesIO(content) df_map = pd.read_excel(_io, sheet_name=None) with self.cache_dir.joinpath( @@ -204,7 +219,7 @@ class CSIIndex(IndexBase): ------- [url1, url2] """ - resp = requests.get(self.changes_url) + resp = retry_request(self.changes_url) html = etree.HTML(resp.text) return html.xpath("//*[@id='itemContainer']//li/a/@href") @@ -224,7 +239,7 @@ class CSIIndex(IndexBase): end_date: pd.Timestamp """ logger.info("get new companies......") - context = requests.get(self.new_companies_url).content + context = retry_request(self.new_companies_url).content with self.cache_dir.joinpath( f"{self.index_name.lower()}_new_companies.{self.new_companies_url.split('.')[-1]}" ).open("wb") as fp: