1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-06-06 05:51:17 +08:00
Files
qlib/scripts/data_collector/csi/collector.py
you-n-g de9e13b171 release-0.5.0 (#1)
* init commit

* change the version number

* rich the docs&fix cache docs

* update index readme

* Modify cache class name

* Modify sharpe to information_ratio

* Modify Group- to Group

* add the description of graphical results & fix the backtest docs

* fix docs in details

* update docs

* Update introduction.rst

* Update README.md

* Update introduction.rst

* Update introduction.rst

* Update introduction.rst

* Update installation.rst

* Update installation.rst

* Update initialization.rst

* Update getdata.rst

* Update integration.rst

* Update initialization.rst

* Update getdata.rst

* Update estimator.rst

Modify some typos.

* Update README.md

Modify the typos.

* Update initialization.rst

* Update data.rst

* Update report.rst

* Update estimator.rst

* Update cumulative_return.py

* Update model.rst

* Update rank_label.py

* Update cumulative_return.py

* Update strategy.rst

* Update getdata.rst

* Update backtest.rst

* Update integration.rst

* Update getdata.rst

* Update introduction.rst

* Update introduction.rst

* Update README.md

* Update report.rst

* Update integration.rst

Fix typos

* Update installation.rst

Fix typos

* Update getdata.rst

* Update initialization.rst

Fix typos.

* add quick start docs&fix detials

* fix estimator docs & fix strategy docs

* fix the cahce in data.rst

* update documents

* Fix Corr && Rsquare

* fix data retrival example to csi300 & fix a data bug

* fix filter bug

* Fix data collector

* Modift model args

* add the log & fix README.md\quick.rst

* add enviroment depend & add intoduction of qlib-server online mode

* fix image center fomat & set log_only of docs is True

* fix README.md format

* update data preparation & readme logo image

* get_data support version

* Modify analysis names

* Modify analysis graph

* update report.rst & data.rst

* commmit estimator for merge

* minimal requirements

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update READEME.md

* Update READEME.md

* update estimator

* Fix doc urls

* fix get_data.py docstring

* update test_get_data.py

* Upate docs

* Upate docs

* Upate docs

Co-authored-by: bxdd <bxddream@gmail.com>
Co-authored-by: zhupr <zhu.pengrong@foxmail.com>
Co-authored-by: Wendi Li <wendili.academic@qq.com>
Co-authored-by: Dingsu Wang <dingsu.wang@gmail.com>
Co-authored-by: bxdd <45119470+bxdd@users.noreply.github.com>
Co-authored-by: cslwqxx <cslwqxx@users.noreply.github.com>
2020-09-24 12:01:39 +08:00

211 lines
6.7 KiB
Python

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import re
import sys
import bisect
from io import BytesIO
from pathlib import Path
import fire
import requests
import pandas as pd
from lxml import etree
from loguru import logger
CUR_DIR = Path(__file__).resolve().parent
sys.path.append(str(CUR_DIR.parent.parent))
from data_collector.utils import get_hs_calendar_list as get_calendar_list
NEW_COMPANIES_URL = "http://www.csindex.com.cn/uploads/file/autofile/cons/000300cons.xls"
CSI300_CHANGES_URL = "http://www.csindex.com.cn/zh-CN/search/total?key=%E5%85%B3%E4%BA%8E%E8%B0%83%E6%95%B4%E6%B2%AA%E6%B7%B1300%E5%92%8C%E4%B8%AD%E8%AF%81%E9%A6%99%E6%B8%AF100%E7%AD%89%E6%8C%87%E6%95%B0%E6%A0%B7%E6%9C%AC%E8%82%A1%E7%9A%84%E5%85%AC%E5%91%8A"
CSI300_START_DATE = pd.Timestamp("2005-01-01")
class CSI300:
REMOVE = "remove"
ADD = "add"
def __init__(self, qlib_dir=None):
"""
Parameters
----------
qlib_dir: str
qlib data dir, default "Path(__file__).parent/qlib_data"
"""
if qlib_dir is None:
qlib_dir = CUR_DIR.joinpath("qlib_data")
self.instruments_dir = Path(qlib_dir).expanduser().resolve().joinpath("instruments")
self.instruments_dir.mkdir(exist_ok=True, parents=True)
self._calendar_list = None
@property
def calendar_list(self) -> list:
"""get history trading date
Returns
-------
"""
return get_calendar_list(bench=True)
def _get_trading_date_by_shift(self, trading_date: pd.Timestamp, shift=1):
"""get trading date by shift
Parameters
----------
shift : int
shift, default is 1
trading_date : pd.Timestamp
trading date
Returns
-------
"""
left_index = bisect.bisect_left(self.calendar_list, trading_date)
try:
res = self.calendar_list[left_index + shift]
except IndexError:
res = trading_date
return res
def _get_changes(self) -> pd.DataFrame:
"""get companies changes
Returns
-------
"""
logger.info("get companies changes......")
res = []
for _url in self._get_change_notices_url():
_df = self._read_change_from_url(_url)
res.append(_df)
logger.info("get companies changes finish")
return pd.concat(res)
@staticmethod
def normalize_symbol(symbol):
symbol = f"{int(symbol):06}"
return f"SH{symbol}" if symbol.startswith("60") else f"SZ{symbol}"
def _read_change_from_url(self, url: str) -> pd.DataFrame:
"""read change from url
Parameters
----------
url : str
change url
Returns
-------
"""
resp = requests.get(url)
_text = resp.text
date_list = re.findall(r"(\d{4}).*?年.*?(\d+).*?月.*?(\d+).*?日", _text)
if len(date_list) >= 2:
add_date = pd.Timestamp("-".join(date_list[0]))
else:
_date = pd.Timestamp("-".join(re.findall(r"(\d{4}).*?年.*?(\d+).*?月", _text)[0]))
add_date = self._get_trading_date_by_shift(_date, shift=0)
remove_date = self._get_trading_date_by_shift(add_date, shift=-1)
logger.info(f"get {add_date} changes")
try:
excel_url = re.findall('.*href="(.*?xls.*?)".*', _text)[0]
_io = BytesIO(requests.get(f"http://www.csindex.com.cn{excel_url}").content)
df_map = pd.read_excel(_io, sheet_name=None)
tmp = []
for _s_name, _type, _date in [("调入", self.ADD, add_date), ("调出", self.REMOVE, remove_date)]:
_df = df_map[_s_name]
_df = _df.loc[_df["指数代码"] == "000300", ["证券代码"]]
_df = _df.applymap(self.normalize_symbol)
_df.columns = ["symbol"]
_df["type"] = _type
_df["date"] = _date
tmp.append(_df)
df = pd.concat(tmp)
except Exception:
df = None
for _df in pd.read_html(resp.content):
if _df.shape[-1] != 4:
continue
tmp = []
for _s, _type, _date in [
(_df.iloc[2:, 0], self.REMOVE, remove_date),
(_df.iloc[2:, 2], self.ADD, add_date),
]:
_tmp_df = pd.DataFrame()
_tmp_df["symbol"] = _s.map(self.normalize_symbol)
_tmp_df["type"] = _type
_tmp_df["date"] = _date
tmp.append(_tmp_df)
df = pd.concat(tmp)
break
return df
@staticmethod
def _get_change_notices_url() -> list:
"""get change notices url
Returns
-------
"""
resp = requests.get(CSI300_CHANGES_URL)
html = etree.HTML(resp.text)
return html.xpath("//*[@id='itemContainer']//li/a/@href")
def _get_new_companies(self):
logger.info("get new companies")
_io = BytesIO(requests.get(NEW_COMPANIES_URL).content)
df = pd.read_excel(_io)
df = df.iloc[:, [0, 4]]
df.columns = ["end_date", "symbol"]
df["symbol"] = df["symbol"].map(self.normalize_symbol)
df["end_date"] = pd.to_datetime(df["end_date"])
df["start_date"] = CSI300_START_DATE
return df
def parse_instruments(self):
"""parse csi300.txt
Examples
-------
$ python collector.py parse_instruments --qlib_dir ~/.qlib/qlib_data/cn_data
"""
logger.info("start parse csi300 companies.....")
instruments_columns = ["symbol", "start_date", "end_date"]
changers_df = self._get_changes()
new_df = self._get_new_companies()
logger.info("parse history companies by changes......")
for _row in changers_df.sort_values("date", ascending=False).itertuples(index=False):
if _row.type == self.ADD:
min_end_date = new_df.loc[new_df["symbol"] == _row.symbol, "end_date"].min()
new_df.loc[
(new_df["end_date"] == min_end_date) & (new_df["symbol"] == _row.symbol), "start_date"
] = _row.date
else:
_tmp_df = pd.DataFrame(
[[_row.symbol, CSI300_START_DATE, _row.date]], columns=["symbol", "start_date", "end_date"]
)
new_df = new_df.append(_tmp_df, sort=False)
new_df.loc[:, instruments_columns].to_csv(
self.instruments_dir.joinpath("csi300.txt"), sep="\t", index=False, header=None
)
logger.info("parse csi300 companies finished.")
if __name__ == "__main__":
fire.Fire(CSI300)