1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-07-05 03:50:57 +08:00
Files
qlib/scripts/data_collector/br_index/collector.py
igor17400 56cfa480dc Ibovespa index support (#990)
* feat: download ibovespa index historic composition

ibovespa(ibov) is the largest index in Brazil's stocks exchange.
The br_index folder has support for downloading new companies for the current index composition.
And has support, as well, for downloading companies from historic composition of ibov index.

Partially resolves issue #956

* fix: typo error instead of end_date, it was written end_ate

* feat: adds support for downloading stocks historic prices from Brazil's stocks exchange (B3)

Together with commit c2f933 it resolves issue #956

* fix: code formatted with black.

* wip: Creating code logic for brazils stock market data normalization

* docs: brazils stock market data normalization code documentation

* fix: code formatted the with black

* docs: fixed typo

* docs: more info about python version used to generate requirements.txt file

* docs: added BeautifulSoup requirements

* feat: removed debug prints

* feat: added ibov_index_composition variable as a class attribute of IBOVIndex

* feat: added increment to generate the four month period used by the ibov index

* refactor: Added get_instruments() method inside utils.py for better code usability.

Message in the PR request to understand the context of the change

In the course of reviewing this PR we found two issues.

    1. there are multiple places where the get_instruments() method is used,
	and we feel that scripts.index.py is the best place for the
	get_instruments() method to go.
    2. data_collector.utils has some very generic stuff put inside it.

* refactor: improve brazils stocks download speed

The reason to use retry=2 is due to the fact that
Yahoo Finance unfortunately does not keep track of the majority
of Brazilian stocks.

Therefore, the decorator deco_retry with retry argument
set to 5 will keep trying to get the stock data 5 times,
which makes the code to download Brazilians stocks very slow.

In future, this may change, but for now
I suggest to leave retry argument to 1 or 2 in
order to improve download speed.

In order to achieve this code logic an argument called retry_config
was added into YahooCollectorBR1d and YahooCollectorBR1min

* fix: added __main__ at the bottom of the script

* refactor: changed interface inside each index

Using partial as `fire.Fire(partial(get_instruments, market_index="br_index" ))`
will make the interface easier for the user to execute the script.
Then all the collector.py CLI in each folder can remove a redundant arguments.

* refactor: implemented  class interface retry into YahooCollectorBR

* docs: added BR as a possible region into the documentation

* refactor: make retry attribute part of the interface

This way we don't have to use hasattr to access the retry attribute as previously done
2022-04-06 09:01:29 +08:00

278 lines
9.7 KiB
Python

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from functools import partial
import sys
from pathlib import Path
import importlib
import datetime
import fire
import pandas as pd
from tqdm import tqdm
from loguru import logger
CUR_DIR = Path(__file__).resolve().parent
sys.path.append(str(CUR_DIR.parent.parent))
from data_collector.index import IndexBase
from data_collector.utils import get_instruments
quarter_dict = {"1Q": "01-03", "2Q": "05-01", "3Q": "09-01"}
class IBOVIndex(IndexBase):
ibov_index_composition = "https://raw.githubusercontent.com/igor17400/IBOV-HCI/main/historic_composition/{}.csv"
years_4_month_periods = []
def __init__(
self,
index_name: str,
qlib_dir: [str, Path] = None,
freq: str = "day",
request_retry: int = 5,
retry_sleep: int = 3,
):
super(IBOVIndex, self).__init__(
index_name=index_name, qlib_dir=qlib_dir, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep
)
self.today: datetime = datetime.date.today()
self.current_4_month_period = self.get_current_4_month_period(self.today.month)
self.year = str(self.today.year)
self.years_4_month_periods = self.get_four_month_period()
@property
def bench_start_date(self) -> pd.Timestamp:
"""
The ibovespa index started on 2 January 1968 (wiki), however,
no suitable data source that keeps track of ibovespa's history
stocks composition has been found. Except from the repo indicated
in README. Which keeps track of such information starting from
the first quarter of 2003
"""
return pd.Timestamp("2003-01-03")
def get_current_4_month_period(self, current_month: int):
"""
This function is used to calculated what is the current
four month period for the current month. For example,
If the current month is August 8, its four month period
is 2Q.
OBS: In english Q is used to represent *quarter*
which means a three month period. However, in
portuguese we use Q to represent a four month period.
In other words,
Jan, Feb, Mar, Apr: 1Q
May, Jun, Jul, Aug: 2Q
Sep, Oct, Nov, Dez: 3Q
Parameters
----------
month : int
Current month (1 <= month <= 12)
Returns
-------
current_4m_period:str
Current Four Month Period (1Q or 2Q or 3Q)
"""
if current_month < 5:
return "1Q"
if current_month < 9:
return "2Q"
if current_month <= 12:
return "3Q"
else:
return -1
def get_four_month_period(self):
"""
The ibovespa index is updated every four months.
Therefore, we will represent each time period as 2003_1Q
which means 2003 first four mount period (Jan, Feb, Mar, Apr)
"""
four_months_period = ["1Q", "2Q", "3Q"]
init_year = 2003
now = datetime.datetime.now()
current_year = now.year
current_month = now.month
for year in [item for item in range(init_year, current_year)]:
for el in four_months_period:
self.years_4_month_periods.append(str(year)+"_"+el)
# For current year the logic must be a little different
current_4_month_period = self.get_current_4_month_period(current_month)
for i in range(int(current_4_month_period[0])):
self.years_4_month_periods.append(str(current_year) + "_" + str(i+1) + "Q")
return self.years_4_month_periods
def format_datetime(self, inst_df: pd.DataFrame) -> pd.DataFrame:
"""formatting the datetime in an instrument
Parameters
----------
inst_df: pd.DataFrame
inst_df.columns = [self.SYMBOL_FIELD_NAME, self.START_DATE_FIELD, self.END_DATE_FIELD]
Returns
-------
inst_df: pd.DataFrame
"""
logger.info("Formatting Datetime")
if self.freq != "day":
inst_df[self.END_DATE_FIELD] = inst_df[self.END_DATE_FIELD].apply(
lambda x: (pd.Timestamp(x) + pd.Timedelta(hours=23, minutes=59)).strftime("%Y-%m-%d %H:%M:%S")
)
else:
inst_df[self.START_DATE_FIELD] = inst_df[self.START_DATE_FIELD].apply(
lambda x: (pd.Timestamp(x)).strftime("%Y-%m-%d")
)
inst_df[self.END_DATE_FIELD] = inst_df[self.END_DATE_FIELD].apply(
lambda x: (pd.Timestamp(x)).strftime("%Y-%m-%d")
)
return inst_df
def format_quarter(self, cell: str):
"""
Parameters
----------
cell: str
It must be on the format 2003_1Q --> years_4_month_periods
Returns
----------
date: str
Returns date in format 2003-03-01
"""
cell_split = cell.split("_")
return cell_split[0] + "-" + quarter_dict[cell_split[1]]
def get_changes(self):
"""
Access the index historic composition and compare it quarter
by quarter and year by year in order to generate a file that
keeps track of which stocks have been removed and which have
been added.
The Dataframe used as reference will provided the index
composition for each year an quarter:
pd.DataFrame:
symbol
SH600000
SH600001
.
.
.
Parameters
----------
self: is used to represent the instance of the class.
Returns
----------
pd.DataFrame:
symbol date type
SH600000 2019-11-11 add
SH600001 2020-11-10 remove
dtypes:
symbol: str
date: pd.Timestamp
type: str, value from ["add", "remove"]
"""
logger.info("Getting companies changes in {} index ...".format(self.index_name))
try:
df_changes_list = []
for i in tqdm(range(len(self.years_4_month_periods) - 1)):
df = pd.read_csv(self.ibov_index_composition.format(self.years_4_month_periods[i]), on_bad_lines="skip")["symbol"]
df_ = pd.read_csv(self.ibov_index_composition.format(self.years_4_month_periods[i + 1]), on_bad_lines="skip")["symbol"]
## Remove Dataframe
remove_date = self.years_4_month_periods[i].split("_")[0] + "-" + quarter_dict[self.years_4_month_periods[i].split("_")[1]]
list_remove = list(df[~df.isin(df_)])
df_removed = pd.DataFrame(
{
"date": len(list_remove) * [remove_date],
"type": len(list_remove) * ["remove"],
"symbol": list_remove,
}
)
## Add Dataframe
add_date = self.years_4_month_periods[i + 1].split("_")[0] + "-" + quarter_dict[self.years_4_month_periods[i + 1].split("_")[1]]
list_add = list(df_[~df_.isin(df)])
df_added = pd.DataFrame(
{"date": len(list_add) * [add_date], "type": len(list_add) * ["add"], "symbol": list_add}
)
df_changes_list.append(pd.concat([df_added, df_removed], sort=False))
df = pd.concat(df_changes_list).reset_index(drop=True)
df["symbol"] = df["symbol"].astype(str) + ".SA"
return df
except Exception as E:
logger.error("An error occured while downloading 2008 index composition - {}".format(E))
def get_new_companies(self):
"""
Get latest index composition.
The repo indicated on README has implemented a script
to get the latest index composition from B3 website using
selenium. Therefore, this method will download the file
containing such composition
Parameters
----------
self: is used to represent the instance of the class.
Returns
----------
pd.DataFrame:
symbol start_date end_date
RRRP3 2020-11-13 2022-03-02
ALPA4 2008-01-02 2022-03-02
dtypes:
symbol: str
start_date: pd.Timestamp
end_date: pd.Timestamp
"""
logger.info("Getting new companies in {} index ...".format(self.index_name))
try:
## Get index composition
df_index = pd.read_csv(
self.ibov_index_composition.format(self.year + "_" + self.current_4_month_period), on_bad_lines="skip"
)
df_date_first_added = pd.read_csv(
self.ibov_index_composition.format("date_first_added_" + self.year + "_" + self.current_4_month_period),
on_bad_lines="skip",
)
df = df_index.merge(df_date_first_added, on="symbol")[["symbol", "Date First Added"]]
df[self.START_DATE_FIELD] = df["Date First Added"].map(self.format_quarter)
# end_date will be our current quarter + 1, since the IBOV index updates itself every quarter
df[self.END_DATE_FIELD] = self.year + "-" + quarter_dict[self.current_4_month_period]
df = df[["symbol", self.START_DATE_FIELD, self.END_DATE_FIELD]]
df["symbol"] = df["symbol"].astype(str) + ".SA"
return df
except Exception as E:
logger.error("An error occured while getting new companies - {}".format(E))
def filter_df(self, df: pd.DataFrame) -> pd.DataFrame:
if "Código" in df.columns:
return df.loc[:, ["Código"]].copy()
if __name__ == "__main__":
fire.Fire(partial(get_instruments, market_index="br_index" ))