mirror of
https://github.com/microsoft/qlib.git
synced 2026-07-05 20:11:08 +08:00
change get_data url (#1558)
* change_url * fix_CI * fix_CI_2 * fix_CI_3 * fix_CI_4 * fix_CI_5 * fix_CI_6 * fix_CI_7 * fix_CI_8 * fix_CI_9 * fix_CI_10 * fix_CI_11 * fix_CI_12 * fix_CI_13 * fix_CI_13 * fix_CI_14 * fix_CI_15 * fix_CI_16 * fix_CI_17 * fix_CI_18 * fix_CI_19 * fix_CI_20 * fix_CI_21 * fix_CI_22 * fix_CI_23 * fix_CI_24 * fix_CI_25 * fix_CI_26 * fix_CI_27 * fix_get_data_error * fix_get_data_error2 * modify_get_data * modify_get_data2 * modify_get_data3 * modify_get_data4 * fix_CI_28 * fix_CI_29 * fix_CI_30 --------- Co-authored-by: Linlang <v-linlanglv@microsoft.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import qlib
|
||||
@@ -11,13 +12,15 @@ import datetime
|
||||
from tqdm import tqdm
|
||||
from pathlib import Path
|
||||
from loguru import logger
|
||||
from cryptography.fernet import Fernet
|
||||
from qlib.utils import exists_qlib_data
|
||||
|
||||
|
||||
class GetData:
|
||||
DATASET_VERSION = "v2"
|
||||
REMOTE_URL = "https://qlibpublic.blob.core.windows.net/data/default/stock_data"
|
||||
QLIB_DATA_NAME = "{dataset_name}_{region}_{interval}_{qlib_version}.zip"
|
||||
# "?" is not included in the token.
|
||||
TOKEN = "gAAAAABkmDhojHc0VSCDdNK1MqmRzNLeDFXe5hy8obHpa6SDQh4de6nW5gtzuD-fa6O_WZb0yyqYOL7ndOfJX_751W3xN5YB4-n-P22jK-t6ucoZqhT70KPD0Lf0_P328QPJVZ1gDnjIdjhi2YLOcP4BFTHLNYO0mvzszR8TKm9iT5AKRvuysWnpi8bbYwGU9zAcJK3x9EPL43hOGtxliFHcPNGMBoJW4g_ercdhi0-Qgv5_JLsV-29_MV-_AhuaYvJuN2dEywBy"
|
||||
KEY = "EYcA8cgorA8X9OhyMwVfuFxn_1W3jGk6jCbs3L2oPoA="
|
||||
|
||||
def __init__(self, delete_zip_file=False):
|
||||
"""
|
||||
@@ -29,24 +32,44 @@ class GetData:
|
||||
"""
|
||||
self.delete_zip_file = delete_zip_file
|
||||
|
||||
def normalize_dataset_version(self, dataset_version: str = None):
|
||||
if dataset_version is None:
|
||||
dataset_version = self.DATASET_VERSION
|
||||
return dataset_version
|
||||
def merge_remote_url(self, file_name: str):
|
||||
fernet = Fernet(self.KEY)
|
||||
token = fernet.decrypt(self.TOKEN).decode()
|
||||
return f"{self.REMOTE_URL}/{file_name}?{token}"
|
||||
|
||||
def merge_remote_url(self, file_name: str, dataset_version: str = None):
|
||||
return f"{self.REMOTE_URL}/{self.normalize_dataset_version(dataset_version)}/{file_name}"
|
||||
def download_data(self, file_name: str, target_dir: [Path, str], delete_old: bool = True):
|
||||
"""
|
||||
Download the specified file to the target folder.
|
||||
|
||||
def _download_data(
|
||||
self, file_name: str, target_dir: [Path, str], delete_old: bool = True, dataset_version: str = None
|
||||
):
|
||||
Parameters
|
||||
----------
|
||||
target_dir: str
|
||||
data save directory
|
||||
file_name: str
|
||||
dataset name, needs to endwith .zip, value from [rl_data.zip, csv_data_cn.zip, ...]
|
||||
may contain folder names, for example: v2/qlib_data_simple_cn_1d_latest.zip
|
||||
delete_old: bool
|
||||
delete an existing directory, by default True
|
||||
|
||||
Examples
|
||||
---------
|
||||
# get rl data
|
||||
python get_data.py download_data --file_name rl_data.zip --target_dir ~/.qlib/qlib_data/rl_data
|
||||
When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/rl_data.zip?{token}
|
||||
|
||||
# get cn csv data
|
||||
python get_data.py download_data --file_name csv_data_cn.zip --target_dir ~/.qlib/csv_data/cn_data
|
||||
When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/csv_data_cn.zip?{token}
|
||||
-------
|
||||
|
||||
"""
|
||||
target_dir = Path(target_dir).expanduser()
|
||||
target_dir.mkdir(exist_ok=True, parents=True)
|
||||
# saved file name
|
||||
_target_file_name = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "_" + file_name
|
||||
_target_file_name = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "_" + os.path.basename(file_name)
|
||||
target_path = target_dir.joinpath(_target_file_name)
|
||||
|
||||
url = self.merge_remote_url(file_name, dataset_version)
|
||||
url = self.merge_remote_url(file_name)
|
||||
resp = requests.get(url, stream=True, timeout=60)
|
||||
resp.raise_for_status()
|
||||
if resp.status_code != 200:
|
||||
@@ -56,7 +79,7 @@ class GetData:
|
||||
logger.warning(
|
||||
f"The data for the example is collected from Yahoo Finance. Please be aware that the quality of the data might not be perfect. (You can refer to the original data source: https://finance.yahoo.com/lookup.)"
|
||||
)
|
||||
logger.info(f"{file_name} downloading......")
|
||||
logger.info(f"{os.path.basename(file_name)} downloading......")
|
||||
with tqdm(total=int(resp.headers.get("Content-Length", 0))) as p_bar:
|
||||
with target_path.open("wb") as fp:
|
||||
for chunk in resp.iter_content(chunk_size=chunk_size):
|
||||
@@ -67,8 +90,8 @@ class GetData:
|
||||
if self.delete_zip_file:
|
||||
target_path.unlink()
|
||||
|
||||
def check_dataset(self, file_name: str, dataset_version: str = None):
|
||||
url = self.merge_remote_url(file_name, dataset_version)
|
||||
def check_dataset(self, file_name: str):
|
||||
url = self.merge_remote_url(file_name)
|
||||
resp = requests.get(url, stream=True, timeout=60)
|
||||
status = True
|
||||
if resp.status_code == 404:
|
||||
@@ -140,9 +163,11 @@ class GetData:
|
||||
---------
|
||||
# get 1d data
|
||||
python get_data.py qlib_data --name qlib_data --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
|
||||
When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/v2/qlib_data_cn_1d_latest.zip?{token}
|
||||
|
||||
# get 1min data
|
||||
python get_data.py qlib_data --name qlib_data --target_dir ~/.qlib/qlib_data/cn_data_1min --interval 1min --region cn
|
||||
When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/v2/qlib_data_cn_1min_latest.zip?{token}
|
||||
-------
|
||||
|
||||
"""
|
||||
@@ -155,29 +180,12 @@ class GetData:
|
||||
|
||||
qlib_version = ".".join(re.findall(r"(\d+)\.+", qlib.__version__))
|
||||
|
||||
def _get_file_name(v):
|
||||
return self.QLIB_DATA_NAME.format(
|
||||
dataset_name=name, region=region.lower(), interval=interval.lower(), qlib_version=v
|
||||
)
|
||||
def _get_file_name_with_version(qlib_version, dataset_version):
|
||||
dataset_version = "v2" if dataset_version is None else dataset_version
|
||||
file_name_with_version = f"{dataset_version}/{name}_{region.lower()}_{interval.lower()}_{qlib_version}.zip"
|
||||
return file_name_with_version
|
||||
|
||||
file_name = _get_file_name(qlib_version)
|
||||
if not self.check_dataset(file_name, version):
|
||||
file_name = _get_file_name("latest")
|
||||
self._download_data(file_name.lower(), target_dir, delete_old, dataset_version=version)
|
||||
|
||||
def csv_data_cn(self, target_dir="~/.qlib/csv_data/cn_data"):
|
||||
"""download cn csv data from remote
|
||||
|
||||
Parameters
|
||||
----------
|
||||
target_dir: str
|
||||
data save directory
|
||||
|
||||
Examples
|
||||
---------
|
||||
python get_data.py csv_data_cn --target_dir ~/.qlib/csv_data/cn_data
|
||||
-------
|
||||
|
||||
"""
|
||||
file_name = "csv_data_cn.zip"
|
||||
self._download_data(file_name, target_dir)
|
||||
file_name = _get_file_name_with_version(qlib_version, dataset_version=version)
|
||||
if not self.check_dataset(file_name):
|
||||
file_name = _get_file_name_with_version("latest", dataset_version=version)
|
||||
self.download_data(file_name.lower(), target_dir, delete_old)
|
||||
|
||||
Reference in New Issue
Block a user