1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-06-06 05:51:17 +08:00

feat: check lowercase naming for qlib features directories (#2087)

* feat: check lowercase naming for qlib features directories

* docs: add background reference for lowercase features dir check
This commit is contained in:
Linlang
2026-01-19 10:15:51 +08:00
committed by GitHub
parent 477160e4ac
commit 80982f8904

View File

@@ -1,12 +1,12 @@
from loguru import logger
import os
from typing import Optional
import fire
import pandas as pd
import qlib
from loguru import logger
from tqdm import tqdm
import qlib
from qlib.data import D
@@ -36,6 +36,7 @@ class DataHealthChecker:
self.large_step_threshold_price = large_step_threshold_price
self.large_step_threshold_volume = large_step_threshold_volume
self.missing_data_num = missing_data_num
self.qlib_dir = os.path.abspath(os.path.expanduser(qlib_dir))
if csv_path:
assert os.path.isdir(csv_path), f"{csv_path} should be a directory."
@@ -68,6 +69,43 @@ class DataHealthChecker:
self.data[instrument] = df
print(df)
# NOTE:
# This check is added due to a known issue in Qlib where feature paths
# are constructed using lowercased instrument names. On case-sensitive
# file systems (e.g. Linux), uppercase directory names under `features/`
# will cause data loading failures.
#
# See: https://github.com/microsoft/qlib/issues/2053
def check_features_dir_lowercase(self) -> Optional[pd.DataFrame]:
"""
Check whether all subdirectories under `<qlib_dir>/features` are named in lowercase.
This validation helps prevent data loading issues on case-sensitive
file systems caused by uppercase instrument directory names.
"""
if not self.qlib_dir:
return None
features_dir = os.path.join(self.qlib_dir, "features")
if not os.path.isdir(features_dir):
logger.warning(f"`features` directory not found under {self.qlib_dir}")
return None
bad_dirs = []
for name in os.listdir(features_dir):
full_path = os.path.join(features_dir, name)
if os.path.isdir(full_path) and name != name.lower():
bad_dirs.append(name)
if bad_dirs:
result_df = pd.DataFrame({"non_lowercase_dir": bad_dirs})
return result_df
else:
logger.info(
f"✅ All subdirectories under `{os.path.join(self.qlib_dir, 'features')}` are named in lowercase."
)
return None
def check_missing_data(self) -> Optional[pd.DataFrame]:
"""Check if any data is missing in the DataFrame."""
result_dict = {
@@ -177,11 +215,13 @@ class DataHealthChecker:
check_large_step_changes_result = self.check_large_step_changes()
check_required_columns_result = self.check_required_columns()
check_missing_factor_result = self.check_missing_factor()
check_features_dir_case_result = self.check_features_dir_lowercase()
if (
check_large_step_changes_result is not None
or check_large_step_changes_result is not None
or check_required_columns_result is not None
or check_missing_factor_result is not None
or check_features_dir_case_result is not None
):
print(f"\nSummary of data health check ({len(self.data)} files checked):")
print("-------------------------------------------------")
@@ -197,6 +237,11 @@ class DataHealthChecker:
if isinstance(check_missing_factor_result, pd.DataFrame):
logger.warning(f"The factor column does not exist or is empty")
print(check_missing_factor_result)
if isinstance(check_features_dir_case_result, pd.DataFrame):
logger.warning(
f"Some subdirectories under `{os.path.join(self.qlib_dir, 'features')}` contain uppercase letters, please rename them to lowercase manually."
)
print(check_features_dir_case_result)
if __name__ == "__main__":