1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-06-06 05:51:17 +08:00

feat: data improve, support parquet (#1966)

* refactor: relocate CLI modules to qlib.cli and update references

* refactor: introduce read_as_df and rename csv_path to data_path

* lint

* refactor: rename csv_path to data_path and use QSettings.provider_uri

* fix pylint error

* fix get_data command

* add comments to CI yaml

* update docs

---------

Co-authored-by: Linlang <Lv.Linlang@hotmail.com>
This commit is contained in:
you-n-g
2025-08-07 15:04:37 +08:00
committed by GitHub
parent 78b77e302b
commit 1b426503fc
21 changed files with 105 additions and 62 deletions

View File

@@ -60,6 +60,8 @@ jobs:
brew unlink libomp
brew install libomp.rb
# When the new version is released it should be changed to:
# python -m qlib.cli.data qlib_data --target_dir ~/.qlib/qlib_data/cn_data --region cn
- name: Downloads dependencies data
run: |
cd ..

View File

@@ -104,7 +104,7 @@ jobs:
- name: Test workflow by config (install from source)
run: |
python -m pip install numba
python qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
python qlib/cli/run.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
- name: Unit tests with Pytest
uses: nick-fields/retry@v2

View File

@@ -229,10 +229,10 @@ Load and prepare data by running the following code:
### Get with module
```bash
# get 1d data
python -m qlib.run.get_data qlib_data --target_dir ~/.qlib/qlib_data/cn_data --region cn
python -m qlib.cli.data qlib_data --target_dir ~/.qlib/qlib_data/cn_data --region cn
# get 1min data
python -m qlib.run.get_data qlib_data --target_dir ~/.qlib/qlib_data/cn_data_1min --region cn --interval 1min
python -m qlib.cli.data qlib_data --target_dir ~/.qlib/qlib_data/cn_data_1min --region cn --interval 1min
```
@@ -329,7 +329,7 @@ We recommend users to prepare their own data if they have a high-quality dataset
3. At this point you are in the docker environment and can run the qlib scripts. An example:
```bash
>>> python scripts/get_data.py qlib_data --name qlib_data_simple --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
>>> python qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
>>> python qlib/cli/run.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
```
4. Exit the container
```bash
@@ -359,7 +359,7 @@ Qlib provides a tool named `qrun` to run the whole workflow automatically (inclu
```
If users want to use `qrun` under debug mode, please use the following command:
```bash
python -m pdb qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
python -m pdb qlib/cli/run.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
```
The result of `qrun` is as follows, please refer to [docs](https://qlib.readthedocs.io/en/latest/component/strategy.html#result) for more explanations about the result.

View File

@@ -108,10 +108,10 @@ Automatic update of daily frequency data
Converting CSV Format into Qlib Format
--------------------------------------
Converting CSV and Parquet Format into Qlib Format
--------------------------------------------------
``Qlib`` has provided the script ``scripts/dump_bin.py`` to convert **any** data in CSV format into `.bin` files (``Qlib`` format) as long as they are in the correct format.
``Qlib`` has provided the script ``scripts/dump_bin.py`` to convert **any** data in CSV or Parquet format into `.bin` files (``Qlib`` format) as long as they are in the correct format.
Besides downloading the prepared demo data, users could download demo data directly from the Collector as follows for reference to the CSV format.
Here are some example:
@@ -126,17 +126,17 @@ for 1min data:
python scripts/data_collector/yahoo/collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_1min --region CN --start 2021-05-20 --end 2021-05-23 --delay 0.1 --interval 1min --limit_nums 10
Users can also provide their own data in CSV format. However, the CSV data **must satisfies** following criterions:
Users can also provide their own data in CSV or Parquet format. However, the data **must satisfies** following criterions:
- CSV file is named after a specific stock *or* the CSV file includes a column of the stock name
- CSV or Parquet file is named after a specific stock *or* the CSV or Parquet file includes a column of the stock name
- Name the CSV file after a stock: `SH600000.csv`, `AAPL.csv` (not case sensitive).
- Name the CSV or Parquet file after a stock: `SH600000.csv`, `AAPL.csv` or `SH600000.parquet`, `AAPL.parquet` (not case sensitive).
- CSV file includes a column of the stock name. User **must** specify the column name when dumping the data. Here is an example:
- CSV or Parquet file includes a column of the stock name. User **must** specify the column name when dumping the data. Here is an example:
.. code-block:: bash
python scripts/dump_bin.py dump_all ... --symbol_field_name symbol
python scripts/dump_bin.py dump_all ... --symbol_field_name symbol --file_suffix <.csv or .parquet>
where the data are in the following format:
@@ -146,11 +146,11 @@ Users can also provide their own data in CSV format. However, the CSV data **mus
| SH600000 | 120 |
+-----------+-------+
- CSV file **must** include a column for the date, and when dumping the data, user must specify the date column name. Here is an example:
- CSV or Parquet file **must** include a column for the date, and when dumping the data, user must specify the date column name. Here is an example:
.. code-block:: bash
python scripts/dump_bin.py dump_all ... --date_field_name date
python scripts/dump_bin.py dump_all ... --date_field_name date --file_suffix <.csv or .parquet>
where the data are in the following format:
@@ -163,23 +163,23 @@ Users can also provide their own data in CSV format. However, the CSV data **mus
+---------+------------+-------+------+----------+
Supposed that users prepare their CSV format data in the directory ``~/.qlib/csv_data/my_data``, they can run the following command to start the conversion.
Supposed that users prepare their CSV or Parquet format data in the directory ``~/.qlib/my_data``, they can run the following command to start the conversion.
.. code-block:: bash
python scripts/dump_bin.py dump_all --csv_path ~/.qlib/csv_data/my_data --qlib_dir ~/.qlib/qlib_data/my_data --include_fields open,close,high,low,volume,factor
python scripts/dump_bin.py dump_all --data_path ~/.qlib/my_data --qlib_dir ~/.qlib/qlib_data/ --include_fields open,close,high,low,volume,factor --file_suffix <.csv or .parquet>
For other supported parameters when dumping the data into `.bin` file, users can refer to the information by running the following commands:
.. code-block:: bash
python dump_bin.py dump_all --help
python scripts/dump_bin.py dump_all --help
After conversion, users can find their Qlib format data in the directory `~/.qlib/qlib_data/my_data`.
After conversion, users can find their Qlib format data in the directory `~/.qlib/qlib_data/`.
.. note::
The arguments of `--include_fields` should correspond with the column names of CSV files. The columns names of dataset provided by ``Qlib`` should include open, close, high, low, volume and factor at least.
The arguments of `--include_fields` should correspond with the column names of CSV or Parquet files. The columns names of dataset provided by ``Qlib`` should include open, close, high, low, volume and factor at least.
- `open`
The adjusted opening price
@@ -195,7 +195,7 @@ After conversion, users can find their Qlib format data in the directory `~/.qli
The Restoration factor. Normally, ``factor = adjusted_price / original_price``, `adjusted price` reference: `split adjusted <https://www.investopedia.com/terms/s/splitadjusted.asp>`_
In the convention of `Qlib` data processing, `open, close, high, low, volume, money and factor` will be set to NaN if the stock is suspended.
If you want to use your own alpha-factor which can't be calculate by OCHLV, like PE, EPS and so on, you could add it to the CSV files with OHCLV together and then dump it to the Qlib format data.
If you want to use your own alpha-factor which can't be calculate by OCHLV, like PE, EPS and so on, you could add it to the CSV or Parquet files with OHCLV together and then dump it to the Qlib format data.
Checking the health of the data
-------------------------------

View File

@@ -110,7 +110,7 @@ If users want to use ``qrun`` under debug mode, please use the following command
.. code-block:: bash
python -m pdb qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
python -m pdb qlib/cli/run.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
.. note::

View File

@@ -52,7 +52,7 @@ How to use qlib images
.. code-block:: bash
>>> python scripts/get_data.py qlib_data --name qlib_data_simple --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
>>> python qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
>>> python qlib/cli/run.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
3. Exit the container

View File

@@ -7,7 +7,7 @@ This folder comprises an example of Reinforcement Learning (RL) workflows for or
### Get Data
```
python -m qlib.run.get_data qlib_data qlib_data --target_dir ./data/bin --region hs300 --interval 5min
python -m qlib.cli.data qlib_data --target_dir ./data/bin --region hs300 --interval 5min
```
### Generate Pickle-Style Data

View File

@@ -103,4 +103,4 @@ packages = [
]
[project.scripts]
qrun = "qlib.workflow.cli:run"
qrun = "qlib.cli.run:run"

View File

@@ -897,6 +897,7 @@ class Exchange:
# if we don't know current position, we choose to sell all
# Otherwise, we clip the amount based on current position
if position is not None:
# TODO: make the trading shortable
current_amount = (
position.get_stock_amount(order.stock_id) if position.check_stock(order.stock_id) else 0
)

View File

@@ -87,7 +87,7 @@ def workflow(config_path, experiment_name="workflow", uri_folder="mlruns"):
"""
This is a Qlib CLI entrance.
User can run the whole Quant research workflow defined by a configure file
- the code is located here ``qlib/workflow/cli.py`
- the code is located here ``qlib/cli/run.py`
User can specify a base_config file in your workflow.yml file by adding "BASE_CONFIG_PATH".
Qlib will load the configuration in BASE_CONFIG_PATH first, and the user only needs to update the custom fields

View File

@@ -49,6 +49,7 @@ class QSettings(BaseSettings):
"""
mlflow: MLflowSettings = MLflowSettings()
provider_uri: str = "~/.qlib/qlib_data/cn_data"
model_config = SettingsConfigDict(
env_prefix="QLIB_",
@@ -261,7 +262,7 @@ MODE_CONF = {
},
"client": {
# config it in user's own code
"provider_uri": "~/.qlib/qlib_data/cn_data",
"provider_uri": QSETTINGS.provider_uri,
# cache
# Using parameter 'remote' to announce the client is using server_cache, and the writing access will be disabled.
# Disable cache by default. Avoid introduce advanced features for beginners

View File

@@ -64,7 +64,7 @@
This will convert the normalized csv in `feature` directory as numpy array and store the normalized data one file per column and one symbol per directory.
- parameters:
- `csv_path`: stock data path or directory, **normalize result(normalize_dir)**
- `data_path`: stock data path or directory, **normalize result(normalize_dir)**
- `qlib_dir`: qlib(dump) data director
- `freq`: transaction frequency, by default `day`
> `freq_map = {1d:day, 5mih: 5min}`
@@ -74,8 +74,9 @@
> dump_fields = `include_fields if include_fields else set(symbol_df.columns) - set(exclude_fields) exclude_fields else symbol_df.columns`
- `symbol_field_name`: column *name* identifying symbol in csv files, by default `symbol`
- `date_field_name`: column *name* identifying time in csv files, by default `date`
- `file_suffix`: stock data file format, by default ".csv"
- examples:
```bash
# dump 5min cn
python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/hs300_5min_nor --qlib_dir ~/.qlib/qlib_data/hs300_5min_bin --freq 5min --exclude_fields date,symbol
python dump_bin.py dump_all --data_path ~/.qlib/stock_data/source/hs300_5min_nor --qlib_dir ~/.qlib/qlib_data/hs300_5min_bin --freq 5min --exclude_fields date,symbol
```

View File

@@ -28,7 +28,7 @@ python collector.py normalize_data --source_dir ~/.qlib/crypto_data/source/1d --
# dump data
cd qlib/scripts
python dump_bin.py dump_all --csv_path ~/.qlib/crypto_data/source/1d_nor --qlib_dir ~/.qlib/qlib_data/crypto_data --freq day --date_field_name date --include_fields prices,total_volumes,market_caps
python dump_bin.py dump_all --data_path ~/.qlib/crypto_data/source/1d_nor --qlib_dir ~/.qlib/qlib_data/crypto_data --freq day --date_field_name date --include_fields prices,total_volumes,market_caps
```

View File

@@ -25,7 +25,7 @@ python collector.py normalize_data --source_dir ~/.qlib/fund_data/source/cn_data
# dump data
cd qlib/scripts
python dump_bin.py dump_all --csv_path ~/.qlib/fund_data/source/cn_1d_nor --qlib_dir ~/.qlib/qlib_data/cn_fund_data --freq day --date_field_name FSRQ --include_fields DWJZ,LJJZ
python dump_bin.py dump_all --data_path ~/.qlib/fund_data/source/cn_1d_nor --qlib_dir ~/.qlib/qlib_data/cn_fund_data --freq day --date_field_name FSRQ --include_fields DWJZ,LJJZ
```

View File

@@ -36,5 +36,5 @@ python collector.py normalize_data --interval quarterly --source_dir ~/.qlib/sto
```bash
cd qlib/scripts
python dump_pit.py dump --csv_path ~/.qlib/stock_data/source/pit_normalized --qlib_dir ~/.qlib/qlib_data/cn_data --interval quarterly
python dump_pit.py dump --data_path ~/.qlib/stock_data/source/pit_normalized --qlib_dir ~/.qlib/qlib_data/cn_data --interval quarterly
```

View File

@@ -139,7 +139,7 @@ pip install -r requirements.txt
This will convert the normalized csv in `feature` directory as numpy array and store the normalized data one file per column and one symbol per directory.
- parameters:
- `csv_path`: stock data path or directory, **normalize result(normalize_dir)**
- `data_path`: stock data path or directory, **normalize result(normalize_dir)**
- `qlib_dir`: qlib(dump) data director
- `freq`: transaction frequency, by default `day`
> `freq_map = {1d:day, 1mih: 1min}`
@@ -149,12 +149,13 @@ pip install -r requirements.txt
> dump_fields = `include_fields if include_fields else set(symbol_df.columns) - set(exclude_fields) exclude_fields else symbol_df.columns`
- `symbol_field_name`: column *name* identifying symbol in csv files, by default `symbol`
- `date_field_name`: column *name* identifying time in csv files, by default `date`
- `file_suffix`: stock data file format, by default ".csv"
- examples:
```bash
# dump 1d cn
python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1d_nor --qlib_dir ~/.qlib/qlib_data/cn_data --freq day --exclude_fields date,symbol
python dump_bin.py dump_all --data_path ~/.qlib/stock_data/source/cn_1d_nor --qlib_dir ~/.qlib/qlib_data/cn_data --freq day --exclude_fields date,symbol --file_suffix .csv
# dump 1min cn
python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1min_nor --qlib_dir ~/.qlib/qlib_data/cn_data_1min --freq 1min --exclude_fields date,symbol
python dump_bin.py dump_all --data_path ~/.qlib/stock_data/source/cn_1min_nor --qlib_dir ~/.qlib/qlib_data/cn_data_1min --freq 1min --exclude_fields date,symbol --file_suffix .csv
```
### Automatic update of daily frequency data(from yahoo finance)

View File

@@ -856,7 +856,7 @@ class Run(BaseRun):
3. normalize new source data(from step 2): python scripts/data_collector/yahoo/collector.py normalize_data_1d_extend --old_qlib_dir <dir1> --source_dir <dir2> --normalize_dir <dir3> --region CN --interval 1d
4. dump data: python scripts/dump_bin.py dump_update --csv_path <dir3> --qlib_dir <dir1> --freq day --date_field_name date --symbol_field_name symbol --exclude_fields symbol,date
4. dump data: python scripts/dump_bin.py dump_update --data_path <dir3> --qlib_dir <dir1> --freq day --date_field_name date --symbol_field_name symbol --exclude_fields symbol,date
5. update instrument(eg. csi300): python python scripts/data_collector/cn_index/collector.py --index_name CSI300 --qlib_dir <dir1> --method parse_instruments
@@ -997,7 +997,7 @@ class Run(BaseRun):
# dump bin
_dump = DumpDataUpdate(
csv_path=self.normalize_dir,
data_path=self.normalize_dir,
qlib_dir=qlib_data_1d_dir,
exclude_fields="symbol,date",
max_workers=self.max_workers,

View File

@@ -17,6 +17,39 @@ from loguru import logger
from qlib.utils import fname_to_code, code_to_fname
def read_as_df(file_path: Union[str, Path], **kwargs) -> pd.DataFrame:
"""
Read a csv or parquet file into a pandas DataFrame.
Parameters
----------
file_path : Union[str, Path]
Path to the data file.
**kwargs :
Additional keyword arguments passed to the underlying pandas
reader.
Returns
-------
pd.DataFrame
"""
file_path = Path(file_path).expanduser()
suffix = file_path.suffix.lower()
keep_keys = {".csv": ("low_memory",)}
kept_kwargs = {}
for k in keep_keys.get(suffix, []):
if k in kwargs:
kept_kwargs[k] = kwargs[k]
if suffix == ".csv":
return pd.read_csv(file_path, **kept_kwargs)
elif suffix == ".parquet":
return pd.read_parquet(file_path, **kept_kwargs)
else:
raise ValueError(f"Unsupported file format: {suffix}")
class DumpDataBase:
INSTRUMENTS_START_FIELD = "start_datetime"
INSTRUMENTS_END_FIELD = "end_datetime"
@@ -34,7 +67,7 @@ class DumpDataBase:
def __init__(
self,
csv_path: str,
data_path: str,
qlib_dir: str,
backup_dir: str = None,
freq: str = "day",
@@ -50,7 +83,7 @@ class DumpDataBase:
Parameters
----------
csv_path: str
data_path: str
stock data path or directory
qlib_dir: str
qlib(dump) data director
@@ -73,7 +106,7 @@ class DumpDataBase:
limit_nums: int
Use when debugging, default None
"""
csv_path = Path(csv_path).expanduser()
data_path = Path(data_path).expanduser()
if isinstance(exclude_fields, str):
exclude_fields = exclude_fields.split(",")
if isinstance(include_fields, str):
@@ -82,9 +115,9 @@ class DumpDataBase:
self._include_fields = tuple(filter(lambda x: len(x) > 0, map(str.strip, include_fields)))
self.file_suffix = file_suffix
self.symbol_field_name = symbol_field_name
self.csv_files = sorted(csv_path.glob(f"*{self.file_suffix}") if csv_path.is_dir() else [csv_path])
self.df_files = sorted(data_path.glob(f"*{self.file_suffix}") if data_path.is_dir() else [data_path])
if limit_nums is not None:
self.csv_files = self.csv_files[: int(limit_nums)]
self.df_files = self.df_files[: int(limit_nums)]
self.qlib_dir = Path(qlib_dir).expanduser()
self.backup_dir = backup_dir if backup_dir is None else Path(backup_dir).expanduser()
if backup_dir is not None:
@@ -134,13 +167,14 @@ class DumpDataBase:
return _calendars.tolist()
def _get_source_data(self, file_path: Path) -> pd.DataFrame:
df = pd.read_csv(str(file_path.resolve()), low_memory=False)
df[self.date_field_name] = df[self.date_field_name].astype(str).astype("datetime64[ns]")
df = read_as_df(file_path, low_memory=False)
if self.date_field_name in df.columns:
df[self.date_field_name] = pd.to_datetime(df[self.date_field_name])
# df.drop_duplicates([self.date_field_name], inplace=True)
return df
def get_symbol_from_file(self, file_path: Path) -> str:
return fname_to_code(file_path.name[: -len(self.file_suffix)].strip().lower())
return fname_to_code(file_path.stem.strip().lower())
def get_dump_fields(self, df_columns: Iterable[str]) -> Iterable[str]:
return (
@@ -274,10 +308,10 @@ class DumpDataAll(DumpDataBase):
all_datetime = set()
date_range_list = []
_fun = partial(self._get_date, as_set=True, is_begin_end=True)
with tqdm(total=len(self.csv_files)) as p_bar:
with tqdm(total=len(self.df_files)) as p_bar:
with ProcessPoolExecutor(max_workers=self.works) as executor:
for file_path, ((_begin_time, _end_time), _set_calendars) in zip(
self.csv_files, executor.map(_fun, self.csv_files)
self.df_files, executor.map(_fun, self.df_files)
):
all_datetime = all_datetime | _set_calendars
if isinstance(_begin_time, pd.Timestamp) and isinstance(_end_time, pd.Timestamp):
@@ -305,9 +339,9 @@ class DumpDataAll(DumpDataBase):
def _dump_features(self):
logger.info("start dump features......")
_dump_func = partial(self._dump_bin, calendar_list=self._calendars_list)
with tqdm(total=len(self.csv_files)) as p_bar:
with tqdm(total=len(self.df_files)) as p_bar:
with ProcessPoolExecutor(max_workers=self.works) as executor:
for _ in executor.map(_dump_func, self.csv_files):
for _ in executor.map(_dump_func, self.df_files):
p_bar.update()
logger.info("end of features dump.\n")
@@ -325,16 +359,15 @@ class DumpDataFix(DumpDataAll):
_fun = partial(self._get_date, is_begin_end=True)
new_stock_files = sorted(
filter(
lambda x: fname_to_code(x.name[: -len(self.file_suffix)].strip().lower()).upper()
not in self._old_instruments,
self.csv_files,
lambda x: self.get_symbol_from_file(x).upper() not in self._old_instruments,
self.df_files,
)
)
with tqdm(total=len(new_stock_files)) as p_bar:
with ProcessPoolExecutor(max_workers=self.works) as execute:
for file_path, (_begin_time, _end_time) in zip(new_stock_files, execute.map(_fun, new_stock_files)):
if isinstance(_begin_time, pd.Timestamp) and isinstance(_end_time, pd.Timestamp):
symbol = fname_to_code(self.get_symbol_from_file(file_path).lower()).upper()
symbol = self.get_symbol_from_file(file_path).upper()
_dt_map = self._old_instruments.setdefault(symbol, dict())
_dt_map[self.INSTRUMENTS_START_FIELD] = self._format_datetime(_begin_time)
_dt_map[self.INSTRUMENTS_END_FIELD] = self._format_datetime(_end_time)
@@ -359,7 +392,7 @@ class DumpDataFix(DumpDataAll):
class DumpDataUpdate(DumpDataBase):
def __init__(
self,
csv_path: str,
data_path: str,
qlib_dir: str,
backup_dir: str = None,
freq: str = "day",
@@ -375,7 +408,7 @@ class DumpDataUpdate(DumpDataBase):
Parameters
----------
csv_path: str
data_path: str
stock data path or directory
qlib_dir: str
qlib(dump) data director
@@ -399,7 +432,7 @@ class DumpDataUpdate(DumpDataBase):
Use when debugging, default None
"""
super().__init__(
csv_path,
data_path,
qlib_dir,
backup_dir,
freq,
@@ -431,15 +464,19 @@ class DumpDataUpdate(DumpDataBase):
logger.info("start load all source data....")
all_df = []
def _read_csv(file_path: Path):
_df = pd.read_csv(file_path, parse_dates=[self.date_field_name])
def _read_df(file_path: Path):
_df = read_as_df(file_path)
if self.date_field_name in _df.columns and not np.issubdtype(
_df[self.date_field_name].dtype, np.datetime64
):
_df[self.date_field_name] = pd.to_datetime(_df[self.date_field_name])
if self.symbol_field_name not in _df.columns:
_df[self.symbol_field_name] = self.get_symbol_from_file(file_path)
return _df
with tqdm(total=len(self.csv_files)) as p_bar:
with tqdm(total=len(self.df_files)) as p_bar:
with ThreadPoolExecutor(max_workers=self.works) as executor:
for df in executor.map(_read_csv, self.csv_files):
for df in executor.map(_read_df, self.df_files):
if not df.empty:
all_df.append(df)
p_bar.update()

View File

@@ -36,7 +36,7 @@ class TestDumpData(unittest.TestCase):
@classmethod
def setUpClass(cls) -> None:
GetData().download_data(file_name="csv_data_cn.zip", target_dir=SOURCE_DIR)
TestDumpData.DUMP_DATA = DumpDataAll(csv_path=SOURCE_DIR, qlib_dir=QLIB_DIR, include_fields=cls.FIELDS)
TestDumpData.DUMP_DATA = DumpDataAll(data_path=SOURCE_DIR, qlib_dir=QLIB_DIR, include_fields=cls.FIELDS)
TestDumpData.STOCK_NAMES = list(map(lambda x: x.name[:-4].upper(), SOURCE_DIR.glob("*.csv")))
provider_uri = str(QLIB_DIR.resolve())
qlib.init(
@@ -76,7 +76,7 @@ class TestDumpData(unittest.TestCase):
def test_4_dump_features_simple(self):
stock = self.STOCK_NAMES[0]
dump_data = DumpDataFix(
csv_path=SOURCE_DIR.joinpath(f"{stock.lower()}.csv"), qlib_dir=QLIB_DIR, include_fields=self.FIELDS
data_path=SOURCE_DIR.joinpath(f"{stock.lower()}.csv"), qlib_dir=QLIB_DIR, include_fields=self.FIELDS
)
dump_data.dump()