1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-06-06 05:51:17 +08:00

solve conflict

This commit is contained in:
bxdd
2021-06-01 17:46:47 +08:00
31 changed files with 606 additions and 765 deletions

View File

@@ -72,12 +72,19 @@ Converting CSV Format into Qlib Format
``Qlib`` has provided the script ``scripts/dump_bin.py`` to convert **any** data in CSV format into `.bin` files (``Qlib`` format) as long as they are in the correct format.
Users can download the demo china-stock data in CSV format as follows for reference to the CSV format.
Besides downloading the prepared demo data, users could download demo data directly from the Collector as follows for reference to the CSV format.
Here are some example:
.. code-block:: bash
for daily data:
.. code-block:: bash
python scripts/get_data.py csv_data_cn --target_dir ~/.qlib/csv_data/cn_data
for 1min data:
.. code-block:: bash
python scripts/data_collector/yahoo/collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_1min --region CN --start 2021-05-20 --end 2021-05-23 --delay 0.1 --interval 1min --limit_nums 10
Users can also provide their own data in CSV format. However, the CSV data **must satisfies** following criterions:
- CSV file is named after a specific stock *or* the CSV file includes a column of the stock name
@@ -145,6 +152,16 @@ After conversion, users can find their Qlib format data in the directory `~/.qli
In the convention of `Qlib` data processing, `open, close, high, low, volume, money and factor` will be set to NaN if the stock is suspended.
Stock Pool (Market)
--------------------------------
``Qlib`` defines `stock pool <https://github.com/microsoft/qlib/blob/main/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml#L4>`_ as stock list and their date ranges. Predefined stock pools (e.g. csi300) may be imported as follows.
.. code-block:: bash
python collector.py --index_name CSI300 --qlib_dir <user qlib data dir> --method parse_instruments
Multiple Stock Modes
--------------------------------

View File

@@ -101,7 +101,7 @@ Graphical Result
- Axis Y:
- `ic`
The `Pearson correlation coefficient` series between `label` and `prediction score`.
In the above example, the `label` is formulated as `Ref($close, -1)/$close - 1`. Please refer to `Data Featrue <data.html#feature>`_ for more details.
In the above example, the `label` is formulated as `Ref($close, -1)/$close - 1`. Please refer to `Data Feature <data.html#feature>`_ for more details.
- `rank_ic`
The `Spearman's rank correlation coefficient` series between `label` and `prediction score`.

View File

@@ -111,8 +111,6 @@ Usage & Example
pred_score, strategy=strategy, **BACKTEST_CONFIG
)
Also, the above example has been given in ``examples/train_backtest_analyze.ipynb``.
To know more about the `prediction score` `pred_score` output by ``Forecast Model``, please refer to `Forecast Model: Model Training & Prediction <model.html>`_.
To know more about ``Intraday Trading``, please refer to `Intraday Trading: Model&Strategy Testing <backtest.html>`_.

View File

@@ -82,7 +82,7 @@ The Custom models need to inherit `qlib.model.base.Model <../reference/api.html#
return pd.Series(self.model.predict(x_test.values), index=x_test.index)
- Override the `finetune` method (Optional)
- This method is optional to the users, and when users one to use this method on their own models, they should inherit the ``ModelFT`` base class, which includes the interface of `finetune`.
- This method is optional to the users. When users want to use this method on their own models, they should inherit the ``ModelFT`` base class, which includes the interface of `finetune`.
- The parameters must include the parameter `dataset`.
- Code Example: In the following example, users will use `LightGBM` as the model and finetune it.
.. code-block:: Python

View File

@@ -1,24 +1,13 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import sys
import fire
from pathlib import Path
import qlib
import pickle
import numpy as np
import pandas as pd
from qlib.config import REG_CN, HIGH_FREQ_CONFIG
from qlib.contrib.model.gbdt import LGBModel
from qlib.contrib.data.handler import Alpha158
from qlib.contrib.strategy.strategy import TopkDropoutStrategy
from qlib.contrib.evaluate import (
backtest as normal_backtest,
risk_analysis,
)
from qlib.utils import init_instance_by_config, exists_qlib_data
from qlib.utils import init_instance_by_config
from qlib.data.dataset.handler import DataHandlerLP
from qlib.data.ops import Operators
from qlib.data.data import Cal
@@ -96,9 +85,7 @@ class HighfreqWorkflow:
# use yahoo_cn_1min data
QLIB_INIT_CONFIG = {**HIGH_FREQ_CONFIG, **self.SPEC_CONF}
provider_uri = QLIB_INIT_CONFIG.get("provider_uri")
if not exists_qlib_data(provider_uri):
print(f"Qlib data is not found in {provider_uri}")
GetData().qlib_data(target_dir=provider_uri, interval="1min", region=REG_CN)
GetData().qlib_data(target_dir=provider_uri, interval="1min", region=REG_CN, exists_skip=True)
qlib.init(**QLIB_INIT_CONFIG)
def _prepare_calender_cache(self):

View File

@@ -1,46 +1,9 @@
import qlib
from qlib.config import REG_CN
from qlib.utils import exists_qlib_data, init_instance_by_config
import optuna
provider_uri = "~/.qlib/qlib_data/cn_data"
if not exists_qlib_data(provider_uri):
print(f"Qlib data is not found in {provider_uri}")
sys.path.append(str(scripts_dir))
from get_data import GetData
GetData().qlib_data(target_dir=provider_uri, region="cn")
qlib.init(provider_uri=provider_uri, region="cn")
market = "csi300"
benchmark = "SH000300"
data_handler_config = {
"start_time": "2008-01-01",
"end_time": "2020-08-01",
"fit_start_time": "2008-01-01",
"fit_end_time": "2014-12-31",
"instruments": market,
}
dataset_task = {
"dataset": {
"class": "DatasetH",
"module_path": "qlib.data.dataset",
"kwargs": {
"handler": {
"class": "Alpha158",
"module_path": "qlib.contrib.data.handler",
"kwargs": data_handler_config,
},
"segments": {
"train": ("2008-01-01", "2014-12-31"),
"valid": ("2015-01-01", "2016-12-31"),
"test": ("2017-01-01", "2020-08-01"),
},
},
},
}
dataset = init_instance_by_config(dataset_task["dataset"])
from qlib.config import REG_CN
from qlib.utils import init_instance_by_config
from qlib.tests.config import CSI300_DATASET_CONFIG
from qlib.tests.data import GetData
def objective(trial):
@@ -65,12 +28,19 @@ def objective(trial):
},
},
}
evals_result = dict()
model = init_instance_by_config(task["model"])
model.fit(dataset, evals_result=evals_result)
return min(evals_result["valid"])
study = optuna.Study(study_name="LGBM_158", storage="sqlite:///db.sqlite3")
study.optimize(objective, n_jobs=6)
if __name__ == "__main__":
provider_uri = "~/.qlib/qlib_data/cn_data"
GetData().qlib_data(target_dir=provider_uri, region=REG_CN, exists_skip=True)
qlib.init(provider_uri=provider_uri, region="cn")
dataset = init_instance_by_config(CSI300_DATASET_CONFIG)
study = optuna.Study(study_name="LGBM_158", storage="sqlite:///db.sqlite3")
study.optimize(objective, n_jobs=6)

View File

@@ -1,46 +1,11 @@
import qlib
from qlib.config import REG_CN
from qlib.utils import exists_qlib_data, init_instance_by_config
import optuna
from qlib.config import REG_CN
from qlib.utils import init_instance_by_config
from qlib.tests.data import GetData
from qlib.tests.config import get_dataset_config, CSI300_MARKET, DATASET_ALPHA360_CLASS
provider_uri = "~/.qlib/qlib_data/cn_data"
if not exists_qlib_data(provider_uri):
print(f"Qlib data is not found in {provider_uri}")
sys.path.append(str(scripts_dir))
from get_data import GetData
GetData().qlib_data(target_dir=provider_uri, region="cn")
qlib.init(provider_uri=provider_uri, region="cn")
market = "csi300"
benchmark = "SH000300"
data_handler_config = {
"start_time": "2008-01-01",
"end_time": "2020-08-01",
"fit_start_time": "2008-01-01",
"fit_end_time": "2014-12-31",
"instruments": market,
}
dataset_task = {
"dataset": {
"class": "DatasetH",
"module_path": "qlib.data.dataset",
"kwargs": {
"handler": {
"class": "Alpha360",
"module_path": "qlib.contrib.data.handler",
"kwargs": data_handler_config,
},
"segments": {
"train": ("2008-01-01", "2014-12-31"),
"valid": ("2015-01-01", "2016-12-31"),
"test": ("2017-01-01", "2020-08-01"),
},
},
},
}
dataset = init_instance_by_config(dataset_task["dataset"])
DATASET_CONFIG = get_dataset_config(market=CSI300_MARKET, dataset_class=DATASET_ALPHA360_CLASS)
def objective(trial):
@@ -72,5 +37,13 @@ def objective(trial):
return min(evals_result["valid"])
study = optuna.Study(study_name="LGBM_360", storage="sqlite:///db.sqlite3")
study.optimize(objective, n_jobs=6)
if __name__ == "__main__":
provider_uri = "~/.qlib/qlib_data/cn_data"
GetData().qlib_data(target_dir=provider_uri, region=REG_CN, exists_skip=True)
qlib.init(provider_uri=provider_uri, region=REG_CN)
dataset = init_instance_by_config(DATASET_CONFIG)
study = optuna.Study(study_name="LGBM_360", storage="sqlite:///db.sqlite3")
study.optimize(objective, n_jobs=6)

View File

@@ -0,0 +1,32 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import qlib
from qlib.config import REG_CN
from qlib.utils import init_instance_by_config
from qlib.tests.data import GetData
from qlib.tests.config import CSI300_GBDT_TASK
if __name__ == "__main__":
# use default data
provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir
GetData().qlib_data(target_dir=provider_uri, region=REG_CN, exists_skip=True)
qlib.init(provider_uri=provider_uri, region=REG_CN)
###################################
# train model
###################################
# model initialization
model = init_instance_by_config(CSI300_GBDT_TASK["model"])
dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"])
model.fit(dataset)
# get model feature importance
feature_importance = model.get_feature_importance()
print("feature importance:")
print(feature_importance)

View File

@@ -17,63 +17,7 @@ from qlib.workflow.task.manage import TaskManager
from qlib.workflow.task.collect import RecorderCollector
from qlib.model.ens.group import RollingGroup
from qlib.model.trainer import TrainerRM
data_handler_config = {
"start_time": "2008-01-01",
"end_time": "2020-08-01",
"fit_start_time": "2008-01-01",
"fit_end_time": "2014-12-31",
"instruments": "csi100",
}
dataset_config = {
"class": "DatasetH",
"module_path": "qlib.data.dataset",
"kwargs": {
"handler": {
"class": "Alpha158",
"module_path": "qlib.contrib.data.handler",
"kwargs": data_handler_config,
},
"segments": {
"train": ("2008-01-01", "2014-12-31"),
"valid": ("2015-01-01", "2016-12-31"),
"test": ("2017-01-01", "2020-08-01"),
},
},
}
record_config = [
{
"class": "SignalRecord",
"module_path": "qlib.workflow.record_temp",
},
{
"class": "SigAnaRecord",
"module_path": "qlib.workflow.record_temp",
},
]
# use lgb
task_lgb_config = {
"model": {
"class": "LGBModel",
"module_path": "qlib.contrib.model.gbdt",
},
"dataset": dataset_config,
"record": record_config,
}
# use xgboost
task_xgboost_config = {
"model": {
"class": "XGBModel",
"module_path": "qlib.contrib.model.xgboost",
},
"dataset": dataset_config,
"record": record_config,
}
from qlib.tests.config import CSI100_RECORD_LGB_TASK_CONFIG, CSI100_RECORD_XGBOOST_TASK_CONFIG
class RollingTaskExample:
@@ -85,11 +29,13 @@ class RollingTaskExample:
task_db_name="rolling_db",
experiment_name="rolling_exp",
task_pool="rolling_task",
task_config=[task_xgboost_config, task_lgb_config],
task_config=None,
rolling_step=550,
rolling_type=RollingGen.ROLL_SD,
):
# TaskManager config
if task_config is None:
task_config = [CSI100_RECORD_XGBOOST_TASK_CONFIG, CSI100_RECORD_LGB_TASK_CONFIG]
mongo_conf = {
"task_url": task_url,
"task_db_name": task_db_name,

View File

@@ -13,63 +13,7 @@ from qlib.workflow.online.manager import OnlineManager
from qlib.workflow.online.strategy import RollingStrategy
from qlib.workflow.task.gen import RollingGen
from qlib.workflow.task.manage import TaskManager
data_handler_config = {
"start_time": "2018-01-01",
"end_time": "2018-10-31",
"fit_start_time": "2018-01-01",
"fit_end_time": "2018-03-31",
"instruments": "csi100",
}
dataset_config = {
"class": "DatasetH",
"module_path": "qlib.data.dataset",
"kwargs": {
"handler": {
"class": "Alpha158",
"module_path": "qlib.contrib.data.handler",
"kwargs": data_handler_config,
},
"segments": {
"train": ("2018-01-01", "2018-03-31"),
"valid": ("2018-04-01", "2018-05-31"),
"test": ("2018-06-01", "2018-09-10"),
},
},
}
record_config = [
{
"class": "SignalRecord",
"module_path": "qlib.workflow.record_temp",
},
{
"class": "SigAnaRecord",
"module_path": "qlib.workflow.record_temp",
},
]
# use lgb model
task_lgb_config = {
"model": {
"class": "LGBModel",
"module_path": "qlib.contrib.model.gbdt",
},
"dataset": dataset_config,
"record": record_config,
}
# use xgboost model
task_xgboost_config = {
"model": {
"class": "XGBModel",
"module_path": "qlib.contrib.model.xgboost",
},
"dataset": dataset_config,
"record": record_config,
}
from qlib.tests.config import CSI100_RECORD_LGB_TASK_CONFIG, CSI100_RECORD_XGBOOST_TASK_CONFIG
class OnlineSimulationExample:
@@ -84,7 +28,7 @@ class OnlineSimulationExample:
rolling_step=80,
start_time="2018-09-10",
end_time="2018-10-31",
tasks=[task_xgboost_config, task_lgb_config],
tasks=None,
):
"""
Init OnlineManagerExample.
@@ -101,6 +45,8 @@ class OnlineSimulationExample:
end_time (str, optional): the end time of simulating. Defaults to "2018-10-31".
tasks (dict or list[dict]): a set of the task config waiting for rolling and training
"""
if tasks is None:
tasks = [CSI100_RECORD_XGBOOST_TASK_CONFIG, CSI100_RECORD_LGB_TASK_CONFIG]
self.exp_name = exp_name
self.task_pool = task_pool
self.start_time = start_time

View File

@@ -17,62 +17,7 @@ from qlib.workflow import R
from qlib.workflow.online.strategy import RollingStrategy
from qlib.workflow.task.gen import RollingGen
from qlib.workflow.online.manager import OnlineManager
data_handler_config = {
"start_time": "2013-01-01",
"end_time": "2020-09-25",
"fit_start_time": "2013-01-01",
"fit_end_time": "2014-12-31",
"instruments": "csi100",
}
dataset_config = {
"class": "DatasetH",
"module_path": "qlib.data.dataset",
"kwargs": {
"handler": {
"class": "Alpha158",
"module_path": "qlib.contrib.data.handler",
"kwargs": data_handler_config,
},
"segments": {
"train": ("2013-01-01", "2014-12-31"),
"valid": ("2015-01-01", "2015-12-31"),
"test": ("2016-01-01", "2020-07-10"),
},
},
}
record_config = [
{
"class": "SignalRecord",
"module_path": "qlib.workflow.record_temp",
},
{
"class": "SigAnaRecord",
"module_path": "qlib.workflow.record_temp",
},
]
# use lgb model
task_lgb_config = {
"model": {
"class": "LGBModel",
"module_path": "qlib.contrib.model.gbdt",
},
"dataset": dataset_config,
"record": record_config,
}
# use xgboost model
task_xgboost_config = {
"model": {
"class": "XGBModel",
"module_path": "qlib.contrib.model.xgboost",
},
"dataset": dataset_config,
"record": record_config,
}
from qlib.tests.config import CSI100_RECORD_XGBOOST_TASK_CONFIG, CSI100_RECORD_LGB_TASK_CONFIG
class RollingOnlineExample:
@@ -83,9 +28,13 @@ class RollingOnlineExample:
task_url="mongodb://10.0.0.4:27017/",
task_db_name="rolling_db",
rolling_step=550,
tasks=[task_xgboost_config],
add_tasks=[task_lgb_config],
tasks=None,
add_tasks=None,
):
if add_tasks is None:
add_tasks = [CSI100_RECORD_LGB_TASK_CONFIG]
if tasks is None:
tasks = [CSI100_RECORD_XGBOOST_TASK_CONFIG]
mongo_conf = {
"task_url": task_url, # your MongoDB url
"task_db_name": task_db_name, # database name

View File

@@ -7,56 +7,19 @@ There are two parts including first_train and update_online_pred.
Firstly, we will finish the training and set the trained models to the `online` models.
Next, we will finish updating online predictions.
"""
import copy
import fire
import qlib
from qlib.config import REG_CN
from qlib.model.trainer import task_train
from qlib.workflow.online.utils import OnlineToolR
from qlib.tests.config import CSI300_GBDT_TASK
data_handler_config = {
"start_time": "2008-01-01",
"end_time": "2020-08-01",
"fit_start_time": "2008-01-01",
"fit_end_time": "2014-12-31",
"instruments": "csi100",
}
task = copy.deepcopy(CSI300_GBDT_TASK)
task = {
"model": {
"class": "LGBModel",
"module_path": "qlib.contrib.model.gbdt",
"kwargs": {
"loss": "mse",
"colsample_bytree": 0.8879,
"learning_rate": 0.0421,
"subsample": 0.8789,
"lambda_l1": 205.6999,
"lambda_l2": 580.9768,
"max_depth": 8,
"num_leaves": 210,
"num_threads": 20,
},
},
"dataset": {
"class": "DatasetH",
"module_path": "qlib.data.dataset",
"kwargs": {
"handler": {
"class": "Alpha158",
"module_path": "qlib.contrib.data.handler",
"kwargs": data_handler_config,
},
"segments": {
"train": ("2008-01-01", "2014-12-31"),
"valid": ("2015-01-01", "2016-12-31"),
"test": ("2017-01-01", "2020-08-01"),
},
},
},
"record": {
"class": "SignalRecord",
"module_path": "qlib.workflow.record_temp",
},
task["record"] = {
"class": "SignalRecord",
"module_path": "qlib.workflow.record_temp",
}

View File

@@ -4,13 +4,11 @@
import qlib
import fire
import pickle
import pandas as pd
from datetime import datetime
from qlib.config import REG_CN
from qlib.data.dataset.handler import DataHandlerLP
from qlib.contrib.data.handler import Alpha158
from qlib.utils import exists_qlib_data, init_instance_by_config
from qlib.utils import init_instance_by_config
from qlib.tests.data import GetData
@@ -24,9 +22,7 @@ class RollingDataWorkflow:
def _init_qlib(self):
"""initialize qlib"""
provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir
if not exists_qlib_data(provider_uri):
print(f"Qlib data is not found in {provider_uri}")
GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
GetData().qlib_data(target_dir=provider_uri, region=REG_CN, exists_skip=True)
qlib.init(provider_uri=provider_uri, region=REG_CN)
def _dump_pre_handler(self, path):

View File

@@ -5,13 +5,11 @@ import os
import sys
import fire
import time
import venv
import glob
import shutil
import signal
import inspect
import tempfile
import traceback
import functools
import statistics
import subprocess
@@ -23,8 +21,7 @@ from pprint import pprint
import qlib
from qlib.config import REG_CN
from qlib.workflow import R
from qlib.workflow.cli import workflow
from qlib.utils import exists_qlib_data
from qlib.tests.data import GetData
# init qlib
@@ -39,12 +36,8 @@ exp_manager = {
"default_exp_name": "Experiment",
},
}
if not exists_qlib_data(provider_uri):
print(f"Qlib data is not found in {provider_uri}")
sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts")))
from get_data import GetData
GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
GetData().qlib_data(target_dir=provider_uri, region=REG_CN, exists_skip=True)
qlib.init(provider_uri=provider_uri, region=REG_CN, exp_manager=exp_manager)
# decorator to check the arguments

View File

@@ -12,10 +12,7 @@ if __name__ == "__main__":
# use default data
provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir
if not exists_qlib_data(provider_uri):
print(f"Qlib data is not found in {provider_uri}")
GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
GetData().qlib_data(target_dir=provider_uri, region=REG_CN, exists_skip=True)
qlib.init(provider_uri=provider_uri, region=REG_CN)
market = "csi300"
@@ -112,7 +109,7 @@ if __name__ == "__main__":
# start exp
with R.start(experiment_name="workflow"):
R.log_params(**flatten_dict(task))
R.log_params(**flatten_dict(CSI300_GBDT_TASK))
model.fit(dataset)
R.save_objects(**{"params.pkl": model})

View File

@@ -10,9 +10,10 @@ from catboost.utils import get_gpu_device_count
from ...model.base import Model
from ...data.dataset import DatasetH
from ...data.dataset.handler import DataHandlerLP
from ...model.interpret.base import FeatureInt
class CatBoostModel(Model):
class CatBoostModel(Model, FeatureInt):
"""CatBoost Model"""
def __init__(self, loss="RMSE", **kwargs):
@@ -69,6 +70,18 @@ class CatBoostModel(Model):
x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I)
return pd.Series(self.model.predict(x_test.values), index=x_test.index)
def get_feature_importance(self, *args, **kwargs) -> pd.Series:
"""get feature importance
Notes
-----
parameters references:
https://catboost.ai/docs/concepts/python-reference_catboost_get_feature_importance.html#python-reference_catboost_get_feature_importance
"""
return pd.Series(
data=self.model.get_feature_importance(*args, **kwargs), index=self.model.feature_names_
).sort_values(ascending=False)
if __name__ == "__main__":
cat = CatBoostModel()

View File

@@ -1,251 +1,265 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import lightgbm as lgb
import numpy as np
import pandas as pd
from typing import Text, Union
from ...model.base import Model
from ...data.dataset import DatasetH
from ...data.dataset.handler import DataHandlerLP
from ...log import get_module_logger
class DEnsembleModel(Model):
"""Double Ensemble Model"""
def __init__(
self,
base_model="gbm",
loss="mse",
num_models=6,
enable_sr=True,
enable_fs=True,
alpha1=1.0,
alpha2=1.0,
bins_sr=10,
bins_fs=5,
decay=None,
sample_ratios=None,
sub_weights=None,
epochs=100,
**kwargs
):
self.base_model = base_model # "gbm" or "mlp", specifically, we use lgbm for "gbm"
self.num_models = num_models # the number of sub-models
self.enable_sr = enable_sr
self.enable_fs = enable_fs
self.alpha1 = alpha1
self.alpha2 = alpha2
self.bins_sr = bins_sr
self.bins_fs = bins_fs
self.decay = decay
if sample_ratios is None: # the default values for sample_ratios
sample_ratios = [0.8, 0.7, 0.6, 0.5, 0.4]
if sub_weights is None: # the default values for sub_weights
sub_weights = [1.0, 0.2, 0.2, 0.2, 0.2, 0.2]
if not len(sample_ratios) == bins_fs:
raise ValueError("The length of sample_ratios should be equal to bins_fs.")
self.sample_ratios = sample_ratios
if not len(sub_weights) == num_models:
raise ValueError("The length of sub_weights should be equal to num_models.")
self.sub_weights = sub_weights
self.epochs = epochs
self.logger = get_module_logger("DEnsembleModel")
self.logger.info("Double Ensemble Model...")
self.ensemble = [] # the current ensemble model, a list contains all the sub-models
self.sub_features = [] # the features for each sub model in the form of pandas.Index
self.params = {"objective": loss}
self.params.update(kwargs)
self.loss = loss
def fit(self, dataset: DatasetH):
df_train, df_valid = dataset.prepare(
["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
)
x_train, y_train = df_train["feature"], df_train["label"]
# initialize the sample weights
N, F = x_train.shape
weights = pd.Series(np.ones(N, dtype=float))
# initialize the features
features = x_train.columns
pred_sub = pd.DataFrame(np.zeros((N, self.num_models), dtype=float), index=x_train.index)
# train sub-models
for k in range(self.num_models):
self.sub_features.append(features)
self.logger.info("Training sub-model: ({}/{})".format(k + 1, self.num_models))
model_k = self.train_submodel(df_train, df_valid, weights, features)
self.ensemble.append(model_k)
# no further sample re-weight and feature selection needed for the last sub-model
if k + 1 == self.num_models:
break
self.logger.info("Retrieving loss curve and loss values...")
loss_curve = self.retrieve_loss_curve(model_k, df_train, features)
pred_k = self.predict_sub(model_k, df_train, features)
pred_sub.iloc[:, k] = pred_k
pred_ensemble = pred_sub.iloc[:, : k + 1].mean(axis=1)
loss_values = pd.Series(self.get_loss(y_train.values.squeeze(), pred_ensemble.values))
if self.enable_sr:
self.logger.info("Sample re-weighting...")
weights = self.sample_reweight(loss_curve, loss_values, k + 1)
if self.enable_fs:
self.logger.info("Feature selection...")
features = self.feature_selection(df_train, loss_values)
def train_submodel(self, df_train, df_valid, weights, features):
dtrain, dvalid = self._prepare_data_gbm(df_train, df_valid, weights, features)
evals_result = dict()
model = lgb.train(
self.params,
dtrain,
num_boost_round=self.epochs,
valid_sets=[dtrain, dvalid],
valid_names=["train", "valid"],
verbose_eval=20,
evals_result=evals_result,
)
evals_result["train"] = list(evals_result["train"].values())[0]
evals_result["valid"] = list(evals_result["valid"].values())[0]
return model
def _prepare_data_gbm(self, df_train, df_valid, weights, features):
x_train, y_train = df_train["feature"].loc[:, features], df_train["label"]
x_valid, y_valid = df_valid["feature"].loc[:, features], df_valid["label"]
# Lightgbm need 1D array as its label
if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
y_train, y_valid = np.squeeze(y_train.values), np.squeeze(y_valid.values)
else:
raise ValueError("LightGBM doesn't support multi-label training")
dtrain = lgb.Dataset(x_train.values, label=y_train, weight=weights)
dvalid = lgb.Dataset(x_valid.values, label=y_valid)
return dtrain, dvalid
def sample_reweight(self, loss_curve, loss_values, k_th):
"""
the SR module of Double Ensemble
:param loss_curve: the shape is NxT
the loss curve for the previous sub-model, where the element (i, t) if the error on the i-th sample
after the t-th iteration in the training of the previous sub-model.
:param loss_values: the shape is N
the loss of the current ensemble on the i-th sample.
:param k_th: the index of the current sub-model, starting from 1
:return: weights
the weights for all the samples.
"""
# normalize loss_curve and loss_values with ranking
loss_curve_norm = loss_curve.rank(axis=0, pct=True)
loss_values_norm = (-loss_values).rank(pct=True)
# calculate l_start and l_end from loss_curve
N, T = loss_curve.shape
part = np.maximum(int(T * 0.1), 1)
l_start = loss_curve_norm.iloc[:, :part].mean(axis=1)
l_end = loss_curve_norm.iloc[:, -part:].mean(axis=1)
# calculate h-value for each sample
h1 = loss_values_norm
h2 = (l_end / l_start).rank(pct=True)
h = pd.DataFrame({"h_value": self.alpha1 * h1 + self.alpha2 * h2})
# calculate weights
h["bins"] = pd.cut(h["h_value"], self.bins_sr)
h_avg = h.groupby("bins")["h_value"].mean()
weights = pd.Series(np.zeros(N, dtype=float))
for i_b, b in enumerate(h_avg.index):
weights[h["bins"] == b] = 1.0 / (self.decay ** k_th * h_avg[i_b] + 0.1)
return weights
def feature_selection(self, df_train, loss_values):
"""
the FS module of Double Ensemble
:param df_train: the shape is NxF
:param loss_values: the shape is N
the loss of the current ensemble on the i-th sample.
:return: res_feat: in the form of pandas.Index
"""
x_train, y_train = df_train["feature"], df_train["label"]
features = x_train.columns
N, F = x_train.shape
g = pd.DataFrame({"g_value": np.zeros(F, dtype=float)})
M = len(self.ensemble)
# shuffle specific columns and calculate g-value for each feature
x_train_tmp = x_train.copy()
for i_f, feat in enumerate(features):
x_train_tmp.loc[:, feat] = np.random.permutation(x_train_tmp.loc[:, feat].values)
pred = pd.Series(np.zeros(N), index=x_train_tmp.index)
for i_s, submodel in enumerate(self.ensemble):
pred += (
pd.Series(
submodel.predict(x_train_tmp.loc[:, self.sub_features[i_s]].values), index=x_train_tmp.index
)
/ M
)
loss_feat = self.get_loss(y_train.values.squeeze(), pred.values)
g.loc[i_f, "g_value"] = np.mean(loss_feat - loss_values) / (np.std(loss_feat - loss_values) + 1e-7)
x_train_tmp.loc[:, feat] = x_train.loc[:, feat].copy()
# one column in train features is all-nan # if g['g_value'].isna().any()
g["g_value"].replace(np.nan, 0, inplace=True)
# divide features into bins_fs bins
g["bins"] = pd.cut(g["g_value"], self.bins_fs)
# randomly sample features from bins to construct the new features
res_feat = []
sorted_bins = sorted(g["bins"].unique(), reverse=True)
for i_b, b in enumerate(sorted_bins):
b_feat = features[g["bins"] == b]
num_feat = int(np.ceil(self.sample_ratios[i_b] * len(b_feat)))
res_feat = res_feat + np.random.choice(b_feat, size=num_feat).tolist()
return pd.Index(res_feat)
def get_loss(self, label, pred):
if self.loss == "mse":
return (label - pred) ** 2
else:
raise ValueError("not implemented yet")
def retrieve_loss_curve(self, model, df_train, features):
if self.base_model == "gbm":
num_trees = model.num_trees()
x_train, y_train = df_train["feature"].loc[:, features], df_train["label"]
# Lightgbm need 1D array as its label
if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
y_train = np.squeeze(y_train.values)
else:
raise ValueError("LightGBM doesn't support multi-label training")
N = x_train.shape[0]
loss_curve = pd.DataFrame(np.zeros((N, num_trees)))
pred_tree = np.zeros(N, dtype=float)
for i_tree in range(num_trees):
pred_tree += model.predict(x_train.values, start_iteration=i_tree, num_iteration=1)
loss_curve.iloc[:, i_tree] = self.get_loss(y_train, pred_tree)
else:
raise ValueError("not implemented yet")
return loss_curve
def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
if self.ensemble is None:
raise ValueError("model is not fitted yet!")
x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I)
pred = pd.Series(np.zeros(x_test.shape[0]), index=x_test.index)
for i_sub, submodel in enumerate(self.ensemble):
feat_sub = self.sub_features[i_sub]
pred += (
pd.Series(submodel.predict(x_test.loc[:, feat_sub].values), index=x_test.index)
* self.sub_weights[i_sub]
)
return pred
def predict_sub(self, submodel, df_data, features):
x_data, y_data = df_data["feature"].loc[:, features], df_data["label"]
pred_sub = pd.Series(submodel.predict(x_data.values), index=x_data.index)
return pred_sub
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import lightgbm as lgb
import numpy as np
import pandas as pd
from typing import Text, Union
from ...model.base import Model
from ...data.dataset import DatasetH
from ...data.dataset.handler import DataHandlerLP
from ...model.interpret.base import FeatureInt
from ...log import get_module_logger
class DEnsembleModel(Model, FeatureInt):
"""Double Ensemble Model"""
def __init__(
self,
base_model="gbm",
loss="mse",
num_models=6,
enable_sr=True,
enable_fs=True,
alpha1=1.0,
alpha2=1.0,
bins_sr=10,
bins_fs=5,
decay=None,
sample_ratios=None,
sub_weights=None,
epochs=100,
**kwargs
):
self.base_model = base_model # "gbm" or "mlp", specifically, we use lgbm for "gbm"
self.num_models = num_models # the number of sub-models
self.enable_sr = enable_sr
self.enable_fs = enable_fs
self.alpha1 = alpha1
self.alpha2 = alpha2
self.bins_sr = bins_sr
self.bins_fs = bins_fs
self.decay = decay
if sample_ratios is None: # the default values for sample_ratios
sample_ratios = [0.8, 0.7, 0.6, 0.5, 0.4]
if sub_weights is None: # the default values for sub_weights
sub_weights = [1.0, 0.2, 0.2, 0.2, 0.2, 0.2]
if not len(sample_ratios) == bins_fs:
raise ValueError("The length of sample_ratios should be equal to bins_fs.")
self.sample_ratios = sample_ratios
if not len(sub_weights) == num_models:
raise ValueError("The length of sub_weights should be equal to num_models.")
self.sub_weights = sub_weights
self.epochs = epochs
self.logger = get_module_logger("DEnsembleModel")
self.logger.info("Double Ensemble Model...")
self.ensemble = [] # the current ensemble model, a list contains all the sub-models
self.sub_features = [] # the features for each sub model in the form of pandas.Index
self.params = {"objective": loss}
self.params.update(kwargs)
self.loss = loss
def fit(self, dataset: DatasetH):
df_train, df_valid = dataset.prepare(
["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
)
x_train, y_train = df_train["feature"], df_train["label"]
# initialize the sample weights
N, F = x_train.shape
weights = pd.Series(np.ones(N, dtype=float))
# initialize the features
features = x_train.columns
pred_sub = pd.DataFrame(np.zeros((N, self.num_models), dtype=float), index=x_train.index)
# train sub-models
for k in range(self.num_models):
self.sub_features.append(features)
self.logger.info("Training sub-model: ({}/{})".format(k + 1, self.num_models))
model_k = self.train_submodel(df_train, df_valid, weights, features)
self.ensemble.append(model_k)
# no further sample re-weight and feature selection needed for the last sub-model
if k + 1 == self.num_models:
break
self.logger.info("Retrieving loss curve and loss values...")
loss_curve = self.retrieve_loss_curve(model_k, df_train, features)
pred_k = self.predict_sub(model_k, df_train, features)
pred_sub.iloc[:, k] = pred_k
pred_ensemble = pred_sub.iloc[:, : k + 1].mean(axis=1)
loss_values = pd.Series(self.get_loss(y_train.values.squeeze(), pred_ensemble.values))
if self.enable_sr:
self.logger.info("Sample re-weighting...")
weights = self.sample_reweight(loss_curve, loss_values, k + 1)
if self.enable_fs:
self.logger.info("Feature selection...")
features = self.feature_selection(df_train, loss_values)
def train_submodel(self, df_train, df_valid, weights, features):
dtrain, dvalid = self._prepare_data_gbm(df_train, df_valid, weights, features)
evals_result = dict()
model = lgb.train(
self.params,
dtrain,
num_boost_round=self.epochs,
valid_sets=[dtrain, dvalid],
valid_names=["train", "valid"],
verbose_eval=20,
evals_result=evals_result,
)
evals_result["train"] = list(evals_result["train"].values())[0]
evals_result["valid"] = list(evals_result["valid"].values())[0]
return model
def _prepare_data_gbm(self, df_train, df_valid, weights, features):
x_train, y_train = df_train["feature"].loc[:, features], df_train["label"]
x_valid, y_valid = df_valid["feature"].loc[:, features], df_valid["label"]
# Lightgbm need 1D array as its label
if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
y_train, y_valid = np.squeeze(y_train.values), np.squeeze(y_valid.values)
else:
raise ValueError("LightGBM doesn't support multi-label training")
dtrain = lgb.Dataset(x_train, label=y_train, weight=weights)
dvalid = lgb.Dataset(x_valid, label=y_valid)
return dtrain, dvalid
def sample_reweight(self, loss_curve, loss_values, k_th):
"""
the SR module of Double Ensemble
:param loss_curve: the shape is NxT
the loss curve for the previous sub-model, where the element (i, t) if the error on the i-th sample
after the t-th iteration in the training of the previous sub-model.
:param loss_values: the shape is N
the loss of the current ensemble on the i-th sample.
:param k_th: the index of the current sub-model, starting from 1
:return: weights
the weights for all the samples.
"""
# normalize loss_curve and loss_values with ranking
loss_curve_norm = loss_curve.rank(axis=0, pct=True)
loss_values_norm = (-loss_values).rank(pct=True)
# calculate l_start and l_end from loss_curve
N, T = loss_curve.shape
part = np.maximum(int(T * 0.1), 1)
l_start = loss_curve_norm.iloc[:, :part].mean(axis=1)
l_end = loss_curve_norm.iloc[:, -part:].mean(axis=1)
# calculate h-value for each sample
h1 = loss_values_norm
h2 = (l_end / l_start).rank(pct=True)
h = pd.DataFrame({"h_value": self.alpha1 * h1 + self.alpha2 * h2})
# calculate weights
h["bins"] = pd.cut(h["h_value"], self.bins_sr)
h_avg = h.groupby("bins")["h_value"].mean()
weights = pd.Series(np.zeros(N, dtype=float))
for i_b, b in enumerate(h_avg.index):
weights[h["bins"] == b] = 1.0 / (self.decay ** k_th * h_avg[i_b] + 0.1)
return weights
def feature_selection(self, df_train, loss_values):
"""
the FS module of Double Ensemble
:param df_train: the shape is NxF
:param loss_values: the shape is N
the loss of the current ensemble on the i-th sample.
:return: res_feat: in the form of pandas.Index
"""
x_train, y_train = df_train["feature"], df_train["label"]
features = x_train.columns
N, F = x_train.shape
g = pd.DataFrame({"g_value": np.zeros(F, dtype=float)})
M = len(self.ensemble)
# shuffle specific columns and calculate g-value for each feature
x_train_tmp = x_train.copy()
for i_f, feat in enumerate(features):
x_train_tmp.loc[:, feat] = np.random.permutation(x_train_tmp.loc[:, feat].values)
pred = pd.Series(np.zeros(N), index=x_train_tmp.index)
for i_s, submodel in enumerate(self.ensemble):
pred += (
pd.Series(
submodel.predict(x_train_tmp.loc[:, self.sub_features[i_s]].values), index=x_train_tmp.index
)
/ M
)
loss_feat = self.get_loss(y_train.values.squeeze(), pred.values)
g.loc[i_f, "g_value"] = np.mean(loss_feat - loss_values) / (np.std(loss_feat - loss_values) + 1e-7)
x_train_tmp.loc[:, feat] = x_train.loc[:, feat].copy()
# one column in train features is all-nan # if g['g_value'].isna().any()
g["g_value"].replace(np.nan, 0, inplace=True)
# divide features into bins_fs bins
g["bins"] = pd.cut(g["g_value"], self.bins_fs)
# randomly sample features from bins to construct the new features
res_feat = []
sorted_bins = sorted(g["bins"].unique(), reverse=True)
for i_b, b in enumerate(sorted_bins):
b_feat = features[g["bins"] == b]
num_feat = int(np.ceil(self.sample_ratios[i_b] * len(b_feat)))
res_feat = res_feat + np.random.choice(b_feat, size=num_feat, replace=False).tolist()
return pd.Index(set(res_feat))
def get_loss(self, label, pred):
if self.loss == "mse":
return (label - pred) ** 2
else:
raise ValueError("not implemented yet")
def retrieve_loss_curve(self, model, df_train, features):
if self.base_model == "gbm":
num_trees = model.num_trees()
x_train, y_train = df_train["feature"].loc[:, features], df_train["label"]
# Lightgbm need 1D array as its label
if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
y_train = np.squeeze(y_train.values)
else:
raise ValueError("LightGBM doesn't support multi-label training")
N = x_train.shape[0]
loss_curve = pd.DataFrame(np.zeros((N, num_trees)))
pred_tree = np.zeros(N, dtype=float)
for i_tree in range(num_trees):
pred_tree += model.predict(x_train.values, start_iteration=i_tree, num_iteration=1)
loss_curve.iloc[:, i_tree] = self.get_loss(y_train, pred_tree)
else:
raise ValueError("not implemented yet")
return loss_curve
def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
if self.ensemble is None:
raise ValueError("model is not fitted yet!")
x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I)
pred = pd.Series(np.zeros(x_test.shape[0]), index=x_test.index)
for i_sub, submodel in enumerate(self.ensemble):
feat_sub = self.sub_features[i_sub]
pred += (
pd.Series(submodel.predict(x_test.loc[:, feat_sub].values), index=x_test.index)
* self.sub_weights[i_sub]
)
return pred
def predict_sub(self, submodel, df_data, features):
x_data, y_data = df_data["feature"].loc[:, features], df_data["label"]
pred_sub = pd.Series(submodel.predict(x_data.values), index=x_data.index)
return pred_sub
def get_feature_importance(self, *args, **kwargs) -> pd.Series:
"""get feature importance
Notes
-----
parameters reference:
https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html?highlight=feature_importance#lightgbm.Booster.feature_importance
"""
res = []
for _model, _weight in zip(self.ensemble, self.sub_weights):
res.append(pd.Series(_model.feature_importance(*args, **kwargs), index=_model.feature_name()) * _weight)
return pd.concat(res, axis=1, sort=False).sum(axis=1).sort_values(ascending=False)

View File

@@ -8,9 +8,10 @@ from typing import Text, Union
from ...model.base import ModelFT
from ...data.dataset import DatasetH
from ...data.dataset.handler import DataHandlerLP
from ...model.interpret.base import LightGBMFInt
class LGBModel(ModelFT):
class LGBModel(ModelFT, LightGBMFInt):
"""LightGBM Model"""
def __init__(self, loss="mse", **kwargs):
@@ -33,8 +34,8 @@ class LGBModel(ModelFT):
else:
raise ValueError("LightGBM doesn't support multi-label training")
dtrain = lgb.Dataset(x_train.values, label=y_train)
dvalid = lgb.Dataset(x_valid.values, label=y_valid)
dtrain = lgb.Dataset(x_train, label=y_train)
dvalid = lgb.Dataset(x_valid, label=y_valid)
return dtrain, dvalid
def fit(

View File

@@ -1,17 +1,18 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import warnings
import numpy as np
import pandas as pd
import lightgbm as lgb
from qlib.model.base import ModelFT
from qlib.data.dataset import DatasetH
from qlib.data.dataset.handler import DataHandlerLP
import warnings
from ...model.base import ModelFT
from ...data.dataset import DatasetH
from ...data.dataset.handler import DataHandlerLP
from ...model.interpret.base import LightGBMFInt
class HFLGBModel(ModelFT):
class HFLGBModel(ModelFT, LightGBMFInt):
"""LightGBM Model for high frequency prediction"""
def __init__(self, loss="mse", **kwargs):
@@ -97,8 +98,8 @@ class HFLGBModel(ModelFT):
else:
raise ValueError("LightGBM doesn't support multi-label training")
dtrain = lgb.Dataset(x_train.values, label=y_train)
dvalid = lgb.Dataset(x_valid.values, label=y_valid)
dtrain = lgb.Dataset(x_train, label=y_train)
dvalid = lgb.Dataset(x_valid, label=y_valid)
return dtrain, dvalid
def fit(

View File

@@ -8,9 +8,10 @@ from typing import Text, Union
from ...model.base import Model
from ...data.dataset import DatasetH
from ...data.dataset.handler import DataHandlerLP
from ...model.interpret.base import FeatureInt
class XGBModel(Model):
class XGBModel(Model, FeatureInt):
"""XGBModel Model"""
def __init__(self, **kwargs):
@@ -42,8 +43,8 @@ class XGBModel(Model):
else:
raise ValueError("XGBoost doesn't support multi-label training")
dtrain = xgb.DMatrix(x_train.values, label=y_train_1d)
dvalid = xgb.DMatrix(x_valid.values, label=y_valid_1d)
dtrain = xgb.DMatrix(x_train, label=y_train_1d)
dvalid = xgb.DMatrix(x_valid, label=y_valid_1d)
self.model = xgb.train(
self._params,
dtrain=dtrain,
@@ -62,3 +63,13 @@ class XGBModel(Model):
raise ValueError("model is not fitted yet!")
x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I)
return pd.Series(self.model.predict(xgb.DMatrix(x_test.values)), index=x_test.index)
def get_feature_importance(self, *args, **kwargs) -> pd.Series:
"""get feature importance
Notes
-------
parameters reference:
https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.Booster.get_score
"""
return pd.Series(self.model.get_score(*args, **kwargs)).sort_values(ascending=False)

View File

View File

@@ -0,0 +1,40 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
Interfaces to interpret models
"""
import pandas as pd
from abc import abstractmethod
class FeatureInt:
"""Feature (Int)erpreter"""
@abstractmethod
def get_feature_importance(self) -> pd.Series:
"""get feature importance
Returns
-------
The index is the feature name.
The greater the value, the higher importance.
"""
class LightGBMFInt(FeatureInt):
"""LightGBM (F)eature (Int)erpreter"""
def get_feature_importance(self, *args, **kwargs) -> pd.Series:
"""get feature importance
Notes
-----
parameters reference:
https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html?highlight=feature_importance#lightgbm.Booster.feature_importance
"""
return pd.Series(self.model.feature_importance(*args, **kwargs), index=self.model.feature_name()).sort_values(
ascending=False
)

View File

@@ -1,6 +1,4 @@
import sys
import unittest
from ..utils import exists_qlib_data
from .data import GetData
from .. import init
from ..config import REG_CN
@@ -14,14 +12,13 @@ class TestAutoData(unittest.TestCase):
@classmethod
def setUpClass(cls) -> None:
# use default data
if not exists_qlib_data(cls.provider_uri):
print(f"Qlib data is not found in {cls.provider_uri}")
GetData().qlib_data(
name="qlib_data_simple",
region="cn",
interval="1d",
target_dir=cls.provider_uri,
delete_old=False,
)
GetData().qlib_data(
name="qlib_data_simple",
region=REG_CN,
interval="1d",
target_dir=cls.provider_uri,
delete_old=False,
exists_skip=True,
)
init(provider_uri=cls.provider_uri, region=REG_CN, **cls._setup_kwargs)

108
qlib/tests/config.py Normal file
View File

@@ -0,0 +1,108 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
CSI300_MARKET = "csi300"
CSI100_MARKET = "csi100"
CSI300_BENCH = "SH000300"
DATASET_ALPHA158_CLASS = "Alpha158"
DATASET_ALPHA360_CLASS = "Alpha360"
###################################
# config
###################################
GBDT_MODEL = {
"class": "LGBModel",
"module_path": "qlib.contrib.model.gbdt",
"kwargs": {
"loss": "mse",
"colsample_bytree": 0.8879,
"learning_rate": 0.0421,
"subsample": 0.8789,
"lambda_l1": 205.6999,
"lambda_l2": 580.9768,
"max_depth": 8,
"num_leaves": 210,
"num_threads": 20,
},
}
RECORD_CONFIG = [
{
"class": "SignalRecord",
"module_path": "qlib.workflow.record_temp",
},
{
"class": "SigAnaRecord",
"module_path": "qlib.workflow.record_temp",
},
]
def get_data_handler_config(market=CSI300_MARKET):
return {
"start_time": "2008-01-01",
"end_time": "2020-08-01",
"fit_start_time": "2008-01-01",
"fit_end_time": "2014-12-31",
"instruments": market,
}
def get_dataset_config(market=CSI300_MARKET, dataset_class=DATASET_ALPHA158_CLASS):
return {
"class": "DatasetH",
"module_path": "qlib.data.dataset",
"kwargs": {
"handler": {
"class": dataset_class,
"module_path": "qlib.contrib.data.handler",
"kwargs": get_data_handler_config(market),
},
"segments": {
"train": ("2008-01-01", "2014-12-31"),
"valid": ("2015-01-01", "2016-12-31"),
"test": ("2017-01-01", "2020-08-01"),
},
},
}
def get_gbdt_task(market=CSI300_MARKET):
return {
"model": GBDT_MODEL,
"dataset": get_dataset_config(market),
}
def get_record_lgb_config(market=CSI300_MARKET):
return {
"model": {
"class": "LGBModel",
"module_path": "qlib.contrib.model.gbdt",
},
"dataset": get_dataset_config(market),
"record": RECORD_CONFIG,
}
def get_record_xgboost_config(market=CSI300_MARKET):
return {
"model": {
"class": "XGBModel",
"module_path": "qlib.contrib.model.xgboost",
},
"dataset": get_dataset_config(market),
"record": RECORD_CONFIG,
}
CSI300_DATASET_CONFIG = get_dataset_config(market=CSI300_MARKET)
CSI300_GBDT_TASK = get_gbdt_task(market=CSI300_MARKET)
CSI100_RECORD_XGBOOST_TASK_CONFIG = get_record_xgboost_config(market=CSI100_MARKET)
CSI100_RECORD_LGB_TASK_CONFIG = get_record_lgb_config(market=CSI100_MARKET)

View File

@@ -10,6 +10,7 @@ import datetime
from tqdm import tqdm
from pathlib import Path
from loguru import logger
from qlib.utils import exists_qlib_data
class GetData:
@@ -112,6 +113,7 @@ class GetData:
interval="1d",
region="cn",
delete_old=True,
exists_skip=False,
):
"""download cn qlib data from remote
@@ -129,6 +131,8 @@ class GetData:
data region, value from [cn, us], by default cn
delete_old: bool
delete an existing directory, by default True
exists_skip: bool
exists skip, by default False
Examples
---------
@@ -140,6 +144,13 @@ class GetData:
-------
"""
if exists_skip and exists_qlib_data(target_dir):
logger.warning(
f"Data already exists: {target_dir}, the data download will be skipped\n"
f"\tIf downloading is required: `exists_skip=False` or `change target_dir`"
)
return
qlib_version = ".".join(re.findall(r"(\d+)\.+", qlib.__version__))
def _get_file_name(v):

View File

@@ -5,5 +5,4 @@ numpy
pandas
tqdm
lxml
loguru
yahooquery

View File

@@ -1,26 +1,10 @@
import sys
from pathlib import Path
import qlib
from qlib.data import D
from qlib.config import REG_CN
import unittest
import numpy as np
from qlib.utils import exists_qlib_data
from qlib.data import D
from qlib.tests import TestAutoData
class TestDataset(unittest.TestCase):
@classmethod
def setUpClass(cls) -> None:
# use default data
provider_uri = "~/.qlib/qlib_data/cn_data_simple" # target_dir
if not exists_qlib_data(provider_uri):
print(f"Qlib data is not found in {provider_uri}")
sys.path.append(str(Path(__file__).resolve().parent.parent.parent.joinpath("scripts")))
from get_data import GetData
GetData().qlib_data(name="qlib_data_simple", target_dir=provider_uri)
qlib.init(provider_uri=provider_uri, region=REG_CN)
class TestDataset(TestAutoData):
def testCSI300(self):
close_p = D.features(D.instruments("csi300"), ["$close"])
size = close_p.groupby("datetime").size()

View File

@@ -12,55 +12,7 @@ from qlib.utils import init_instance_by_config, flatten_dict
from qlib.workflow import R
from qlib.workflow.record_temp import SignalRecord, SigAnaRecord, PortAnaRecord
from qlib.tests import TestAutoData
market = "csi300"
benchmark = "SH000300"
###################################
# train model
###################################
data_handler_config = {
"start_time": "2008-01-01",
"end_time": "2020-08-01",
"fit_start_time": "2008-01-01",
"fit_end_time": "2014-12-31",
"instruments": market,
}
task = {
"model": {
"class": "LGBModel",
"module_path": "qlib.contrib.model.gbdt",
"kwargs": {
"loss": "mse",
"colsample_bytree": 0.8879,
"learning_rate": 0.0421,
"subsample": 0.8789,
"lambda_l1": 205.6999,
"lambda_l2": 580.9768,
"max_depth": 8,
"num_leaves": 210,
"num_threads": 20,
},
},
"dataset": {
"class": "DatasetH",
"module_path": "qlib.data.dataset",
"kwargs": {
"handler": {
"class": "Alpha158",
"module_path": "qlib.contrib.data.handler",
"kwargs": data_handler_config,
},
"segments": {
"train": ("2008-01-01", "2014-12-31"),
"valid": ("2015-01-01", "2016-12-31"),
"test": ("2017-01-01", "2020-08-01"),
},
},
},
}
from qlib.tests.config import CSI300_GBDT_TASK, CSI300_BENCH
port_analysis_config = {
"strategy": {
@@ -75,7 +27,7 @@ port_analysis_config = {
"verbose": False,
"limit_threshold": 0.095,
"account": 100000000,
"benchmark": benchmark,
"benchmark": CSI300_BENCH,
"deal_price": "close",
"open_cost": 0.0005,
"close_cost": 0.0015,
@@ -96,15 +48,15 @@ def train():
"""
# model initiaiton
model = init_instance_by_config(task["model"])
dataset = init_instance_by_config(task["dataset"])
model = init_instance_by_config(CSI300_GBDT_TASK["model"])
dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"])
# To test __repr__
print(dataset)
print(R)
# start exp
with R.start(experiment_name="workflow"):
R.log_params(**flatten_dict(task))
R.log_params(**flatten_dict(CSI300_GBDT_TASK))
model.fit(dataset)
# prediction
@@ -137,12 +89,12 @@ def train_with_sigana():
performance: dict
model performance
"""
model = init_instance_by_config(task["model"])
dataset = init_instance_by_config(task["dataset"])
model = init_instance_by_config(CSI300_GBDT_TASK["model"])
dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"])
# start exp
with R.start(experiment_name="workflow_with_sigana"):
R.log_params(**flatten_dict(task))
R.log_params(**flatten_dict(CSI300_GBDT_TASK))
model.fit(dataset)
# predict and calculate ic and ric
@@ -171,7 +123,7 @@ def fake_experiment():
default_uri = R.get_uri()
current_uri = "file:./temp-test-exp-mag"
with R.start(experiment_name="fake_workflow_for_expm", uri=current_uri):
R.log_params(**flatten_dict(task))
R.log_params(**flatten_dict(CSI300_GBDT_TASK))
current_uri_to_check = R.get_uri()
default_uri_to_check = R.get_uri()

View File

@@ -1,73 +1,22 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import sys
import shutil
import unittest
from pathlib import Path
import qlib
from qlib.config import C
from qlib.contrib.workflow import MultiSegRecord, SignalMseRecord
from qlib.utils import init_instance_by_config, flatten_dict
from qlib.workflow import R
from qlib.tests import TestAutoData
market = "csi300"
benchmark = "SH000300"
###################################
# train model
###################################
data_handler_config = {
"start_time": "2008-01-01",
"end_time": "2020-08-01",
"fit_start_time": "2008-01-01",
"fit_end_time": "2014-12-31",
"instruments": market,
}
task = {
"model": {
"class": "LGBModel",
"module_path": "qlib.contrib.model.gbdt",
"kwargs": {
"loss": "mse",
"colsample_bytree": 0.8879,
"learning_rate": 0.0421,
"subsample": 0.8789,
"lambda_l1": 205.6999,
"lambda_l2": 580.9768,
"max_depth": 8,
"num_leaves": 210,
"num_threads": 20,
},
},
"dataset": {
"class": "DatasetH",
"module_path": "qlib.data.dataset",
"kwargs": {
"handler": {
"class": "Alpha158",
"module_path": "qlib.contrib.data.handler",
"kwargs": data_handler_config,
},
"segments": {
"train": ("2008-01-01", "2014-12-31"),
"valid": ("2015-01-01", "2016-12-31"),
"test": ("2017-01-01", "2020-08-01"),
},
},
},
}
from qlib.tests.config import CSI300_GBDT_TASK
def train_multiseg():
model = init_instance_by_config(task["model"])
dataset = init_instance_by_config(task["dataset"])
model = init_instance_by_config(CSI300_GBDT_TASK["model"])
dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"])
with R.start(experiment_name="workflow"):
R.log_params(**flatten_dict(task))
R.log_params(**flatten_dict(CSI300_GBDT_TASK))
model.fit(dataset)
recorder = R.get_recorder()
sr = MultiSegRecord(model, dataset, recorder)
@@ -77,10 +26,10 @@ def train_multiseg():
def train_mse():
model = init_instance_by_config(task["model"])
dataset = init_instance_by_config(task["dataset"])
model = init_instance_by_config(CSI300_GBDT_TASK["model"])
dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"])
with R.start(experiment_name="workflow"):
R.log_params(**flatten_dict(task))
R.log_params(**flatten_dict(CSI300_GBDT_TASK))
model.fit(dataset)
recorder = R.get_recorder()
sr = SignalMseRecord(recorder, model=model, dataset=dataset)

View File

@@ -1,16 +1,13 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import sys
import shutil
import unittest
from pathlib import Path
sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts")))
from get_data import GetData
import qlib
from qlib.data import D
from qlib.tests.data import GetData
DATA_DIR = Path(__file__).parent.joinpath("test_get_data")
SOURCE_DIR = DATA_DIR.joinpath("source")
@@ -37,7 +34,9 @@ class TestGetData(unittest.TestCase):
def test_0_qlib_data(self):
GetData().qlib_data(name="qlib_data_simple", target_dir=QLIB_DIR, region="cn", interval="1d", delete_old=False)
GetData().qlib_data(
name="qlib_data_simple", target_dir=QLIB_DIR, region="cn", interval="1d", delete_old=False, exists_skip=True
)
df = D.features(D.instruments("csi300"), self.FIELDS)
self.assertListEqual(list(df.columns), self.FIELDS, "get qlib data failed")
self.assertFalse(df.dropna().empty, "get qlib data failed")

View File

@@ -1,17 +1,12 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import sys
import unittest
import numpy as np
import qlib
from qlib.data import D
from qlib.data.ops import ElemOperator, PairOperator
from qlib.config import REG_CN
from qlib.utils import exists_qlib_data
from qlib.tests import TestAutoData
from qlib.tests.data import GetData
class Diff(ElemOperator):