mirror of
https://github.com/microsoft/qlib.git
synced 2026-07-04 19:41:00 +08:00
@@ -1,5 +1,6 @@
|
|||||||
# Copyright (c) Microsoft Corporation.
|
# Copyright (c) Microsoft Corporation.
|
||||||
# Licensed under the MIT License.
|
# Licensed under the MIT License.
|
||||||
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
@@ -35,6 +36,10 @@ class DDGDABench(DDGDA):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
GetData().qlib_data(exists_skip=True)
|
kwargs = {}
|
||||||
auto_init()
|
if os.environ.get("PROVIDER_URI", "") == "":
|
||||||
|
GetData().qlib_data(exists_skip=True)
|
||||||
|
else:
|
||||||
|
kwargs["provider_uri"] = os.environ["PROVIDER_URI"]
|
||||||
|
auto_init(**kwargs)
|
||||||
fire.Fire(DDGDABench)
|
fire.Fire(DDGDABench)
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
# Copyright (c) Microsoft Corporation.
|
# Copyright (c) Microsoft Corporation.
|
||||||
# Licensed under the MIT License.
|
# Licensed under the MIT License.
|
||||||
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
@@ -31,6 +32,10 @@ class RollingBenchmark(Rolling):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
GetData().qlib_data(exists_skip=True)
|
kwargs = {}
|
||||||
auto_init()
|
if os.environ.get("PROVIDER_URI", "") == "":
|
||||||
|
GetData().qlib_data(exists_skip=True)
|
||||||
|
else:
|
||||||
|
kwargs["provider_uri"] = os.environ["PROVIDER_URI"]
|
||||||
|
auto_init(**kwargs)
|
||||||
fire.Fire(RollingBenchmark)
|
fire.Fire(RollingBenchmark)
|
||||||
|
|||||||
@@ -243,7 +243,7 @@ class MetaDatasetDS(MetaTaskDataset):
|
|||||||
trunc_days: int = None,
|
trunc_days: int = None,
|
||||||
rolling_ext_days: int = 0,
|
rolling_ext_days: int = 0,
|
||||||
exp_name: Union[str, InternalData],
|
exp_name: Union[str, InternalData],
|
||||||
segments: Union[Dict[Text, Tuple], float],
|
segments: Union[Dict[Text, Tuple], float, str],
|
||||||
hist_step_n: int = 10,
|
hist_step_n: int = 10,
|
||||||
task_mode: str = MetaTask.PROC_MODE_FULL,
|
task_mode: str = MetaTask.PROC_MODE_FULL,
|
||||||
fill_method: str = "max",
|
fill_method: str = "max",
|
||||||
@@ -271,12 +271,16 @@ class MetaDatasetDS(MetaTaskDataset):
|
|||||||
- str: the name of the experiment to store the performance of data
|
- str: the name of the experiment to store the performance of data
|
||||||
- InternalData: a prepared internal data
|
- InternalData: a prepared internal data
|
||||||
segments: Union[Dict[Text, Tuple], float]
|
segments: Union[Dict[Text, Tuple], float]
|
||||||
the segments to divide data
|
if the segment is a Dict
|
||||||
both left and right
|
the segments to divide data
|
||||||
|
both left and right are included
|
||||||
if segments is a float:
|
if segments is a float:
|
||||||
the float represents the percentage of data for training
|
the float represents the percentage of data for training
|
||||||
|
if segments is a string:
|
||||||
|
it will try its best to put its data in training and ensure that the date `segments` is in the test set
|
||||||
hist_step_n: int
|
hist_step_n: int
|
||||||
length of historical steps for the meta infomation
|
length of historical steps for the meta infomation
|
||||||
|
Number of steps of the data similarity information
|
||||||
task_mode : str
|
task_mode : str
|
||||||
Please refer to the docs of MetaTask
|
Please refer to the docs of MetaTask
|
||||||
"""
|
"""
|
||||||
@@ -383,10 +387,30 @@ class MetaDatasetDS(MetaTaskDataset):
|
|||||||
if isinstance(self.segments, float):
|
if isinstance(self.segments, float):
|
||||||
train_task_n = int(len(self.meta_task_l) * self.segments)
|
train_task_n = int(len(self.meta_task_l) * self.segments)
|
||||||
if segment == "train":
|
if segment == "train":
|
||||||
return self.meta_task_l[:train_task_n]
|
train_tasks = self.meta_task_l[:train_task_n]
|
||||||
|
get_module_logger("MetaDatasetDS").info(f"The first train meta task: {train_tasks[0]}")
|
||||||
|
return train_tasks
|
||||||
elif segment == "test":
|
elif segment == "test":
|
||||||
return self.meta_task_l[train_task_n:]
|
test_tasks = self.meta_task_l[train_task_n:]
|
||||||
|
get_module_logger("MetaDatasetDS").info(f"The first test meta task: {test_tasks[0]}")
|
||||||
|
return test_tasks
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(f"This type of input is not supported")
|
raise NotImplementedError(f"This type of input is not supported")
|
||||||
|
elif isinstance(self.segments, str):
|
||||||
|
train_tasks = []
|
||||||
|
test_tasks = []
|
||||||
|
for t in self.meta_task_l:
|
||||||
|
test_end = t.task["dataset"]["kwargs"]["segments"]["test"][1]
|
||||||
|
if test_end is None or pd.Timestamp(test_end) < pd.Timestamp(self.segments):
|
||||||
|
train_tasks.append(t)
|
||||||
|
else:
|
||||||
|
test_tasks.append(t)
|
||||||
|
get_module_logger("MetaDatasetDS").info(f"The first train meta task: {train_tasks[0]}")
|
||||||
|
get_module_logger("MetaDatasetDS").info(f"The first test meta task: {test_tasks[0]}")
|
||||||
|
if segment == "train":
|
||||||
|
return train_tasks
|
||||||
|
elif segment == "test":
|
||||||
|
return test_tasks
|
||||||
|
raise NotImplementedError(f"This type of input is not supported")
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(f"This type of input is not supported")
|
raise NotImplementedError(f"This type of input is not supported")
|
||||||
|
|||||||
@@ -53,7 +53,12 @@ class MetaModelDS(MetaTaskModel):
|
|||||||
max_epoch=100,
|
max_epoch=100,
|
||||||
seed=43,
|
seed=43,
|
||||||
alpha=0.0,
|
alpha=0.0,
|
||||||
|
loss_skip_thresh=50,
|
||||||
):
|
):
|
||||||
|
"""
|
||||||
|
loss_skip_size: int
|
||||||
|
The number of threshold to skip the loss calculation for each day.
|
||||||
|
"""
|
||||||
self.step = step
|
self.step = step
|
||||||
self.hist_step_n = hist_step_n
|
self.hist_step_n = hist_step_n
|
||||||
self.clip_method = clip_method
|
self.clip_method = clip_method
|
||||||
@@ -63,6 +68,7 @@ class MetaModelDS(MetaTaskModel):
|
|||||||
self.max_epoch = max_epoch
|
self.max_epoch = max_epoch
|
||||||
self.fitted = False
|
self.fitted = False
|
||||||
self.alpha = alpha
|
self.alpha = alpha
|
||||||
|
self.loss_skip_thresh = loss_skip_thresh
|
||||||
torch.manual_seed(seed)
|
torch.manual_seed(seed)
|
||||||
|
|
||||||
def run_epoch(self, phase, task_list, epoch, opt, loss_l, ignore_weight=False):
|
def run_epoch(self, phase, task_list, epoch, opt, loss_l, ignore_weight=False):
|
||||||
@@ -88,12 +94,14 @@ class MetaModelDS(MetaTaskModel):
|
|||||||
criterion = nn.MSELoss()
|
criterion = nn.MSELoss()
|
||||||
loss = criterion(pred, meta_input["y_test"])
|
loss = criterion(pred, meta_input["y_test"])
|
||||||
elif self.criterion == "ic_loss":
|
elif self.criterion == "ic_loss":
|
||||||
criterion = ICLoss()
|
criterion = ICLoss(self.loss_skip_thresh)
|
||||||
try:
|
try:
|
||||||
loss = criterion(pred, meta_input["y_test"], meta_input["test_idx"], skip_size=50)
|
loss = criterion(pred, meta_input["y_test"], meta_input["test_idx"])
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
get_module_logger("MetaModelDS").warning(f"Exception `{e}` when calculating IC loss")
|
get_module_logger("MetaModelDS").warning(f"Exception `{e}` when calculating IC loss")
|
||||||
continue
|
continue
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown criterion: {self.criterion}")
|
||||||
|
|
||||||
assert not np.isnan(loss.detach().item()), "NaN loss!"
|
assert not np.isnan(loss.detach().item()), "NaN loss!"
|
||||||
|
|
||||||
|
|||||||
@@ -10,7 +10,11 @@ from qlib.log import get_module_logger
|
|||||||
|
|
||||||
|
|
||||||
class ICLoss(nn.Module):
|
class ICLoss(nn.Module):
|
||||||
def forward(self, pred, y, idx, skip_size=50):
|
def __init__(self, skip_size=50):
|
||||||
|
super().__init__()
|
||||||
|
self.skip_size = skip_size
|
||||||
|
|
||||||
|
def forward(self, pred, y, idx):
|
||||||
"""forward.
|
"""forward.
|
||||||
FIXME:
|
FIXME:
|
||||||
- Some times it will be a slightly different from the result from `pandas.corr()`
|
- Some times it will be a slightly different from the result from `pandas.corr()`
|
||||||
@@ -33,7 +37,7 @@ class ICLoss(nn.Module):
|
|||||||
skip_n = 0
|
skip_n = 0
|
||||||
for start_i, end_i in zip(diff_point, diff_point[1:]):
|
for start_i, end_i in zip(diff_point, diff_point[1:]):
|
||||||
pred_focus = pred[start_i:end_i] # TODO: just for fake
|
pred_focus = pred[start_i:end_i] # TODO: just for fake
|
||||||
if pred_focus.shape[0] < skip_size:
|
if pred_focus.shape[0] < self.skip_size:
|
||||||
# skip some days which have very small amount of stock.
|
# skip some days which have very small amount of stock.
|
||||||
skip_n += 1
|
skip_n += 1
|
||||||
continue
|
continue
|
||||||
@@ -50,6 +54,7 @@ class ICLoss(nn.Module):
|
|||||||
)
|
)
|
||||||
ic_all += ic_day
|
ic_all += ic_day
|
||||||
if len(diff_point) - 1 - skip_n <= 0:
|
if len(diff_point) - 1 - skip_n <= 0:
|
||||||
|
__import__("ipdb").set_trace()
|
||||||
raise ValueError("No enough data for calculating IC")
|
raise ValueError("No enough data for calculating IC")
|
||||||
if skip_n > 0:
|
if skip_n > 0:
|
||||||
get_module_logger("ICLoss").info(
|
get_module_logger("ICLoss").info(
|
||||||
|
|||||||
@@ -63,6 +63,7 @@ class LinearModel(Model):
|
|||||||
df_train = pd.concat([df_train, df_valid])
|
df_train = pd.concat([df_train, df_valid])
|
||||||
except KeyError:
|
except KeyError:
|
||||||
get_module_logger("LinearModel").info("include_valid=True, but valid does not exist")
|
get_module_logger("LinearModel").info("include_valid=True, but valid does not exist")
|
||||||
|
df_train = df_train.dropna()
|
||||||
if df_train.empty:
|
if df_train.empty:
|
||||||
raise ValueError("Empty data from dataset, please check your dataset config.")
|
raise ValueError("Empty data from dataset, please check your dataset config.")
|
||||||
if reweighter is not None:
|
if reweighter is not None:
|
||||||
|
|||||||
@@ -1,25 +1,25 @@
|
|||||||
# Copyright (c) Microsoft Corporation.
|
# Copyright (c) Microsoft Corporation.
|
||||||
# Licensed under the MIT License.
|
# Licensed under the MIT License.
|
||||||
|
|
||||||
|
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
import copy
|
||||||
|
from typing import Text, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from typing import Text, Union
|
|
||||||
import copy
|
|
||||||
from ...utils import get_or_create_path
|
|
||||||
from ...log import get_module_logger
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch.optim as optim
|
import torch.optim as optim
|
||||||
|
|
||||||
from .pytorch_utils import count_parameters
|
from qlib.workflow import R
|
||||||
from ...model.base import Model
|
|
||||||
from ...data.dataset import DatasetH
|
from ...data.dataset import DatasetH
|
||||||
from ...data.dataset.handler import DataHandlerLP
|
from ...data.dataset.handler import DataHandlerLP
|
||||||
|
from ...log import get_module_logger
|
||||||
|
from ...model.base import Model
|
||||||
|
from ...utils import get_or_create_path
|
||||||
|
from .pytorch_utils import count_parameters
|
||||||
|
|
||||||
|
|
||||||
class GRU(Model):
|
class GRU(Model):
|
||||||
@@ -212,16 +212,31 @@ class GRU(Model):
|
|||||||
evals_result=dict(),
|
evals_result=dict(),
|
||||||
save_path=None,
|
save_path=None,
|
||||||
):
|
):
|
||||||
df_train, df_valid, df_test = dataset.prepare(
|
# prepare training and validation data
|
||||||
["train", "valid", "test"],
|
dfs = {
|
||||||
col_set=["feature", "label"],
|
k: dataset.prepare(
|
||||||
data_key=DataHandlerLP.DK_L,
|
k,
|
||||||
)
|
col_set=["feature", "label"],
|
||||||
if df_train.empty or df_valid.empty:
|
data_key=DataHandlerLP.DK_L,
|
||||||
raise ValueError("Empty data from dataset, please check your dataset config.")
|
)
|
||||||
|
for k in ["train", "valid"]
|
||||||
|
if k in dataset.segments
|
||||||
|
}
|
||||||
|
df_train, df_valid = dfs.get("train", pd.DataFrame()), dfs.get("valid", pd.DataFrame())
|
||||||
|
|
||||||
|
# check if training data is empty
|
||||||
|
if df_train.empty:
|
||||||
|
raise ValueError("Empty training data from dataset, please check your dataset config.")
|
||||||
|
|
||||||
|
df_train = df_train.dropna()
|
||||||
x_train, y_train = df_train["feature"], df_train["label"]
|
x_train, y_train = df_train["feature"], df_train["label"]
|
||||||
x_valid, y_valid = df_valid["feature"], df_valid["label"]
|
|
||||||
|
# check if validation data is provided
|
||||||
|
if not df_valid.empty:
|
||||||
|
df_valid = df_valid.dropna()
|
||||||
|
x_valid, y_valid = df_valid["feature"], df_valid["label"]
|
||||||
|
else:
|
||||||
|
x_valid, y_valid = None, None
|
||||||
|
|
||||||
save_path = get_or_create_path(save_path)
|
save_path = get_or_create_path(save_path)
|
||||||
stop_steps = 0
|
stop_steps = 0
|
||||||
@@ -235,32 +250,42 @@ class GRU(Model):
|
|||||||
self.logger.info("training...")
|
self.logger.info("training...")
|
||||||
self.fitted = True
|
self.fitted = True
|
||||||
|
|
||||||
|
best_param = copy.deepcopy(self.gru_model.state_dict())
|
||||||
for step in range(self.n_epochs):
|
for step in range(self.n_epochs):
|
||||||
self.logger.info("Epoch%d:", step)
|
self.logger.info("Epoch%d:", step)
|
||||||
self.logger.info("training...")
|
self.logger.info("training...")
|
||||||
self.train_epoch(x_train, y_train)
|
self.train_epoch(x_train, y_train)
|
||||||
self.logger.info("evaluating...")
|
self.logger.info("evaluating...")
|
||||||
train_loss, train_score = self.test_epoch(x_train, y_train)
|
train_loss, train_score = self.test_epoch(x_train, y_train)
|
||||||
val_loss, val_score = self.test_epoch(x_valid, y_valid)
|
|
||||||
self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
|
|
||||||
evals_result["train"].append(train_score)
|
evals_result["train"].append(train_score)
|
||||||
evals_result["valid"].append(val_score)
|
|
||||||
|
|
||||||
if val_score > best_score:
|
# evaluate on validation data if provided
|
||||||
best_score = val_score
|
if x_valid is not None and y_valid is not None:
|
||||||
stop_steps = 0
|
val_loss, val_score = self.test_epoch(x_valid, y_valid)
|
||||||
best_epoch = step
|
self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
|
||||||
best_param = copy.deepcopy(self.gru_model.state_dict())
|
evals_result["valid"].append(val_score)
|
||||||
else:
|
|
||||||
stop_steps += 1
|
if val_score > best_score:
|
||||||
if stop_steps >= self.early_stop:
|
best_score = val_score
|
||||||
self.logger.info("early stop")
|
stop_steps = 0
|
||||||
break
|
best_epoch = step
|
||||||
|
best_param = copy.deepcopy(self.gru_model.state_dict())
|
||||||
|
else:
|
||||||
|
stop_steps += 1
|
||||||
|
if stop_steps >= self.early_stop:
|
||||||
|
self.logger.info("early stop")
|
||||||
|
break
|
||||||
|
|
||||||
self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
|
self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
|
||||||
self.gru_model.load_state_dict(best_param)
|
self.gru_model.load_state_dict(best_param)
|
||||||
torch.save(best_param, save_path)
|
torch.save(best_param, save_path)
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
rec = R.get_recorder()
|
||||||
|
for k, v_l in evals_result.items():
|
||||||
|
for i, v in enumerate(v_l):
|
||||||
|
rec.log_metrics(step=i, **{k: v})
|
||||||
|
|
||||||
if self.use_gpu:
|
if self.use_gpu:
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
@@ -292,6 +317,7 @@ class GRU(Model):
|
|||||||
|
|
||||||
|
|
||||||
class GRUModel(nn.Module):
|
class GRUModel(nn.Module):
|
||||||
|
|
||||||
def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0):
|
def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,17 @@
|
|||||||
# Copyright (c) Microsoft Corporation.
|
# Copyright (c) Microsoft Corporation.
|
||||||
# Licensed under the MIT License.
|
# Licensed under the MIT License.
|
||||||
|
"""
|
||||||
|
Here we have a comprehensive set of analysis classes.
|
||||||
|
|
||||||
|
Here is an example.
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from qlib.contrib.report.data.ana import FeaMeanStd
|
||||||
|
fa = FeaMeanStd(ret_df)
|
||||||
|
fa.plot_all(wspace=0.3, sub_figsize=(12, 3), col_n=5)
|
||||||
|
|
||||||
|
"""
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from qlib.contrib.report.data.base import FeaAnalyser
|
from qlib.contrib.report.data.base import FeaAnalyser
|
||||||
@@ -152,6 +164,7 @@ class FeaSkewTurt(NumFeaAnalyser):
|
|||||||
self._kurt[col].plot(ax=right_ax, label="kurt", color="green")
|
self._kurt[col].plot(ax=right_ax, label="kurt", color="green")
|
||||||
right_ax.set_xlabel("")
|
right_ax.set_xlabel("")
|
||||||
right_ax.set_ylabel("kurt")
|
right_ax.set_ylabel("kurt")
|
||||||
|
right_ax.grid(None) # set the grid to None to avoid two layer of grid
|
||||||
|
|
||||||
h1, l1 = ax.get_legend_handles_labels()
|
h1, l1 = ax.get_legend_handles_labels()
|
||||||
h2, l2 = right_ax.get_legend_handles_labels()
|
h2, l2 = right_ax.get_legend_handles_labels()
|
||||||
@@ -171,12 +184,15 @@ class FeaMeanStd(NumFeaAnalyser):
|
|||||||
ax.set_xlabel("")
|
ax.set_xlabel("")
|
||||||
ax.set_ylabel("mean")
|
ax.set_ylabel("mean")
|
||||||
ax.legend()
|
ax.legend()
|
||||||
|
ax.tick_params(axis="x", rotation=90)
|
||||||
|
|
||||||
right_ax = ax.twinx()
|
right_ax = ax.twinx()
|
||||||
|
|
||||||
self._std[col].plot(ax=right_ax, label="std", color="green")
|
self._std[col].plot(ax=right_ax, label="std", color="green")
|
||||||
right_ax.set_xlabel("")
|
right_ax.set_xlabel("")
|
||||||
right_ax.set_ylabel("std")
|
right_ax.set_ylabel("std")
|
||||||
|
right_ax.tick_params(axis="x", rotation=90)
|
||||||
|
right_ax.grid(None) # set the grid to None to avoid two layer of grid
|
||||||
|
|
||||||
h1, l1 = ax.get_legend_handles_labels()
|
h1, l1 = ax.get_legend_handles_labels()
|
||||||
h2, l2 = right_ax.get_legend_handles_labels()
|
h2, l2 = right_ax.get_legend_handles_labels()
|
||||||
|
|||||||
@@ -14,6 +14,24 @@ from qlib.contrib.report.utils import sub_fig_generator
|
|||||||
|
|
||||||
class FeaAnalyser:
|
class FeaAnalyser:
|
||||||
def __init__(self, dataset: pd.DataFrame):
|
def __init__(self, dataset: pd.DataFrame):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : pd.DataFrame
|
||||||
|
|
||||||
|
We often have multiple columns for dataset. Each column corresponds to one sub figure.
|
||||||
|
There will be a datatime column in the index levels.
|
||||||
|
Aggretation will be used for more summarized metrics overtime.
|
||||||
|
Here is an example of data:
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
return
|
||||||
|
datetime instrument
|
||||||
|
2007-02-06 equity_tpx 0.010087
|
||||||
|
equity_spx 0.000786
|
||||||
|
"""
|
||||||
self._dataset = dataset
|
self._dataset = dataset
|
||||||
with TimeInspector.logt("calc_stat_values"):
|
with TimeInspector.logt("calc_stat_values"):
|
||||||
self.calc_stat_values()
|
self.calc_stat_values()
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import matplotlib.pyplot as plt
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
def sub_fig_generator(sub_fs=(3, 3), col_n=10, row_n=1, wspace=None, hspace=None, sharex=False, sharey=False):
|
def sub_fig_generator(sub_figsize=(3, 3), col_n=10, row_n=1, wspace=None, hspace=None, sharex=False, sharey=False):
|
||||||
"""sub_fig_generator.
|
"""sub_fig_generator.
|
||||||
it will return a generator, each row contains <col_n> sub graph
|
it will return a generator, each row contains <col_n> sub graph
|
||||||
|
|
||||||
@@ -13,7 +13,7 @@ def sub_fig_generator(sub_fs=(3, 3), col_n=10, row_n=1, wspace=None, hspace=None
|
|||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
sub_fs :
|
sub_figsize :
|
||||||
the figure size of each subgraph in <col_n> * <row_n> subgraphs
|
the figure size of each subgraph in <col_n> * <row_n> subgraphs
|
||||||
col_n :
|
col_n :
|
||||||
the number of subgraph in each row; It will generating a new graph after generating <col_n> of subgraphs.
|
the number of subgraph in each row; It will generating a new graph after generating <col_n> of subgraphs.
|
||||||
@@ -33,7 +33,7 @@ def sub_fig_generator(sub_fs=(3, 3), col_n=10, row_n=1, wspace=None, hspace=None
|
|||||||
|
|
||||||
while True:
|
while True:
|
||||||
fig, axes = plt.subplots(
|
fig, axes = plt.subplots(
|
||||||
row_n, col_n, figsize=(sub_fs[0] * col_n, sub_fs[1] * row_n), sharex=sharex, sharey=sharey
|
row_n, col_n, figsize=(sub_figsize[0] * col_n, sub_figsize[1] * row_n), sharex=sharex, sharey=sharey
|
||||||
)
|
)
|
||||||
plt.subplots_adjust(wspace=wspace, hspace=hspace)
|
plt.subplots_adjust(wspace=wspace, hspace=hspace)
|
||||||
axes = axes.reshape(row_n, col_n)
|
axes = axes.reshape(row_n, col_n)
|
||||||
|
|||||||
@@ -73,8 +73,8 @@ class Rolling:
|
|||||||
The horizon of the prediction target.
|
The horizon of the prediction target.
|
||||||
This is used to override the prediction horizon of the file.
|
This is used to override the prediction horizon of the file.
|
||||||
h_path : Optional[str]
|
h_path : Optional[str]
|
||||||
the dumped data handler;
|
It is other data source that is dumped as a handler. It will override the data handler section in the config.
|
||||||
It may come from other data source. It will override the data handler in the config.
|
If it is not given, it will create a customized cache for the handler when `enable_handler_cache=True`
|
||||||
test_end : Optional[str]
|
test_end : Optional[str]
|
||||||
the test end for the data. It is typically used together with the handler
|
the test end for the data. It is typically used together with the handler
|
||||||
You can do the same thing with task_ext_conf in a more complicated way
|
You can do the same thing with task_ext_conf in a more complicated way
|
||||||
@@ -119,7 +119,7 @@ class Rolling:
|
|||||||
with self.conf_path.open("r") as f:
|
with self.conf_path.open("r") as f:
|
||||||
return yaml.safe_load(f)
|
return yaml.safe_load(f)
|
||||||
|
|
||||||
def _replace_hanler_with_cache(self, task: dict):
|
def _replace_handler_with_cache(self, task: dict):
|
||||||
"""
|
"""
|
||||||
Due to the data processing part in original rolling is slow. So we have to
|
Due to the data processing part in original rolling is slow. So we have to
|
||||||
This class tries to add more feature
|
This class tries to add more feature
|
||||||
@@ -159,13 +159,20 @@ class Rolling:
|
|||||||
# - get horizon automatically from the expression!!!!
|
# - get horizon automatically from the expression!!!!
|
||||||
raise NotImplementedError(f"This type of input is not supported")
|
raise NotImplementedError(f"This type of input is not supported")
|
||||||
else:
|
else:
|
||||||
self.logger.info("The prediction horizon is overrided")
|
if enable_handler_cache and self.h_path is not None:
|
||||||
task["dataset"]["kwargs"]["handler"]["kwargs"]["label"] = [
|
self.logger.info("Fail to override the horizon due to data handler cache")
|
||||||
"Ref($close, -{}) / Ref($close, -1) - 1".format(self.horizon + 1)
|
else:
|
||||||
]
|
self.logger.info("The prediction horizon is overrided")
|
||||||
|
if isinstance(task["dataset"]["kwargs"]["handler"], dict):
|
||||||
|
task["dataset"]["kwargs"]["handler"]["kwargs"]["label"] = [
|
||||||
|
"Ref($close, -{}) / Ref($close, -1) - 1".format(self.horizon + 1)
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
self.logger.warning("Try to automatically configure the lablel but failed.")
|
||||||
|
|
||||||
if enable_handler_cache:
|
if self.h_path is not None or enable_handler_cache:
|
||||||
task = self._replace_hanler_with_cache(task)
|
# if we already have provided data source or we want to create one
|
||||||
|
task = self._replace_handler_with_cache(task)
|
||||||
task = self._update_start_end_time(task)
|
task = self._update_start_end_time(task)
|
||||||
|
|
||||||
if self.task_ext_conf is not None:
|
if self.task_ext_conf is not None:
|
||||||
@@ -173,6 +180,16 @@ class Rolling:
|
|||||||
self.logger.info(task)
|
self.logger.info(task)
|
||||||
return task
|
return task
|
||||||
|
|
||||||
|
def run_basic_task(self):
|
||||||
|
"""
|
||||||
|
Run the basic task without rolling.
|
||||||
|
This is for fast testing for model tunning.
|
||||||
|
"""
|
||||||
|
task = self.basic_task()
|
||||||
|
print(task)
|
||||||
|
trainer = TrainerR(experiment_name=self.exp_name)
|
||||||
|
trainer([task])
|
||||||
|
|
||||||
def get_task_list(self) -> List[dict]:
|
def get_task_list(self) -> List[dict]:
|
||||||
"""return a batch of tasks for rolling."""
|
"""return a batch of tasks for rolling."""
|
||||||
task = self.basic_task()
|
task = self.basic_task()
|
||||||
|
|||||||
@@ -80,6 +80,11 @@ class DDGDA(Rolling):
|
|||||||
sim_task_model: UTIL_MODEL_TYPE = "gbdt",
|
sim_task_model: UTIL_MODEL_TYPE = "gbdt",
|
||||||
meta_1st_train_end: Optional[str] = None,
|
meta_1st_train_end: Optional[str] = None,
|
||||||
alpha: float = 0.01,
|
alpha: float = 0.01,
|
||||||
|
loss_skip_thresh: int = 50,
|
||||||
|
fea_imp_n: Optional[int] = 30,
|
||||||
|
meta_data_proc: Optional[str] = "V01",
|
||||||
|
segments: Union[float, str] = 0.62,
|
||||||
|
hist_step_n: int = 30,
|
||||||
working_dir: Optional[Union[str, Path]] = None,
|
working_dir: Optional[Union[str, Path]] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
@@ -94,6 +99,15 @@ class DDGDA(Rolling):
|
|||||||
alpha: float
|
alpha: float
|
||||||
Setting the L2 regularization for ridge
|
Setting the L2 regularization for ridge
|
||||||
The `alpha` is only passed to MetaModelDS (it is not passed to sim_task_model currently..)
|
The `alpha` is only passed to MetaModelDS (it is not passed to sim_task_model currently..)
|
||||||
|
loss_skip_thresh: int
|
||||||
|
The thresh to skip the loss calculation for each day. If the number of item is less than it, it will skip the loss on that day.
|
||||||
|
meta_data_proc : Optional[str]
|
||||||
|
How we process the meta dataset for learning meta model.
|
||||||
|
segments : Union[float, str]
|
||||||
|
if segments is a float:
|
||||||
|
The ratio of training data in the meta task dataset
|
||||||
|
if segments is a string:
|
||||||
|
it will try its best to put its data in training and ensure that the date `segments` is in the test set
|
||||||
"""
|
"""
|
||||||
# NOTE:
|
# NOTE:
|
||||||
# the horizon must match the meaning in the base task template
|
# the horizon must match the meaning in the base task template
|
||||||
@@ -104,14 +118,22 @@ class DDGDA(Rolling):
|
|||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.working_dir = self.conf_path.parent if working_dir is None else Path(working_dir)
|
self.working_dir = self.conf_path.parent if working_dir is None else Path(working_dir)
|
||||||
self.proxy_hd = self.working_dir / "handler_proxy.pkl"
|
self.proxy_hd = self.working_dir / "handler_proxy.pkl"
|
||||||
|
self.fea_imp_n = fea_imp_n
|
||||||
|
self.meta_data_proc = meta_data_proc
|
||||||
|
self.loss_skip_thresh = loss_skip_thresh
|
||||||
|
self.segments = segments
|
||||||
|
self.hist_step_n = hist_step_n
|
||||||
|
|
||||||
def _adjust_task(self, task: dict, astype: UTIL_MODEL_TYPE):
|
def _adjust_task(self, task: dict, astype: UTIL_MODEL_TYPE):
|
||||||
"""
|
"""
|
||||||
some task are use for special purpose.
|
Base on the original task, we need to do some extra things.
|
||||||
|
|
||||||
For example:
|
For example:
|
||||||
- GBDT for calculating feature importance
|
- GBDT for calculating feature importance
|
||||||
- Linear or GBDT for calculating similarity
|
- Linear or GBDT for calculating similarity
|
||||||
- Datset (well processed) that aligned to Linear that for meta learning
|
- Datset (well processed) that aligned to Linear that for meta learning
|
||||||
|
|
||||||
|
So we may need to change the dataset and model for the special purpose and other settings remains the same.
|
||||||
"""
|
"""
|
||||||
# NOTE: here is just for aligning with previous implementation
|
# NOTE: here is just for aligning with previous implementation
|
||||||
# It is not necessary for the current implementation
|
# It is not necessary for the current implementation
|
||||||
@@ -119,12 +141,16 @@ class DDGDA(Rolling):
|
|||||||
if astype == "gbdt":
|
if astype == "gbdt":
|
||||||
task["model"] = LGBM_MODEL
|
task["model"] = LGBM_MODEL
|
||||||
if isinstance(handler, dict):
|
if isinstance(handler, dict):
|
||||||
|
# We don't need preprocessing when using GBDT model
|
||||||
for k in ["infer_processors", "learn_processors"]:
|
for k in ["infer_processors", "learn_processors"]:
|
||||||
if k in handler.setdefault("kwargs", {}):
|
if k in handler.setdefault("kwargs", {}):
|
||||||
handler["kwargs"].pop(k)
|
handler["kwargs"].pop(k)
|
||||||
elif astype == "linear":
|
elif astype == "linear":
|
||||||
task["model"] = LINEAR_MODEL
|
task["model"] = LINEAR_MODEL
|
||||||
handler["kwargs"].update(PROC_ARGS)
|
if isinstance(handler, dict):
|
||||||
|
handler["kwargs"].update(PROC_ARGS)
|
||||||
|
else:
|
||||||
|
self.logger.warning("The handler can't be adjusted.")
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"astype not supported: {astype}")
|
raise ValueError(f"astype not supported: {astype}")
|
||||||
return task
|
return task
|
||||||
@@ -155,12 +181,15 @@ class DDGDA(Rolling):
|
|||||||
The meta model will be trained upon the proxy forecasting model.
|
The meta model will be trained upon the proxy forecasting model.
|
||||||
This dataset is for the proxy forecasting model.
|
This dataset is for the proxy forecasting model.
|
||||||
"""
|
"""
|
||||||
topk = 30
|
|
||||||
fi = self._get_feature_importance()
|
|
||||||
col_selected = fi.nlargest(topk)
|
|
||||||
# NOTE: adjusting to `self.sim_task_model` just for aligning with previous implementation.
|
# NOTE: adjusting to `self.sim_task_model` just for aligning with previous implementation.
|
||||||
|
# In previous version. The data for proxy model is using sim_task_model's way for processing
|
||||||
task = self._adjust_task(self.basic_task(enable_handler_cache=False), self.sim_task_model)
|
task = self._adjust_task(self.basic_task(enable_handler_cache=False), self.sim_task_model)
|
||||||
task = replace_task_handler_with_cache(task, self.working_dir)
|
task = replace_task_handler_with_cache(task, self.working_dir)
|
||||||
|
# if self.meta_data_proc is not None:
|
||||||
|
# else:
|
||||||
|
# # Otherwise, we don't need futher processing
|
||||||
|
# task = self.basic_task()
|
||||||
|
|
||||||
dataset = init_instance_by_config(task["dataset"])
|
dataset = init_instance_by_config(task["dataset"])
|
||||||
prep_ds = dataset.prepare(slice(None), col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
|
prep_ds = dataset.prepare(slice(None), col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
|
||||||
@@ -168,12 +197,18 @@ class DDGDA(Rolling):
|
|||||||
feature_df = prep_ds["feature"]
|
feature_df = prep_ds["feature"]
|
||||||
label_df = prep_ds["label"]
|
label_df = prep_ds["label"]
|
||||||
|
|
||||||
feature_selected = feature_df.loc[:, col_selected.index]
|
if self.fea_imp_n is not None:
|
||||||
|
fi = self._get_feature_importance()
|
||||||
|
col_selected = fi.nlargest(self.fea_imp_n)
|
||||||
|
feature_selected = feature_df.loc[:, col_selected.index]
|
||||||
|
else:
|
||||||
|
feature_selected = feature_df
|
||||||
|
|
||||||
feature_selected = feature_selected.groupby("datetime", group_keys=False).apply(
|
if self.meta_data_proc == "V01":
|
||||||
lambda df: (df - df.mean()).div(df.std())
|
feature_selected = feature_selected.groupby("datetime", group_keys=False).apply(
|
||||||
)
|
lambda df: (df - df.mean()).div(df.std())
|
||||||
feature_selected = feature_selected.fillna(0.0)
|
)
|
||||||
|
feature_selected = feature_selected.fillna(0.0)
|
||||||
|
|
||||||
df_all = {
|
df_all = {
|
||||||
"label": label_df.reindex(feature_selected.index),
|
"label": label_df.reindex(feature_selected.index),
|
||||||
@@ -223,7 +258,10 @@ class DDGDA(Rolling):
|
|||||||
# 1) leverage the simplified proxy forecasting model to train meta model.
|
# 1) leverage the simplified proxy forecasting model to train meta model.
|
||||||
# - Only the dataset part is important, in current version of meta model will integrate the
|
# - Only the dataset part is important, in current version of meta model will integrate the
|
||||||
|
|
||||||
# the train_start for training meta model does not necessarily align with final rolling
|
# NOTE:
|
||||||
|
# - The train_start for training meta model does not necessarily align with final rolling
|
||||||
|
# But please select a right time to make sure the finnal rolling tasks are not leaked in the training data.
|
||||||
|
# - The test_start is automatically aligned to the next day of test_end. Validation is ignored.
|
||||||
train_start = "2008-01-01" if self.train_start is None else self.train_start
|
train_start = "2008-01-01" if self.train_start is None else self.train_start
|
||||||
train_end = "2010-12-31" if self.meta_1st_train_end is None else self.meta_1st_train_end
|
train_end = "2010-12-31" if self.meta_1st_train_end is None else self.meta_1st_train_end
|
||||||
test_start = (pd.Timestamp(train_end) + pd.Timedelta(days=1)).strftime("%Y-%m-%d")
|
test_start = (pd.Timestamp(train_end) + pd.Timedelta(days=1)).strftime("%Y-%m-%d")
|
||||||
@@ -249,9 +287,9 @@ class DDGDA(Rolling):
|
|||||||
kwargs = dict(
|
kwargs = dict(
|
||||||
task_tpl=proxy_forecast_model_task,
|
task_tpl=proxy_forecast_model_task,
|
||||||
step=self.step,
|
step=self.step,
|
||||||
segments=0.62, # keep test period consistent with the dataset yaml
|
segments=self.segments, # keep test period consistent with the dataset yaml
|
||||||
trunc_days=1 + self.horizon,
|
trunc_days=1 + self.horizon,
|
||||||
hist_step_n=30,
|
hist_step_n=self.hist_step_n,
|
||||||
fill_method=fill_method,
|
fill_method=fill_method,
|
||||||
rolling_ext_days=0,
|
rolling_ext_days=0,
|
||||||
)
|
)
|
||||||
@@ -268,7 +306,13 @@ class DDGDA(Rolling):
|
|||||||
with R.start(experiment_name=self.meta_exp_name):
|
with R.start(experiment_name=self.meta_exp_name):
|
||||||
R.log_params(**kwargs)
|
R.log_params(**kwargs)
|
||||||
mm = MetaModelDS(
|
mm = MetaModelDS(
|
||||||
step=self.step, hist_step_n=kwargs["hist_step_n"], lr=0.001, max_epoch=30, seed=43, alpha=self.alpha
|
step=self.step,
|
||||||
|
hist_step_n=kwargs["hist_step_n"],
|
||||||
|
lr=0.001,
|
||||||
|
max_epoch=30,
|
||||||
|
seed=43,
|
||||||
|
alpha=self.alpha,
|
||||||
|
loss_skip_thresh=self.loss_skip_thresh,
|
||||||
)
|
)
|
||||||
mm.fit(md)
|
mm.fit(md)
|
||||||
R.save_objects(model=mm)
|
R.save_objects(model=mm)
|
||||||
|
|||||||
@@ -51,3 +51,6 @@ class MetaTask:
|
|||||||
Return the **processed** meta_info
|
Return the **processed** meta_info
|
||||||
"""
|
"""
|
||||||
return self.meta_info
|
return self.meta_info
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"MetaTask(task={self.task}, meta_info={self.meta_info})"
|
||||||
|
|||||||
@@ -161,7 +161,13 @@ def init_instance_by_config(
|
|||||||
# path like 'file:///<path to pickle file>/obj.pkl'
|
# path like 'file:///<path to pickle file>/obj.pkl'
|
||||||
pr = urlparse(config)
|
pr = urlparse(config)
|
||||||
if pr.scheme == "file":
|
if pr.scheme == "file":
|
||||||
pr_path = os.path.join(pr.netloc, pr.path) if bool(pr.path) else pr.netloc
|
|
||||||
|
# To enable relative path like file://data/a/b/c.pkl. pr.netloc will be data
|
||||||
|
path = pr.path
|
||||||
|
if pr.netloc != "":
|
||||||
|
path = path.lstrip("/")
|
||||||
|
|
||||||
|
pr_path = os.path.join(pr.netloc, path) if bool(pr.path) else pr.netloc
|
||||||
with open(os.path.normpath(pr_path), "rb") as f:
|
with open(os.path.normpath(pr_path), "rb") as f:
|
||||||
return pickle.load(f)
|
return pickle.load(f)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -1,18 +1,20 @@
|
|||||||
# Copyright (c) Microsoft Corporation.
|
# Copyright (c) Microsoft Corporation.
|
||||||
# Licensed under the MIT License.
|
# Licensed under the MIT License.
|
||||||
import logging
|
import logging
|
||||||
import sys
|
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import fire
|
||||||
|
from jinja2 import Template, meta
|
||||||
|
import ruamel.yaml as yaml
|
||||||
|
|
||||||
import qlib
|
import qlib
|
||||||
import fire
|
|
||||||
import ruamel.yaml as yaml
|
|
||||||
from qlib.config import C
|
from qlib.config import C
|
||||||
from qlib.model.trainer import task_train
|
|
||||||
from qlib.utils.data import update_config
|
|
||||||
from qlib.log import get_module_logger
|
from qlib.log import get_module_logger
|
||||||
|
from qlib.model.trainer import task_train
|
||||||
from qlib.utils import set_log_with_config
|
from qlib.utils import set_log_with_config
|
||||||
|
from qlib.utils.data import update_config
|
||||||
|
|
||||||
set_log_with_config(C.logging_config)
|
set_log_with_config(C.logging_config)
|
||||||
logger = get_module_logger("qrun", logging.INFO)
|
logger = get_module_logger("qrun", logging.INFO)
|
||||||
@@ -47,6 +49,39 @@ def sys_config(config, config_path):
|
|||||||
sys.path.append(str(Path(config_path).parent.resolve().absolute() / p))
|
sys.path.append(str(Path(config_path).parent.resolve().absolute() / p))
|
||||||
|
|
||||||
|
|
||||||
|
def render_template(config_path: str) -> str:
|
||||||
|
"""
|
||||||
|
render the template based on the environment
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
config_path : str
|
||||||
|
configuration path
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str
|
||||||
|
the rendered content
|
||||||
|
"""
|
||||||
|
with open(config_path, "r") as f:
|
||||||
|
config = f.read()
|
||||||
|
# Set up the Jinja2 environment
|
||||||
|
template = Template(config)
|
||||||
|
|
||||||
|
# Parse the template to find undeclared variables
|
||||||
|
env = template.environment
|
||||||
|
parsed_content = env.parse(config)
|
||||||
|
variables = meta.find_undeclared_variables(parsed_content)
|
||||||
|
|
||||||
|
# Get context from os.environ according to the variables
|
||||||
|
context = {var: os.getenv(var, "") for var in variables if var in os.environ}
|
||||||
|
logger.info(f"Render the template with the context: {context}")
|
||||||
|
|
||||||
|
# Render the template with the context
|
||||||
|
rendered_content = template.render(context)
|
||||||
|
return rendered_content
|
||||||
|
|
||||||
|
|
||||||
# workflow handler function
|
# workflow handler function
|
||||||
def workflow(config_path, experiment_name="workflow", uri_folder="mlruns"):
|
def workflow(config_path, experiment_name="workflow", uri_folder="mlruns"):
|
||||||
"""
|
"""
|
||||||
@@ -67,8 +102,9 @@ def workflow(config_path, experiment_name="workflow", uri_folder="mlruns"):
|
|||||||
market: csi300
|
market: csi300
|
||||||
|
|
||||||
"""
|
"""
|
||||||
with open(config_path) as fp:
|
# Render the template
|
||||||
config = yaml.safe_load(fp)
|
rendered_yaml = render_template(config_path)
|
||||||
|
config = yaml.safe_load(rendered_yaml)
|
||||||
|
|
||||||
base_config_path = config.get("BASE_CONFIG_PATH", None)
|
base_config_path = config.get("BASE_CONFIG_PATH", None)
|
||||||
if base_config_path:
|
if base_config_path:
|
||||||
|
|||||||
Reference in New Issue
Block a user