1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-07-03 02:50:58 +08:00
Files
qlib/qlib/contrib/meta/data_selection/model.py
2022-03-25 16:56:02 +08:00

179 lines
6.2 KiB
Python

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch import optim
from tqdm.auto import tqdm
import copy
from typing import Union, List
from ....model.meta.dataset import MetaTaskDataset
from ....model.meta.model import MetaTaskModel
from ....workflow import R
from .utils import ICLoss
from .dataset import MetaDatasetDS
from qlib.log import get_module_logger
from qlib.model.meta.task import MetaTask
from qlib.data.dataset.weight import Reweighter
from qlib.contrib.meta.data_selection.net import PredNet
logger = get_module_logger("data selection")
class TimeReweighter(Reweighter):
def __init__(self, time_weight: pd.Series):
self.time_weight = time_weight
def reweight(self, data: Union[pd.DataFrame, pd.Series]):
# TODO: handling TSDataSampler
w_s = pd.Series(1.0, index=data.index)
for k, w in self.time_weight.items():
w_s.loc[slice(*k)] = w
logger.info(f"Reweighting result: {w_s}")
return w_s
class MetaModelDS(MetaTaskModel):
"""
The meta-model for meta-learning-based data selection.
"""
def __init__(
self,
step,
hist_step_n,
clip_method="tanh",
clip_weight=2.0,
criterion="ic_loss",
lr=0.0001,
max_epoch=100,
seed=43,
):
self.step = step
self.hist_step_n = hist_step_n
self.clip_method = clip_method
self.clip_weight = clip_weight
self.criterion = criterion
self.lr = lr
self.max_epoch = max_epoch
self.fitted = False
torch.manual_seed(seed)
def run_epoch(self, phase, task_list, epoch, opt, loss_l, ignore_weight=False):
if phase == "train":
self.tn.train()
torch.set_grad_enabled(True)
else:
self.tn.eval()
torch.set_grad_enabled(False)
running_loss = 0.0
pred_y_all = []
for task in tqdm(task_list, desc=f"{phase} Task", leave=False):
meta_input = task.get_meta_input()
pred, weights = self.tn(
meta_input["X"],
meta_input["y"],
meta_input["time_perf"],
meta_input["time_belong"],
meta_input["X_test"],
ignore_weight=ignore_weight,
)
if self.criterion == "mse":
criterion = nn.MSELoss()
loss = criterion(pred, meta_input["y_test"])
elif self.criterion == "ic_loss":
criterion = ICLoss()
try:
loss = criterion(pred, meta_input["y_test"], meta_input["test_idx"], skip_size=50)
except ValueError as e:
get_module_logger("MetaModelDS").warning(f"Exception `{e}` when calculating IC loss")
continue
assert not np.isnan(loss.detach().item()), "NaN loss!"
if phase == "train":
opt.zero_grad()
loss.backward()
opt.step()
elif phase == "test":
pass
pred_y_all.append(
pd.DataFrame(
{
"pred": pd.Series(pred.detach().cpu().numpy(), index=meta_input["test_idx"]),
"label": pd.Series(meta_input["y_test"].detach().cpu().numpy(), index=meta_input["test_idx"]),
}
)
)
running_loss += loss.detach().item()
running_loss = running_loss / len(task_list)
loss_l.setdefault(phase, []).append(running_loss)
pred_y_all = pd.concat(pred_y_all)
ic = pred_y_all.groupby("datetime").apply(lambda df: df["pred"].corr(df["label"], method="spearman")).mean()
R.log_metrics(**{f"loss/{phase}": running_loss, "step": epoch})
R.log_metrics(**{f"ic/{phase}": ic, "step": epoch})
def fit(self, meta_dataset: MetaDatasetDS):
"""
The meta-learning-based data selection interacts directly with meta-dataset due to the close-form proxy measurement.
Parameters
----------
meta_dataset : MetaDatasetDS
The meta-model takes the meta-dataset for its training process.
"""
if not self.fitted:
for k in set(["lr", "step", "hist_step_n", "clip_method", "clip_weight", "criterion", "max_epoch"]):
R.log_params(**{k: getattr(self, k)})
# FIXME: get test tasks for just checking the performance
phases = ["train", "test"]
meta_tasks_l = meta_dataset.prepare_tasks(phases)
if len(meta_tasks_l[1]):
R.log_params(
**dict(proxy_test_begin=meta_tasks_l[1][0].task["dataset"]["kwargs"]["segments"]["test"])
) # debug: record when the test phase starts
self.tn = PredNet(
step=self.step, hist_step_n=self.hist_step_n, clip_weight=self.clip_weight, clip_method=self.clip_method
)
opt = optim.Adam(self.tn.parameters(), lr=self.lr)
# run weight with no weight
for phase, task_list in zip(phases, meta_tasks_l):
self.run_epoch(f"{phase}_noweight", task_list, 0, opt, {}, ignore_weight=True)
self.run_epoch(f"{phase}_init", task_list, 0, opt, {})
# run training
loss_l = {}
for epoch in tqdm(range(self.max_epoch), desc="epoch"):
for phase, task_list in zip(phases, meta_tasks_l):
self.run_epoch(phase, task_list, epoch, opt, loss_l)
R.save_objects(**{"model.pkl": self.tn})
self.fitted = True
def _prepare_task(self, task: MetaTask) -> dict:
meta_ipt = task.get_meta_input()
weights = self.tn.twm(meta_ipt["time_perf"])
weight_s = pd.Series(weights.detach().cpu().numpy(), index=task.meta_info.columns)
task = copy.copy(task.task) # NOTE: this is a shallow copy.
task["reweighter"] = TimeReweighter(weight_s)
return task
def inference(self, meta_dataset: MetaTaskDataset) -> List[dict]:
res = []
for mt in meta_dataset.prepare_tasks("test"):
res.append(self._prepare_task(mt))
return res