mirror of
https://github.com/microsoft/qlib.git
synced 2026-06-06 05:51:17 +08:00
Add AdaRNN baseline. (#689)
* Update TCTS. * Update TCTS README. * Update TCTS README. * Update TCTS. * Add ADARNN. * Update README. * Reformat ADARNN. * Add README for adarnn. Co-authored-by: lewwang <lwwang@microsoft.com>
This commit is contained in:
@@ -11,6 +11,7 @@
|
||||
Recent released features
|
||||
| Feature | Status |
|
||||
| -- | ------ |
|
||||
| ADARNN model | [Released](https://github.com/microsoft/qlib/pull/689) on Nov 14, 2021 |
|
||||
| TCN model | [Released](https://github.com/microsoft/qlib/pull/668) on Nov 4, 2021 |
|
||||
|Temporal Routing Adaptor (TRA) | [Released](https://github.com/microsoft/qlib/pull/531) on July 30, 2021 |
|
||||
| Transformer & Localformer | [Released](https://github.com/microsoft/qlib/pull/508) on July 22, 2021 |
|
||||
@@ -296,6 +297,7 @@ Here is a list of models built on `Qlib`.
|
||||
- [Localformer based on pytorch (Juyong Jiang, et al.)](examples/benchmarks/Localformer/)
|
||||
- [TRA based on pytorch (Hengxu, Dong, et al. KDD 2021)](examples/benchmarks/TRA/)
|
||||
- [TCN based on pytorch (Shaojie Bai, et al. 2018)](examples/benchmarks/TCN/)
|
||||
- [ADARNN based on pytorch (YunTao Du, et al. 2021)](examples/benchmarks/ADARNN/)
|
||||
|
||||
Your PR of new Quant models is highly welcomed.
|
||||
|
||||
|
||||
4
examples/benchmarks/ADARNN/README.md
Normal file
4
examples/benchmarks/ADARNN/README.md
Normal file
@@ -0,0 +1,4 @@
|
||||
# AdaRNN
|
||||
* Code: [https://github.com/jindongwang/transferlearning/tree/master/code/deep/adarnn](https://github.com/jindongwang/transferlearning/tree/master/code/deep/adarnn)
|
||||
* Paper: [AdaRNN: Adaptive Learning and Forecasting for Time Series](https://arxiv.org/pdf/2108.04443.pdf).
|
||||
|
||||
4
examples/benchmarks/ADARNN/requirements.txt
Normal file
4
examples/benchmarks/ADARNN/requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
pandas==1.1.2
|
||||
numpy==1.17.4
|
||||
scikit_learn==0.23.2
|
||||
torch==1.7.0
|
||||
@@ -0,0 +1,88 @@
|
||||
qlib_init:
|
||||
provider_uri: "~/.qlib/qlib_data/cn_data"
|
||||
region: cn
|
||||
market: &market csi300
|
||||
benchmark: &benchmark SH000300
|
||||
data_handler_config: &data_handler_config
|
||||
start_time: 2008-01-01
|
||||
end_time: 2020-08-01
|
||||
fit_start_time: 2008-01-01
|
||||
fit_end_time: 2014-12-31
|
||||
instruments: *market
|
||||
infer_processors:
|
||||
- class: RobustZScoreNorm
|
||||
kwargs:
|
||||
fields_group: feature
|
||||
clip_outlier: true
|
||||
- class: Fillna
|
||||
kwargs:
|
||||
fields_group: feature
|
||||
learn_processors:
|
||||
- class: DropnaLabel
|
||||
- class: CSRankNorm
|
||||
kwargs:
|
||||
fields_group: label
|
||||
label: ["Ref($close, -2) / Ref($close, -1) - 1"]
|
||||
port_analysis_config: &port_analysis_config
|
||||
strategy:
|
||||
class: TopkDropoutStrategy
|
||||
module_path: qlib.contrib.strategy
|
||||
kwargs:
|
||||
model: <MODEL>
|
||||
dataset: <DATASET>
|
||||
topk: 50
|
||||
n_drop: 5
|
||||
backtest:
|
||||
start_time: 2017-01-01
|
||||
end_time: 2020-08-01
|
||||
account: 100000000
|
||||
benchmark: *benchmark
|
||||
exchange_kwargs:
|
||||
limit_threshold: 0.095
|
||||
deal_price: close
|
||||
open_cost: 0.0005
|
||||
close_cost: 0.0015
|
||||
min_cost: 5
|
||||
task:
|
||||
model:
|
||||
class: ADARNN
|
||||
module_path: qlib.contrib.model.pytorch_adarnn
|
||||
kwargs:
|
||||
d_feat: 6
|
||||
hidden_size: 64
|
||||
num_layers: 2
|
||||
dropout: 0.0
|
||||
n_epochs: 200
|
||||
lr: 1e-3
|
||||
early_stop: 20
|
||||
batch_size: 800
|
||||
metric: loss
|
||||
loss: mse
|
||||
GPU: 0
|
||||
dataset:
|
||||
class: DatasetH
|
||||
module_path: qlib.data.dataset
|
||||
kwargs:
|
||||
handler:
|
||||
class: Alpha360
|
||||
module_path: qlib.contrib.data.handler
|
||||
kwargs: *data_handler_config
|
||||
segments:
|
||||
train: [2008-01-01, 2014-12-31]
|
||||
valid: [2015-01-01, 2016-12-31]
|
||||
test: [2017-01-01, 2020-08-01]
|
||||
record:
|
||||
- class: SignalRecord
|
||||
module_path: qlib.workflow.record_temp
|
||||
kwargs:
|
||||
model: <MODEL>
|
||||
dataset: <DATASET>
|
||||
- class: SigAnaRecord
|
||||
module_path: qlib.workflow.record_temp
|
||||
kwargs:
|
||||
ana_long_short: False
|
||||
ann_scaler: 252
|
||||
- class: PortAnaRecord
|
||||
module_path: qlib.workflow.record_temp
|
||||
kwargs:
|
||||
config: *port_analysis_config
|
||||
@@ -21,6 +21,7 @@ The numbers shown below demonstrate the performance of the entire `workflow` of
|
||||
|
||||
| Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown |
|
||||
|------------------------------------------|-------------------------------------|-------------|-------------|-------------|-------------|-------------------|-------------------|--------------|
|
||||
| TCN(Shaojie Bai, et al.) | Alpha158 | 0.0275±0.00 | 0.2157±0.01 | 0.0411±0.00 | 0.3379±0.01 | 0.0190±0.02 | 0.2887±0.27 | -0.1202±0.03 |
|
||||
| TabNet(Sercan O. Arik, et al.) | Alpha158 | 0.0204±0.01 | 0.1554±0.07 | 0.0333±0.00 | 0.2552±0.05 | 0.0227±0.04 | 0.3676±0.54 | -0.1089±0.08 |
|
||||
| Transformer(Ashish Vaswani, et al.) | Alpha158 | 0.0264±0.00 | 0.2053±0.02 | 0.0407±0.00 | 0.3273±0.02 | 0.0273±0.02 | 0.3970±0.26 | -0.1101±0.02 |
|
||||
| GRU(Kyunghyun Cho, et al.) | Alpha158(with selected 20 features) | 0.0315±0.00 | 0.2450±0.04 | 0.0428±0.00 | 0.3440±0.03 | 0.0344±0.02 | 0.5160±0.25 | -0.1017±0.02 |
|
||||
@@ -38,8 +39,6 @@ The numbers shown below demonstrate the performance of the entire `workflow` of
|
||||
| MLP | Alpha158 | 0.0376±0.00 | 0.2846±0.02 | 0.0429±0.00 | 0.3220±0.01 | 0.0895±0.02 | 1.1408±0.23 | -0.1103±0.02 |
|
||||
| LightGBM(Guolin Ke, et al.) | Alpha158 | 0.0448±0.00 | 0.3660±0.00 | 0.0469±0.00 | 0.3877±0.00 | 0.0901±0.00 | 1.0164±0.00 | -0.1038±0.00 |
|
||||
| DoubleEnsemble(Chuheng Zhang, et al.) | Alpha158 | 0.0544±0.00 | 0.4340±0.00 | 0.0523±0.00 | 0.4284±0.01 | 0.1168±0.01 | 1.3384±0.12 | -0.1036±0.01 |
|
||||
| TCN | Alpha158 | 0.0275±0.00 | 0.2157±0.01 | 0.0411±0.00 | 0.3379±0.01 | 0.0190±0.02 | 0.2887±0.27 | -0.1202±0.03 |
|
||||
|
||||
|
||||
|
||||
## Alpha360 dataset
|
||||
@@ -54,13 +53,14 @@ The numbers shown below demonstrate the performance of the entire `workflow` of
|
||||
| XGBoost(Tianqi Chen, et al.) | Alpha360 | 0.0394±0.00 | 0.2909±0.00 | 0.0448±0.00 | 0.3679±0.00 | 0.0344±0.00 | 0.4527±0.02 | -0.1004±0.00 |
|
||||
| DoubleEnsemble(Chuheng Zhang, et al.) | Alpha360 | 0.0404±0.00 | 0.3023±0.00 | 0.0495±0.00 | 0.3898±0.00 | 0.0468±0.01 | 0.6302±0.20 | -0.0860±0.01 |
|
||||
| LightGBM(Guolin Ke, et al.) | Alpha360 | 0.0400±0.00 | 0.3037±0.00 | 0.0499±0.00 | 0.4042±0.00 | 0.0558±0.00 | 0.7632±0.00 | -0.0659±0.00 |
|
||||
| TCN(Shaojie Bai, et al.) | Alpha360 | 0.0441±0.00 | 0.3301±0.02 | 0.0519±0.00 | 0.4130±0.01 | 0.0604±0.02 | 0.8295±0.34 | -0.1018±0.03 |
|
||||
| ALSTM (Yao Qin, et al.) | Alpha360 | 0.0497±0.00 | 0.3829±0.04 | 0.0599±0.00 | 0.4736±0.03 | 0.0626±0.02 | 0.8651±0.31 | -0.0994±0.03 |
|
||||
| LSTM(Sepp Hochreiter, et al.) | Alpha360 | 0.0448±0.00 | 0.3474±0.04 | 0.0549±0.00 | 0.4366±0.03 | 0.0647±0.03 | 0.8963±0.39 | -0.0875±0.02 |
|
||||
| GRU(Kyunghyun Cho, et al.) | Alpha360 | 0.0493±0.00 | 0.3772±0.04 | 0.0584±0.00 | 0.4638±0.03 | 0.0720±0.02 | 0.9730±0.33 | -0.0821±0.02 |
|
||||
| AdaRNN(Yuntao Du, et al.) | Alpha360 | 0.0464±0.01 | 0.3619±0.08 | 0.0539±0.01 | 0.4287±0.06 | 0.0753±0.03 | 1.0200±0.40 | -0.0936±0.03 |
|
||||
| GATs (Petar Velickovic, et al.) | Alpha360 | 0.0476±0.00 | 0.3508±0.02 | 0.0598±0.00 | 0.4604±0.01 | 0.0824±0.02 | 1.1079±0.26 | -0.0894±0.03 |
|
||||
| TCTS(Xueqing Wu, et al.) | Alpha360 | 0.0508±0.00 | 0.3931±0.04 | 0.0599±0.00 | 0.4756±0.03 | 0.0893±0.03 | 1.2256±0.36 | -0.0857±0.02 |
|
||||
| TRA(Hengxu Lin, et al.) | Alpha360 | 0.0485±0.00 | 0.3787±0.03 | 0.0587±0.00 | 0.4756±0.03 | 0.0920±0.03 | 1.2789±0.42 | -0.0834±0.02 |
|
||||
| TCN(Shaojie Bai, et al.) | Alpha360 | 0.0441±0.00 | 0.3301±0.02 | 0.0519±0.00 | 0.4130±0.01 | 0.0604±0.02 | 0.8295±0.34 | -0.1018±0.03 |
|
||||
|
||||
- The selected 20 features are based on the feature importance of a lightgbm-based model.
|
||||
- The base model of DoubleEnsemble is LGBM.
|
||||
|
||||
@@ -95,4 +95,4 @@ task:
|
||||
- class: PortAnaRecord
|
||||
module_path: qlib.workflow.record_temp
|
||||
kwargs:
|
||||
config: *port_analysis_config
|
||||
config: *port_analysis_config
|
||||
789
qlib/contrib/model/pytorch_adarnn.py
Normal file
789
qlib/contrib/model/pytorch_adarnn.py
Normal file
@@ -0,0 +1,789 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
import os
|
||||
from pdb import set_trace
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
|
||||
import copy
|
||||
from typing import Text, Union
|
||||
|
||||
import math
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.optim as optim
|
||||
from torch.autograd import Function
|
||||
from qlib.contrib.model.pytorch_utils import count_parameters
|
||||
from qlib.data.dataset import DatasetH
|
||||
from qlib.data.dataset.handler import DataHandlerLP
|
||||
from qlib.log import get_module_logger
|
||||
from qlib.model.base import Model
|
||||
from qlib.utils import get_or_create_path
|
||||
|
||||
|
||||
class ADARNN(Model):
|
||||
"""ADARNN Model
|
||||
|
||||
Parameters
|
||||
----------
|
||||
d_feat : int
|
||||
input dimension for each time step
|
||||
metric: str
|
||||
the evaluate metric used in early stop
|
||||
optimizer : str
|
||||
optimizer name
|
||||
GPU : str
|
||||
the GPU ID(s) used for training
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
d_feat=6,
|
||||
hidden_size=64,
|
||||
num_layers=2,
|
||||
dropout=0.0,
|
||||
n_epochs=200,
|
||||
pre_epoch=40,
|
||||
dw=0.5,
|
||||
loss_type="cosine",
|
||||
len_seq=60,
|
||||
len_win=0,
|
||||
lr=0.001,
|
||||
metric="mse",
|
||||
batch_size=2000,
|
||||
early_stop=20,
|
||||
loss="mse",
|
||||
optimizer="adam",
|
||||
n_splits=2,
|
||||
GPU=0,
|
||||
seed=None,
|
||||
**kwargs
|
||||
):
|
||||
# Set logger.
|
||||
self.logger = get_module_logger("ADARNN")
|
||||
self.logger.info("ADARNN pytorch version...")
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = str(GPU)
|
||||
|
||||
# set hyper-parameters.
|
||||
self.d_feat = d_feat
|
||||
self.hidden_size = hidden_size
|
||||
self.num_layers = num_layers
|
||||
self.dropout = dropout
|
||||
self.n_epochs = n_epochs
|
||||
self.pre_epoch = pre_epoch
|
||||
self.dw = dw
|
||||
self.loss_type = loss_type
|
||||
self.len_seq = len_seq
|
||||
self.len_win = len_win
|
||||
self.lr = lr
|
||||
self.metric = metric
|
||||
self.batch_size = batch_size
|
||||
self.early_stop = early_stop
|
||||
self.optimizer = optimizer.lower()
|
||||
self.loss = loss
|
||||
self.n_splits = n_splits
|
||||
self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu")
|
||||
self.seed = seed
|
||||
|
||||
self.logger.info(
|
||||
"ADARNN parameters setting:"
|
||||
"\nd_feat : {}"
|
||||
"\nhidden_size : {}"
|
||||
"\nnum_layers : {}"
|
||||
"\ndropout : {}"
|
||||
"\nn_epochs : {}"
|
||||
"\nlr : {}"
|
||||
"\nmetric : {}"
|
||||
"\nbatch_size : {}"
|
||||
"\nearly_stop : {}"
|
||||
"\noptimizer : {}"
|
||||
"\nloss_type : {}"
|
||||
"\nvisible_GPU : {}"
|
||||
"\nuse_GPU : {}"
|
||||
"\nseed : {}".format(
|
||||
d_feat,
|
||||
hidden_size,
|
||||
num_layers,
|
||||
dropout,
|
||||
n_epochs,
|
||||
lr,
|
||||
metric,
|
||||
batch_size,
|
||||
early_stop,
|
||||
optimizer.lower(),
|
||||
loss,
|
||||
GPU,
|
||||
self.use_gpu,
|
||||
seed,
|
||||
)
|
||||
)
|
||||
|
||||
if self.seed is not None:
|
||||
np.random.seed(self.seed)
|
||||
torch.manual_seed(self.seed)
|
||||
|
||||
n_hiddens = [hidden_size for _ in range(num_layers)]
|
||||
self.model = AdaRNN(
|
||||
use_bottleneck=False,
|
||||
bottleneck_width=64,
|
||||
n_input=d_feat,
|
||||
n_hiddens=n_hiddens,
|
||||
n_output=1,
|
||||
dropout=dropout,
|
||||
model_type="AdaRNN",
|
||||
len_seq=len_seq,
|
||||
trans_loss=loss_type,
|
||||
)
|
||||
self.logger.info("model:\n{:}".format(self.model))
|
||||
self.logger.info("model size: {:.4f} MB".format(count_parameters(self.model)))
|
||||
|
||||
if optimizer.lower() == "adam":
|
||||
self.train_optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
|
||||
elif optimizer.lower() == "gd":
|
||||
self.train_optimizer = optim.SGD(self.model.parameters(), lr=self.lr)
|
||||
else:
|
||||
raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
|
||||
|
||||
self.fitted = False
|
||||
self.model.cuda()
|
||||
|
||||
@property
|
||||
def use_gpu(self):
|
||||
return self.device != torch.device("cpu")
|
||||
|
||||
def train_AdaRNN(self, train_loader_list, epoch, dist_old=None, weight_mat=None):
|
||||
self.model.train()
|
||||
criterion = nn.MSELoss()
|
||||
dist_mat = torch.zeros(self.num_layers, self.len_seq).cuda()
|
||||
len_loader = np.inf
|
||||
for loader in train_loader_list:
|
||||
if len(loader) < len_loader:
|
||||
len_loader = len(loader)
|
||||
for data_all in zip(*train_loader_list):
|
||||
# for data_all in zip(*train_loader_list):
|
||||
self.train_optimizer.zero_grad()
|
||||
list_feat = []
|
||||
list_label = []
|
||||
for data in data_all:
|
||||
# feature :[36, 24, 6]
|
||||
feature, label_reg = data[0].cuda().float(), data[1].cuda().float()
|
||||
list_feat.append(feature)
|
||||
list_label.append(label_reg)
|
||||
flag = False
|
||||
index = get_index(len(data_all) - 1)
|
||||
for temp_index in index:
|
||||
s1 = temp_index[0]
|
||||
s2 = temp_index[1]
|
||||
if list_feat[s1].shape[0] != list_feat[s2].shape[0]:
|
||||
flag = True
|
||||
break
|
||||
if flag:
|
||||
continue
|
||||
|
||||
total_loss = torch.zeros(1).cuda()
|
||||
for i in range(len(index)):
|
||||
feature_s = list_feat[index[i][0]]
|
||||
feature_t = list_feat[index[i][1]]
|
||||
label_reg_s = list_label[index[i][0]]
|
||||
label_reg_t = list_label[index[i][1]]
|
||||
feature_all = torch.cat((feature_s, feature_t), 0)
|
||||
|
||||
if epoch < self.pre_epoch:
|
||||
pred_all, loss_transfer, out_weight_list = self.model.forward_pre_train(
|
||||
feature_all, len_win=self.len_win
|
||||
)
|
||||
else:
|
||||
pred_all, loss_transfer, dist, weight_mat = self.model.forward_Boosting(feature_all, weight_mat)
|
||||
dist_mat = dist_mat + dist
|
||||
pred_s = pred_all[0 : feature_s.size(0)]
|
||||
pred_t = pred_all[feature_s.size(0) :]
|
||||
|
||||
loss_s = criterion(pred_s, label_reg_s)
|
||||
loss_t = criterion(pred_t, label_reg_t)
|
||||
|
||||
total_loss = total_loss + loss_s + loss_t + self.dw * loss_transfer
|
||||
self.train_optimizer.zero_grad()
|
||||
total_loss.backward()
|
||||
torch.nn.utils.clip_grad_value_(self.model.parameters(), 3.0)
|
||||
self.train_optimizer.step()
|
||||
if epoch >= self.pre_epoch:
|
||||
if epoch > self.pre_epoch:
|
||||
weight_mat = self.model.update_weight_Boosting(weight_mat, dist_old, dist_mat)
|
||||
return weight_mat, dist_mat
|
||||
else:
|
||||
weight_mat = self.transform_type(out_weight_list)
|
||||
return weight_mat, None
|
||||
|
||||
def calc_all_metrics(self, pred):
|
||||
"""pred is a pandas dataframe that has two attributes: score (pred) and label (real)"""
|
||||
res = {}
|
||||
ic = pred.groupby(level="datetime").apply(lambda x: x.label.corr(x.score))
|
||||
rank_ic = pred.groupby(level="datetime").apply(lambda x: x.label.corr(x.score, method="spearman"))
|
||||
res["ic"] = ic.mean()
|
||||
res["icir"] = ic.mean() / ic.std()
|
||||
res["ric"] = rank_ic.mean()
|
||||
res["ricir"] = rank_ic.mean() / rank_ic.std()
|
||||
res["mse"] = -(pred["label"] - pred["score"]).mean()
|
||||
res["loss"] = res["mse"]
|
||||
return res
|
||||
|
||||
def test_epoch(self, df):
|
||||
self.model.eval()
|
||||
preds = self.infer(df["feature"])
|
||||
label = df["label"].squeeze()
|
||||
preds = pd.DataFrame({"label": label, "score": preds}, index=df.index)
|
||||
metrics = self.calc_all_metrics(preds)
|
||||
return metrics
|
||||
|
||||
def log_metrics(self, mode, metrics):
|
||||
metrics = ["{}/{}: {:.6f}".format(k, mode, v) for k, v in metrics.items()]
|
||||
metrics = ", ".join(metrics)
|
||||
self.logger.info(metrics)
|
||||
|
||||
def fit(
|
||||
self,
|
||||
dataset: DatasetH,
|
||||
evals_result=dict(),
|
||||
save_path=None,
|
||||
):
|
||||
|
||||
df_train, df_valid = dataset.prepare(
|
||||
["train", "valid"],
|
||||
col_set=["feature", "label"],
|
||||
data_key=DataHandlerLP.DK_L,
|
||||
)
|
||||
# splits = ['2011-06-30']
|
||||
days = df_train.index.get_level_values(level=0).unique()
|
||||
train_splits = np.array_split(days, self.n_splits)
|
||||
train_splits = [df_train[s[0] : s[-1]] for s in train_splits]
|
||||
train_loader_list = [get_stock_loader(df, self.batch_size) for df in train_splits]
|
||||
|
||||
save_path = get_or_create_path(save_path)
|
||||
stop_steps = 0
|
||||
best_score = -np.inf
|
||||
best_epoch = 0
|
||||
evals_result["train"] = []
|
||||
evals_result["valid"] = []
|
||||
|
||||
# train
|
||||
self.logger.info("training...")
|
||||
self.fitted = True
|
||||
best_score = -np.inf
|
||||
best_epoch = 0
|
||||
weight_mat, dist_mat = None, None
|
||||
|
||||
for step in range(self.n_epochs):
|
||||
self.logger.info("Epoch%d:", step)
|
||||
self.logger.info("training...")
|
||||
weight_mat, dist_mat = self.train_AdaRNN(train_loader_list, step, dist_mat, weight_mat)
|
||||
self.logger.info("evaluating...")
|
||||
train_metrics = self.test_epoch(df_train)
|
||||
valid_metrics = self.test_epoch(df_valid)
|
||||
self.log_metrics("train: ", train_metrics)
|
||||
self.log_metrics("valid: ", valid_metrics)
|
||||
|
||||
valid_score = valid_metrics[self.metric]
|
||||
train_score = train_metrics[self.metric]
|
||||
evals_result["train"].append(train_score)
|
||||
evals_result["valid"].append(valid_score)
|
||||
if valid_score > best_score:
|
||||
best_score = valid_score
|
||||
stop_steps = 0
|
||||
best_epoch = step
|
||||
best_param = copy.deepcopy(self.model.state_dict())
|
||||
else:
|
||||
stop_steps += 1
|
||||
if stop_steps >= self.early_stop:
|
||||
self.logger.info("early stop")
|
||||
break
|
||||
|
||||
self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
|
||||
self.model.load_state_dict(best_param)
|
||||
torch.save(best_param, save_path)
|
||||
|
||||
if self.use_gpu:
|
||||
torch.cuda.empty_cache()
|
||||
return best_score
|
||||
|
||||
def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
|
||||
if not self.fitted:
|
||||
raise ValueError("model is not fitted yet!")
|
||||
x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I)
|
||||
return self.infer(x_test)
|
||||
|
||||
def infer(self, x_test):
|
||||
index = x_test.index
|
||||
self.model.eval()
|
||||
x_values = x_test.values
|
||||
sample_num = x_values.shape[0]
|
||||
x_values = x_values.reshape(sample_num, self.d_feat, -1).transpose(0, 2, 1)
|
||||
preds = []
|
||||
|
||||
for begin in range(sample_num)[:: self.batch_size]:
|
||||
|
||||
if sample_num - begin < self.batch_size:
|
||||
end = sample_num
|
||||
else:
|
||||
end = begin + self.batch_size
|
||||
|
||||
x_batch = torch.from_numpy(x_values[begin:end]).float().cuda()
|
||||
|
||||
with torch.no_grad():
|
||||
pred = self.model.predict(x_batch).detach().cpu().numpy()
|
||||
|
||||
preds.append(pred)
|
||||
|
||||
return pd.Series(np.concatenate(preds), index=index)
|
||||
|
||||
def transform_type(self, init_weight):
|
||||
weight = torch.ones(self.num_layers, self.len_seq).cuda()
|
||||
for i in range(self.num_layers):
|
||||
for j in range(self.len_seq):
|
||||
weight[i, j] = init_weight[i][j].item()
|
||||
return weight
|
||||
|
||||
|
||||
class data_loader(Dataset):
|
||||
def __init__(self, df):
|
||||
self.df_feature = df["feature"]
|
||||
self.df_label_reg = df["label"]
|
||||
self.df_index = df.index
|
||||
self.df_feature = torch.tensor(
|
||||
self.df_feature.values.reshape(-1, 6, 60).transpose(0, 2, 1), dtype=torch.float32
|
||||
)
|
||||
self.df_label_reg = torch.tensor(self.df_label_reg.values.reshape(-1), dtype=torch.float32)
|
||||
|
||||
def __getitem__(self, index):
|
||||
sample, label_reg = self.df_feature[index], self.df_label_reg[index]
|
||||
return sample, label_reg
|
||||
|
||||
def __len__(self):
|
||||
return len(self.df_feature)
|
||||
|
||||
|
||||
def get_stock_loader(df, batch_size, shuffle=True):
|
||||
train_loader = DataLoader(data_loader(df), batch_size=batch_size, shuffle=shuffle)
|
||||
return train_loader
|
||||
|
||||
|
||||
def get_index(num_domain=2):
|
||||
index = []
|
||||
for i in range(num_domain):
|
||||
for j in range(i + 1, num_domain + 1):
|
||||
index.append((i, j))
|
||||
return index
|
||||
|
||||
|
||||
class AdaRNN(nn.Module):
|
||||
"""
|
||||
model_type: 'Boosting', 'AdaRNN'
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
use_bottleneck=False,
|
||||
bottleneck_width=256,
|
||||
n_input=128,
|
||||
n_hiddens=[64, 64],
|
||||
n_output=6,
|
||||
dropout=0.0,
|
||||
len_seq=9,
|
||||
model_type="AdaRNN",
|
||||
trans_loss="mmd",
|
||||
):
|
||||
super(AdaRNN, self).__init__()
|
||||
self.use_bottleneck = use_bottleneck
|
||||
self.n_input = n_input
|
||||
self.num_layers = len(n_hiddens)
|
||||
self.hiddens = n_hiddens
|
||||
self.n_output = n_output
|
||||
self.model_type = model_type
|
||||
self.trans_loss = trans_loss
|
||||
self.len_seq = len_seq
|
||||
in_size = self.n_input
|
||||
|
||||
features = nn.ModuleList()
|
||||
for hidden in n_hiddens:
|
||||
rnn = nn.GRU(input_size=in_size, num_layers=1, hidden_size=hidden, batch_first=True, dropout=dropout)
|
||||
features.append(rnn)
|
||||
in_size = hidden
|
||||
self.features = nn.Sequential(*features)
|
||||
|
||||
if use_bottleneck == True: # finance
|
||||
self.bottleneck = nn.Sequential(
|
||||
nn.Linear(n_hiddens[-1], bottleneck_width),
|
||||
nn.Linear(bottleneck_width, bottleneck_width),
|
||||
nn.BatchNorm1d(bottleneck_width),
|
||||
nn.ReLU(),
|
||||
nn.Dropout(),
|
||||
)
|
||||
self.bottleneck[0].weight.data.normal_(0, 0.005)
|
||||
self.bottleneck[0].bias.data.fill_(0.1)
|
||||
self.bottleneck[1].weight.data.normal_(0, 0.005)
|
||||
self.bottleneck[1].bias.data.fill_(0.1)
|
||||
self.fc = nn.Linear(bottleneck_width, n_output)
|
||||
torch.nn.init.xavier_normal_(self.fc.weight)
|
||||
else:
|
||||
self.fc_out = nn.Linear(n_hiddens[-1], self.n_output)
|
||||
|
||||
if self.model_type == "AdaRNN":
|
||||
gate = nn.ModuleList()
|
||||
for i in range(len(n_hiddens)):
|
||||
gate_weight = nn.Linear(len_seq * self.hiddens[i] * 2, len_seq)
|
||||
gate.append(gate_weight)
|
||||
self.gate = gate
|
||||
|
||||
bnlst = nn.ModuleList()
|
||||
for i in range(len(n_hiddens)):
|
||||
bnlst.append(nn.BatchNorm1d(len_seq))
|
||||
self.bn_lst = bnlst
|
||||
self.softmax = torch.nn.Softmax(dim=0)
|
||||
self.init_layers()
|
||||
|
||||
def init_layers(self):
|
||||
for i in range(len(self.hiddens)):
|
||||
self.gate[i].weight.data.normal_(0, 0.05)
|
||||
self.gate[i].bias.data.fill_(0.0)
|
||||
|
||||
def forward_pre_train(self, x, len_win=0):
|
||||
out = self.gru_features(x)
|
||||
fea = out[0] # [2N,L,H]
|
||||
if self.use_bottleneck == True:
|
||||
fea_bottleneck = self.bottleneck(fea[:, -1, :])
|
||||
fc_out = self.fc(fea_bottleneck).squeeze()
|
||||
else:
|
||||
fc_out = self.fc_out(fea[:, -1, :]).squeeze() # [N,]
|
||||
|
||||
out_list_all, out_weight_list = out[1], out[2]
|
||||
out_list_s, out_list_t = self.get_features(out_list_all)
|
||||
loss_transfer = torch.zeros((1,)).cuda()
|
||||
for i in range(len(out_list_s)):
|
||||
criterion_transder = TransferLoss(loss_type=self.trans_loss, input_dim=out_list_s[i].shape[2])
|
||||
h_start = 0
|
||||
for j in range(h_start, self.len_seq, 1):
|
||||
i_start = j - len_win if j - len_win >= 0 else 0
|
||||
i_end = j + len_win if j + len_win < self.len_seq else self.len_seq - 1
|
||||
for k in range(i_start, i_end + 1):
|
||||
weight = (
|
||||
out_weight_list[i][j]
|
||||
if self.model_type == "AdaRNN"
|
||||
else 1 / (self.len_seq - h_start) * (2 * len_win + 1)
|
||||
)
|
||||
loss_transfer = loss_transfer + weight * criterion_transder.compute(
|
||||
out_list_s[i][:, j, :], out_list_t[i][:, k, :]
|
||||
)
|
||||
return fc_out, loss_transfer, out_weight_list
|
||||
|
||||
def gru_features(self, x, predict=False):
|
||||
x_input = x
|
||||
out = None
|
||||
out_lis = []
|
||||
out_weight_list = [] if (self.model_type == "AdaRNN") else None
|
||||
for i in range(self.num_layers):
|
||||
out, _ = self.features[i](x_input.float())
|
||||
x_input = out
|
||||
out_lis.append(out)
|
||||
if self.model_type == "AdaRNN" and predict == False:
|
||||
out_gate = self.process_gate_weight(x_input, i)
|
||||
out_weight_list.append(out_gate)
|
||||
return out, out_lis, out_weight_list
|
||||
|
||||
def process_gate_weight(self, out, index):
|
||||
x_s = out[0 : int(out.shape[0] // 2)]
|
||||
x_t = out[out.shape[0] // 2 : out.shape[0]]
|
||||
x_all = torch.cat((x_s, x_t), 2)
|
||||
x_all = x_all.view(x_all.shape[0], -1)
|
||||
weight = torch.sigmoid(self.bn_lst[index](self.gate[index](x_all.float())))
|
||||
weight = torch.mean(weight, dim=0)
|
||||
res = self.softmax(weight).squeeze()
|
||||
return res
|
||||
|
||||
def get_features(self, output_list):
|
||||
fea_list_src, fea_list_tar = [], []
|
||||
for fea in output_list:
|
||||
fea_list_src.append(fea[0 : fea.size(0) // 2])
|
||||
fea_list_tar.append(fea[fea.size(0) // 2 :])
|
||||
return fea_list_src, fea_list_tar
|
||||
|
||||
# For Boosting-based
|
||||
def forward_Boosting(self, x, weight_mat=None):
|
||||
out = self.gru_features(x)
|
||||
fea = out[0]
|
||||
if self.use_bottleneck:
|
||||
fea_bottleneck = self.bottleneck(fea[:, -1, :])
|
||||
fc_out = self.fc(fea_bottleneck).squeeze()
|
||||
else:
|
||||
fc_out = self.fc_out(fea[:, -1, :]).squeeze()
|
||||
|
||||
out_list_all = out[1]
|
||||
out_list_s, out_list_t = self.get_features(out_list_all)
|
||||
loss_transfer = torch.zeros((1,)).cuda()
|
||||
if weight_mat is None:
|
||||
weight = (1.0 / self.len_seq * torch.ones(self.num_layers, self.len_seq)).cuda()
|
||||
else:
|
||||
weight = weight_mat
|
||||
dist_mat = torch.zeros(self.num_layers, self.len_seq).cuda()
|
||||
for i in range(len(out_list_s)):
|
||||
criterion_transder = TransferLoss(loss_type=self.trans_loss, input_dim=out_list_s[i].shape[2])
|
||||
for j in range(self.len_seq):
|
||||
loss_trans = criterion_transder.compute(out_list_s[i][:, j, :], out_list_t[i][:, j, :])
|
||||
loss_transfer = loss_transfer + weight[i, j] * loss_trans
|
||||
dist_mat[i, j] = loss_trans
|
||||
return fc_out, loss_transfer, dist_mat, weight
|
||||
|
||||
# For Boosting-based
|
||||
def update_weight_Boosting(self, weight_mat, dist_old, dist_new):
|
||||
epsilon = 1e-5
|
||||
dist_old = dist_old.detach()
|
||||
dist_new = dist_new.detach()
|
||||
ind = dist_new > dist_old + epsilon
|
||||
weight_mat[ind] = weight_mat[ind] * (1 + torch.sigmoid(dist_new[ind] - dist_old[ind]))
|
||||
weight_norm = torch.norm(weight_mat, dim=1, p=1)
|
||||
weight_mat = weight_mat / weight_norm.t().unsqueeze(1).repeat(1, self.len_seq)
|
||||
return weight_mat
|
||||
|
||||
def predict(self, x):
|
||||
out = self.gru_features(x, predict=True)
|
||||
fea = out[0]
|
||||
if self.use_bottleneck == True:
|
||||
fea_bottleneck = self.bottleneck(fea[:, -1, :])
|
||||
fc_out = self.fc(fea_bottleneck).squeeze()
|
||||
else:
|
||||
fc_out = self.fc_out(fea[:, -1, :]).squeeze()
|
||||
return fc_out
|
||||
|
||||
|
||||
class TransferLoss(object):
|
||||
def __init__(self, loss_type="cosine", input_dim=512):
|
||||
"""
|
||||
Supported loss_type: mmd(mmd_lin), mmd_rbf, coral, cosine, kl, js, mine, adv
|
||||
"""
|
||||
self.loss_type = loss_type
|
||||
self.input_dim = input_dim
|
||||
|
||||
def compute(self, X, Y):
|
||||
"""Compute adaptation loss
|
||||
|
||||
Arguments:
|
||||
X {tensor} -- source matrix
|
||||
Y {tensor} -- target matrix
|
||||
|
||||
Returns:
|
||||
[tensor] -- transfer loss
|
||||
"""
|
||||
if self.loss_type == "mmd_lin" or self.loss_type == "mmd":
|
||||
mmdloss = MMD_loss(kernel_type="linear")
|
||||
loss = mmdloss(X, Y)
|
||||
elif self.loss_type == "coral":
|
||||
loss = CORAL(X, Y)
|
||||
elif self.loss_type == "cosine" or self.loss_type == "cos":
|
||||
loss = 1 - cosine(X, Y)
|
||||
elif self.loss_type == "kl":
|
||||
loss = kl_div(X, Y)
|
||||
elif self.loss_type == "js":
|
||||
loss = js(X, Y)
|
||||
elif self.loss_type == "mine":
|
||||
mine_model = Mine_estimator(input_dim=self.input_dim, hidden_dim=60).cuda()
|
||||
loss = mine_model(X, Y)
|
||||
elif self.loss_type == "adv":
|
||||
loss = adv(X, Y, input_dim=self.input_dim, hidden_dim=32)
|
||||
elif self.loss_type == "mmd_rbf":
|
||||
mmdloss = MMD_loss(kernel_type="rbf")
|
||||
loss = mmdloss(X, Y)
|
||||
elif self.loss_type == "pairwise":
|
||||
pair_mat = pairwise_dist(X, Y)
|
||||
loss = torch.norm(pair_mat)
|
||||
|
||||
return loss
|
||||
|
||||
|
||||
def cosine(source, target):
|
||||
source, target = source.mean(), target.mean()
|
||||
cos = nn.CosineSimilarity(dim=0)
|
||||
loss = cos(source, target)
|
||||
return loss.mean()
|
||||
|
||||
|
||||
class ReverseLayerF(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, x, alpha):
|
||||
ctx.alpha = alpha
|
||||
return x.view_as(x)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
output = grad_output.neg() * ctx.alpha
|
||||
return output, None
|
||||
|
||||
|
||||
class Discriminator(nn.Module):
|
||||
def __init__(self, input_dim=256, hidden_dim=256):
|
||||
super(Discriminator, self).__init__()
|
||||
self.input_dim = input_dim
|
||||
self.hidden_dim = hidden_dim
|
||||
self.dis1 = nn.Linear(input_dim, hidden_dim)
|
||||
self.dis2 = nn.Linear(hidden_dim, 1)
|
||||
|
||||
def forward(self, x):
|
||||
x = F.relu(self.dis1(x))
|
||||
x = self.dis2(x)
|
||||
x = torch.sigmoid(x)
|
||||
return x
|
||||
|
||||
|
||||
def adv(source, target, input_dim=256, hidden_dim=512):
|
||||
domain_loss = nn.BCELoss()
|
||||
# !!! Pay attention to .cuda !!!
|
||||
adv_net = Discriminator(input_dim, hidden_dim).cuda()
|
||||
domain_src = torch.ones(len(source)).cuda()
|
||||
domain_tar = torch.zeros(len(target)).cuda()
|
||||
domain_src, domain_tar = domain_src.view(domain_src.shape[0], 1), domain_tar.view(domain_tar.shape[0], 1)
|
||||
reverse_src = ReverseLayerF.apply(source, 1)
|
||||
reverse_tar = ReverseLayerF.apply(target, 1)
|
||||
pred_src = adv_net(reverse_src)
|
||||
pred_tar = adv_net(reverse_tar)
|
||||
loss_s, loss_t = domain_loss(pred_src, domain_src), domain_loss(pred_tar, domain_tar)
|
||||
loss = loss_s + loss_t
|
||||
return loss
|
||||
|
||||
|
||||
def CORAL(source, target):
|
||||
d = source.size(1)
|
||||
ns, nt = source.size(0), target.size(0)
|
||||
|
||||
# source covariance
|
||||
tmp_s = torch.ones((1, ns)).cuda() @ source
|
||||
cs = (source.t() @ source - (tmp_s.t() @ tmp_s) / ns) / (ns - 1)
|
||||
|
||||
# target covariance
|
||||
tmp_t = torch.ones((1, nt)).cuda() @ target
|
||||
ct = (target.t() @ target - (tmp_t.t() @ tmp_t) / nt) / (nt - 1)
|
||||
|
||||
# frobenius norm
|
||||
loss = (cs - ct).pow(2).sum()
|
||||
loss = loss / (4 * d * d)
|
||||
|
||||
return loss
|
||||
|
||||
|
||||
class MMD_loss(nn.Module):
|
||||
def __init__(self, kernel_type="linear", kernel_mul=2.0, kernel_num=5):
|
||||
super(MMD_loss, self).__init__()
|
||||
self.kernel_num = kernel_num
|
||||
self.kernel_mul = kernel_mul
|
||||
self.fix_sigma = None
|
||||
self.kernel_type = kernel_type
|
||||
|
||||
def guassian_kernel(self, source, target, kernel_mul=2.0, kernel_num=5, fix_sigma=None):
|
||||
n_samples = int(source.size()[0]) + int(target.size()[0])
|
||||
total = torch.cat([source, target], dim=0)
|
||||
total0 = total.unsqueeze(0).expand(int(total.size(0)), int(total.size(0)), int(total.size(1)))
|
||||
total1 = total.unsqueeze(1).expand(int(total.size(0)), int(total.size(0)), int(total.size(1)))
|
||||
L2_distance = ((total0 - total1) ** 2).sum(2)
|
||||
if fix_sigma:
|
||||
bandwidth = fix_sigma
|
||||
else:
|
||||
bandwidth = torch.sum(L2_distance.data) / (n_samples ** 2 - n_samples)
|
||||
bandwidth /= kernel_mul ** (kernel_num // 2)
|
||||
bandwidth_list = [bandwidth * (kernel_mul ** i) for i in range(kernel_num)]
|
||||
kernel_val = [torch.exp(-L2_distance / bandwidth_temp) for bandwidth_temp in bandwidth_list]
|
||||
return sum(kernel_val)
|
||||
|
||||
def linear_mmd(self, X, Y):
|
||||
delta = X.mean(axis=0) - Y.mean(axis=0)
|
||||
loss = delta.dot(delta.T)
|
||||
return loss
|
||||
|
||||
def forward(self, source, target):
|
||||
if self.kernel_type == "linear":
|
||||
return self.linear_mmd(source, target)
|
||||
elif self.kernel_type == "rbf":
|
||||
batch_size = int(source.size()[0])
|
||||
kernels = self.guassian_kernel(
|
||||
source, target, kernel_mul=self.kernel_mul, kernel_num=self.kernel_num, fix_sigma=self.fix_sigma
|
||||
)
|
||||
with torch.no_grad():
|
||||
XX = torch.mean(kernels[:batch_size, :batch_size])
|
||||
YY = torch.mean(kernels[batch_size:, batch_size:])
|
||||
XY = torch.mean(kernels[:batch_size, batch_size:])
|
||||
YX = torch.mean(kernels[batch_size:, :batch_size])
|
||||
loss = torch.mean(XX + YY - XY - YX)
|
||||
return loss
|
||||
|
||||
|
||||
class Mine_estimator(nn.Module):
|
||||
def __init__(self, input_dim=2048, hidden_dim=512):
|
||||
super(Mine_estimator, self).__init__()
|
||||
self.mine_model = Mine(input_dim, hidden_dim)
|
||||
|
||||
def forward(self, X, Y):
|
||||
Y_shffle = Y[torch.randperm(len(Y))]
|
||||
loss_joint = self.mine_model(X, Y)
|
||||
loss_marginal = self.mine_model(X, Y_shffle)
|
||||
ret = torch.mean(loss_joint) - torch.log(torch.mean(torch.exp(loss_marginal)))
|
||||
loss = -ret
|
||||
return loss
|
||||
|
||||
|
||||
class Mine(nn.Module):
|
||||
def __init__(self, input_dim=2048, hidden_dim=512):
|
||||
super(Mine, self).__init__()
|
||||
self.fc1_x = nn.Linear(input_dim, hidden_dim)
|
||||
self.fc1_y = nn.Linear(input_dim, hidden_dim)
|
||||
self.fc2 = nn.Linear(hidden_dim, 1)
|
||||
|
||||
def forward(self, x, y):
|
||||
h1 = F.leaky_relu(self.fc1_x(x) + self.fc1_y(y))
|
||||
h2 = self.fc2(h1)
|
||||
return h2
|
||||
|
||||
|
||||
def pairwise_dist(X, Y):
|
||||
n, d = X.shape
|
||||
m, _ = Y.shape
|
||||
assert d == Y.shape[1]
|
||||
a = X.unsqueeze(1).expand(n, m, d)
|
||||
b = Y.unsqueeze(0).expand(n, m, d)
|
||||
return torch.pow(a - b, 2).sum(2)
|
||||
|
||||
|
||||
def pairwise_dist_np(X, Y):
|
||||
n, d = X.shape
|
||||
m, _ = Y.shape
|
||||
assert d == Y.shape[1]
|
||||
a = np.expand_dims(X, 1)
|
||||
b = np.expand_dims(Y, 0)
|
||||
a = np.tile(a, (1, m, 1))
|
||||
b = np.tile(b, (n, 1, 1))
|
||||
return np.power(a - b, 2).sum(2)
|
||||
|
||||
|
||||
def pa(X, Y):
|
||||
XY = np.dot(X, Y.T)
|
||||
XX = np.sum(np.square(X), axis=1)
|
||||
XX = np.transpose([XX])
|
||||
YY = np.sum(np.square(Y), axis=1)
|
||||
dist = XX + YY - 2 * XY
|
||||
|
||||
return dist
|
||||
|
||||
|
||||
def kl_div(source, target):
|
||||
if len(source) < len(target):
|
||||
target = target[: len(source)]
|
||||
elif len(source) > len(target):
|
||||
source = source[: len(target)]
|
||||
criterion = nn.KLDivLoss(reduction="batchmean")
|
||||
loss = criterion(source.log(), target)
|
||||
return loss
|
||||
|
||||
|
||||
def js(source, target):
|
||||
if len(source) < len(target):
|
||||
target = target[: len(source)]
|
||||
elif len(source) > len(target):
|
||||
source = source[: len(target)]
|
||||
M = 0.5 * (source + target)
|
||||
loss_1, loss_2 = kl_div(source, M), kl_div(target, M)
|
||||
return 0.5 * (loss_1 + loss_2)
|
||||
Reference in New Issue
Block a user