diff --git a/examples/benchmarks/GATs/workflow_config_gats.yaml b/examples/benchmarks/GATs/workflow_config_gats.yaml index 37bced99d..33aa0fe8d 100644 --- a/examples/benchmarks/GATs/workflow_config_gats.yaml +++ b/examples/benchmarks/GATs/workflow_config_gats.yaml @@ -36,7 +36,6 @@ task: n_epochs: 200 lr: 1e-3 early_stop: 20 - batch_size: 800 metric: loss loss: mse base_model: LSTM diff --git a/examples/workflow_by_code_gats.py b/examples/workflow_by_code_gats.py index b5bad31ec..4bb8b30dd 100644 --- a/examples/workflow_by_code_gats.py +++ b/examples/workflow_by_code_gats.py @@ -66,13 +66,12 @@ if __name__ == "__main__": "n_epochs": 200, "lr": 1e-3, "early_stop": 20, - "batch_size": 800, "metric": "loss", "loss": "mse", "base_model": "LSTM", "with_pretrain": True, "seed": 0, - "GPU": 0, + "GPU": "0", }, }, "dataset": { @@ -95,7 +94,6 @@ if __name__ == "__main__": # "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'], } - # model = train_model(task) model = init_instance_by_config(task["model"]) dataset = init_instance_by_config(task["dataset"]) model.fit(dataset) diff --git a/qlib/contrib/model/pytorch_gats.py b/qlib/contrib/model/pytorch_gats.py index 7cdfb571a..ef1ce5cc1 100755 --- a/qlib/contrib/model/pytorch_gats.py +++ b/qlib/contrib/model/pytorch_gats.py @@ -9,10 +9,8 @@ import os import numpy as np import pandas as pd import copy -from sklearn.metrics import roc_auc_score, mean_squared_error -import logging -from ...utils import unpack_archive_with_buffer, save_multiple_parts_file, create_save_path, drop_nan_by_y_index -from ...log import get_module_logger, TimeInspector +from ...utils import create_save_path +from ...log import get_module_logger import torch import torch.nn as nn @@ -49,7 +47,6 @@ class GAT(Model): n_epochs=200, lr=0.001, metric="IC", - batch_size=2000, early_stop=20, loss="mse", base_model="GRU", @@ -71,7 +68,6 @@ class GAT(Model): self.n_epochs = n_epochs self.lr = lr self.metric = metric - self.batch_size = batch_size self.early_stop = early_stop self.optimizer = optimizer.lower() self.loss = loss @@ -90,7 +86,6 @@ class GAT(Model): "\nn_epochs : {}" "\nlr : {}" "\nmetric : {}" - "\nbatch_size : {}" "\nearly_stop : {}" "\noptimizer : {}" "\nloss_type : {}" @@ -106,7 +101,6 @@ class GAT(Model): n_epochs, lr, metric, - batch_size, early_stop, optimizer.lower(), loss, @@ -165,23 +159,31 @@ class GAT(Model): def cal_ic(self, pred, label): return torch.mean(pred * label) + def get_daily_inter(self, df, shuffle=False): + # organize the train data into daily inter as daily batches + daily_count = df.groupby(level=0).size().values + daily_index = np.roll(np.cumsum(daily_count), 1) + daily_index[0] = 0 + if shuffle: + # shuffle the daily inter data + daily_shuffle = list(zip(daily_index, daily_count)) + np.random.shuffle(daily_shuffle) + daily_index, daily_count = zip(*daily_shuffle) + return daily_index, daily_count + def train_epoch(self, x_train, y_train): x_train_values = x_train.values y_train_values = np.squeeze(y_train.values) * 100 - self.GAT_model.train() - indices = np.arange(len(x_train_values)) - np.random.shuffle(indices) + # organize the train data into daily inter as daily batches + daily_index, daily_count = self.get_daily_inter(x_train, shuffle=True) - for i in range(len(indices))[:: self.batch_size]: - - if len(indices) - i < self.batch_size: - break - - feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float() - label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float() + for idx, count in zip(daily_index, daily_count): + batch = slice(idx, idx + count) + feature = torch.from_numpy(x_train_values[batch]).float() + label = torch.from_numpy(y_train_values[batch]).float() if self.use_gpu: feature = feature.cuda() @@ -206,15 +208,13 @@ class GAT(Model): scores = [] losses = [] - indices = np.arange(len(x_values)) + # organize the test data into daily inter as daily batches + daily_index, daily_count = self.get_daily_inter(data_x, shuffle=False) - for i in range(len(indices))[:: self.batch_size]: - - if len(indices) - i < self.batch_size: - break - - feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float() - label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float() + for idx, count in zip(daily_index, daily_count): + batch = slice(idx, idx + count) + feature = torch.from_numpy(x_values[batch]).float() + label = torch.from_numpy(y_values[batch]).float() if self.use_gpu: feature = feature.cuda() @@ -247,7 +247,6 @@ class GAT(Model): if save_path == None: save_path = create_save_path(save_path) stop_steps = 0 - train_loss = 0 best_score = -np.inf best_epoch = 0 evals_result["train"] = [] @@ -314,17 +313,14 @@ class GAT(Model): index = x_test.index self.GAT_model.eval() x_values = x_test.values - sample_num = x_values.shape[0] preds = [] - for begin in range(sample_num)[:: self.batch_size]: + # organize the data into daily inter as daily batches + daily_index, daily_count = self.get_daily_inter(x_test, shuffle=False) - if sample_num - begin < self.batch_size: - end = sample_num - else: - end = begin + self.batch_size - - x_batch = torch.from_numpy(x_values[begin:end]).float() + for idx, count in zip(daily_index, daily_count): + batch = slice(idx, idx + count) + x_batch = torch.from_numpy(x_values[batch]).float() if self.use_gpu: x_batch = x_batch.cuda()