Remove batchsize and add daily-batch mode

2026-07-21 11:17:34 +08:00 · 2020-11-26 20:19:23 +08:00
parent a108f753d5
commit 99efaadd38
3 changed files with 32 additions and 39 deletions
--- a/examples/benchmarks/GATs/workflow_config_gats.yaml
+++ b/examples/benchmarks/GATs/workflow_config_gats.yaml
@@ -36,7 +36,6 @@ task:
            n_epochs: 200
            lr: 1e-3
            early_stop: 20
-            batch_size: 800
            metric: loss
            loss: mse
            base_model: LSTM
--- a/examples/workflow_by_code_gats.py
+++ b/examples/workflow_by_code_gats.py
@@ -66,13 +66,12 @@ if __name__ == "__main__":
                "n_epochs": 200,
                "lr": 1e-3,
                "early_stop": 20,
-                "batch_size": 800,
                "metric": "loss",
                "loss": "mse",
                "base_model": "LSTM",
                "with_pretrain": True,
                "seed": 0,
-                "GPU": 0,
+                "GPU": "0",
            },
        },
        "dataset": {
@@ -95,7 +94,6 @@ if __name__ == "__main__":
        # "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'],
    }

-    # model = train_model(task)
    model = init_instance_by_config(task["model"])
    dataset = init_instance_by_config(task["dataset"])
    model.fit(dataset)
--- a/qlib/contrib/model/pytorch_gats.py
+++ b/qlib/contrib/model/pytorch_gats.py
@@ -9,10 +9,8 @@ import os
 import numpy as np
 import pandas as pd
 import copy
-from sklearn.metrics import roc_auc_score, mean_squared_error
-import logging
-from ...utils import unpack_archive_with_buffer, save_multiple_parts_file, create_save_path, drop_nan_by_y_index
-from ...log import get_module_logger, TimeInspector
+from ...utils import create_save_path
+from ...log import get_module_logger

 import torch
 import torch.nn as nn
@@ -49,7 +47,6 @@ class GAT(Model):
        n_epochs=200,
        lr=0.001,
        metric="IC",
-        batch_size=2000,
        early_stop=20,
        loss="mse",
        base_model="GRU",
@@ -71,7 +68,6 @@ class GAT(Model):
        self.n_epochs = n_epochs
        self.lr = lr
        self.metric = metric
-        self.batch_size = batch_size
        self.early_stop = early_stop
        self.optimizer = optimizer.lower()
        self.loss = loss
@@ -90,7 +86,6 @@ class GAT(Model):
            "\nn_epochs : {}"
            "\nlr : {}"
            "\nmetric : {}"
-            "\nbatch_size : {}"
            "\nearly_stop : {}"
            "\noptimizer : {}"
            "\nloss_type : {}"
@@ -106,7 +101,6 @@ class GAT(Model):
                n_epochs,
                lr,
                metric,
-                batch_size,
                early_stop,
                optimizer.lower(),
                loss,
@@ -165,23 +159,31 @@ class GAT(Model):
    def cal_ic(self, pred, label):
        return torch.mean(pred * label)

+    def get_daily_inter(self, df, shuffle=False):
+        # organize the train data into daily inter as daily batches
+        daily_count = df.groupby(level=0).size().values
+        daily_index = np.roll(np.cumsum(daily_count), 1)
+        daily_index[0] = 0
+        if shuffle:
+            # shuffle the daily inter data
+            daily_shuffle = list(zip(daily_index, daily_count))
+            np.random.shuffle(daily_shuffle)
+            daily_index, daily_count = zip(*daily_shuffle)
+        return  daily_index, daily_count
+
    def train_epoch(self, x_train, y_train):

        x_train_values = x_train.values
        y_train_values = np.squeeze(y_train.values) * 100
-
        self.GAT_model.train()

-        indices = np.arange(len(x_train_values))
-        np.random.shuffle(indices)
+        # organize the train data into daily inter as daily batches
+        daily_index, daily_count = self.get_daily_inter(x_train, shuffle=True)

-        for i in range(len(indices))[:: self.batch_size]:
-
-            if len(indices) - i < self.batch_size:
-                break
-
-            feature = torch.from_numpy(x_train_values[indices[i : i + self.batch_size]]).float()
-            label = torch.from_numpy(y_train_values[indices[i : i + self.batch_size]]).float()
+        for idx, count in zip(daily_index, daily_count):
+            batch = slice(idx, idx + count)
+            feature = torch.from_numpy(x_train_values[batch]).float()
+            label = torch.from_numpy(y_train_values[batch]).float()

            if self.use_gpu:
                feature = feature.cuda()
@@ -206,15 +208,13 @@ class GAT(Model):
        scores = []
        losses = []

-        indices = np.arange(len(x_values))
+        # organize the test data into daily inter as daily batches
+        daily_index, daily_count = self.get_daily_inter(data_x, shuffle=False)

-        for i in range(len(indices))[:: self.batch_size]:
-
-            if len(indices) - i < self.batch_size:
-                break
-
-            feature = torch.from_numpy(x_values[indices[i : i + self.batch_size]]).float()
-            label = torch.from_numpy(y_values[indices[i : i + self.batch_size]]).float()
+        for idx, count in zip(daily_index, daily_count):
+            batch = slice(idx, idx + count)
+            feature = torch.from_numpy(x_values[batch]).float()
+            label = torch.from_numpy(y_values[batch]).float()

            if self.use_gpu:
                feature = feature.cuda()
@@ -247,7 +247,6 @@ class GAT(Model):
        if save_path == None:
            save_path = create_save_path(save_path)
        stop_steps = 0
-        train_loss = 0
        best_score = -np.inf
        best_epoch = 0
        evals_result["train"] = []
@@ -314,17 +313,14 @@ class GAT(Model):
        index = x_test.index
        self.GAT_model.eval()
        x_values = x_test.values
-        sample_num = x_values.shape[0]
        preds = []

-        for begin in range(sample_num)[:: self.batch_size]:
+        # organize the data into daily inter as daily batches
+        daily_index, daily_count = self.get_daily_inter(x_test, shuffle=False)

-            if sample_num - begin < self.batch_size:
-                end = sample_num
-            else:
-                end = begin + self.batch_size
-
-            x_batch = torch.from_numpy(x_values[begin:end]).float()
+        for idx, count in zip(daily_index, daily_count):
+            batch = slice(idx, idx + count)
+            x_batch = torch.from_numpy(x_values[batch]).float()

            if self.use_gpu:
                x_batch = x_batch.cuda()