From c2c96a817f3ccd59f100c7a9f6c4dba8a3f2959e Mon Sep 17 00:00:00 2001
From: Wendi Li <wendili.academic@qq.com>
Date: Mon, 23 Nov 2020 16:09:03 +0800
Subject: [PATCH] Format TFT

---
 .../TFT/data_formatters/__init__.py           |   29 +-
 .../benchmarks/TFT/data_formatters/base.py    |  458 ++-
 .../TFT/data_formatters/electricity.py        |  515 ++--
 .../TFT/data_formatters/favorita.py           |  660 ++--
 .../TFT/data_formatters/qlib_Alpha158.py      |  439 ++-
 .../benchmarks/TFT/data_formatters/traffic.py |  234 +-
 .../TFT/data_formatters/volatility.py         |  426 ++-
 .../benchmarks/TFT/expt_settings/__init__.py  |   29 +-
 .../benchmarks/TFT/expt_settings/configs.py   |  218 +-
 examples/benchmarks/TFT/libs/__init__.py      |   29 +-
 .../benchmarks/TFT/libs/hyperparam_opt.py     |  868 +++---
 examples/benchmarks/TFT/libs/tft_model.py     | 2671 ++++++++---------
 examples/benchmarks/TFT/libs/utils.py         |  460 ++-
 examples/benchmarks/TFT/tft.py                |  494 +--
 .../benchmarks/TFT/workflow_by_code_tft.py    |  262 +-
 15 files changed, 3821 insertions(+), 3971 deletions(-)

diff --git a/examples/benchmarks/TFT/data_formatters/__init__.py b/examples/benchmarks/TFT/data_formatters/__init__.py
index 9a1980462..87ec3284f 100644
--- a/examples/benchmarks/TFT/data_formatters/__init__.py
+++ b/examples/benchmarks/TFT/data_formatters/__init__.py
@@ -1,15 +1,14 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/examples/benchmarks/TFT/data_formatters/base.py b/examples/benchmarks/TFT/data_formatters/base.py
index f4ce2764f..c68a192ba 100644
--- a/examples/benchmarks/TFT/data_formatters/base.py
+++ b/examples/benchmarks/TFT/data_formatters/base.py
@@ -1,235 +1,223 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-"""Default data formatting functions for experiments.
-
-For new datasets, inherit form GenericDataFormatter and implement
-all abstract functions.
-
-These dataset-specific methods:
-1) Define the column and input types for tabular dataframes used by model
-2) Perform the necessary input feature engineering & normalisation steps
-3) Reverts the normalisation for predictions
-4) Are responsible for train, validation and test splits
-
-
-"""
-
-import abc
-import enum
-
-
-# Type defintions
-class DataTypes(enum.IntEnum):
-  """Defines numerical types of each column."""
-  REAL_VALUED = 0
-  CATEGORICAL = 1
-  DATE = 2
-
-
-class InputTypes(enum.IntEnum):
-  """Defines input types of each column."""
-  TARGET = 0
-  OBSERVED_INPUT = 1
-  KNOWN_INPUT = 2
-  STATIC_INPUT = 3
-  ID = 4  # Single column used as an entity identifier
-  TIME = 5  # Single column exclusively used as a time index
-
-
-class GenericDataFormatter(abc.ABC):
-  """Abstract base class for all data formatters.
-
-  User can implement the abstract methods below to perform dataset-specific
-  manipulations.
-
-  """
-
-  @abc.abstractmethod
-  def set_scalers(self, df):
-    """Calibrates scalers using the data supplied."""
-    raise NotImplementedError()
-
-  @abc.abstractmethod
-  def transform_inputs(self, df):
-    """Performs feature transformation."""
-    raise NotImplementedError()
-
-  @abc.abstractmethod
-  def format_predictions(self, df):
-    """Reverts any normalisation to give predictions in original scale."""
-    raise NotImplementedError()
-
-  @abc.abstractmethod
-  def split_data(self, df):
-    """Performs the default train, validation and test splits."""
-    raise NotImplementedError()
-
-  @property
-  @abc.abstractmethod
-  def _column_definition(self):
-    """Defines order, input type and data type of each column."""
-    raise NotImplementedError()
-
-  @abc.abstractmethod
-  def get_fixed_params(self):
-    """Defines the fixed parameters used by the model for training.
-
-    Requires the following keys:
-      'total_time_steps': Defines the total number of time steps used by TFT
-      'num_encoder_steps': Determines length of LSTM encoder (i.e. history)
-      'num_epochs': Maximum number of epochs for training
-      'early_stopping_patience': Early stopping param for keras
-      'multiprocessing_workers': # of cpus for data processing
-
-
-    Returns:
-      A dictionary of fixed parameters, e.g.:
-
-      fixed_params = {
-          'total_time_steps': 252 + 5,
-          'num_encoder_steps': 252,
-          'num_epochs': 100,
-          'early_stopping_patience': 5,
-          'multiprocessing_workers': 5,
-      }
-    """
-    raise NotImplementedError
-
-  # Shared functions across data-formatters
-  @property
-  def num_classes_per_cat_input(self):
-    """Returns number of categories per relevant input.
-
-    This is seqeuently required for keras embedding layers.
-    """
-    return self._num_classes_per_cat_input
-
-  def get_num_samples_for_calibration(self):
-    """Gets the default number of training and validation samples.
-
-    Use to sub-sample the data for network calibration and a value of -1 uses
-    all available samples.
-
-    Returns:
-      Tuple of (training samples, validation samples)
-    """
-    return -1, -1
-
-  def get_column_definition(self):
-    """"Returns formatted column definition in order expected by the TFT."""
-
-    column_definition = self._column_definition
-
-    # Sanity checks first.
-    # Ensure only one ID and time column exist
-    def _check_single_column(input_type):
-
-      length = len([tup for tup in column_definition if tup[2] == input_type])
-
-      if length != 1:
-        raise ValueError('Illegal number of inputs ({}) of type {}'.format(
-            length, input_type))
-
-    _check_single_column(InputTypes.ID)
-    _check_single_column(InputTypes.TIME)
-
-    identifier = [tup for tup in column_definition if tup[2] == InputTypes.ID]
-    time = [tup for tup in column_definition if tup[2] == InputTypes.TIME]
-    real_inputs = [
-        tup for tup in column_definition if tup[1] == DataTypes.REAL_VALUED and
-        tup[2] not in {InputTypes.ID, InputTypes.TIME}
-    ]
-    categorical_inputs = [
-        tup for tup in column_definition if tup[1] == DataTypes.CATEGORICAL and
-        tup[2] not in {InputTypes.ID, InputTypes.TIME}
-    ]
-
-    return identifier + time + real_inputs + categorical_inputs
-
-  def _get_input_columns(self):
-    """Returns names of all input columns."""
-    return [
-        tup[0]
-        for tup in self.get_column_definition()
-        if tup[2] not in {InputTypes.ID, InputTypes.TIME}
-    ]
-
-  def _get_tft_input_indices(self):
-    """Returns the relevant indexes and input sizes required by TFT."""
-
-    # Functions
-    def _extract_tuples_from_data_type(data_type, defn):
-      return [
-          tup for tup in defn if tup[1] == data_type and
-          tup[2] not in {InputTypes.ID, InputTypes.TIME}
-      ]
-
-    def _get_locations(input_types, defn):
-      return [i for i, tup in enumerate(defn) if tup[2] in input_types]
-
-    # Start extraction
-    column_definition = [
-        tup for tup in self.get_column_definition()
-        if tup[2] not in {InputTypes.ID, InputTypes.TIME}
-    ]
-
-    categorical_inputs = _extract_tuples_from_data_type(DataTypes.CATEGORICAL,
-                                                        column_definition)
-    real_inputs = _extract_tuples_from_data_type(DataTypes.REAL_VALUED,
-                                                 column_definition)
-
-    locations = {
-        'input_size':
-            len(self._get_input_columns()),
-        'output_size':
-            len(_get_locations({InputTypes.TARGET}, column_definition)),
-        'category_counts':
-            self.num_classes_per_cat_input,
-        'input_obs_loc':
-            _get_locations({InputTypes.TARGET}, column_definition),
-        'static_input_loc':
-            _get_locations({InputTypes.STATIC_INPUT}, column_definition),
-        'known_regular_inputs':
-            _get_locations({InputTypes.STATIC_INPUT, InputTypes.KNOWN_INPUT},
-                           real_inputs),
-        'known_categorical_inputs':
-            _get_locations({InputTypes.STATIC_INPUT, InputTypes.KNOWN_INPUT},
-                           categorical_inputs),
-    }
-
-    return locations
-
-  def get_experiment_params(self):
-    """Returns fixed model parameters for experiments."""
-
-    required_keys = [
-        'total_time_steps', 'num_encoder_steps', 'num_epochs',
-        'early_stopping_patience', 'multiprocessing_workers'
-    ]
-
-    fixed_params = self.get_fixed_params()
-
-    for k in required_keys:
-      if k not in fixed_params:
-        raise ValueError('Field {}'.format(k) +
-                         ' missing from fixed parameter definitions!')
-
-    fixed_params['column_definition'] = self.get_column_definition()
-
-    fixed_params.update(self._get_tft_input_indices())
-
-    return fixed_params
+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Default data formatting functions for experiments.
+
+For new datasets, inherit form GenericDataFormatter and implement
+all abstract functions.
+
+These dataset-specific methods:
+1) Define the column and input types for tabular dataframes used by model
+2) Perform the necessary input feature engineering & normalisation steps
+3) Reverts the normalisation for predictions
+4) Are responsible for train, validation and test splits
+
+
+"""
+
+import abc
+import enum
+
+
+# Type defintions
+class DataTypes(enum.IntEnum):
+    """Defines numerical types of each column."""
+
+    REAL_VALUED = 0
+    CATEGORICAL = 1
+    DATE = 2
+
+
+class InputTypes(enum.IntEnum):
+    """Defines input types of each column."""
+
+    TARGET = 0
+    OBSERVED_INPUT = 1
+    KNOWN_INPUT = 2
+    STATIC_INPUT = 3
+    ID = 4  # Single column used as an entity identifier
+    TIME = 5  # Single column exclusively used as a time index
+
+
+class GenericDataFormatter(abc.ABC):
+    """Abstract base class for all data formatters.
+
+    User can implement the abstract methods below to perform dataset-specific
+    manipulations.
+
+    """
+
+    @abc.abstractmethod
+    def set_scalers(self, df):
+        """Calibrates scalers using the data supplied."""
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def transform_inputs(self, df):
+        """Performs feature transformation."""
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def format_predictions(self, df):
+        """Reverts any normalisation to give predictions in original scale."""
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def split_data(self, df):
+        """Performs the default train, validation and test splits."""
+        raise NotImplementedError()
+
+    @property
+    @abc.abstractmethod
+    def _column_definition(self):
+        """Defines order, input type and data type of each column."""
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def get_fixed_params(self):
+        """Defines the fixed parameters used by the model for training.
+
+        Requires the following keys:
+          'total_time_steps': Defines the total number of time steps used by TFT
+          'num_encoder_steps': Determines length of LSTM encoder (i.e. history)
+          'num_epochs': Maximum number of epochs for training
+          'early_stopping_patience': Early stopping param for keras
+          'multiprocessing_workers': # of cpus for data processing
+
+
+        Returns:
+          A dictionary of fixed parameters, e.g.:
+
+          fixed_params = {
+              'total_time_steps': 252 + 5,
+              'num_encoder_steps': 252,
+              'num_epochs': 100,
+              'early_stopping_patience': 5,
+              'multiprocessing_workers': 5,
+          }
+        """
+        raise NotImplementedError
+
+    # Shared functions across data-formatters
+    @property
+    def num_classes_per_cat_input(self):
+        """Returns number of categories per relevant input.
+
+        This is seqeuently required for keras embedding layers.
+        """
+        return self._num_classes_per_cat_input
+
+    def get_num_samples_for_calibration(self):
+        """Gets the default number of training and validation samples.
+
+        Use to sub-sample the data for network calibration and a value of -1 uses
+        all available samples.
+
+        Returns:
+          Tuple of (training samples, validation samples)
+        """
+        return -1, -1
+
+    def get_column_definition(self):
+        """"Returns formatted column definition in order expected by the TFT."""
+
+        column_definition = self._column_definition
+
+        # Sanity checks first.
+        # Ensure only one ID and time column exist
+        def _check_single_column(input_type):
+
+            length = len([tup for tup in column_definition if tup[2] == input_type])
+
+            if length != 1:
+                raise ValueError("Illegal number of inputs ({}) of type {}".format(length, input_type))
+
+        _check_single_column(InputTypes.ID)
+        _check_single_column(InputTypes.TIME)
+
+        identifier = [tup for tup in column_definition if tup[2] == InputTypes.ID]
+        time = [tup for tup in column_definition if tup[2] == InputTypes.TIME]
+        real_inputs = [
+            tup
+            for tup in column_definition
+            if tup[1] == DataTypes.REAL_VALUED and tup[2] not in {InputTypes.ID, InputTypes.TIME}
+        ]
+        categorical_inputs = [
+            tup
+            for tup in column_definition
+            if tup[1] == DataTypes.CATEGORICAL and tup[2] not in {InputTypes.ID, InputTypes.TIME}
+        ]
+
+        return identifier + time + real_inputs + categorical_inputs
+
+    def _get_input_columns(self):
+        """Returns names of all input columns."""
+        return [tup[0] for tup in self.get_column_definition() if tup[2] not in {InputTypes.ID, InputTypes.TIME}]
+
+    def _get_tft_input_indices(self):
+        """Returns the relevant indexes and input sizes required by TFT."""
+
+        # Functions
+        def _extract_tuples_from_data_type(data_type, defn):
+            return [tup for tup in defn if tup[1] == data_type and tup[2] not in {InputTypes.ID, InputTypes.TIME}]
+
+        def _get_locations(input_types, defn):
+            return [i for i, tup in enumerate(defn) if tup[2] in input_types]
+
+        # Start extraction
+        column_definition = [
+            tup for tup in self.get_column_definition() if tup[2] not in {InputTypes.ID, InputTypes.TIME}
+        ]
+
+        categorical_inputs = _extract_tuples_from_data_type(DataTypes.CATEGORICAL, column_definition)
+        real_inputs = _extract_tuples_from_data_type(DataTypes.REAL_VALUED, column_definition)
+
+        locations = {
+            "input_size": len(self._get_input_columns()),
+            "output_size": len(_get_locations({InputTypes.TARGET}, column_definition)),
+            "category_counts": self.num_classes_per_cat_input,
+            "input_obs_loc": _get_locations({InputTypes.TARGET}, column_definition),
+            "static_input_loc": _get_locations({InputTypes.STATIC_INPUT}, column_definition),
+            "known_regular_inputs": _get_locations({InputTypes.STATIC_INPUT, InputTypes.KNOWN_INPUT}, real_inputs),
+            "known_categorical_inputs": _get_locations(
+                {InputTypes.STATIC_INPUT, InputTypes.KNOWN_INPUT}, categorical_inputs
+            ),
+        }
+
+        return locations
+
+    def get_experiment_params(self):
+        """Returns fixed model parameters for experiments."""
+
+        required_keys = [
+            "total_time_steps",
+            "num_encoder_steps",
+            "num_epochs",
+            "early_stopping_patience",
+            "multiprocessing_workers",
+        ]
+
+        fixed_params = self.get_fixed_params()
+
+        for k in required_keys:
+            if k not in fixed_params:
+                raise ValueError("Field {}".format(k) + " missing from fixed parameter definitions!")
+
+        fixed_params["column_definition"] = self.get_column_definition()
+
+        fixed_params.update(self._get_tft_input_indices())
+
+        return fixed_params
diff --git a/examples/benchmarks/TFT/data_formatters/electricity.py b/examples/benchmarks/TFT/data_formatters/electricity.py
index 062a77eb2..366954a71 100644
--- a/examples/benchmarks/TFT/data_formatters/electricity.py
+++ b/examples/benchmarks/TFT/data_formatters/electricity.py
@@ -1,261 +1,254 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-"""Custom formatting functions for Electricity dataset.
-
-Defines dataset specific column definitions and data transformations. Uses
-entity specific z-score normalization.
-"""
-
-import data_formatters.base
-import libs.utils as utils
-import pandas as pd
-import sklearn.preprocessing
-
-GenericDataFormatter = data_formatters.base.GenericDataFormatter
-DataTypes = data_formatters.base.DataTypes
-InputTypes = data_formatters.base.InputTypes
-
-
-class ElectricityFormatter(GenericDataFormatter):
-  """Defines and formats data for the electricity dataset.
-
-  Note that per-entity z-score normalization is used here, and is implemented
-  across functions.
-
-  Attributes:
-    column_definition: Defines input and data type of column used in the
-      experiment.
-    identifiers: Entity identifiers used in experiments.
-  """
-
-  _column_definition = [
-      ('id', DataTypes.REAL_VALUED, InputTypes.ID),
-      ('hours_from_start', DataTypes.REAL_VALUED, InputTypes.TIME),
-      ('power_usage', DataTypes.REAL_VALUED, InputTypes.TARGET),
-      ('hour', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
-      ('day_of_week', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
-      ('hours_from_start', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
-      ('categorical_id', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
-  ]
-
-  def __init__(self):
-    """Initialises formatter."""
-
-    self.identifiers = None
-    self._real_scalers = None
-    self._cat_scalers = None
-    self._target_scaler = None
-    self._num_classes_per_cat_input = None
-    self._time_steps = self.get_fixed_params()['total_time_steps']
-
-  def split_data(self, df, valid_boundary=1315, test_boundary=1339):
-    """Splits data frame into training-validation-test data frames.
-
-    This also calibrates scaling object, and transforms data for each split.
-
-    Args:
-      df: Source data frame to split.
-      valid_boundary: Starting year for validation data
-      test_boundary: Starting year for test data
-
-    Returns:
-      Tuple of transformed (train, valid, test) data.
-    """
-
-    print('Formatting train-valid-test splits.')
-
-    index = df['days_from_start']
-    train = df.loc[index < valid_boundary]
-    valid = df.loc[(index >= valid_boundary - 7) & (index < test_boundary)]
-    test = df.loc[index >= test_boundary - 7]
-
-    self.set_scalers(train)
-
-    return (self.transform_inputs(data) for data in [train, valid, test])
-
-  def set_scalers(self, df):
-    """Calibrates scalers using the data supplied.
-
-    Args:
-      df: Data to use to calibrate scalers.
-    """
-    print('Setting scalers with training data...')
-
-    column_definitions = self.get_column_definition()
-    id_column = utils.get_single_col_by_input_type(InputTypes.ID,
-                                                   column_definitions)
-    target_column = utils.get_single_col_by_input_type(InputTypes.TARGET,
-                                                       column_definitions)
-
-    # Format real scalers
-    real_inputs = utils.extract_cols_from_data_type(
-        DataTypes.REAL_VALUED, column_definitions,
-        {InputTypes.ID, InputTypes.TIME})
-
-    # Initialise scaler caches
-    self._real_scalers = {}
-    self._target_scaler = {}
-    identifiers = []
-    for identifier, sliced in df.groupby(id_column):
-
-      if len(sliced) >= self._time_steps:
-
-        data = sliced[real_inputs].values
-        targets = sliced[[target_column]].values
-        self._real_scalers[identifier] \
-      = sklearn.preprocessing.StandardScaler().fit(data)
-
-        self._target_scaler[identifier] \
-      = sklearn.preprocessing.StandardScaler().fit(targets)
-      identifiers.append(identifier)
-
-    # Format categorical scalers
-    categorical_inputs = utils.extract_cols_from_data_type(
-        DataTypes.CATEGORICAL, column_definitions,
-        {InputTypes.ID, InputTypes.TIME})
-
-    categorical_scalers = {}
-    num_classes = []
-    for col in categorical_inputs:
-      # Set all to str so that we don't have mixed integer/string columns
-      srs = df[col].apply(str)
-      categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(
-          srs.values)
-      num_classes.append(srs.nunique())
-
-    # Set categorical scaler outputs
-    self._cat_scalers = categorical_scalers
-    self._num_classes_per_cat_input = num_classes
-
-    # Extract identifiers in case required
-    self.identifiers = identifiers
-
-  def transform_inputs(self, df):
-    """Performs feature transformations.
-
-    This includes both feature engineering, preprocessing and normalisation.
-
-    Args:
-      df: Data frame to transform.
-
-    Returns:
-      Transformed data frame.
-
-    """
-
-    if self._real_scalers is None and self._cat_scalers is None:
-      raise ValueError('Scalers have not been set!')
-
-    # Extract relevant columns
-    column_definitions = self.get_column_definition()
-    id_col = utils.get_single_col_by_input_type(InputTypes.ID,
-                                                column_definitions)
-    real_inputs = utils.extract_cols_from_data_type(
-        DataTypes.REAL_VALUED, column_definitions,
-        {InputTypes.ID, InputTypes.TIME})
-    categorical_inputs = utils.extract_cols_from_data_type(
-        DataTypes.CATEGORICAL, column_definitions,
-        {InputTypes.ID, InputTypes.TIME})
-
-    # Transform real inputs per entity
-    df_list = []
-    for identifier, sliced in df.groupby(id_col):
-
-      # Filter out any trajectories that are too short
-      if len(sliced) >= self._time_steps:
-        sliced_copy = sliced.copy()
-        sliced_copy[real_inputs] = self._real_scalers[identifier].transform(
-            sliced_copy[real_inputs].values)
-        df_list.append(sliced_copy)
-
-    output = pd.concat(df_list, axis=0)
-
-    # Format categorical inputs
-    for col in categorical_inputs:
-      string_df = df[col].apply(str)
-      output[col] = self._cat_scalers[col].transform(string_df)
-
-    return output
-
-  def format_predictions(self, predictions):
-    """Reverts any normalisation to give predictions in original scale.
-
-    Args:
-      predictions: Dataframe of model predictions.
-
-    Returns:
-      Data frame of unnormalised predictions.
-    """
-
-    if self._target_scaler is None:
-      raise ValueError('Scalers have not been set!')
-
-    column_names = predictions.columns
-
-    df_list = []
-    for identifier, sliced in predictions.groupby('identifier'):
-      sliced_copy = sliced.copy()
-      target_scaler = self._target_scaler[identifier]
-
-      for col in column_names:
-        if col not in {'forecast_time', 'identifier'}:
-          sliced_copy[col] = target_scaler.inverse_transform(sliced_copy[col])
-      df_list.append(sliced_copy)
-
-    output = pd.concat(df_list, axis=0)
-
-    return output
-
-  # Default params
-  def get_fixed_params(self):
-    """Returns fixed model parameters for experiments."""
-
-    fixed_params = {
-        'total_time_steps': 8 * 24,
-        'num_encoder_steps': 7 * 24,
-        'num_epochs': 100,
-        'early_stopping_patience': 5,
-        'multiprocessing_workers': 5
-    }
-
-    return fixed_params
-
-  def get_default_model_params(self):
-    """Returns default optimised model parameters."""
-
-    model_params = {
-        'dropout_rate': 0.1,
-        'hidden_layer_size': 160,
-        'learning_rate': 0.001,
-        'minibatch_size': 64,
-        'max_gradient_norm': 0.01,
-        'num_heads': 4,
-        'stack_size': 1
-    }
-
-    return model_params
-
-  def get_num_samples_for_calibration(self):
-    """Gets the default number of training and validation samples.
-
-    Use to sub-sample the data for network calibration and a value of -1 uses
-    all available samples.
-
-    Returns:
-      Tuple of (training samples, validation samples)
-    """
-    return 450000, 50000
+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Custom formatting functions for Electricity dataset.
+
+Defines dataset specific column definitions and data transformations. Uses
+entity specific z-score normalization.
+"""
+
+import data_formatters.base
+import libs.utils as utils
+import pandas as pd
+import sklearn.preprocessing
+
+GenericDataFormatter = data_formatters.base.GenericDataFormatter
+DataTypes = data_formatters.base.DataTypes
+InputTypes = data_formatters.base.InputTypes
+
+
+class ElectricityFormatter(GenericDataFormatter):
+    """Defines and formats data for the electricity dataset.
+
+    Note that per-entity z-score normalization is used here, and is implemented
+    across functions.
+
+    Attributes:
+      column_definition: Defines input and data type of column used in the
+        experiment.
+      identifiers: Entity identifiers used in experiments.
+    """
+
+    _column_definition = [
+        ("id", DataTypes.REAL_VALUED, InputTypes.ID),
+        ("hours_from_start", DataTypes.REAL_VALUED, InputTypes.TIME),
+        ("power_usage", DataTypes.REAL_VALUED, InputTypes.TARGET),
+        ("hour", DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
+        ("day_of_week", DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
+        ("hours_from_start", DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
+        ("categorical_id", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
+    ]
+
+    def __init__(self):
+        """Initialises formatter."""
+
+        self.identifiers = None
+        self._real_scalers = None
+        self._cat_scalers = None
+        self._target_scaler = None
+        self._num_classes_per_cat_input = None
+        self._time_steps = self.get_fixed_params()["total_time_steps"]
+
+    def split_data(self, df, valid_boundary=1315, test_boundary=1339):
+        """Splits data frame into training-validation-test data frames.
+
+        This also calibrates scaling object, and transforms data for each split.
+
+        Args:
+          df: Source data frame to split.
+          valid_boundary: Starting year for validation data
+          test_boundary: Starting year for test data
+
+        Returns:
+          Tuple of transformed (train, valid, test) data.
+        """
+
+        print("Formatting train-valid-test splits.")
+
+        index = df["days_from_start"]
+        train = df.loc[index < valid_boundary]
+        valid = df.loc[(index >= valid_boundary - 7) & (index < test_boundary)]
+        test = df.loc[index >= test_boundary - 7]
+
+        self.set_scalers(train)
+
+        return (self.transform_inputs(data) for data in [train, valid, test])
+
+    def set_scalers(self, df):
+        """Calibrates scalers using the data supplied.
+
+        Args:
+          df: Data to use to calibrate scalers.
+        """
+        print("Setting scalers with training data...")
+
+        column_definitions = self.get_column_definition()
+        id_column = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions)
+        target_column = utils.get_single_col_by_input_type(InputTypes.TARGET, column_definitions)
+
+        # Format real scalers
+        real_inputs = utils.extract_cols_from_data_type(
+            DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}
+        )
+
+        # Initialise scaler caches
+        self._real_scalers = {}
+        self._target_scaler = {}
+        identifiers = []
+        for identifier, sliced in df.groupby(id_column):
+
+            if len(sliced) >= self._time_steps:
+
+                data = sliced[real_inputs].values
+                targets = sliced[[target_column]].values
+                self._real_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(data)
+
+                self._target_scaler[identifier] = sklearn.preprocessing.StandardScaler().fit(targets)
+            identifiers.append(identifier)
+
+        # Format categorical scalers
+        categorical_inputs = utils.extract_cols_from_data_type(
+            DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}
+        )
+
+        categorical_scalers = {}
+        num_classes = []
+        for col in categorical_inputs:
+            # Set all to str so that we don't have mixed integer/string columns
+            srs = df[col].apply(str)
+            categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
+            num_classes.append(srs.nunique())
+
+        # Set categorical scaler outputs
+        self._cat_scalers = categorical_scalers
+        self._num_classes_per_cat_input = num_classes
+
+        # Extract identifiers in case required
+        self.identifiers = identifiers
+
+    def transform_inputs(self, df):
+        """Performs feature transformations.
+
+        This includes both feature engineering, preprocessing and normalisation.
+
+        Args:
+          df: Data frame to transform.
+
+        Returns:
+          Transformed data frame.
+
+        """
+
+        if self._real_scalers is None and self._cat_scalers is None:
+            raise ValueError("Scalers have not been set!")
+
+        # Extract relevant columns
+        column_definitions = self.get_column_definition()
+        id_col = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions)
+        real_inputs = utils.extract_cols_from_data_type(
+            DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}
+        )
+        categorical_inputs = utils.extract_cols_from_data_type(
+            DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}
+        )
+
+        # Transform real inputs per entity
+        df_list = []
+        for identifier, sliced in df.groupby(id_col):
+
+            # Filter out any trajectories that are too short
+            if len(sliced) >= self._time_steps:
+                sliced_copy = sliced.copy()
+                sliced_copy[real_inputs] = self._real_scalers[identifier].transform(sliced_copy[real_inputs].values)
+                df_list.append(sliced_copy)
+
+        output = pd.concat(df_list, axis=0)
+
+        # Format categorical inputs
+        for col in categorical_inputs:
+            string_df = df[col].apply(str)
+            output[col] = self._cat_scalers[col].transform(string_df)
+
+        return output
+
+    def format_predictions(self, predictions):
+        """Reverts any normalisation to give predictions in original scale.
+
+        Args:
+          predictions: Dataframe of model predictions.
+
+        Returns:
+          Data frame of unnormalised predictions.
+        """
+
+        if self._target_scaler is None:
+            raise ValueError("Scalers have not been set!")
+
+        column_names = predictions.columns
+
+        df_list = []
+        for identifier, sliced in predictions.groupby("identifier"):
+            sliced_copy = sliced.copy()
+            target_scaler = self._target_scaler[identifier]
+
+            for col in column_names:
+                if col not in {"forecast_time", "identifier"}:
+                    sliced_copy[col] = target_scaler.inverse_transform(sliced_copy[col])
+            df_list.append(sliced_copy)
+
+        output = pd.concat(df_list, axis=0)
+
+        return output
+
+    # Default params
+    def get_fixed_params(self):
+        """Returns fixed model parameters for experiments."""
+
+        fixed_params = {
+            "total_time_steps": 8 * 24,
+            "num_encoder_steps": 7 * 24,
+            "num_epochs": 100,
+            "early_stopping_patience": 5,
+            "multiprocessing_workers": 5,
+        }
+
+        return fixed_params
+
+    def get_default_model_params(self):
+        """Returns default optimised model parameters."""
+
+        model_params = {
+            "dropout_rate": 0.1,
+            "hidden_layer_size": 160,
+            "learning_rate": 0.001,
+            "minibatch_size": 64,
+            "max_gradient_norm": 0.01,
+            "num_heads": 4,
+            "stack_size": 1,
+        }
+
+        return model_params
+
+    def get_num_samples_for_calibration(self):
+        """Gets the default number of training and validation samples.
+
+        Use to sub-sample the data for network calibration and a value of -1 uses
+        all available samples.
+
+        Returns:
+          Tuple of (training samples, validation samples)
+        """
+        return 450000, 50000
diff --git a/examples/benchmarks/TFT/data_formatters/favorita.py b/examples/benchmarks/TFT/data_formatters/favorita.py
index 26fae632c..bc7a24140 100644
--- a/examples/benchmarks/TFT/data_formatters/favorita.py
+++ b/examples/benchmarks/TFT/data_formatters/favorita.py
@@ -1,327 +1,333 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-"""Custom formatting functions for Favorita dataset.
-
-Defines dataset specific column definitions and data transformations.
-"""
-
-import data_formatters.base
-import libs.utils as utils
-import pandas as pd
-import sklearn.preprocessing
-
-DataTypes = data_formatters.base.DataTypes
-InputTypes = data_formatters.base.InputTypes
-
-
-class FavoritaFormatter(data_formatters.base.GenericDataFormatter):
-  """Defines and formats data for the Favorita dataset.
-
-  Attributes:
-    column_definition: Defines input and data type of column used in the
-      experiment.
-    identifiers: Entity identifiers used in experiments.
-  """
-
-  _column_definition = [
-      ('traj_id', DataTypes.REAL_VALUED, InputTypes.ID),
-      ('date', DataTypes.DATE, InputTypes.TIME),
-      ('log_sales', DataTypes.REAL_VALUED, InputTypes.TARGET),
-      ('onpromotion', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
-      ('transactions', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
-      ('oil', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
-      ('day_of_week', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
-      ('day_of_month', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
-      ('month', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
-      ('national_hol', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
-      ('regional_hol', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
-      ('local_hol', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
-      ('open', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
-      ('item_nbr', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
-      ('store_nbr', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
-      ('city', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
-      ('state', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
-      ('type', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
-      ('cluster', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
-      ('family', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
-      ('class', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
-      ('perishable', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT)
-  ]
-
-  def __init__(self):
-    """Initialises formatter."""
-
-    self.identifiers = None
-    self._real_scalers = None
-    self._cat_scalers = None
-    self._target_scaler = None
-    self._num_classes_per_cat_input = None
-
-  def split_data(self, df, valid_boundary=None, test_boundary=None):
-    """Splits data frame into training-validation-test data frames.
-
-    This also calibrates scaling object, and transforms data for each split.
-
-    Args:
-      df: Source data frame to split.
-      valid_boundary: Starting year for validation data
-      test_boundary: Starting year for test data
-
-    Returns:
-      Tuple of transformed (train, valid, test) data.
-    """
-
-    print('Formatting train-valid-test splits.')
-
-    if valid_boundary is None:
-      valid_boundary = pd.datetime(2015, 12, 1)
-
-    fixed_params = self.get_fixed_params()
-    time_steps = fixed_params['total_time_steps']
-    lookback = fixed_params['num_encoder_steps']
-    forecast_horizon = time_steps - lookback
-
-    df['date'] = pd.to_datetime(df['date'])
-    df_lists = {'train': [], 'valid': [], 'test': []}
-    for _, sliced in df.groupby('traj_id'):
-      index = sliced['date']
-      train = sliced.loc[index < valid_boundary]
-      train_len = len(train)
-      valid_len = train_len + forecast_horizon
-      valid = sliced.iloc[train_len - lookback:valid_len, :]
-      test = sliced.iloc[valid_len - lookback:valid_len + forecast_horizon, :]
-
-      sliced_map = {'train': train, 'valid': valid, 'test': test}
-
-      for k in sliced_map:
-        item = sliced_map[k]
-
-        if len(item) >= time_steps:
-          df_lists[k].append(item)
-
-    dfs = {k: pd.concat(df_lists[k], axis=0) for k in df_lists}
-
-    train = dfs['train']
-    self.set_scalers(train, set_real=True)
-
-    # Use all data for label encoding  to handle labels not present in training.
-    self.set_scalers(df, set_real=False)
-
-    # Filter out identifiers not present in training (i.e. cold-started items).
-    def filter_ids(frame):
-      identifiers = set(self.identifiers)
-      index = frame['traj_id']
-      return frame.loc[index.apply(lambda x: x in identifiers)]
-
-    valid = filter_ids(dfs['valid'])
-    test = filter_ids(dfs['test'])
-
-    return (self.transform_inputs(data) for data in [train, valid, test])
-
-  def set_scalers(self, df, set_real=True):
-    """Calibrates scalers using the data supplied.
-
-    Label encoding is applied to the entire dataset (i.e. including test),
-    so that unseen labels can be handled at run-time.
-
-    Args:
-      df: Data to use to calibrate scalers.
-      set_real: Whether to fit set real-valued or categorical scalers
-    """
-    print('Setting scalers with training data...')
-
-    column_definitions = self.get_column_definition()
-    id_column = utils.get_single_col_by_input_type(InputTypes.ID,
-                                                   column_definitions)
-    target_column = utils.get_single_col_by_input_type(InputTypes.TARGET,
-                                                       column_definitions)
-
-    if set_real:
-
-      # Extract identifiers in case required
-      self.identifiers = list(df[id_column].unique())
-
-      # Format real scalers
-      self._real_scalers = {}
-      for col in ['oil', 'transactions', 'log_sales']:
-        self._real_scalers[col] = (df[col].mean(), df[col].std())
-
-      self._target_scaler = (df[target_column].mean(), df[target_column].std())
-
-    else:
-      # Format categorical scalers
-      categorical_inputs = utils.extract_cols_from_data_type(
-          DataTypes.CATEGORICAL, column_definitions,
-          {InputTypes.ID, InputTypes.TIME})
-
-      categorical_scalers = {}
-      num_classes = []
-      if self.identifiers is None:
-        raise ValueError('Scale real-valued inputs first!')
-      id_set = set(self.identifiers)
-      valid_idx = df['traj_id'].apply(lambda x: x in id_set)
-      for col in categorical_inputs:
-        # Set all to str so that we don't have mixed integer/string columns
-        srs = df[col].apply(str).loc[valid_idx]
-        categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(
-            srs.values)
-
-        num_classes.append(srs.nunique())
-
-      # Set categorical scaler outputs
-      self._cat_scalers = categorical_scalers
-      self._num_classes_per_cat_input = num_classes
-
-  def transform_inputs(self, df):
-    """Performs feature transformations.
-
-    This includes both feature engineering, preprocessing and normalisation.
-
-    Args:
-      df: Data frame to transform.
-
-    Returns:
-      Transformed data frame.
-
-    """
-    output = df.copy()
-
-    if self._real_scalers is None and self._cat_scalers is None:
-      raise ValueError('Scalers have not been set!')
-
-    column_definitions = self.get_column_definition()
-
-    categorical_inputs = utils.extract_cols_from_data_type(
-        DataTypes.CATEGORICAL, column_definitions,
-        {InputTypes.ID, InputTypes.TIME})
-
-    # Format real inputs
-    for col in ['log_sales', 'oil', 'transactions']:
-      mean, std = self._real_scalers[col]
-      output[col] = (df[col] - mean) / std
-
-      if col == 'log_sales':
-        output[col] = output[col].fillna(0.)  # mean imputation
-
-    # Format categorical inputs
-    for col in categorical_inputs:
-      string_df = df[col].apply(str)
-      output[col] = self._cat_scalers[col].transform(string_df)
-
-    return output
-
-  def format_predictions(self, predictions):
-    """Reverts any normalisation to give predictions in original scale.
-
-    Args:
-      predictions: Dataframe of model predictions.
-
-    Returns:
-      Data frame of unnormalised predictions.
-    """
-    output = predictions.copy()
-
-    column_names = predictions.columns
-    mean, std = self._target_scaler
-    for col in column_names:
-      if col not in {'forecast_time', 'identifier'}:
-        output[col] = (predictions[col] * std) + mean
-
-    return output
-
-  # Default params
-  def get_fixed_params(self):
-    """Returns fixed model parameters for experiments."""
-
-    fixed_params = {
-        'total_time_steps': 120,
-        'num_encoder_steps': 90,
-        'num_epochs': 100,
-        'early_stopping_patience': 5,
-        'multiprocessing_workers': 5
-    }
-
-    return fixed_params
-
-  def get_default_model_params(self):
-    """Returns default optimised model parameters."""
-
-    model_params = {
-        'dropout_rate': 0.1,
-        'hidden_layer_size': 240,
-        'learning_rate': 0.001,
-        'minibatch_size': 128,
-        'max_gradient_norm': 100.,
-        'num_heads': 4,
-        'stack_size': 1
-    }
-
-    return model_params
-
-  def get_num_samples_for_calibration(self):
-    """Gets the default number of training and validation samples.
-
-    Use to sub-sample the data for network calibration and a value of -1 uses
-    all available samples.
-
-    Returns:
-      Tuple of (training samples, validation samples)
-    """
-    return 450000, 50000
-
-  def get_column_definition(self):
-    """"Formats column definition in order expected by the TFT.
-
-    Modified for Favorita to match column order of original experiment.
-
-    Returns:
-      Favorita-specific column definition
-    """
-
-    column_definition = self._column_definition
-
-    # Sanity checks first.
-    # Ensure only one ID and time column exist
-    def _check_single_column(input_type):
-
-      length = len([tup for tup in column_definition if tup[2] == input_type])
-
-      if length != 1:
-        raise ValueError('Illegal number of inputs ({}) of type {}'.format(
-            length, input_type))
-
-    _check_single_column(InputTypes.ID)
-    _check_single_column(InputTypes.TIME)
-
-    identifier = [tup for tup in column_definition if tup[2] == InputTypes.ID]
-    time = [tup for tup in column_definition if tup[2] == InputTypes.TIME]
-    real_inputs = [
-        tup for tup in column_definition if tup[1] == DataTypes.REAL_VALUED and
-        tup[2] not in {InputTypes.ID, InputTypes.TIME}
-    ]
-
-    col_definition_map = {tup[0]: tup for tup in column_definition}
-    col_order = [
-        'item_nbr', 'store_nbr', 'city', 'state', 'type', 'cluster', 'family',
-        'class', 'perishable', 'onpromotion', 'day_of_week', 'national_hol',
-        'regional_hol', 'local_hol'
-    ]
-    categorical_inputs = [
-        col_definition_map[k] for k in col_order if k in col_definition_map
-    ]
-
-    return identifier + time + real_inputs + categorical_inputs
+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Custom formatting functions for Favorita dataset.
+
+Defines dataset specific column definitions and data transformations.
+"""
+
+import data_formatters.base
+import libs.utils as utils
+import pandas as pd
+import sklearn.preprocessing
+
+DataTypes = data_formatters.base.DataTypes
+InputTypes = data_formatters.base.InputTypes
+
+
+class FavoritaFormatter(data_formatters.base.GenericDataFormatter):
+    """Defines and formats data for the Favorita dataset.
+
+    Attributes:
+      column_definition: Defines input and data type of column used in the
+        experiment.
+      identifiers: Entity identifiers used in experiments.
+    """
+
+    _column_definition = [
+        ("traj_id", DataTypes.REAL_VALUED, InputTypes.ID),
+        ("date", DataTypes.DATE, InputTypes.TIME),
+        ("log_sales", DataTypes.REAL_VALUED, InputTypes.TARGET),
+        ("onpromotion", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
+        ("transactions", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
+        ("oil", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
+        ("day_of_week", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
+        ("day_of_month", DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
+        ("month", DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
+        ("national_hol", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
+        ("regional_hol", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
+        ("local_hol", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
+        ("open", DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
+        ("item_nbr", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
+        ("store_nbr", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
+        ("city", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
+        ("state", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
+        ("type", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
+        ("cluster", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
+        ("family", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
+        ("class", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
+        ("perishable", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
+    ]
+
+    def __init__(self):
+        """Initialises formatter."""
+
+        self.identifiers = None
+        self._real_scalers = None
+        self._cat_scalers = None
+        self._target_scaler = None
+        self._num_classes_per_cat_input = None
+
+    def split_data(self, df, valid_boundary=None, test_boundary=None):
+        """Splits data frame into training-validation-test data frames.
+
+        This also calibrates scaling object, and transforms data for each split.
+
+        Args:
+          df: Source data frame to split.
+          valid_boundary: Starting year for validation data
+          test_boundary: Starting year for test data
+
+        Returns:
+          Tuple of transformed (train, valid, test) data.
+        """
+
+        print("Formatting train-valid-test splits.")
+
+        if valid_boundary is None:
+            valid_boundary = pd.datetime(2015, 12, 1)
+
+        fixed_params = self.get_fixed_params()
+        time_steps = fixed_params["total_time_steps"]
+        lookback = fixed_params["num_encoder_steps"]
+        forecast_horizon = time_steps - lookback
+
+        df["date"] = pd.to_datetime(df["date"])
+        df_lists = {"train": [], "valid": [], "test": []}
+        for _, sliced in df.groupby("traj_id"):
+            index = sliced["date"]
+            train = sliced.loc[index < valid_boundary]
+            train_len = len(train)
+            valid_len = train_len + forecast_horizon
+            valid = sliced.iloc[train_len - lookback : valid_len, :]
+            test = sliced.iloc[valid_len - lookback : valid_len + forecast_horizon, :]
+
+            sliced_map = {"train": train, "valid": valid, "test": test}
+
+            for k in sliced_map:
+                item = sliced_map[k]
+
+                if len(item) >= time_steps:
+                    df_lists[k].append(item)
+
+        dfs = {k: pd.concat(df_lists[k], axis=0) for k in df_lists}
+
+        train = dfs["train"]
+        self.set_scalers(train, set_real=True)
+
+        # Use all data for label encoding  to handle labels not present in training.
+        self.set_scalers(df, set_real=False)
+
+        # Filter out identifiers not present in training (i.e. cold-started items).
+        def filter_ids(frame):
+            identifiers = set(self.identifiers)
+            index = frame["traj_id"]
+            return frame.loc[index.apply(lambda x: x in identifiers)]
+
+        valid = filter_ids(dfs["valid"])
+        test = filter_ids(dfs["test"])
+
+        return (self.transform_inputs(data) for data in [train, valid, test])
+
+    def set_scalers(self, df, set_real=True):
+        """Calibrates scalers using the data supplied.
+
+        Label encoding is applied to the entire dataset (i.e. including test),
+        so that unseen labels can be handled at run-time.
+
+        Args:
+          df: Data to use to calibrate scalers.
+          set_real: Whether to fit set real-valued or categorical scalers
+        """
+        print("Setting scalers with training data...")
+
+        column_definitions = self.get_column_definition()
+        id_column = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions)
+        target_column = utils.get_single_col_by_input_type(InputTypes.TARGET, column_definitions)
+
+        if set_real:
+
+            # Extract identifiers in case required
+            self.identifiers = list(df[id_column].unique())
+
+            # Format real scalers
+            self._real_scalers = {}
+            for col in ["oil", "transactions", "log_sales"]:
+                self._real_scalers[col] = (df[col].mean(), df[col].std())
+
+            self._target_scaler = (df[target_column].mean(), df[target_column].std())
+
+        else:
+            # Format categorical scalers
+            categorical_inputs = utils.extract_cols_from_data_type(
+                DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}
+            )
+
+            categorical_scalers = {}
+            num_classes = []
+            if self.identifiers is None:
+                raise ValueError("Scale real-valued inputs first!")
+            id_set = set(self.identifiers)
+            valid_idx = df["traj_id"].apply(lambda x: x in id_set)
+            for col in categorical_inputs:
+                # Set all to str so that we don't have mixed integer/string columns
+                srs = df[col].apply(str).loc[valid_idx]
+                categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
+
+                num_classes.append(srs.nunique())
+
+            # Set categorical scaler outputs
+            self._cat_scalers = categorical_scalers
+            self._num_classes_per_cat_input = num_classes
+
+    def transform_inputs(self, df):
+        """Performs feature transformations.
+
+        This includes both feature engineering, preprocessing and normalisation.
+
+        Args:
+          df: Data frame to transform.
+
+        Returns:
+          Transformed data frame.
+
+        """
+        output = df.copy()
+
+        if self._real_scalers is None and self._cat_scalers is None:
+            raise ValueError("Scalers have not been set!")
+
+        column_definitions = self.get_column_definition()
+
+        categorical_inputs = utils.extract_cols_from_data_type(
+            DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}
+        )
+
+        # Format real inputs
+        for col in ["log_sales", "oil", "transactions"]:
+            mean, std = self._real_scalers[col]
+            output[col] = (df[col] - mean) / std
+
+            if col == "log_sales":
+                output[col] = output[col].fillna(0.0)  # mean imputation
+
+        # Format categorical inputs
+        for col in categorical_inputs:
+            string_df = df[col].apply(str)
+            output[col] = self._cat_scalers[col].transform(string_df)
+
+        return output
+
+    def format_predictions(self, predictions):
+        """Reverts any normalisation to give predictions in original scale.
+
+        Args:
+          predictions: Dataframe of model predictions.
+
+        Returns:
+          Data frame of unnormalised predictions.
+        """
+        output = predictions.copy()
+
+        column_names = predictions.columns
+        mean, std = self._target_scaler
+        for col in column_names:
+            if col not in {"forecast_time", "identifier"}:
+                output[col] = (predictions[col] * std) + mean
+
+        return output
+
+    # Default params
+    def get_fixed_params(self):
+        """Returns fixed model parameters for experiments."""
+
+        fixed_params = {
+            "total_time_steps": 120,
+            "num_encoder_steps": 90,
+            "num_epochs": 100,
+            "early_stopping_patience": 5,
+            "multiprocessing_workers": 5,
+        }
+
+        return fixed_params
+
+    def get_default_model_params(self):
+        """Returns default optimised model parameters."""
+
+        model_params = {
+            "dropout_rate": 0.1,
+            "hidden_layer_size": 240,
+            "learning_rate": 0.001,
+            "minibatch_size": 128,
+            "max_gradient_norm": 100.0,
+            "num_heads": 4,
+            "stack_size": 1,
+        }
+
+        return model_params
+
+    def get_num_samples_for_calibration(self):
+        """Gets the default number of training and validation samples.
+
+        Use to sub-sample the data for network calibration and a value of -1 uses
+        all available samples.
+
+        Returns:
+          Tuple of (training samples, validation samples)
+        """
+        return 450000, 50000
+
+    def get_column_definition(self):
+        """ "Formats column definition in order expected by the TFT.
+
+        Modified for Favorita to match column order of original experiment.
+
+        Returns:
+          Favorita-specific column definition
+        """
+
+        column_definition = self._column_definition
+
+        # Sanity checks first.
+        # Ensure only one ID and time column exist
+        def _check_single_column(input_type):
+
+            length = len([tup for tup in column_definition if tup[2] == input_type])
+
+            if length != 1:
+                raise ValueError("Illegal number of inputs ({}) of type {}".format(length, input_type))
+
+        _check_single_column(InputTypes.ID)
+        _check_single_column(InputTypes.TIME)
+
+        identifier = [tup for tup in column_definition if tup[2] == InputTypes.ID]
+        time = [tup for tup in column_definition if tup[2] == InputTypes.TIME]
+        real_inputs = [
+            tup
+            for tup in column_definition
+            if tup[1] == DataTypes.REAL_VALUED and tup[2] not in {InputTypes.ID, InputTypes.TIME}
+        ]
+
+        col_definition_map = {tup[0]: tup for tup in column_definition}
+        col_order = [
+            "item_nbr",
+            "store_nbr",
+            "city",
+            "state",
+            "type",
+            "cluster",
+            "family",
+            "class",
+            "perishable",
+            "onpromotion",
+            "day_of_week",
+            "national_hol",
+            "regional_hol",
+            "local_hol",
+        ]
+        categorical_inputs = [col_definition_map[k] for k in col_order if k in col_definition_map]
+
+        return identifier + time + real_inputs + categorical_inputs
diff --git a/examples/benchmarks/TFT/data_formatters/qlib_Alpha158.py b/examples/benchmarks/TFT/data_formatters/qlib_Alpha158.py
index aa081fb17..e9236d041 100644
--- a/examples/benchmarks/TFT/data_formatters/qlib_Alpha158.py
+++ b/examples/benchmarks/TFT/data_formatters/qlib_Alpha158.py
@@ -1,220 +1,219 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-"""Custom formatting functions for Alpha158 dataset.
-
-Defines dataset specific column definitions and data transformations.
-"""
-
-import data_formatters.base
-import libs.utils as utils
-import sklearn.preprocessing
-
-GenericDataFormatter = data_formatters.base.GenericDataFormatter
-DataTypes = data_formatters.base.DataTypes
-InputTypes = data_formatters.base.InputTypes
-
-class Alpha158Formatter(GenericDataFormatter):
-  """Defines and formats data for the Alpha158 dataset.
-
-  Attributes:
-    column_definition: Defines input and data type of column used in the
-      experiment.
-    identifiers: Entity identifiers used in experiments.
-  """
-
-  _column_definition = [
-      ('instrument', DataTypes.CATEGORICAL, InputTypes.ID),
-      ('LABEL0', DataTypes.REAL_VALUED, InputTypes.TARGET),
-      ('date', DataTypes.DATE, InputTypes.TIME),
-      ('month', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
-      ('day_of_week', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
-      # Selected 10 features
-      ('RESI5', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
-      ('WVMA5', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
-      ('RSQR5', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
-      ('KLEN', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
-      ('RSQR10', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
-      ('CORR5', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
-      ('CORD5', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
-      ('CORR10', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
-      ('ROC60', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
-      ('RESI10', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
-      ('const', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
-    ]
-
-  def __init__(self):
-    """Initialises formatter."""
-
-    self.identifiers = None
-    self._real_scalers = None
-    self._cat_scalers = None
-    self._target_scaler = None
-    self._num_classes_per_cat_input = None
-
-  def split_data(self, df, valid_boundary=2016, test_boundary=2018):
-    """Splits data frame into training-validation-test data frames.
-
-    This also calibrates scaling object, and transforms data for each split.
-
-    Args:
-      df: Source data frame to split.
-      valid_boundary: Starting year for validation data
-      test_boundary: Starting year for test data
-
-    Returns:
-      Tuple of transformed (train, valid, test) data.
-    """
-
-    print('Formatting train-valid-test splits.')
-
-    index = df['year']
-    train = df.loc[index < valid_boundary]
-    valid = df.loc[(index >= valid_boundary) & (index < test_boundary)]
-    test = df.loc[index >= test_boundary]
-
-    self.set_scalers(train)
-
-    return (self.transform_inputs(data) for data in [train, valid, test])
-
-  def set_scalers(self, df):
-    """Calibrates scalers using the data supplied.
-
-    Args:
-      df: Data to use to calibrate scalers.
-    """
-    print('Setting scalers with training data...')
-
-    column_definitions = self.get_column_definition()
-    id_column = utils.get_single_col_by_input_type(InputTypes.ID,
-                                                   column_definitions)
-    target_column = utils.get_single_col_by_input_type(InputTypes.TARGET,
-                                                       column_definitions)
-
-    # Extract identifiers in case required
-    self.identifiers = list(df[id_column].unique())
-
-    # Format real scalers
-    real_inputs = utils.extract_cols_from_data_type(
-        DataTypes.REAL_VALUED, column_definitions,
-        {InputTypes.ID, InputTypes.TIME})
-
-    data = df[real_inputs].values
-    self._real_scalers = sklearn.preprocessing.StandardScaler().fit(data)
-    self._target_scaler = sklearn.preprocessing.StandardScaler().fit(
-        df[[target_column]].values)  # used for predictions
-
-    # Format categorical scalers
-    categorical_inputs = utils.extract_cols_from_data_type(
-        DataTypes.CATEGORICAL, column_definitions,
-        {InputTypes.ID, InputTypes.TIME})
-
-    categorical_scalers = {}
-    num_classes = []
-    for col in categorical_inputs:
-      # Set all to str so that we don't have mixed integer/string columns
-      srs = df[col].apply(str)
-      categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(
-          srs.values)
-      num_classes.append(srs.nunique())
-
-    # Set categorical scaler outputs
-    self._cat_scalers = categorical_scalers
-    self._num_classes_per_cat_input = num_classes
-
-  def transform_inputs(self, df):
-    """Performs feature transformations.
-
-    This includes both feature engineering, preprocessing and normalisation.
-
-    Args:
-      df: Data frame to transform.
-
-    Returns:
-      Transformed data frame.
-
-    """
-    output = df.copy()
-
-    if self._real_scalers is None and self._cat_scalers is None:
-      raise ValueError('Scalers have not been set!')
-
-    column_definitions = self.get_column_definition()
-
-    real_inputs = utils.extract_cols_from_data_type(
-        DataTypes.REAL_VALUED, column_definitions,
-        {InputTypes.ID, InputTypes.TIME})
-    categorical_inputs = utils.extract_cols_from_data_type(
-        DataTypes.CATEGORICAL, column_definitions,
-        {InputTypes.ID, InputTypes.TIME})
-
-    # Format real inputs
-    output[real_inputs] = self._real_scalers.transform(df[real_inputs].values)
-
-    # Format categorical inputs
-    for col in categorical_inputs:
-      string_df = df[col].apply(str)
-      output[col] = self._cat_scalers[col].transform(string_df)
-
-    return output
-
-  def format_predictions(self, predictions):
-    """Reverts any normalisation to give predictions in original scale.
-
-    Args:
-      predictions: Dataframe of model predictions.
-
-    Returns:
-      Data frame of unnormalised predictions.
-    """
-    output = predictions.copy()
-
-    column_names = predictions.columns
-
-    for col in column_names:
-      if col not in {'forecast_time', 'identifier'}:
-        output[col] = self._target_scaler.inverse_transform(predictions[col])
-
-    return output
-
-  # Default params
-  def get_fixed_params(self):
-    """Returns fixed model parameters for experiments."""
-
-    fixed_params = {
-        'total_time_steps': 16 + 6,
-        'num_encoder_steps': 16,
-        'num_epochs': 100,
-        'early_stopping_patience': 5,
-        'multiprocessing_workers': 5,
-    }
-
-    return fixed_params
-
-  def get_default_model_params(self):
-    """Returns default optimised model parameters."""
-
-    model_params = {
-        'dropout_rate': 0.3,
-        'hidden_layer_size': 160,
-        'learning_rate': 0.01,
-        'minibatch_size': 64,
-        'max_gradient_norm': 0.01,
-        'num_heads': 1,
-        'stack_size': 1
-    }
-
-    return model_params
+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Custom formatting functions for Alpha158 dataset.
+
+Defines dataset specific column definitions and data transformations.
+"""
+
+import data_formatters.base
+import libs.utils as utils
+import sklearn.preprocessing
+
+GenericDataFormatter = data_formatters.base.GenericDataFormatter
+DataTypes = data_formatters.base.DataTypes
+InputTypes = data_formatters.base.InputTypes
+
+
+class Alpha158Formatter(GenericDataFormatter):
+    """Defines and formats data for the Alpha158 dataset.
+
+    Attributes:
+      column_definition: Defines input and data type of column used in the
+        experiment.
+      identifiers: Entity identifiers used in experiments.
+    """
+
+    _column_definition = [
+        ("instrument", DataTypes.CATEGORICAL, InputTypes.ID),
+        ("LABEL0", DataTypes.REAL_VALUED, InputTypes.TARGET),
+        ("date", DataTypes.DATE, InputTypes.TIME),
+        ("month", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
+        ("day_of_week", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
+        # Selected 10 features
+        ("RESI5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
+        ("WVMA5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
+        ("RSQR5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
+        ("KLEN", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
+        ("RSQR10", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
+        ("CORR5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
+        ("CORD5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
+        ("CORR10", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
+        ("ROC60", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
+        ("RESI10", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
+        ("const", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
+    ]
+
+    def __init__(self):
+        """Initialises formatter."""
+
+        self.identifiers = None
+        self._real_scalers = None
+        self._cat_scalers = None
+        self._target_scaler = None
+        self._num_classes_per_cat_input = None
+
+    def split_data(self, df, valid_boundary=2016, test_boundary=2018):
+        """Splits data frame into training-validation-test data frames.
+
+        This also calibrates scaling object, and transforms data for each split.
+
+        Args:
+          df: Source data frame to split.
+          valid_boundary: Starting year for validation data
+          test_boundary: Starting year for test data
+
+        Returns:
+          Tuple of transformed (train, valid, test) data.
+        """
+
+        print("Formatting train-valid-test splits.")
+
+        index = df["year"]
+        train = df.loc[index < valid_boundary]
+        valid = df.loc[(index >= valid_boundary) & (index < test_boundary)]
+        test = df.loc[index >= test_boundary]
+
+        self.set_scalers(train)
+
+        return (self.transform_inputs(data) for data in [train, valid, test])
+
+    def set_scalers(self, df):
+        """Calibrates scalers using the data supplied.
+
+        Args:
+          df: Data to use to calibrate scalers.
+        """
+        print("Setting scalers with training data...")
+
+        column_definitions = self.get_column_definition()
+        id_column = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions)
+        target_column = utils.get_single_col_by_input_type(InputTypes.TARGET, column_definitions)
+
+        # Extract identifiers in case required
+        self.identifiers = list(df[id_column].unique())
+
+        # Format real scalers
+        real_inputs = utils.extract_cols_from_data_type(
+            DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}
+        )
+
+        data = df[real_inputs].values
+        self._real_scalers = sklearn.preprocessing.StandardScaler().fit(data)
+        self._target_scaler = sklearn.preprocessing.StandardScaler().fit(
+            df[[target_column]].values
+        )  # used for predictions
+
+        # Format categorical scalers
+        categorical_inputs = utils.extract_cols_from_data_type(
+            DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}
+        )
+
+        categorical_scalers = {}
+        num_classes = []
+        for col in categorical_inputs:
+            # Set all to str so that we don't have mixed integer/string columns
+            srs = df[col].apply(str)
+            categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
+            num_classes.append(srs.nunique())
+
+        # Set categorical scaler outputs
+        self._cat_scalers = categorical_scalers
+        self._num_classes_per_cat_input = num_classes
+
+    def transform_inputs(self, df):
+        """Performs feature transformations.
+
+        This includes both feature engineering, preprocessing and normalisation.
+
+        Args:
+          df: Data frame to transform.
+
+        Returns:
+          Transformed data frame.
+
+        """
+        output = df.copy()
+
+        if self._real_scalers is None and self._cat_scalers is None:
+            raise ValueError("Scalers have not been set!")
+
+        column_definitions = self.get_column_definition()
+
+        real_inputs = utils.extract_cols_from_data_type(
+            DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}
+        )
+        categorical_inputs = utils.extract_cols_from_data_type(
+            DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}
+        )
+
+        # Format real inputs
+        output[real_inputs] = self._real_scalers.transform(df[real_inputs].values)
+
+        # Format categorical inputs
+        for col in categorical_inputs:
+            string_df = df[col].apply(str)
+            output[col] = self._cat_scalers[col].transform(string_df)
+
+        return output
+
+    def format_predictions(self, predictions):
+        """Reverts any normalisation to give predictions in original scale.
+
+        Args:
+          predictions: Dataframe of model predictions.
+
+        Returns:
+          Data frame of unnormalised predictions.
+        """
+        output = predictions.copy()
+
+        column_names = predictions.columns
+
+        for col in column_names:
+            if col not in {"forecast_time", "identifier"}:
+                output[col] = self._target_scaler.inverse_transform(predictions[col])
+
+        return output
+
+    # Default params
+    def get_fixed_params(self):
+        """Returns fixed model parameters for experiments."""
+
+        fixed_params = {
+            "total_time_steps": 16 + 6,
+            "num_encoder_steps": 16,
+            "num_epochs": 100,
+            "early_stopping_patience": 5,
+            "multiprocessing_workers": 5,
+        }
+
+        return fixed_params
+
+    def get_default_model_params(self):
+        """Returns default optimised model parameters."""
+
+        model_params = {
+            "dropout_rate": 0.3,
+            "hidden_layer_size": 160,
+            "learning_rate": 0.01,
+            "minibatch_size": 64,
+            "max_gradient_norm": 0.01,
+            "num_heads": 1,
+            "stack_size": 1,
+        }
+
+        return model_params
diff --git a/examples/benchmarks/TFT/data_formatters/traffic.py b/examples/benchmarks/TFT/data_formatters/traffic.py
index 49401e5cc..ee8ef2e5d 100644
--- a/examples/benchmarks/TFT/data_formatters/traffic.py
+++ b/examples/benchmarks/TFT/data_formatters/traffic.py
@@ -1,117 +1,117 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-"""Custom formatting functions for Traffic dataset.
-
-Defines dataset specific column definitions and data transformations. This also
-performs z-score normalization across the entire dataset, hence re-uses most of
-the same functions as volatility.
-"""
-
-import data_formatters.base
-import data_formatters.volatility
-
-VolatilityFormatter = data_formatters.volatility.VolatilityFormatter
-DataTypes = data_formatters.base.DataTypes
-InputTypes = data_formatters.base.InputTypes
-
-
-class TrafficFormatter(VolatilityFormatter):
-  """Defines and formats data for the traffic dataset.
-
-  This also performs z-score normalization across the entire dataset, hence
-  re-uses most of the same functions as volatility.
-
-  Attributes:
-    column_definition: Defines input and data type of column used in the
-      experiment.
-    identifiers: Entity identifiers used in experiments.
-  """
-
-  _column_definition = [
-      ('id', DataTypes.REAL_VALUED, InputTypes.ID),
-      ('hours_from_start', DataTypes.REAL_VALUED, InputTypes.TIME),
-      ('values', DataTypes.REAL_VALUED, InputTypes.TARGET),
-      ('time_on_day', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
-      ('day_of_week', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
-      ('hours_from_start', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
-      ('categorical_id', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
-  ]
-
-  def split_data(self, df, valid_boundary=151, test_boundary=166):
-    """Splits data frame into training-validation-test data frames.
-
-    This also calibrates scaling object, and transforms data for each split.
-
-    Args:
-      df: Source data frame to split.
-      valid_boundary: Starting year for validation data
-      test_boundary: Starting year for test data
-
-    Returns:
-      Tuple of transformed (train, valid, test) data.
-    """
-
-    print('Formatting train-valid-test splits.')
-
-    index = df['sensor_day']
-    train = df.loc[index < valid_boundary]
-    valid = df.loc[(index >= valid_boundary - 7) & (index < test_boundary)]
-    test = df.loc[index >= test_boundary - 7]
-
-    self.set_scalers(train)
-
-    return (self.transform_inputs(data) for data in [train, valid, test])
-
-  # Default params
-  def get_fixed_params(self):
-    """Returns fixed model parameters for experiments."""
-
-    fixed_params = {
-        'total_time_steps': 8 * 24,
-        'num_encoder_steps': 7 * 24,
-        'num_epochs': 100,
-        'early_stopping_patience': 5,
-        'multiprocessing_workers': 5
-    }
-
-    return fixed_params
-
-  def get_default_model_params(self):
-    """Returns default optimised model parameters."""
-
-    model_params = {
-        'dropout_rate': 0.3,
-        'hidden_layer_size': 320,
-        'learning_rate': 0.001,
-        'minibatch_size': 128,
-        'max_gradient_norm': 100.,
-        'num_heads': 4,
-        'stack_size': 1
-    }
-
-    return model_params
-
-  def get_num_samples_for_calibration(self):
-    """Gets the default number of training and validation samples.
-
-    Use to sub-sample the data for network calibration and a value of -1 uses
-    all available samples.
-
-    Returns:
-      Tuple of (training samples, validation samples)
-    """
-    return 450000, 50000
+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Custom formatting functions for Traffic dataset.
+
+Defines dataset specific column definitions and data transformations. This also
+performs z-score normalization across the entire dataset, hence re-uses most of
+the same functions as volatility.
+"""
+
+import data_formatters.base
+import data_formatters.volatility
+
+VolatilityFormatter = data_formatters.volatility.VolatilityFormatter
+DataTypes = data_formatters.base.DataTypes
+InputTypes = data_formatters.base.InputTypes
+
+
+class TrafficFormatter(VolatilityFormatter):
+    """Defines and formats data for the traffic dataset.
+
+    This also performs z-score normalization across the entire dataset, hence
+    re-uses most of the same functions as volatility.
+
+    Attributes:
+      column_definition: Defines input and data type of column used in the
+        experiment.
+      identifiers: Entity identifiers used in experiments.
+    """
+
+    _column_definition = [
+        ("id", DataTypes.REAL_VALUED, InputTypes.ID),
+        ("hours_from_start", DataTypes.REAL_VALUED, InputTypes.TIME),
+        ("values", DataTypes.REAL_VALUED, InputTypes.TARGET),
+        ("time_on_day", DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
+        ("day_of_week", DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
+        ("hours_from_start", DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
+        ("categorical_id", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
+    ]
+
+    def split_data(self, df, valid_boundary=151, test_boundary=166):
+        """Splits data frame into training-validation-test data frames.
+
+        This also calibrates scaling object, and transforms data for each split.
+
+        Args:
+          df: Source data frame to split.
+          valid_boundary: Starting year for validation data
+          test_boundary: Starting year for test data
+
+        Returns:
+          Tuple of transformed (train, valid, test) data.
+        """
+
+        print("Formatting train-valid-test splits.")
+
+        index = df["sensor_day"]
+        train = df.loc[index < valid_boundary]
+        valid = df.loc[(index >= valid_boundary - 7) & (index < test_boundary)]
+        test = df.loc[index >= test_boundary - 7]
+
+        self.set_scalers(train)
+
+        return (self.transform_inputs(data) for data in [train, valid, test])
+
+    # Default params
+    def get_fixed_params(self):
+        """Returns fixed model parameters for experiments."""
+
+        fixed_params = {
+            "total_time_steps": 8 * 24,
+            "num_encoder_steps": 7 * 24,
+            "num_epochs": 100,
+            "early_stopping_patience": 5,
+            "multiprocessing_workers": 5,
+        }
+
+        return fixed_params
+
+    def get_default_model_params(self):
+        """Returns default optimised model parameters."""
+
+        model_params = {
+            "dropout_rate": 0.3,
+            "hidden_layer_size": 320,
+            "learning_rate": 0.001,
+            "minibatch_size": 128,
+            "max_gradient_norm": 100.0,
+            "num_heads": 4,
+            "stack_size": 1,
+        }
+
+        return model_params
+
+    def get_num_samples_for_calibration(self):
+        """Gets the default number of training and validation samples.
+
+        Use to sub-sample the data for network calibration and a value of -1 uses
+        all available samples.
+
+        Returns:
+          Tuple of (training samples, validation samples)
+        """
+        return 450000, 50000
diff --git a/examples/benchmarks/TFT/data_formatters/volatility.py b/examples/benchmarks/TFT/data_formatters/volatility.py
index 37923a275..b3ddf09fd 100644
--- a/examples/benchmarks/TFT/data_formatters/volatility.py
+++ b/examples/benchmarks/TFT/data_formatters/volatility.py
@@ -1,214 +1,212 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-"""Custom formatting functions for Volatility dataset.
-
-Defines dataset specific column definitions and data transformations.
-"""
-
-import data_formatters.base
-import libs.utils as utils
-import sklearn.preprocessing
-
-GenericDataFormatter = data_formatters.base.GenericDataFormatter
-DataTypes = data_formatters.base.DataTypes
-InputTypes = data_formatters.base.InputTypes
-
-
-class VolatilityFormatter(GenericDataFormatter):
-  """Defines and formats data for the volatility dataset.
-
-  Attributes:
-    column_definition: Defines input and data type of column used in the
-      experiment.
-    identifiers: Entity identifiers used in experiments.
-  """
-
-  _column_definition = [
-      ('Symbol', DataTypes.CATEGORICAL, InputTypes.ID),
-      ('date', DataTypes.DATE, InputTypes.TIME),
-      ('log_vol', DataTypes.REAL_VALUED, InputTypes.TARGET),
-      ('open_to_close', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
-      ('days_from_start', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
-      ('day_of_week', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
-      ('day_of_month', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
-      ('week_of_year', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
-      ('month', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
-      ('Region', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
-  ]
-
-  def __init__(self):
-    """Initialises formatter."""
-
-    self.identifiers = None
-    self._real_scalers = None
-    self._cat_scalers = None
-    self._target_scaler = None
-    self._num_classes_per_cat_input = None
-
-  def split_data(self, df, valid_boundary=2016, test_boundary=2018):
-    """Splits data frame into training-validation-test data frames.
-
-    This also calibrates scaling object, and transforms data for each split.
-
-    Args:
-      df: Source data frame to split.
-      valid_boundary: Starting year for validation data
-      test_boundary: Starting year for test data
-
-    Returns:
-      Tuple of transformed (train, valid, test) data.
-    """
-
-    print('Formatting train-valid-test splits.')
-
-    index = df['year']
-    train = df.loc[index < valid_boundary]
-    valid = df.loc[(index >= valid_boundary) & (index < test_boundary)]
-    test = df.loc[index >= test_boundary]
-
-    self.set_scalers(train)
-
-    return (self.transform_inputs(data) for data in [train, valid, test])
-
-  def set_scalers(self, df):
-    """Calibrates scalers using the data supplied.
-
-    Args:
-      df: Data to use to calibrate scalers.
-    """
-    print('Setting scalers with training data...')
-
-    column_definitions = self.get_column_definition()
-    id_column = utils.get_single_col_by_input_type(InputTypes.ID,
-                                                   column_definitions)
-    target_column = utils.get_single_col_by_input_type(InputTypes.TARGET,
-                                                       column_definitions)
-
-    # Extract identifiers in case required
-    self.identifiers = list(df[id_column].unique())
-
-    # Format real scalers
-    real_inputs = utils.extract_cols_from_data_type(
-        DataTypes.REAL_VALUED, column_definitions,
-        {InputTypes.ID, InputTypes.TIME})
-
-    data = df[real_inputs].values
-    self._real_scalers = sklearn.preprocessing.StandardScaler().fit(data)
-    self._target_scaler = sklearn.preprocessing.StandardScaler().fit(
-        df[[target_column]].values)  # used for predictions
-
-    # Format categorical scalers
-    categorical_inputs = utils.extract_cols_from_data_type(
-        DataTypes.CATEGORICAL, column_definitions,
-        {InputTypes.ID, InputTypes.TIME})
-
-    categorical_scalers = {}
-    num_classes = []
-    for col in categorical_inputs:
-      # Set all to str so that we don't have mixed integer/string columns
-      srs = df[col].apply(str)
-      categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(
-          srs.values)
-      num_classes.append(srs.nunique())
-
-    # Set categorical scaler outputs
-    self._cat_scalers = categorical_scalers
-    self._num_classes_per_cat_input = num_classes
-
-  def transform_inputs(self, df):
-    """Performs feature transformations.
-
-    This includes both feature engineering, preprocessing and normalisation.
-
-    Args:
-      df: Data frame to transform.
-
-    Returns:
-      Transformed data frame.
-
-    """
-    output = df.copy()
-
-    if self._real_scalers is None and self._cat_scalers is None:
-      raise ValueError('Scalers have not been set!')
-
-    column_definitions = self.get_column_definition()
-
-    real_inputs = utils.extract_cols_from_data_type(
-        DataTypes.REAL_VALUED, column_definitions,
-        {InputTypes.ID, InputTypes.TIME})
-    categorical_inputs = utils.extract_cols_from_data_type(
-        DataTypes.CATEGORICAL, column_definitions,
-        {InputTypes.ID, InputTypes.TIME})
-
-    # Format real inputs
-    output[real_inputs] = self._real_scalers.transform(df[real_inputs].values)
-
-    # Format categorical inputs
-    for col in categorical_inputs:
-      string_df = df[col].apply(str)
-      output[col] = self._cat_scalers[col].transform(string_df)
-
-    return output
-
-  def format_predictions(self, predictions):
-    """Reverts any normalisation to give predictions in original scale.
-
-    Args:
-      predictions: Dataframe of model predictions.
-
-    Returns:
-      Data frame of unnormalised predictions.
-    """
-    output = predictions.copy()
-
-    column_names = predictions.columns
-
-    for col in column_names:
-      if col not in {'forecast_time', 'identifier'}:
-        output[col] = self._target_scaler.inverse_transform(predictions[col])
-
-    return output
-
-  # Default params
-  def get_fixed_params(self):
-    """Returns fixed model parameters for experiments."""
-
-    fixed_params = {
-        'total_time_steps': 252 + 5,
-        'num_encoder_steps': 252,
-        'num_epochs': 100,
-        'early_stopping_patience': 5,
-        'multiprocessing_workers': 5,
-    }
-
-    return fixed_params
-
-  def get_default_model_params(self):
-    """Returns default optimised model parameters."""
-
-    model_params = {
-        'dropout_rate': 0.3,
-        'hidden_layer_size': 160,
-        'learning_rate': 0.01,
-        'minibatch_size': 64,
-        'max_gradient_norm': 0.01,
-        'num_heads': 1,
-        'stack_size': 1
-    }
-
-    return model_params
+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Custom formatting functions for Volatility dataset.
+
+Defines dataset specific column definitions and data transformations.
+"""
+
+import data_formatters.base
+import libs.utils as utils
+import sklearn.preprocessing
+
+GenericDataFormatter = data_formatters.base.GenericDataFormatter
+DataTypes = data_formatters.base.DataTypes
+InputTypes = data_formatters.base.InputTypes
+
+
+class VolatilityFormatter(GenericDataFormatter):
+    """Defines and formats data for the volatility dataset.
+
+    Attributes:
+      column_definition: Defines input and data type of column used in the
+        experiment.
+      identifiers: Entity identifiers used in experiments.
+    """
+
+    _column_definition = [
+        ("Symbol", DataTypes.CATEGORICAL, InputTypes.ID),
+        ("date", DataTypes.DATE, InputTypes.TIME),
+        ("log_vol", DataTypes.REAL_VALUED, InputTypes.TARGET),
+        ("open_to_close", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
+        ("days_from_start", DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
+        ("day_of_week", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
+        ("day_of_month", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
+        ("week_of_year", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
+        ("month", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
+        ("Region", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
+    ]
+
+    def __init__(self):
+        """Initialises formatter."""
+
+        self.identifiers = None
+        self._real_scalers = None
+        self._cat_scalers = None
+        self._target_scaler = None
+        self._num_classes_per_cat_input = None
+
+    def split_data(self, df, valid_boundary=2016, test_boundary=2018):
+        """Splits data frame into training-validation-test data frames.
+
+        This also calibrates scaling object, and transforms data for each split.
+
+        Args:
+          df: Source data frame to split.
+          valid_boundary: Starting year for validation data
+          test_boundary: Starting year for test data
+
+        Returns:
+          Tuple of transformed (train, valid, test) data.
+        """
+
+        print("Formatting train-valid-test splits.")
+
+        index = df["year"]
+        train = df.loc[index < valid_boundary]
+        valid = df.loc[(index >= valid_boundary) & (index < test_boundary)]
+        test = df.loc[index >= test_boundary]
+
+        self.set_scalers(train)
+
+        return (self.transform_inputs(data) for data in [train, valid, test])
+
+    def set_scalers(self, df):
+        """Calibrates scalers using the data supplied.
+
+        Args:
+          df: Data to use to calibrate scalers.
+        """
+        print("Setting scalers with training data...")
+
+        column_definitions = self.get_column_definition()
+        id_column = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions)
+        target_column = utils.get_single_col_by_input_type(InputTypes.TARGET, column_definitions)
+
+        # Extract identifiers in case required
+        self.identifiers = list(df[id_column].unique())
+
+        # Format real scalers
+        real_inputs = utils.extract_cols_from_data_type(
+            DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}
+        )
+
+        data = df[real_inputs].values
+        self._real_scalers = sklearn.preprocessing.StandardScaler().fit(data)
+        self._target_scaler = sklearn.preprocessing.StandardScaler().fit(
+            df[[target_column]].values
+        )  # used for predictions
+
+        # Format categorical scalers
+        categorical_inputs = utils.extract_cols_from_data_type(
+            DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}
+        )
+
+        categorical_scalers = {}
+        num_classes = []
+        for col in categorical_inputs:
+            # Set all to str so that we don't have mixed integer/string columns
+            srs = df[col].apply(str)
+            categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
+            num_classes.append(srs.nunique())
+
+        # Set categorical scaler outputs
+        self._cat_scalers = categorical_scalers
+        self._num_classes_per_cat_input = num_classes
+
+    def transform_inputs(self, df):
+        """Performs feature transformations.
+
+        This includes both feature engineering, preprocessing and normalisation.
+
+        Args:
+          df: Data frame to transform.
+
+        Returns:
+          Transformed data frame.
+
+        """
+        output = df.copy()
+
+        if self._real_scalers is None and self._cat_scalers is None:
+            raise ValueError("Scalers have not been set!")
+
+        column_definitions = self.get_column_definition()
+
+        real_inputs = utils.extract_cols_from_data_type(
+            DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}
+        )
+        categorical_inputs = utils.extract_cols_from_data_type(
+            DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}
+        )
+
+        # Format real inputs
+        output[real_inputs] = self._real_scalers.transform(df[real_inputs].values)
+
+        # Format categorical inputs
+        for col in categorical_inputs:
+            string_df = df[col].apply(str)
+            output[col] = self._cat_scalers[col].transform(string_df)
+
+        return output
+
+    def format_predictions(self, predictions):
+        """Reverts any normalisation to give predictions in original scale.
+
+        Args:
+          predictions: Dataframe of model predictions.
+
+        Returns:
+          Data frame of unnormalised predictions.
+        """
+        output = predictions.copy()
+
+        column_names = predictions.columns
+
+        for col in column_names:
+            if col not in {"forecast_time", "identifier"}:
+                output[col] = self._target_scaler.inverse_transform(predictions[col])
+
+        return output
+
+    # Default params
+    def get_fixed_params(self):
+        """Returns fixed model parameters for experiments."""
+
+        fixed_params = {
+            "total_time_steps": 252 + 5,
+            "num_encoder_steps": 252,
+            "num_epochs": 100,
+            "early_stopping_patience": 5,
+            "multiprocessing_workers": 5,
+        }
+
+        return fixed_params
+
+    def get_default_model_params(self):
+        """Returns default optimised model parameters."""
+
+        model_params = {
+            "dropout_rate": 0.3,
+            "hidden_layer_size": 160,
+            "learning_rate": 0.01,
+            "minibatch_size": 64,
+            "max_gradient_norm": 0.01,
+            "num_heads": 1,
+            "stack_size": 1,
+        }
+
+        return model_params
diff --git a/examples/benchmarks/TFT/expt_settings/__init__.py b/examples/benchmarks/TFT/expt_settings/__init__.py
index 9a1980462..87ec3284f 100644
--- a/examples/benchmarks/TFT/expt_settings/__init__.py
+++ b/examples/benchmarks/TFT/expt_settings/__init__.py
@@ -1,15 +1,14 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/examples/benchmarks/TFT/expt_settings/configs.py b/examples/benchmarks/TFT/expt_settings/configs.py
index d28a39bb0..d1891a002 100644
--- a/examples/benchmarks/TFT/expt_settings/configs.py
+++ b/examples/benchmarks/TFT/expt_settings/configs.py
@@ -1,111 +1,107 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-"""Default configs for TFT experiments.
-
-Contains the default output paths for data, serialised models and predictions
-for the main experiments used in the publication.
-"""
-
-import os
-
-import data_formatters.electricity
-import data_formatters.favorita
-import data_formatters.traffic
-import data_formatters.volatility
-import data_formatters.qlib_Alpha158
-
-
-class ExperimentConfig(object):
-  """Defines experiment configs and paths to outputs.
-
-  Attributes:
-    root_folder: Root folder to contain all experimental outputs.
-    experiment: Name of experiment to run.
-    data_folder: Folder to store data for experiment.
-    model_folder: Folder to store serialised models.
-    results_folder: Folder to store results.
-    data_csv_path: Path to primary data csv file used in experiment.
-    hyperparam_iterations: Default number of random search iterations for
-      experiment.
-  """
-
-  default_experiments = ['volatility', 'electricity', 'traffic', 'favorita', 'Alpha158']
-
-  def __init__(self, experiment='volatility', root_folder=None):
-    """Creates configs based on default experiment chosen.
-
-    Args:
-      experiment: Name of experiment.
-      root_folder: Root folder to save all outputs of training.
-    """
-
-    if experiment not in self.default_experiments:
-      raise ValueError('Unrecognised experiment={}'.format(experiment))
-
-    # Defines all relevant paths
-    if root_folder is None:
-      root_folder = os.path.join(
-          os.path.dirname(os.path.realpath(__file__)), '..', 'outputs')
-      print('Using root folder {}'.format(root_folder))
-
-    self.root_folder = root_folder
-    self.experiment = experiment
-    self.data_folder = os.path.join(root_folder, 'data', experiment)
-    self.model_folder = os.path.join(root_folder, 'saved_models', experiment)
-    self.results_folder = os.path.join(root_folder, 'results', experiment)
-
-    # Creates folders if they don't exist
-    for relevant_directory in [
-        self.root_folder, self.data_folder, self.model_folder,
-        self.results_folder
-    ]:
-      if not os.path.exists(relevant_directory):
-        os.makedirs(relevant_directory)
-
-  @property
-  def data_csv_path(self):
-    csv_map = {
-        'volatility': 'formatted_omi_vol.csv',
-        'electricity': 'hourly_electricity.csv',
-        'traffic': 'hourly_data.csv',
-        'favorita': 'favorita_consolidated.csv',
-        'Alpha158': 'Alpha158.csv',
-    }
-
-    return os.path.join(self.data_folder, csv_map[self.experiment])
-
-  @property
-  def hyperparam_iterations(self):
-
-    return 240 if self.experiment == 'volatility' else 60
-
-  def make_data_formatter(self):
-    """Gets a data formatter object for experiment.
-
-    Returns:
-      Default DataFormatter per experiment.
-    """
-
-    data_formatter_class = {
-        'volatility': data_formatters.volatility.VolatilityFormatter,
-        'electricity': data_formatters.electricity.ElectricityFormatter,
-        'traffic': data_formatters.traffic.TrafficFormatter,
-        'favorita': data_formatters.favorita.FavoritaFormatter,
-        'Alpha158': data_formatters.qlib_Alpha158.Alpha158Formatter,
-    }
-
-    return data_formatter_class[self.experiment]()
+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Default configs for TFT experiments.
+
+Contains the default output paths for data, serialised models and predictions
+for the main experiments used in the publication.
+"""
+
+import os
+
+import data_formatters.electricity
+import data_formatters.favorita
+import data_formatters.traffic
+import data_formatters.volatility
+import data_formatters.qlib_Alpha158
+
+
+class ExperimentConfig(object):
+    """Defines experiment configs and paths to outputs.
+
+    Attributes:
+      root_folder: Root folder to contain all experimental outputs.
+      experiment: Name of experiment to run.
+      data_folder: Folder to store data for experiment.
+      model_folder: Folder to store serialised models.
+      results_folder: Folder to store results.
+      data_csv_path: Path to primary data csv file used in experiment.
+      hyperparam_iterations: Default number of random search iterations for
+        experiment.
+    """
+
+    default_experiments = ["volatility", "electricity", "traffic", "favorita", "Alpha158"]
+
+    def __init__(self, experiment="volatility", root_folder=None):
+        """Creates configs based on default experiment chosen.
+
+        Args:
+          experiment: Name of experiment.
+          root_folder: Root folder to save all outputs of training.
+        """
+
+        if experiment not in self.default_experiments:
+            raise ValueError("Unrecognised experiment={}".format(experiment))
+
+        # Defines all relevant paths
+        if root_folder is None:
+            root_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "outputs")
+            print("Using root folder {}".format(root_folder))
+
+        self.root_folder = root_folder
+        self.experiment = experiment
+        self.data_folder = os.path.join(root_folder, "data", experiment)
+        self.model_folder = os.path.join(root_folder, "saved_models", experiment)
+        self.results_folder = os.path.join(root_folder, "results", experiment)
+
+        # Creates folders if they don't exist
+        for relevant_directory in [self.root_folder, self.data_folder, self.model_folder, self.results_folder]:
+            if not os.path.exists(relevant_directory):
+                os.makedirs(relevant_directory)
+
+    @property
+    def data_csv_path(self):
+        csv_map = {
+            "volatility": "formatted_omi_vol.csv",
+            "electricity": "hourly_electricity.csv",
+            "traffic": "hourly_data.csv",
+            "favorita": "favorita_consolidated.csv",
+            "Alpha158": "Alpha158.csv",
+        }
+
+        return os.path.join(self.data_folder, csv_map[self.experiment])
+
+    @property
+    def hyperparam_iterations(self):
+
+        return 240 if self.experiment == "volatility" else 60
+
+    def make_data_formatter(self):
+        """Gets a data formatter object for experiment.
+
+        Returns:
+          Default DataFormatter per experiment.
+        """
+
+        data_formatter_class = {
+            "volatility": data_formatters.volatility.VolatilityFormatter,
+            "electricity": data_formatters.electricity.ElectricityFormatter,
+            "traffic": data_formatters.traffic.TrafficFormatter,
+            "favorita": data_formatters.favorita.FavoritaFormatter,
+            "Alpha158": data_formatters.qlib_Alpha158.Alpha158Formatter,
+        }
+
+        return data_formatter_class[self.experiment]()
diff --git a/examples/benchmarks/TFT/libs/__init__.py b/examples/benchmarks/TFT/libs/__init__.py
index 9a1980462..87ec3284f 100644
--- a/examples/benchmarks/TFT/libs/__init__.py
+++ b/examples/benchmarks/TFT/libs/__init__.py
@@ -1,15 +1,14 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/examples/benchmarks/TFT/libs/hyperparam_opt.py b/examples/benchmarks/TFT/libs/hyperparam_opt.py
index c9bc19e7c..750fdf2c1 100644
--- a/examples/benchmarks/TFT/libs/hyperparam_opt.py
+++ b/examples/benchmarks/TFT/libs/hyperparam_opt.py
@@ -1,438 +1,430 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-"""Classes used for hyperparameter optimisation.
-
-Two main classes exist:
-1) HyperparamOptManager used for optimisation on a single machine/GPU.
-2) DistributedHyperparamOptManager for multiple GPUs on different machines.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import os
-import shutil
-import libs.utils as utils
-import numpy as np
-import pandas as pd
-
-Deque = collections.deque
-
-
-class HyperparamOptManager:
-  """Manages hyperparameter optimisation using random search for a single GPU.
-
-  Attributes:
-    param_ranges: Discrete hyperparameter range for random search.
-    results: Dataframe of validation results.
-    fixed_params: Fixed model parameters per experiment.
-    saved_params: Dataframe of parameters trained.
-    best_score: Minimum validation loss observed thus far.
-    optimal_name: Key to best configuration.
-    hyperparam_folder: Where to save optimisation outputs.
-  """
-
-  def __init__(self,
-               param_ranges,
-               fixed_params,
-               model_folder,
-               override_w_fixed_params=True):
-    """Instantiates model.
-
-    Args:
-      param_ranges: Discrete hyperparameter range for random search.
-      fixed_params: Fixed model parameters per experiment.
-      model_folder: Folder to store optimisation artifacts.
-      override_w_fixed_params: Whether to override serialsed fixed model
-        parameters with new supplied values.
-    """
-
-    self.param_ranges = param_ranges
-
-    self._max_tries = 1000
-    self.results = pd.DataFrame()
-    self.fixed_params = fixed_params
-    self.saved_params = pd.DataFrame()
-
-    self.best_score = np.Inf
-    self.optimal_name = ""
-
-    # Setup
-    # Create folder for saving if its not there
-    self.hyperparam_folder = model_folder
-    utils.create_folder_if_not_exist(self.hyperparam_folder)
-
-    self._override_w_fixed_params = override_w_fixed_params
-
-  def load_results(self):
-    """Loads results from previous hyperparameter optimisation.
-
-    Returns:
-      A boolean indicating if previous results can be loaded.
-    """
-    print("Loading results from", self.hyperparam_folder)
-
-    results_file = os.path.join(self.hyperparam_folder, "results.csv")
-    params_file = os.path.join(self.hyperparam_folder, "params.csv")
-
-    if os.path.exists(results_file) and os.path.exists(params_file):
-
-      self.results = pd.read_csv(results_file, index_col=0)
-      self.saved_params = pd.read_csv(params_file, index_col=0)
-
-      if not self.results.empty:
-        self.results.at["loss"] = self.results.loc["loss"].apply(float)
-        self.best_score = self.results.loc["loss"].min()
-
-        is_optimal = self.results.loc["loss"] == self.best_score
-        self.optimal_name = self.results.T[is_optimal].index[0]
-
-        return True
-
-    return False
-
-  def _get_params_from_name(self, name):
-    """Returns previously saved parameters given a key."""
-    params = self.saved_params
-
-    selected_params = dict(params[name])
-
-    if self._override_w_fixed_params:
-      for k in self.fixed_params:
-        selected_params[k] = self.fixed_params[k]
-
-    return selected_params
-
-  def get_best_params(self):
-    """Returns the optimal hyperparameters thus far."""
-
-    optimal_name = self.optimal_name
-
-    return self._get_params_from_name(optimal_name)
-
-  def clear(self):
-    """Clears all previous results and saved parameters."""
-    shutil.rmtree(self.hyperparam_folder)
-    os.makedirs(self.hyperparam_folder)
-    self.results = pd.DataFrame()
-    self.saved_params = pd.DataFrame()
-
-  def _check_params(self, params):
-    """Checks that parameter map is properly defined."""
-
-    valid_fields = list(self.param_ranges.keys()) + list(
-        self.fixed_params.keys())
-    invalid_fields = [k for k in params if k not in valid_fields]
-    missing_fields = [k for k in valid_fields if k not in params]
-
-    if invalid_fields:
-      raise ValueError("Invalid Fields Found {} - Valid ones are {}".format(
-          invalid_fields, valid_fields))
-    if missing_fields:
-      raise ValueError("Missing Fields Found {} - Valid ones are {}".format(
-          missing_fields, valid_fields))
-
-  def _get_name(self, params):
-    """Returns a unique key for the supplied set of params."""
-
-    self._check_params(params)
-
-    fields = list(params.keys())
-    fields.sort()
-
-    return "_".join([str(params[k]) for k in fields])
-
-  def get_next_parameters(self, ranges_to_skip=None):
-    """Returns the next set of parameters to optimise.
-
-    Args:
-      ranges_to_skip: Explicitly defines a set of keys to skip.
-    """
-    if ranges_to_skip is None:
-      ranges_to_skip = set(self.results.index)
-
-    if not isinstance(self.param_ranges, dict):
-      raise ValueError("Only works for random search!")
-
-    param_range_keys = list(self.param_ranges.keys())
-    param_range_keys.sort()
-
-    def _get_next():
-      """Returns next hyperparameter set per try."""
-
-      parameters = {
-          k: np.random.choice(self.param_ranges[k]) for k in param_range_keys
-      }
-
-      # Adds fixed params
-      for k in self.fixed_params:
-        parameters[k] = self.fixed_params[k]
-
-      return parameters
-
-    for _ in range(self._max_tries):
-
-      parameters = _get_next()
-      name = self._get_name(parameters)
-
-      if name not in ranges_to_skip:
-        return parameters
-
-    raise ValueError("Exceeded max number of hyperparameter searches!!")
-
-  def update_score(self, parameters, loss, model, info=""):
-    """Updates the results from last optimisation run.
-
-    Args:
-      parameters: Hyperparameters used in optimisation.
-      loss: Validation loss obtained.
-      model: Model to serialised if required.
-      info: Any ancillary information to tag on to results.
-
-    Returns:
-      Boolean flag indicating if the model is the best seen so far.
-    """
-
-    if np.isnan(loss):
-      loss = np.Inf
-
-    if not os.path.isdir(self.hyperparam_folder):
-      os.makedirs(self.hyperparam_folder)
-
-    name = self._get_name(parameters)
-
-    is_optimal = self.results.empty or loss < self.best_score
-
-    # save the first model
-    if is_optimal:
-      # Try saving first, before updating info
-      if model is not None:
-        print("Optimal model found, updating")
-        model.save(self.hyperparam_folder)
-      self.best_score = loss
-      self.optimal_name = name
-
-    self.results[name] = pd.Series({"loss": loss, "info": info})
-    self.saved_params[name] = pd.Series(parameters)
-
-    self.results.to_csv(os.path.join(self.hyperparam_folder, "results.csv"))
-    self.saved_params.to_csv(os.path.join(self.hyperparam_folder, "params.csv"))
-
-    return is_optimal
-
-
-class DistributedHyperparamOptManager(HyperparamOptManager):
-  """Manages distributed hyperparameter optimisation across many gpus."""
-
-  def __init__(self,
-               param_ranges,
-               fixed_params,
-               root_model_folder,
-               worker_number,
-               search_iterations=1000,
-               num_iterations_per_worker=5,
-               clear_serialised_params=False):
-    """Instantiates optimisation manager.
-
-    This hyperparameter optimisation pre-generates #search_iterations
-    hyperparameter combinations and serialises them
-    at the start. At runtime, each worker goes through their own set of
-    parameter ranges. The pregeneration
-    allows for multiple workers to run in parallel on different machines without
-    resulting in parameter overlaps.
-
-    Args:
-      param_ranges: Discrete hyperparameter range for random search.
-      fixed_params: Fixed model parameters per experiment.
-      root_model_folder: Folder to store optimisation artifacts.
-      worker_number: Worker index definining which set of hyperparameters to
-        test.
-      search_iterations: Maximum numer of random search iterations.
-      num_iterations_per_worker: How many iterations are handled per worker.
-      clear_serialised_params: Whether to regenerate hyperparameter
-        combinations.
-    """
-
-    max_workers = int(np.ceil(search_iterations / num_iterations_per_worker))
-
-    # Sanity checks
-    if worker_number > max_workers:
-      raise ValueError(
-          "Worker number ({}) cannot be larger than the total number of workers!"
-          .format(max_workers))
-    if worker_number > search_iterations:
-      raise ValueError(
-          "Worker number ({}) cannot be larger than the max search iterations ({})!"
-          .format(worker_number, search_iterations))
-
-    print("*** Creating hyperparameter manager for worker {} ***".format(
-        worker_number))
-
-    hyperparam_folder = os.path.join(root_model_folder, str(worker_number))
-    super().__init__(
-        param_ranges,
-        fixed_params,
-        hyperparam_folder,
-        override_w_fixed_params=True)
-
-    serialised_ranges_folder = os.path.join(root_model_folder, "hyperparams")
-    if clear_serialised_params:
-      print("Regenerating hyperparameter list")
-      if os.path.exists(serialised_ranges_folder):
-        shutil.rmtree(serialised_ranges_folder)
-
-    utils.create_folder_if_not_exist(serialised_ranges_folder)
-
-    self.serialised_ranges_path = os.path.join(
-        serialised_ranges_folder, "ranges_{}.csv".format(search_iterations))
-    self.hyperparam_folder = hyperparam_folder  # override
-    self.worker_num = worker_number
-    self.total_search_iterations = search_iterations
-    self.num_iterations_per_worker = num_iterations_per_worker
-    self.global_hyperparam_df = self.load_serialised_hyperparam_df()
-    self.worker_search_queue = self._get_worker_search_queue()
-
-  @property
-  def optimisation_completed(self):
-    return False if self.worker_search_queue else True
-
-  def get_next_parameters(self):
-    """Returns next dictionary of hyperparameters to optimise."""
-    param_name = self.worker_search_queue.pop()
-
-    params = self.global_hyperparam_df.loc[param_name, :].to_dict()
-
-    # Always override!
-    for k in self.fixed_params:
-      print("Overriding saved {}: {}".format(k, self.fixed_params[k]))
-
-      params[k] = self.fixed_params[k]
-
-    return params
-
-  def load_serialised_hyperparam_df(self):
-    """Loads serialsed hyperparameter ranges from file.
-
-    Returns:
-      DataFrame containing hyperparameter combinations.
-    """
-    print("Loading params for {} search iterations form {}".format(
-        self.total_search_iterations, self.serialised_ranges_path))
-
-    if os.path.exists(self.serialised_ranges_folder):
-      df = pd.read_csv(self.serialised_ranges_path, index_col=0)
-    else:
-      print("Unable to load - regenerating serach ranges instead")
-      df = self.update_serialised_hyperparam_df()
-
-    return df
-
-  def update_serialised_hyperparam_df(self):
-    """Regenerates hyperparameter combinations and saves to file.
-
-    Returns:
-      DataFrame containing hyperparameter combinations.
-    """
-    search_df = self._generate_full_hyperparam_df()
-
-    print("Serialising params for {} search iterations to {}".format(
-        self.total_search_iterations, self.serialised_ranges_path))
-
-    search_df.to_csv(self.serialised_ranges_path)
-
-    return search_df
-
-  def _generate_full_hyperparam_df(self):
-    """Generates actual hyperparameter combinations.
-
-    Returns:
-      DataFrame containing hyperparameter combinations.
-    """
-
-    np.random.seed(131)  # for reproducibility of hyperparam list
-
-    name_list = []
-    param_list = []
-    for _ in range(self.total_search_iterations):
-      params = super().get_next_parameters(name_list)
-
-      name = self._get_name(params)
-
-      name_list.append(name)
-      param_list.append(params)
-
-    full_search_df = pd.DataFrame(param_list, index=name_list)
-
-    return full_search_df
-
-  def clear(self):  # reset when cleared
-    """Clears results for hyperparameter manager and resets."""
-    super().clear()
-    self.worker_search_queue = self._get_worker_search_queue()
-
-  def load_results(self):
-    """Load results from file and queue parameter combinations to try.
-
-    Returns:
-      Boolean indicating if results were successfully loaded.
-    """
-    success = super().load_results()
-
-    if success:
-      self.worker_search_queue = self._get_worker_search_queue()
-
-    return success
-
-  def _get_worker_search_queue(self):
-    """Generates the queue of param combinations for current worker.
-
-    Returns:
-      Queue of hyperparameter combinations outstanding.
-    """
-    global_df = self.assign_worker_numbers(self.global_hyperparam_df)
-    worker_df = global_df[global_df["worker"] == self.worker_num]
-
-    left_overs = [s for s in worker_df.index if s not in self.results.columns]
-
-    return Deque(left_overs)
-
-  def assign_worker_numbers(self, df):
-    """Updates parameter combinations with the index of the worker used.
-
-    Args:
-      df: DataFrame of parameter combinations.
-
-    Returns:
-      Updated DataFrame with worker number.
-    """
-    output = df.copy()
-
-    n = self.total_search_iterations
-    batch_size = self.num_iterations_per_worker
-
-    max_worker_num = int(np.ceil(n / batch_size))
-
-    worker_idx = np.concatenate([
-        np.tile(i + 1, self.num_iterations_per_worker)
-        for i in range(max_worker_num)
-    ])
-
-    output["worker"] = worker_idx[:len(output)]
-
-    return output
+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Classes used for hyperparameter optimisation.
+
+Two main classes exist:
+1) HyperparamOptManager used for optimisation on a single machine/GPU.
+2) DistributedHyperparamOptManager for multiple GPUs on different machines.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import os
+import shutil
+import libs.utils as utils
+import numpy as np
+import pandas as pd
+
+Deque = collections.deque
+
+
+class HyperparamOptManager:
+    """Manages hyperparameter optimisation using random search for a single GPU.
+
+    Attributes:
+      param_ranges: Discrete hyperparameter range for random search.
+      results: Dataframe of validation results.
+      fixed_params: Fixed model parameters per experiment.
+      saved_params: Dataframe of parameters trained.
+      best_score: Minimum validation loss observed thus far.
+      optimal_name: Key to best configuration.
+      hyperparam_folder: Where to save optimisation outputs.
+    """
+
+    def __init__(self, param_ranges, fixed_params, model_folder, override_w_fixed_params=True):
+        """Instantiates model.
+
+        Args:
+          param_ranges: Discrete hyperparameter range for random search.
+          fixed_params: Fixed model parameters per experiment.
+          model_folder: Folder to store optimisation artifacts.
+          override_w_fixed_params: Whether to override serialsed fixed model
+            parameters with new supplied values.
+        """
+
+        self.param_ranges = param_ranges
+
+        self._max_tries = 1000
+        self.results = pd.DataFrame()
+        self.fixed_params = fixed_params
+        self.saved_params = pd.DataFrame()
+
+        self.best_score = np.Inf
+        self.optimal_name = ""
+
+        # Setup
+        # Create folder for saving if its not there
+        self.hyperparam_folder = model_folder
+        utils.create_folder_if_not_exist(self.hyperparam_folder)
+
+        self._override_w_fixed_params = override_w_fixed_params
+
+    def load_results(self):
+        """Loads results from previous hyperparameter optimisation.
+
+        Returns:
+          A boolean indicating if previous results can be loaded.
+        """
+        print("Loading results from", self.hyperparam_folder)
+
+        results_file = os.path.join(self.hyperparam_folder, "results.csv")
+        params_file = os.path.join(self.hyperparam_folder, "params.csv")
+
+        if os.path.exists(results_file) and os.path.exists(params_file):
+
+            self.results = pd.read_csv(results_file, index_col=0)
+            self.saved_params = pd.read_csv(params_file, index_col=0)
+
+            if not self.results.empty:
+                self.results.at["loss"] = self.results.loc["loss"].apply(float)
+                self.best_score = self.results.loc["loss"].min()
+
+                is_optimal = self.results.loc["loss"] == self.best_score
+                self.optimal_name = self.results.T[is_optimal].index[0]
+
+                return True
+
+        return False
+
+    def _get_params_from_name(self, name):
+        """Returns previously saved parameters given a key."""
+        params = self.saved_params
+
+        selected_params = dict(params[name])
+
+        if self._override_w_fixed_params:
+            for k in self.fixed_params:
+                selected_params[k] = self.fixed_params[k]
+
+        return selected_params
+
+    def get_best_params(self):
+        """Returns the optimal hyperparameters thus far."""
+
+        optimal_name = self.optimal_name
+
+        return self._get_params_from_name(optimal_name)
+
+    def clear(self):
+        """Clears all previous results and saved parameters."""
+        shutil.rmtree(self.hyperparam_folder)
+        os.makedirs(self.hyperparam_folder)
+        self.results = pd.DataFrame()
+        self.saved_params = pd.DataFrame()
+
+    def _check_params(self, params):
+        """Checks that parameter map is properly defined."""
+
+        valid_fields = list(self.param_ranges.keys()) + list(self.fixed_params.keys())
+        invalid_fields = [k for k in params if k not in valid_fields]
+        missing_fields = [k for k in valid_fields if k not in params]
+
+        if invalid_fields:
+            raise ValueError("Invalid Fields Found {} - Valid ones are {}".format(invalid_fields, valid_fields))
+        if missing_fields:
+            raise ValueError("Missing Fields Found {} - Valid ones are {}".format(missing_fields, valid_fields))
+
+    def _get_name(self, params):
+        """Returns a unique key for the supplied set of params."""
+
+        self._check_params(params)
+
+        fields = list(params.keys())
+        fields.sort()
+
+        return "_".join([str(params[k]) for k in fields])
+
+    def get_next_parameters(self, ranges_to_skip=None):
+        """Returns the next set of parameters to optimise.
+
+        Args:
+          ranges_to_skip: Explicitly defines a set of keys to skip.
+        """
+        if ranges_to_skip is None:
+            ranges_to_skip = set(self.results.index)
+
+        if not isinstance(self.param_ranges, dict):
+            raise ValueError("Only works for random search!")
+
+        param_range_keys = list(self.param_ranges.keys())
+        param_range_keys.sort()
+
+        def _get_next():
+            """Returns next hyperparameter set per try."""
+
+            parameters = {k: np.random.choice(self.param_ranges[k]) for k in param_range_keys}
+
+            # Adds fixed params
+            for k in self.fixed_params:
+                parameters[k] = self.fixed_params[k]
+
+            return parameters
+
+        for _ in range(self._max_tries):
+
+            parameters = _get_next()
+            name = self._get_name(parameters)
+
+            if name not in ranges_to_skip:
+                return parameters
+
+        raise ValueError("Exceeded max number of hyperparameter searches!!")
+
+    def update_score(self, parameters, loss, model, info=""):
+        """Updates the results from last optimisation run.
+
+        Args:
+          parameters: Hyperparameters used in optimisation.
+          loss: Validation loss obtained.
+          model: Model to serialised if required.
+          info: Any ancillary information to tag on to results.
+
+        Returns:
+          Boolean flag indicating if the model is the best seen so far.
+        """
+
+        if np.isnan(loss):
+            loss = np.Inf
+
+        if not os.path.isdir(self.hyperparam_folder):
+            os.makedirs(self.hyperparam_folder)
+
+        name = self._get_name(parameters)
+
+        is_optimal = self.results.empty or loss < self.best_score
+
+        # save the first model
+        if is_optimal:
+            # Try saving first, before updating info
+            if model is not None:
+                print("Optimal model found, updating")
+                model.save(self.hyperparam_folder)
+            self.best_score = loss
+            self.optimal_name = name
+
+        self.results[name] = pd.Series({"loss": loss, "info": info})
+        self.saved_params[name] = pd.Series(parameters)
+
+        self.results.to_csv(os.path.join(self.hyperparam_folder, "results.csv"))
+        self.saved_params.to_csv(os.path.join(self.hyperparam_folder, "params.csv"))
+
+        return is_optimal
+
+
+class DistributedHyperparamOptManager(HyperparamOptManager):
+    """Manages distributed hyperparameter optimisation across many gpus."""
+
+    def __init__(
+        self,
+        param_ranges,
+        fixed_params,
+        root_model_folder,
+        worker_number,
+        search_iterations=1000,
+        num_iterations_per_worker=5,
+        clear_serialised_params=False,
+    ):
+        """Instantiates optimisation manager.
+
+        This hyperparameter optimisation pre-generates #search_iterations
+        hyperparameter combinations and serialises them
+        at the start. At runtime, each worker goes through their own set of
+        parameter ranges. The pregeneration
+        allows for multiple workers to run in parallel on different machines without
+        resulting in parameter overlaps.
+
+        Args:
+          param_ranges: Discrete hyperparameter range for random search.
+          fixed_params: Fixed model parameters per experiment.
+          root_model_folder: Folder to store optimisation artifacts.
+          worker_number: Worker index definining which set of hyperparameters to
+            test.
+          search_iterations: Maximum numer of random search iterations.
+          num_iterations_per_worker: How many iterations are handled per worker.
+          clear_serialised_params: Whether to regenerate hyperparameter
+            combinations.
+        """
+
+        max_workers = int(np.ceil(search_iterations / num_iterations_per_worker))
+
+        # Sanity checks
+        if worker_number > max_workers:
+            raise ValueError(
+                "Worker number ({}) cannot be larger than the total number of workers!".format(max_workers)
+            )
+        if worker_number > search_iterations:
+            raise ValueError(
+                "Worker number ({}) cannot be larger than the max search iterations ({})!".format(
+                    worker_number, search_iterations
+                )
+            )
+
+        print("*** Creating hyperparameter manager for worker {} ***".format(worker_number))
+
+        hyperparam_folder = os.path.join(root_model_folder, str(worker_number))
+        super().__init__(param_ranges, fixed_params, hyperparam_folder, override_w_fixed_params=True)
+
+        serialised_ranges_folder = os.path.join(root_model_folder, "hyperparams")
+        if clear_serialised_params:
+            print("Regenerating hyperparameter list")
+            if os.path.exists(serialised_ranges_folder):
+                shutil.rmtree(serialised_ranges_folder)
+
+        utils.create_folder_if_not_exist(serialised_ranges_folder)
+
+        self.serialised_ranges_path = os.path.join(serialised_ranges_folder, "ranges_{}.csv".format(search_iterations))
+        self.hyperparam_folder = hyperparam_folder  # override
+        self.worker_num = worker_number
+        self.total_search_iterations = search_iterations
+        self.num_iterations_per_worker = num_iterations_per_worker
+        self.global_hyperparam_df = self.load_serialised_hyperparam_df()
+        self.worker_search_queue = self._get_worker_search_queue()
+
+    @property
+    def optimisation_completed(self):
+        return False if self.worker_search_queue else True
+
+    def get_next_parameters(self):
+        """Returns next dictionary of hyperparameters to optimise."""
+        param_name = self.worker_search_queue.pop()
+
+        params = self.global_hyperparam_df.loc[param_name, :].to_dict()
+
+        # Always override!
+        for k in self.fixed_params:
+            print("Overriding saved {}: {}".format(k, self.fixed_params[k]))
+
+            params[k] = self.fixed_params[k]
+
+        return params
+
+    def load_serialised_hyperparam_df(self):
+        """Loads serialsed hyperparameter ranges from file.
+
+        Returns:
+          DataFrame containing hyperparameter combinations.
+        """
+        print(
+            "Loading params for {} search iterations form {}".format(
+                self.total_search_iterations, self.serialised_ranges_path
+            )
+        )
+
+        if os.path.exists(self.serialised_ranges_folder):
+            df = pd.read_csv(self.serialised_ranges_path, index_col=0)
+        else:
+            print("Unable to load - regenerating serach ranges instead")
+            df = self.update_serialised_hyperparam_df()
+
+        return df
+
+    def update_serialised_hyperparam_df(self):
+        """Regenerates hyperparameter combinations and saves to file.
+
+        Returns:
+          DataFrame containing hyperparameter combinations.
+        """
+        search_df = self._generate_full_hyperparam_df()
+
+        print(
+            "Serialising params for {} search iterations to {}".format(
+                self.total_search_iterations, self.serialised_ranges_path
+            )
+        )
+
+        search_df.to_csv(self.serialised_ranges_path)
+
+        return search_df
+
+    def _generate_full_hyperparam_df(self):
+        """Generates actual hyperparameter combinations.
+
+        Returns:
+          DataFrame containing hyperparameter combinations.
+        """
+
+        np.random.seed(131)  # for reproducibility of hyperparam list
+
+        name_list = []
+        param_list = []
+        for _ in range(self.total_search_iterations):
+            params = super().get_next_parameters(name_list)
+
+            name = self._get_name(params)
+
+            name_list.append(name)
+            param_list.append(params)
+
+        full_search_df = pd.DataFrame(param_list, index=name_list)
+
+        return full_search_df
+
+    def clear(self):  # reset when cleared
+        """Clears results for hyperparameter manager and resets."""
+        super().clear()
+        self.worker_search_queue = self._get_worker_search_queue()
+
+    def load_results(self):
+        """Load results from file and queue parameter combinations to try.
+
+        Returns:
+          Boolean indicating if results were successfully loaded.
+        """
+        success = super().load_results()
+
+        if success:
+            self.worker_search_queue = self._get_worker_search_queue()
+
+        return success
+
+    def _get_worker_search_queue(self):
+        """Generates the queue of param combinations for current worker.
+
+        Returns:
+          Queue of hyperparameter combinations outstanding.
+        """
+        global_df = self.assign_worker_numbers(self.global_hyperparam_df)
+        worker_df = global_df[global_df["worker"] == self.worker_num]
+
+        left_overs = [s for s in worker_df.index if s not in self.results.columns]
+
+        return Deque(left_overs)
+
+    def assign_worker_numbers(self, df):
+        """Updates parameter combinations with the index of the worker used.
+
+        Args:
+          df: DataFrame of parameter combinations.
+
+        Returns:
+          Updated DataFrame with worker number.
+        """
+        output = df.copy()
+
+        n = self.total_search_iterations
+        batch_size = self.num_iterations_per_worker
+
+        max_worker_num = int(np.ceil(n / batch_size))
+
+        worker_idx = np.concatenate([np.tile(i + 1, self.num_iterations_per_worker) for i in range(max_worker_num)])
+
+        output["worker"] = worker_idx[: len(output)]
+
+        return output
diff --git a/examples/benchmarks/TFT/libs/tft_model.py b/examples/benchmarks/TFT/libs/tft_model.py
index 2a41f4566..658bae60f 100644
--- a/examples/benchmarks/TFT/libs/tft_model.py
+++ b/examples/benchmarks/TFT/libs/tft_model.py
@@ -1,1391 +1,1280 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-"""Temporal Fusion Transformer Model.
-
-Contains the full TFT architecture and associated components. Defines functions
-for training, evaluation and prediction using simple Pandas Dataframe inputs.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import gc
-import json
-import os
-import shutil
-
-import data_formatters.base
-import libs.utils as utils
-import numpy as np
-import pandas as pd
-import tensorflow as tf
-
-# Layer definitions.
-concat = tf.keras.backend.concatenate
-stack = tf.keras.backend.stack
-K = tf.keras.backend
-Add = tf.keras.layers.Add
-LayerNorm = tf.keras.layers.LayerNormalization
-Dense = tf.keras.layers.Dense
-Multiply = tf.keras.layers.Multiply
-Dropout = tf.keras.layers.Dropout
-Activation = tf.keras.layers.Activation
-Lambda = tf.keras.layers.Lambda
-
-# Default input types.
-InputTypes = data_formatters.base.InputTypes
-
-
-# Layer utility functions.
-def linear_layer(size,
-                 activation=None,
-                 use_time_distributed=False,
-                 use_bias=True):
-  """Returns simple Keras linear layer.
-
-  Args:
-    size: Output size
-    activation: Activation function to apply if required
-    use_time_distributed: Whether to apply layer across time
-    use_bias: Whether bias should be included in layer
-  """
-  linear = tf.keras.layers.Dense(size, activation=activation, use_bias=use_bias)
-  if use_time_distributed:
-    linear = tf.keras.layers.TimeDistributed(linear)
-  return linear
-
-
-def apply_mlp(inputs,
-              hidden_size,
-              output_size,
-              output_activation=None,
-              hidden_activation='tanh',
-              use_time_distributed=False):
-  """Applies simple feed-forward network to an input.
-
-  Args:
-    inputs: MLP inputs
-    hidden_size: Hidden state size
-    output_size: Output size of MLP
-    output_activation: Activation function to apply on output
-    hidden_activation: Activation function to apply on input
-    use_time_distributed: Whether to apply across time
-
-  Returns:
-    Tensor for MLP outputs.
-  """
-  if use_time_distributed:
-    hidden = tf.keras.layers.TimeDistributed(
-        tf.keras.layers.Dense(hidden_size, activation=hidden_activation))(
-            inputs)
-    return tf.keras.layers.TimeDistributed(
-        tf.keras.layers.Dense(output_size, activation=output_activation))(
-            hidden)
-  else:
-    hidden = tf.keras.layers.Dense(
-        hidden_size, activation=hidden_activation)(
-            inputs)
-    return tf.keras.layers.Dense(
-        output_size, activation=output_activation)(
-            hidden)
-
-
-def apply_gating_layer(x,
-                       hidden_layer_size,
-                       dropout_rate=None,
-                       use_time_distributed=True,
-                       activation=None):
-  """Applies a Gated Linear Unit (GLU) to an input.
-
-  Args:
-    x: Input to gating layer
-    hidden_layer_size: Dimension of GLU
-    dropout_rate: Dropout rate to apply if any
-    use_time_distributed: Whether to apply across time
-    activation: Activation function to apply to the linear feature transform if
-      necessary
-
-  Returns:
-    Tuple of tensors for: (GLU output, gate)
-  """
-
-  if dropout_rate is not None:
-    x = tf.keras.layers.Dropout(dropout_rate)(x)
-
-  if use_time_distributed:
-    activation_layer = tf.keras.layers.TimeDistributed(
-        tf.keras.layers.Dense(hidden_layer_size, activation=activation))(
-            x)
-    gated_layer = tf.keras.layers.TimeDistributed(
-        tf.keras.layers.Dense(hidden_layer_size, activation='sigmoid'))(
-            x)
-  else:
-    activation_layer = tf.keras.layers.Dense(
-        hidden_layer_size, activation=activation)(
-            x)
-    gated_layer = tf.keras.layers.Dense(
-        hidden_layer_size, activation='sigmoid')(
-            x)
-
-  return tf.keras.layers.Multiply()([activation_layer,
-                                     gated_layer]), gated_layer
-
-
-def add_and_norm(x_list):
-  """Applies skip connection followed by layer normalisation.
-
-  Args:
-    x_list: List of inputs to sum for skip connection
-
-  Returns:
-    Tensor output from layer.
-  """
-  tmp = Add()(x_list)
-  tmp = LayerNorm()(tmp)
-  return tmp
-
-
-def gated_residual_network(x,
-                           hidden_layer_size,
-                           output_size=None,
-                           dropout_rate=None,
-                           use_time_distributed=True,
-                           additional_context=None,
-                           return_gate=False):
-  """Applies the gated residual network (GRN) as defined in paper.
-
-  Args:
-    x: Network inputs
-    hidden_layer_size: Internal state size
-    output_size: Size of output layer
-    dropout_rate: Dropout rate if dropout is applied
-    use_time_distributed: Whether to apply network across time dimension
-    additional_context: Additional context vector to use if relevant
-    return_gate: Whether to return GLU gate for diagnostic purposes
-
-  Returns:
-    Tuple of tensors for: (GRN output, GLU gate)
-  """
-
-  # Setup skip connection
-  if output_size is None:
-    output_size = hidden_layer_size
-    skip = x
-  else:
-    linear = Dense(output_size)
-    if use_time_distributed:
-      linear = tf.keras.layers.TimeDistributed(linear)
-    skip = linear(x)
-
-  # Apply feedforward network
-  hidden = linear_layer(
-      hidden_layer_size,
-      activation=None,
-      use_time_distributed=use_time_distributed)(
-          x)
-  if additional_context is not None:
-    hidden = hidden + linear_layer(
-        hidden_layer_size,
-        activation=None,
-        use_time_distributed=use_time_distributed,
-        use_bias=False)(
-            additional_context)
-  hidden = tf.keras.layers.Activation('elu')(hidden)
-  hidden = linear_layer(
-      hidden_layer_size,
-      activation=None,
-      use_time_distributed=use_time_distributed)(
-          hidden)
-
-  gating_layer, gate = apply_gating_layer(
-      hidden,
-      output_size,
-      dropout_rate=dropout_rate,
-      use_time_distributed=use_time_distributed,
-      activation=None)
-
-  if return_gate:
-    return add_and_norm([skip, gating_layer]), gate
-  else:
-    return add_and_norm([skip, gating_layer])
-
-
-# Attention Components.
-def get_decoder_mask(self_attn_inputs):
-  """Returns causal mask to apply for self-attention layer.
-
-  Args:
-    self_attn_inputs: Inputs to self attention layer to determine mask shape
-  """
-  len_s = tf.shape(self_attn_inputs)[1]
-  bs = tf.shape(self_attn_inputs)[:1]
-  mask = K.cumsum(tf.eye(len_s, batch_shape=bs), 1)
-  return mask
-
-
-class ScaledDotProductAttention():
-  """Defines scaled dot product attention layer.
-
-  Attributes:
-    dropout: Dropout rate to use
-    activation: Normalisation function for scaled dot product attention (e.g.
-      softmax by default)
-  """
-
-  def __init__(self, attn_dropout=0.0):
-    self.dropout = Dropout(attn_dropout)
-    self.activation = Activation('softmax')
-
-  def __call__(self, q, k, v, mask):
-    """Applies scaled dot product attention.
-
-    Args:
-      q: Queries
-      k: Keys
-      v: Values
-      mask: Masking if required -- sets softmax to very large value
-
-    Returns:
-      Tuple of (layer outputs, attention weights)
-    """
-    temper = tf.sqrt(tf.cast(tf.shape(k)[-1], dtype='float32'))
-    attn = Lambda(lambda x: K.batch_dot(x[0], x[1], axes=[2, 2]) / temper)(
-        [q, k])  # shape=(batch, q, k)
-    if mask is not None:
-      mmask = Lambda(lambda x: (-1e+9) * (1. - K.cast(x, 'float32')))(
-          mask)  # setting to infinity
-      attn = Add()([attn, mmask])
-    attn = self.activation(attn)
-    attn = self.dropout(attn)
-    output = Lambda(lambda x: K.batch_dot(x[0], x[1]))([attn, v])
-    return output, attn
-
-
-class InterpretableMultiHeadAttention():
-  """Defines interpretable multi-head attention layer.
-
-  Attributes:
-    n_head: Number of heads
-    d_k: Key/query dimensionality per head
-    d_v: Value dimensionality
-    dropout: Dropout rate to apply
-    qs_layers: List of queries across heads
-    ks_layers: List of keys across heads
-    vs_layers: List of values across heads
-    attention: Scaled dot product attention layer
-    w_o: Output weight matrix to project internal state to the original TFT
-      state size
-  """
-
-  def __init__(self, n_head, d_model, dropout):
-    """Initialises layer.
-
-    Args:
-      n_head: Number of heads
-      d_model: TFT state dimensionality
-      dropout: Dropout discard rate
-    """
-
-    self.n_head = n_head
-    self.d_k = self.d_v = d_k = d_v = d_model // n_head
-    self.dropout = dropout
-
-    self.qs_layers = []
-    self.ks_layers = []
-    self.vs_layers = []
-
-    # Use same value layer to facilitate interp
-    vs_layer = Dense(d_v, use_bias=False)
-
-    for _ in range(n_head):
-      self.qs_layers.append(Dense(d_k, use_bias=False))
-      self.ks_layers.append(Dense(d_k, use_bias=False))
-      self.vs_layers.append(vs_layer)  # use same vs_layer
-
-    self.attention = ScaledDotProductAttention()
-    self.w_o = Dense(d_model, use_bias=False)
-
-  def __call__(self, q, k, v, mask=None):
-    """Applies interpretable multihead attention.
-
-    Using T to denote the number of time steps fed into the transformer.
-
-    Args:
-      q: Query tensor of shape=(?, T, d_model)
-      k: Key of shape=(?, T, d_model)
-      v: Values of shape=(?, T, d_model)
-      mask: Masking if required with shape=(?, T, T)
-
-    Returns:
-      Tuple of (layer outputs, attention weights)
-    """
-    n_head = self.n_head
-
-    heads = []
-    attns = []
-    for i in range(n_head):
-      qs = self.qs_layers[i](q)
-      ks = self.ks_layers[i](k)
-      vs = self.vs_layers[i](v)
-      head, attn = self.attention(qs, ks, vs, mask)
-
-      head_dropout = Dropout(self.dropout)(head)
-      heads.append(head_dropout)
-      attns.append(attn)
-    head = K.stack(heads) if n_head > 1 else heads[0]
-    attn = K.stack(attns)
-
-    outputs = K.mean(head, axis=0) if n_head > 1 else head
-    outputs = self.w_o(outputs)
-    outputs = Dropout(self.dropout)(outputs)  # output dropout
-
-    return outputs, attn
-
-
-class TFTDataCache(object):
-  """Caches data for the TFT."""
-
-  _data_cache = {}
-
-  @classmethod
-  def update(cls, data, key):
-    """Updates cached data.
-
-    Args:
-      data: Source to update
-      key: Key to dictionary location
-    """
-    cls._data_cache[key] = data
-
-  @classmethod
-  def get(cls, key):
-    """Returns data stored at key location."""
-    return cls._data_cache[key].copy()
-
-  @classmethod
-  def contains(cls, key):
-    """Retuns boolean indicating whether key is present in cache."""
-
-    return key in cls._data_cache
-
-
-# TFT model definitions.
-class TemporalFusionTransformer(object):
-  """Defines Temporal Fusion Transformer.
-
-  Attributes:
-    name: Name of model
-    time_steps: Total number of input time steps per forecast date (i.e. Width
-      of Temporal fusion decoder N)
-    input_size: Total number of inputs
-    output_size: Total number of outputs
-    category_counts: Number of categories per categorical variable
-    n_multiprocessing_workers: Number of workers to use for parallel
-      computations
-    column_definition: List of tuples of (string, DataType, InputType) that
-      define each column
-    quantiles: Quantiles to forecast for TFT
-    use_cudnn: Whether to use Keras CuDNNLSTM or standard LSTM layers
-    hidden_layer_size: Internal state size of TFT
-    dropout_rate: Dropout discard rate
-    max_gradient_norm: Maximum norm for gradient clipping
-    learning_rate: Initial learning rate of ADAM optimizer
-    minibatch_size: Size of minibatches for training
-    num_epochs: Maximum number of epochs for training
-    early_stopping_patience: Maximum number of iterations of non-improvement
-      before early stopping kicks in
-    num_encoder_steps: Size of LSTM encoder -- i.e. number of past time steps
-      before forecast date to use
-    num_stacks: Number of self-attention layers to apply (default is 1 for basic
-      TFT)
-    num_heads: Number of heads for interpretable mulit-head attention
-    model: Keras model for TFT
-  """
-
-  def __init__(self, raw_params, use_cudnn=False):
-    """Builds TFT from parameters.
-
-    Args:
-      raw_params: Parameters to define TFT
-      use_cudnn: Whether to use CUDNN GPU optimised LSTM
-    """
-
-    self.name = self.__class__.__name__
-
-    params = dict(raw_params)  # copy locally
-
-    # Data parameters
-    self.time_steps = int(params['total_time_steps'])
-    self.input_size = int(params['input_size'])
-    self.output_size = int(params['output_size'])
-    self.category_counts = json.loads(str(params['category_counts']))
-    self.n_multiprocessing_workers = int(params['multiprocessing_workers'])
-
-    # Relevant indices for TFT
-    self._input_obs_loc = json.loads(str(params['input_obs_loc']))
-    self._static_input_loc = json.loads(str(params['static_input_loc']))
-    self._known_regular_input_idx = json.loads(
-        str(params['known_regular_inputs']))
-    self._known_categorical_input_idx = json.loads(
-        str(params['known_categorical_inputs']))
-
-    self.column_definition = params['column_definition']
-
-    # Network params
-    self.quantiles = [0.1, 0.5, 0.9]
-    self.use_cudnn = use_cudnn  # Whether to use GPU optimised LSTM
-    self.hidden_layer_size = int(params['hidden_layer_size'])
-    self.dropout_rate = float(params['dropout_rate'])
-    self.max_gradient_norm = float(params['max_gradient_norm'])
-    self.learning_rate = float(params['learning_rate'])
-    self.minibatch_size = int(params['minibatch_size'])
-    self.num_epochs = int(params['num_epochs'])
-    self.early_stopping_patience = int(params['early_stopping_patience'])
-
-    self.num_encoder_steps = int(params['num_encoder_steps'])
-    self.num_stacks = int(params['stack_size'])
-    self.num_heads = int(params['num_heads'])
-
-    # Serialisation options
-    self._temp_folder = os.path.join(params['model_folder'], 'tmp')
-    self.reset_temp_folder()
-
-    # Extra components to store Tensorflow nodes for attention computations
-    self._input_placeholder = None
-    self._attention_components = None
-    self._prediction_parts = None
-
-    print('*** {} params ***'.format(self.name))
-    for k in params:
-      print('# {} = {}'.format(k, params[k]))
-
-    # Build model
-    self.model = self.build_model()
-
-  def get_tft_embeddings(self, all_inputs):
-    """Transforms raw inputs to embeddings.
-
-    Applies linear transformation onto continuous variables and uses embeddings
-    for categorical variables.
-
-    Args:
-      all_inputs: Inputs to transform
-
-    Returns:
-      Tensors for transformed inputs.
-    """
-
-    time_steps = self.time_steps
-
-    # Sanity checks
-    for i in self._known_regular_input_idx:
-      if i in self._input_obs_loc:
-        raise ValueError('Observation cannot be known a priori!')
-    for i in self._input_obs_loc:
-      if i in self._static_input_loc:
-        raise ValueError('Observation cannot be static!')
-
-    if all_inputs.get_shape().as_list()[-1] != self.input_size:
-      raise ValueError(
-          'Illegal number of inputs! Inputs observed={}, expected={}'.format(
-              all_inputs.get_shape().as_list()[-1], self.input_size))
-
-    num_categorical_variables = len(self.category_counts)
-    num_regular_variables = self.input_size - num_categorical_variables
-
-    embedding_sizes = [
-        self.hidden_layer_size for i, size in enumerate(self.category_counts)
-    ]
-
-    embeddings = []
-    for i in range(num_categorical_variables):
-
-      embedding = tf.keras.Sequential([
-          tf.keras.layers.InputLayer([time_steps]),
-          tf.keras.layers.Embedding(
-              self.category_counts[i],
-              embedding_sizes[i],
-              input_length=time_steps,
-              dtype=tf.float32)
-      ])
-      embeddings.append(embedding)
-
-    regular_inputs, categorical_inputs \
-        = all_inputs[:, :, :num_regular_variables], \
-          all_inputs[:, :, num_regular_variables:]
-
-    embedded_inputs = [
-        embeddings[i](categorical_inputs[Ellipsis, i])
-        for i in range(num_categorical_variables)
-    ]
-
-    # Static inputs
-    if self._static_input_loc:
-      static_inputs = [tf.keras.layers.Dense(self.hidden_layer_size)(
-          regular_inputs[:, 0, i:i + 1]) for i in range(num_regular_variables)
-                       if i in self._static_input_loc] \
-          + [embedded_inputs[i][:, 0, :]
-             for i in range(num_categorical_variables)
-             if i + num_regular_variables in self._static_input_loc]
-      static_inputs = tf.keras.backend.stack(static_inputs, axis=1)
-
-    else:
-      static_inputs = None
-
-    def convert_real_to_embedding(x):
-      """Applies linear transformation for time-varying inputs."""
-      return tf.keras.layers.TimeDistributed(
-          tf.keras.layers.Dense(self.hidden_layer_size))(
-              x)
-
-    # Targets
-    obs_inputs = tf.keras.backend.stack([
-        convert_real_to_embedding(regular_inputs[Ellipsis, i:i + 1])
-        for i in self._input_obs_loc
-    ],
-                                        axis=-1)
-
-    # Observed (a prioir unknown) inputs
-    wired_embeddings = []
-    for i in range(num_categorical_variables):
-      if i not in self._known_categorical_input_idx \
-        and  i + num_regular_variables  not in self._input_obs_loc:
-        e = embeddings[i](categorical_inputs[:, :, i])
-        wired_embeddings.append(e)
-
-    unknown_inputs = []
-    for i in range(regular_inputs.shape[-1]):
-      if i not in self._known_regular_input_idx \
-          and i not in self._input_obs_loc:
-        e = convert_real_to_embedding(regular_inputs[Ellipsis, i:i + 1])
-        unknown_inputs.append(e)
-
-    if unknown_inputs + wired_embeddings:
-      unknown_inputs = tf.keras.backend.stack(
-          unknown_inputs + wired_embeddings, axis=-1)
-    else:
-      unknown_inputs = None
-
-    # A priori known inputs
-    known_regular_inputs = [
-        convert_real_to_embedding(regular_inputs[Ellipsis, i:i + 1])
-        for i in self._known_regular_input_idx
-        if i not in self._static_input_loc
-    ]
-    known_categorical_inputs = [
-        embedded_inputs[i]
-        for i in self._known_categorical_input_idx
-        if i + num_regular_variables not in self._static_input_loc
-    ]
-
-    known_combined_layer = tf.keras.backend.stack(
-        known_regular_inputs + known_categorical_inputs, axis=-1)
-
-    return unknown_inputs, known_combined_layer, obs_inputs, static_inputs
-
-  def _get_single_col_by_type(self, input_type):
-    """Returns name of single column for input type."""
-
-    return utils.get_single_col_by_input_type(input_type,
-                                              self.column_definition)
-
-  def training_data_cached(self):
-    """Returns boolean indicating if training data has been cached."""
-
-    return TFTDataCache.contains('train') and TFTDataCache.contains('valid')
-
-  def cache_batched_data(self, data, cache_key, num_samples=-1):
-    """Batches and caches data once for using during training.
-
-    Args:
-      data: Data to batch and cache
-      cache_key: Key used for cache
-      num_samples: Maximum number of samples to extract (-1 to use all data)
-    """
-
-    if num_samples > 0:
-      TFTDataCache.update(
-          self._batch_sampled_data(data, max_samples=num_samples), cache_key)
-    else:
-      TFTDataCache.update(self._batch_data(data), cache_key)
-
-    print('Cached data "{}" updated'.format(cache_key))
-
-  def _batch_sampled_data(self, data, max_samples):
-    """Samples segments into a compatible format.
-
-    Args:
-      data: Sources data to sample and batch
-      max_samples: Maximum number of samples in batch
-
-    Returns:
-      Dictionary of batched data with the maximum samples specified.
-    """
-
-    if max_samples < 1:
-      raise ValueError(
-          'Illegal number of samples specified! samples={}'.format(max_samples))
-
-    id_col = self._get_single_col_by_type(InputTypes.ID)
-    time_col = self._get_single_col_by_type(InputTypes.TIME)
-
-    data.sort_values(by=[id_col, time_col], inplace=True)
-
-    print('Getting valid sampling locations.')
-    valid_sampling_locations = []
-    split_data_map = {}
-    for identifier, df in data.groupby(id_col):
-      print('Getting locations for {}'.format(identifier))
-      num_entries = len(df)
-      if num_entries >= self.time_steps:
-        valid_sampling_locations += [
-            (identifier, self.time_steps + i)
-            for i in range(num_entries - self.time_steps + 1)
-        ]
-      split_data_map[identifier] = df
-
-    inputs = np.zeros((max_samples, self.time_steps, self.input_size))
-    outputs = np.zeros((max_samples, self.time_steps, self.output_size))
-    time = np.empty((max_samples, self.time_steps, 1), dtype=object)
-    identifiers = np.empty((max_samples, self.time_steps, 1), dtype=object)
-
-    if max_samples > 0 and len(valid_sampling_locations) > max_samples:
-      print('Extracting {} samples...'.format(max_samples))
-      ranges = [
-          valid_sampling_locations[i] for i in np.random.choice(
-              len(valid_sampling_locations), max_samples, replace=False)
-      ]
-    else:
-      print('Max samples={} exceeds # available segments={}'.format(
-          max_samples, len(valid_sampling_locations)))
-      ranges = valid_sampling_locations
-
-    id_col = self._get_single_col_by_type(InputTypes.ID)
-    time_col = self._get_single_col_by_type(InputTypes.TIME)
-    target_col = self._get_single_col_by_type(InputTypes.TARGET)
-    input_cols = [
-        tup[0]
-        for tup in self.column_definition
-        if tup[2] not in {InputTypes.ID, InputTypes.TIME}
-    ]
-
-    for i, tup in enumerate(ranges):
-      if (i + 1 % 1000) == 0:
-        print(i + 1, 'of', max_samples, 'samples done...')
-      identifier, start_idx = tup
-      sliced = split_data_map[identifier].iloc[start_idx -
-                                               self.time_steps:start_idx]
-      inputs[i, :, :] = sliced[input_cols]
-      outputs[i, :, :] = sliced[[target_col]]
-      time[i, :, 0] = sliced[time_col]
-      identifiers[i, :, 0] = sliced[id_col]
-
-    sampled_data = {
-        'inputs': inputs,
-        'outputs': outputs[:, self.num_encoder_steps:, :],
-        'active_entries': np.ones_like(outputs[:, self.num_encoder_steps:, :]),
-        'time': time,
-        'identifier': identifiers
-    }
-
-    return sampled_data
-
-  def _batch_data(self, data):
-    """Batches data for training.
-
-    Converts raw dataframe from a 2-D tabular format to a batched 3-D array
-    to feed into Keras model.
-
-    Args:
-      data: DataFrame to batch
-
-    Returns:
-      Batched Numpy array with shape=(?, self.time_steps, self.input_size)
-    """
-
-    # Functions.
-    def _batch_single_entity(input_data):
-      time_steps = len(input_data)
-      lags = self.time_steps
-      x = input_data.values
-      if time_steps >= lags:
-        return np.stack(
-            [x[i:time_steps - (lags - 1) + i, :] for i in range(lags)], axis=1)
-
-      else:
-        return None
-
-    id_col = self._get_single_col_by_type(InputTypes.ID)
-    time_col = self._get_single_col_by_type(InputTypes.TIME)
-    target_col = self._get_single_col_by_type(InputTypes.TARGET)
-    input_cols = [
-        tup[0]
-        for tup in self.column_definition
-        if tup[2] not in {InputTypes.ID, InputTypes.TIME}
-    ]
-
-    data_map = {}
-    for _, sliced in data.groupby(id_col):
-
-      col_mappings = {
-          'identifier': [id_col],
-          'time': [time_col],
-          'outputs': [target_col],
-          'inputs': input_cols
-      }
-
-      for k in col_mappings:
-        cols = col_mappings[k]
-        arr = _batch_single_entity(sliced[cols].copy())
-
-        if k not in data_map:
-          data_map[k] = [arr]
-        else:
-          data_map[k].append(arr)
-
-    # Combine all data
-    for k in data_map:
-      # Wendi: Avoid returning None when the length is not enough
-      data_map[k] = np.concatenate([i for i in data_map[k] if i is not None], axis=0)
-
-    # Shorten target so we only get decoder steps
-    data_map['outputs'] = data_map['outputs'][:, self.num_encoder_steps:, :]
-
-    active_entries = np.ones_like(data_map['outputs'])
-    if 'active_entries' not in data_map:
-      data_map['active_entries'] = active_entries
-    else:
-      data_map['active_entries'].append(active_entries)
-
-    return data_map
-
-  def _get_active_locations(self, x):
-    """Formats sample weights for Keras training."""
-    return (np.sum(x, axis=-1) > 0.0) * 1.0
-
-  def _build_base_graph(self):
-    """Returns graph defining layers of the TFT."""
-
-    # Size definitions.
-    time_steps = self.time_steps
-    combined_input_size = self.input_size
-    encoder_steps = self.num_encoder_steps
-
-    # Inputs.
-    all_inputs = tf.keras.layers.Input(
-        shape=(
-            time_steps,
-            combined_input_size,
-        ))
-
-    unknown_inputs, known_combined_layer, obs_inputs, static_inputs \
-        = self.get_tft_embeddings(all_inputs)
-
-    # Isolate known and observed historical inputs.
-    if unknown_inputs is not None:
-      historical_inputs = concat([
-          unknown_inputs[:, :encoder_steps, :],
-          known_combined_layer[:, :encoder_steps, :],
-          obs_inputs[:, :encoder_steps, :]
-      ],
-                                 axis=-1)
-    else:
-      historical_inputs = concat([
-          known_combined_layer[:, :encoder_steps, :],
-          obs_inputs[:, :encoder_steps, :]
-      ],
-                                 axis=-1)
-
-    # Isolate only known future inputs.
-    future_inputs = known_combined_layer[:, encoder_steps:, :]
-
-    def static_combine_and_mask(embedding):
-      """Applies variable selection network to static inputs.
-
-      Args:
-        embedding: Transformed static inputs
-
-      Returns:
-        Tensor output for variable selection network
-      """
-
-      # Add temporal features
-      _, num_static, _ = embedding.get_shape().as_list()
-
-      flatten = tf.keras.layers.Flatten()(embedding)
-
-      # Nonlinear transformation with gated residual network.
-      mlp_outputs = gated_residual_network(
-          flatten,
-          self.hidden_layer_size,
-          output_size=num_static,
-          dropout_rate=self.dropout_rate,
-          use_time_distributed=False,
-          additional_context=None)
-
-      sparse_weights = tf.keras.layers.Activation('softmax')(mlp_outputs)
-      sparse_weights = K.expand_dims(sparse_weights, axis=-1)
-
-      trans_emb_list = []
-      for i in range(num_static):
-        e = gated_residual_network(
-            embedding[:, i:i + 1, :],
-            self.hidden_layer_size,
-            dropout_rate=self.dropout_rate,
-            use_time_distributed=False)
-        trans_emb_list.append(e)
-
-      transformed_embedding = concat(trans_emb_list, axis=1)
-
-      combined = tf.keras.layers.Multiply()(
-          [sparse_weights, transformed_embedding])
-
-      static_vec = K.sum(combined, axis=1)
-
-      return static_vec, sparse_weights
-
-    static_encoder, static_weights = static_combine_and_mask(static_inputs)
-
-    static_context_variable_selection = gated_residual_network(
-        static_encoder,
-        self.hidden_layer_size,
-        dropout_rate=self.dropout_rate,
-        use_time_distributed=False)
-    static_context_enrichment = gated_residual_network(
-        static_encoder,
-        self.hidden_layer_size,
-        dropout_rate=self.dropout_rate,
-        use_time_distributed=False)
-    static_context_state_h = gated_residual_network(
-        static_encoder,
-        self.hidden_layer_size,
-        dropout_rate=self.dropout_rate,
-        use_time_distributed=False)
-    static_context_state_c = gated_residual_network(
-        static_encoder,
-        self.hidden_layer_size,
-        dropout_rate=self.dropout_rate,
-        use_time_distributed=False)
-
-    def lstm_combine_and_mask(embedding):
-      """Apply temporal variable selection networks.
-
-      Args:
-        embedding: Transformed inputs.
-
-      Returns:
-        Processed tensor outputs.
-      """
-
-      # Add temporal features
-      _, time_steps, embedding_dim, num_inputs = embedding.get_shape().as_list()
-
-      flatten = K.reshape(embedding,
-                          [-1, time_steps, embedding_dim * num_inputs])
-
-      expanded_static_context = K.expand_dims(
-          static_context_variable_selection, axis=1)
-
-      # Variable selection weights
-      mlp_outputs, static_gate = gated_residual_network(
-          flatten,
-          self.hidden_layer_size,
-          output_size=num_inputs,
-          dropout_rate=self.dropout_rate,
-          use_time_distributed=True,
-          additional_context=expanded_static_context,
-          return_gate=True)
-
-      sparse_weights = tf.keras.layers.Activation('softmax')(mlp_outputs)
-      sparse_weights = tf.expand_dims(sparse_weights, axis=2)
-
-      # Non-linear Processing & weight application
-      trans_emb_list = []
-      for i in range(num_inputs):
-        grn_output = gated_residual_network(
-            embedding[Ellipsis, i],
-            self.hidden_layer_size,
-            dropout_rate=self.dropout_rate,
-            use_time_distributed=True)
-        trans_emb_list.append(grn_output)
-
-      transformed_embedding = stack(trans_emb_list, axis=-1)
-
-      combined = tf.keras.layers.Multiply()(
-          [sparse_weights, transformed_embedding])
-      temporal_ctx = K.sum(combined, axis=-1)
-
-      return temporal_ctx, sparse_weights, static_gate
-
-    historical_features, historical_flags, _ = lstm_combine_and_mask(
-        historical_inputs)
-    future_features, future_flags, _ = lstm_combine_and_mask(future_inputs)
-
-    # LSTM layer
-    def get_lstm(return_state):
-      """Returns LSTM cell initialized with default parameters."""
-      if self.use_cudnn:
-        lstm = tf.keras.layers.CuDNNLSTM(
-            self.hidden_layer_size,
-            return_sequences=True,
-            return_state=return_state,
-            stateful=False,
-        )
-      else:
-        lstm = tf.keras.layers.LSTM(
-            self.hidden_layer_size,
-            return_sequences=True,
-            return_state=return_state,
-            stateful=False,
-            # Additional params to ensure LSTM matches CuDNN, See TF 2.0 :
-            # (https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM)
-            activation='tanh',
-            recurrent_activation='sigmoid',
-            recurrent_dropout=0,
-            unroll=False,
-            use_bias=True)
-      return lstm
-
-    history_lstm, state_h, state_c \
-        = get_lstm(return_state=True)(historical_features,
-                                      initial_state=[static_context_state_h,
-                                                     static_context_state_c])
-
-    future_lstm = get_lstm(return_state=False)(
-        future_features, initial_state=[state_h, state_c])
-
-    lstm_layer = concat([history_lstm, future_lstm], axis=1)
-
-    # Apply gated skip connection
-    input_embeddings = concat([historical_features, future_features], axis=1)
-
-    lstm_layer, _ = apply_gating_layer(
-        lstm_layer, self.hidden_layer_size, self.dropout_rate, activation=None)
-    temporal_feature_layer = add_and_norm([lstm_layer, input_embeddings])
-
-    # Static enrichment layers
-    expanded_static_context = K.expand_dims(static_context_enrichment, axis=1)
-    enriched, _ = gated_residual_network(
-        temporal_feature_layer,
-        self.hidden_layer_size,
-        dropout_rate=self.dropout_rate,
-        use_time_distributed=True,
-        additional_context=expanded_static_context,
-        return_gate=True)
-
-    # Decoder self attention
-    self_attn_layer = InterpretableMultiHeadAttention(
-        self.num_heads, self.hidden_layer_size, dropout=self.dropout_rate)
-
-    mask = get_decoder_mask(enriched)
-    x, self_att \
-        = self_attn_layer(enriched, enriched, enriched,
-                          mask=mask)
-
-    x, _ = apply_gating_layer(
-        x,
-        self.hidden_layer_size,
-        dropout_rate=self.dropout_rate,
-        activation=None)
-    x = add_and_norm([x, enriched])
-
-    # Nonlinear processing on outputs
-    decoder = gated_residual_network(
-        x,
-        self.hidden_layer_size,
-        dropout_rate=self.dropout_rate,
-        use_time_distributed=True)
-
-    # Final skip connection
-    decoder, _ = apply_gating_layer(
-        decoder, self.hidden_layer_size, activation=None)
-    transformer_layer = add_and_norm([decoder, temporal_feature_layer])
-
-    # Attention components for explainability
-    attention_components = {
-        # Temporal attention weights
-        'decoder_self_attn': self_att,
-        # Static variable selection weights
-        'static_flags': static_weights[Ellipsis, 0],
-        # Variable selection weights of past inputs
-        'historical_flags': historical_flags[Ellipsis, 0, :],
-        # Variable selection weights of future inputs
-        'future_flags': future_flags[Ellipsis, 0, :]
-    }
-
-    return transformer_layer, all_inputs, attention_components
-
-  def build_model(self):
-    """Build model and defines training losses.
-
-    Returns:
-      Fully defined Keras model.
-    """
-
-    with tf.variable_scope(self.name):
-
-      transformer_layer, all_inputs, attention_components \
-          = self._build_base_graph()
-
-      outputs = tf.keras.layers.TimeDistributed(
-          tf.keras.layers.Dense(self.output_size * len(self.quantiles))) \
-          (transformer_layer[Ellipsis, self.num_encoder_steps:, :])
-
-      self._attention_components = attention_components
-
-      adam = tf.keras.optimizers.Adam(
-          lr=self.learning_rate, clipnorm=self.max_gradient_norm)
-
-      model = tf.keras.Model(inputs=all_inputs, outputs=outputs)
-
-      print(model.summary())
-
-      valid_quantiles = self.quantiles
-      output_size = self.output_size
-
-      class QuantileLossCalculator(object):
-        """Computes the combined quantile loss for prespecified quantiles.
-
-        Attributes:
-          quantiles: Quantiles to compute losses
-        """
-
-        def __init__(self, quantiles):
-          """Initializes computer with quantiles for loss calculations.
-
-          Args:
-            quantiles: Quantiles to use for computations.
-          """
-          self.quantiles = quantiles
-
-        def quantile_loss(self, a, b):
-          """Returns quantile loss for specified quantiles.
-
-          Args:
-            a: Targets
-            b: Predictions
-          """
-          quantiles_used = set(self.quantiles)
-
-          loss = 0.
-          for i, quantile in enumerate(valid_quantiles):
-            if quantile in quantiles_used:
-              loss += utils.tensorflow_quantile_loss(
-                  a[Ellipsis, output_size * i:output_size * (i + 1)],
-                  b[Ellipsis, output_size * i:output_size * (i + 1)], quantile)
-          return loss
-
-      quantile_loss = QuantileLossCalculator(valid_quantiles).quantile_loss
-
-      model.compile(
-          loss=quantile_loss, optimizer=adam, sample_weight_mode='temporal')
-
-      self._input_placeholder = all_inputs
-
-    return model
-
-  def fit(self, train_df=None, valid_df=None):
-    """Fits deep neural network for given training and validation data.
-
-    Args:
-      train_df: DataFrame for training data
-      valid_df: DataFrame for validation data
-    """
-
-    print('*** Fitting {} ***'.format(self.name))
-
-    # Add relevant callbacks
-    callbacks = [
-        tf.keras.callbacks.EarlyStopping(
-            monitor='val_loss',
-            patience=self.early_stopping_patience,
-            min_delta=1e-4),
-        tf.keras.callbacks.ModelCheckpoint(
-            filepath=self.get_keras_saved_path(self._temp_folder),
-            monitor='val_loss',
-            save_best_only=True,
-            save_weights_only=True),
-        tf.keras.callbacks.TerminateOnNaN()
-    ]
-
-    print('Getting batched_data')
-    if train_df is None:
-      print('Using cached training data')
-      train_data = TFTDataCache.get('train')
-    else:
-      train_data = self._batch_data(train_df)
-
-    if valid_df is None:
-      print('Using cached validation data')
-      valid_data = TFTDataCache.get('valid')
-    else:
-      valid_data = self._batch_data(valid_df)
-
-    print('Using keras standard fit')
-
-    def _unpack(data):
-      return data['inputs'], data['outputs'], \
-          self._get_active_locations(data['active_entries'])
-
-    # Unpack without sample weights
-    data, labels, active_flags = _unpack(train_data)
-    val_data, val_labels, val_flags = _unpack(valid_data)
-
-    all_callbacks = callbacks
-
-    self.model.fit(
-        x=data,
-        y=np.concatenate([labels, labels, labels], axis=-1),
-        sample_weight=active_flags,
-        epochs=self.num_epochs,
-        batch_size=self.minibatch_size,
-        validation_data=(val_data,
-                         np.concatenate([val_labels, val_labels, val_labels],
-                                        axis=-1), val_flags),
-        callbacks=all_callbacks,
-        shuffle=True,
-        use_multiprocessing=True,
-        workers=self.n_multiprocessing_workers)
-
-    # Load best checkpoint again
-    tmp_checkpont = self.get_keras_saved_path(self._temp_folder)
-    if os.path.exists(tmp_checkpont):
-      self.load(
-          self._temp_folder,
-          use_keras_loadings=True)
-
-    else:
-      print('Cannot load from {}, skipping ...'.format(self._temp_folder))
-
-  def evaluate(self, data=None, eval_metric='loss'):
-    """Applies evaluation metric to the training data.
-
-    Args:
-      data: Dataframe for evaluation
-      eval_metric: Evaluation metic to return, based on model definition.
-
-    Returns:
-      Computed evaluation loss.
-    """
-
-    if data is None:
-      print('Using cached validation data')
-      raw_data = TFTDataCache.get('valid')
-    else:
-      raw_data = self._batch_data(data)
-
-    inputs = raw_data['inputs']
-    outputs = raw_data['outputs']
-    active_entries = self._get_active_locations(raw_data['active_entries'])
-
-    metric_values = self.model.evaluate(
-        x=inputs,
-        y=np.concatenate([outputs, outputs, outputs], axis=-1),
-        sample_weight=active_entries,
-        workers=16,
-        use_multiprocessing=True)
-
-    metrics = pd.Series(metric_values, self.model.metrics_names)
-
-    return metrics[eval_metric]
-
-  def predict(self, df, return_targets=False):
-    """Computes predictions for a given input dataset.
-
-    Args:
-      df: Input dataframe
-      return_targets: Whether to also return outputs aligned with predictions to
-        faciliate evaluation
-
-    Returns:
-      Input dataframe or tuple of (input dataframe, algined output dataframe).
-    """
-
-    data = self._batch_data(df)
-
-    inputs = data['inputs']
-    time = data['time']
-    identifier = data['identifier']
-    outputs = data['outputs']
-
-    combined = self.model.predict(
-        inputs,
-        workers=16,
-        use_multiprocessing=True,
-        batch_size=self.minibatch_size)
-
-    # Format output_csv
-    if self.output_size != 1:
-      raise NotImplementedError('Current version only supports 1D targets!')
-
-    def format_outputs(prediction):
-      """Returns formatted dataframes for prediction."""
-
-      flat_prediction = pd.DataFrame(
-          prediction[:, :, 0],
-          columns=[
-              't+{}'.format(i)
-              for i in range(self.time_steps - self.num_encoder_steps)
-          ])
-      cols = list(flat_prediction.columns)
-      flat_prediction['forecast_time'] = time[:, self.num_encoder_steps - 1, 0]
-      flat_prediction['identifier'] = identifier[:, 0, 0]
-
-      # Arrange in order
-      return flat_prediction[['forecast_time', 'identifier'] + cols]
-
-    # Extract predictions for each quantile into different entries
-    process_map = {
-        'p{}'.format(int(q * 100)):
-        combined[Ellipsis, i * self.output_size:(i + 1) * self.output_size]
-        for i, q in enumerate(self.quantiles)
-    }
-
-    if return_targets:
-      # Add targets if relevant
-      process_map['targets'] = outputs
-
-    return {k: format_outputs(process_map[k]) for k in process_map}
-
-  def get_attention(self, df):
-    """Computes TFT attention weights for a given dataset.
-
-    Args:
-      df: Input dataframe
-
-    Returns:
-        Dictionary of numpy arrays for temporal attention weights and variable
-          selection weights, along with their identifiers and time indices
-    """
-
-    data = self._batch_data(df)
-    inputs = data['inputs']
-    identifiers = data['identifier']
-    time = data['time']
-
-    def get_batch_attention_weights(input_batch):
-      """Returns weights for a given minibatch of data."""
-      input_placeholder = self._input_placeholder
-      attention_weights = {}
-      for k in self._attention_components:
-        attention_weight = tf.keras.backend.get_session().run(
-            self._attention_components[k],
-            {input_placeholder: input_batch.astype(np.float32)})
-        attention_weights[k] = attention_weight
-      return attention_weights
-
-    # Compute number of batches
-    batch_size = self.minibatch_size
-    n = inputs.shape[0]
-    num_batches = n // batch_size
-    if n - (num_batches * batch_size) > 0:
-      num_batches += 1
-
-    # Split up inputs into batches
-    batched_inputs = [
-        inputs[i * batch_size:(i + 1) * batch_size, Ellipsis]
-        for i in range(num_batches)
-    ]
-
-    # Get attention weights, while avoiding large memory increases
-    attention_by_batch = [
-        get_batch_attention_weights(batch) for batch in batched_inputs
-    ]
-    attention_weights = {}
-    for k in self._attention_components:
-      attention_weights[k] = []
-      for batch_weights in attention_by_batch:
-        attention_weights[k].append(batch_weights[k])
-
-      if len(attention_weights[k][0].shape) == 4:
-        tmp = np.concatenate(attention_weights[k], axis=1)
-      else:
-        tmp = np.concatenate(attention_weights[k], axis=0)
-
-      del attention_weights[k]
-      gc.collect()
-      attention_weights[k] = tmp
-
-    attention_weights['identifiers'] = identifiers[:, 0, 0]
-    attention_weights['time'] = time[:, :, 0]
-
-    return attention_weights
-
-  # Serialisation.
-  def reset_temp_folder(self):
-    """Deletes and recreates folder with temporary Keras training outputs."""
-    print('Resetting temp folder...')
-    utils.create_folder_if_not_exist(self._temp_folder)
-    shutil.rmtree(self._temp_folder)
-    os.makedirs(self._temp_folder)
-
-  def get_keras_saved_path(self, model_folder):
-    """Returns path to keras checkpoint."""
-    return os.path.join(model_folder, '{}.check'.format(self.name))
-
-  def save(self, model_folder):
-    """Saves optimal TFT weights.
-
-    Args:
-      model_folder: Location to serialze model.
-    """
-    # Allows for direct serialisation of tensorflow variables to avoid spurious
-    # issue with Keras that leads to different performance evaluation results
-    # when model is reloaded (https://github.com/keras-team/keras/issues/4875).
-
-    utils.save(
-        tf.keras.backend.get_session(),
-        model_folder,
-        cp_name=self.name,
-        scope=self.name)
-
-  def load(self, model_folder, use_keras_loadings=False):
-    """Loads TFT weights.
-
-    Args:
-      model_folder: Folder containing serialized models.
-      use_keras_loadings: Whether to load from Keras checkpoint.
-
-    Returns:
-
-    """
-    if use_keras_loadings:
-      # Loads temporary Keras model saved during training.
-      serialisation_path = self.get_keras_saved_path(model_folder)
-      print('Loading model from {}'.format(serialisation_path))
-      self.model.load_weights(serialisation_path)
-    else:
-      # Loads tensorflow graph for optimal models.
-      utils.load(
-          tf.keras.backend.get_session(),
-          model_folder,
-          cp_name=self.name,
-          scope=self.name)
-
-  @classmethod
-  def get_hyperparm_choices(cls):
-    """Returns hyperparameter ranges for random search."""
-    return {
-        'dropout_rate': [0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.9],
-        'hidden_layer_size': [10, 20, 40, 80, 160, 240, 320],
-        'minibatch_size': [64, 128, 256],
-        'learning_rate': [1e-4, 1e-3, 1e-2],
-        'max_gradient_norm': [0.01, 1.0, 100.0],
-        'num_heads': [1, 4],
-        'stack_size': [1],
-    }
+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Temporal Fusion Transformer Model.
+
+Contains the full TFT architecture and associated components. Defines functions
+for training, evaluation and prediction using simple Pandas Dataframe inputs.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gc
+import json
+import os
+import shutil
+
+import data_formatters.base
+import libs.utils as utils
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+
+# Layer definitions.
+concat = tf.keras.backend.concatenate
+stack = tf.keras.backend.stack
+K = tf.keras.backend
+Add = tf.keras.layers.Add
+LayerNorm = tf.keras.layers.LayerNormalization
+Dense = tf.keras.layers.Dense
+Multiply = tf.keras.layers.Multiply
+Dropout = tf.keras.layers.Dropout
+Activation = tf.keras.layers.Activation
+Lambda = tf.keras.layers.Lambda
+
+# Default input types.
+InputTypes = data_formatters.base.InputTypes
+
+
+# Layer utility functions.
+def linear_layer(size, activation=None, use_time_distributed=False, use_bias=True):
+    """Returns simple Keras linear layer.
+
+    Args:
+      size: Output size
+      activation: Activation function to apply if required
+      use_time_distributed: Whether to apply layer across time
+      use_bias: Whether bias should be included in layer
+    """
+    linear = tf.keras.layers.Dense(size, activation=activation, use_bias=use_bias)
+    if use_time_distributed:
+        linear = tf.keras.layers.TimeDistributed(linear)
+    return linear
+
+
+def apply_mlp(
+    inputs, hidden_size, output_size, output_activation=None, hidden_activation="tanh", use_time_distributed=False
+):
+    """Applies simple feed-forward network to an input.
+
+    Args:
+      inputs: MLP inputs
+      hidden_size: Hidden state size
+      output_size: Output size of MLP
+      output_activation: Activation function to apply on output
+      hidden_activation: Activation function to apply on input
+      use_time_distributed: Whether to apply across time
+
+    Returns:
+      Tensor for MLP outputs.
+    """
+    if use_time_distributed:
+        hidden = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(hidden_size, activation=hidden_activation))(
+            inputs
+        )
+        return tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(output_size, activation=output_activation))(hidden)
+    else:
+        hidden = tf.keras.layers.Dense(hidden_size, activation=hidden_activation)(inputs)
+        return tf.keras.layers.Dense(output_size, activation=output_activation)(hidden)
+
+
+def apply_gating_layer(x, hidden_layer_size, dropout_rate=None, use_time_distributed=True, activation=None):
+    """Applies a Gated Linear Unit (GLU) to an input.
+
+    Args:
+      x: Input to gating layer
+      hidden_layer_size: Dimension of GLU
+      dropout_rate: Dropout rate to apply if any
+      use_time_distributed: Whether to apply across time
+      activation: Activation function to apply to the linear feature transform if
+        necessary
+
+    Returns:
+      Tuple of tensors for: (GLU output, gate)
+    """
+
+    if dropout_rate is not None:
+        x = tf.keras.layers.Dropout(dropout_rate)(x)
+
+    if use_time_distributed:
+        activation_layer = tf.keras.layers.TimeDistributed(
+            tf.keras.layers.Dense(hidden_layer_size, activation=activation)
+        )(x)
+        gated_layer = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(hidden_layer_size, activation="sigmoid"))(x)
+    else:
+        activation_layer = tf.keras.layers.Dense(hidden_layer_size, activation=activation)(x)
+        gated_layer = tf.keras.layers.Dense(hidden_layer_size, activation="sigmoid")(x)
+
+    return tf.keras.layers.Multiply()([activation_layer, gated_layer]), gated_layer
+
+
+def add_and_norm(x_list):
+    """Applies skip connection followed by layer normalisation.
+
+    Args:
+      x_list: List of inputs to sum for skip connection
+
+    Returns:
+      Tensor output from layer.
+    """
+    tmp = Add()(x_list)
+    tmp = LayerNorm()(tmp)
+    return tmp
+
+
+def gated_residual_network(
+    x,
+    hidden_layer_size,
+    output_size=None,
+    dropout_rate=None,
+    use_time_distributed=True,
+    additional_context=None,
+    return_gate=False,
+):
+    """Applies the gated residual network (GRN) as defined in paper.
+
+    Args:
+      x: Network inputs
+      hidden_layer_size: Internal state size
+      output_size: Size of output layer
+      dropout_rate: Dropout rate if dropout is applied
+      use_time_distributed: Whether to apply network across time dimension
+      additional_context: Additional context vector to use if relevant
+      return_gate: Whether to return GLU gate for diagnostic purposes
+
+    Returns:
+      Tuple of tensors for: (GRN output, GLU gate)
+    """
+
+    # Setup skip connection
+    if output_size is None:
+        output_size = hidden_layer_size
+        skip = x
+    else:
+        linear = Dense(output_size)
+        if use_time_distributed:
+            linear = tf.keras.layers.TimeDistributed(linear)
+        skip = linear(x)
+
+    # Apply feedforward network
+    hidden = linear_layer(hidden_layer_size, activation=None, use_time_distributed=use_time_distributed)(x)
+    if additional_context is not None:
+        hidden = hidden + linear_layer(
+            hidden_layer_size, activation=None, use_time_distributed=use_time_distributed, use_bias=False
+        )(additional_context)
+    hidden = tf.keras.layers.Activation("elu")(hidden)
+    hidden = linear_layer(hidden_layer_size, activation=None, use_time_distributed=use_time_distributed)(hidden)
+
+    gating_layer, gate = apply_gating_layer(
+        hidden, output_size, dropout_rate=dropout_rate, use_time_distributed=use_time_distributed, activation=None
+    )
+
+    if return_gate:
+        return add_and_norm([skip, gating_layer]), gate
+    else:
+        return add_and_norm([skip, gating_layer])
+
+
+# Attention Components.
+def get_decoder_mask(self_attn_inputs):
+    """Returns causal mask to apply for self-attention layer.
+
+    Args:
+      self_attn_inputs: Inputs to self attention layer to determine mask shape
+    """
+    len_s = tf.shape(self_attn_inputs)[1]
+    bs = tf.shape(self_attn_inputs)[:1]
+    mask = K.cumsum(tf.eye(len_s, batch_shape=bs), 1)
+    return mask
+
+
+class ScaledDotProductAttention:
+    """Defines scaled dot product attention layer.
+
+    Attributes:
+      dropout: Dropout rate to use
+      activation: Normalisation function for scaled dot product attention (e.g.
+        softmax by default)
+    """
+
+    def __init__(self, attn_dropout=0.0):
+        self.dropout = Dropout(attn_dropout)
+        self.activation = Activation("softmax")
+
+    def __call__(self, q, k, v, mask):
+        """Applies scaled dot product attention.
+
+        Args:
+          q: Queries
+          k: Keys
+          v: Values
+          mask: Masking if required -- sets softmax to very large value
+
+        Returns:
+          Tuple of (layer outputs, attention weights)
+        """
+        temper = tf.sqrt(tf.cast(tf.shape(k)[-1], dtype="float32"))
+        attn = Lambda(lambda x: K.batch_dot(x[0], x[1], axes=[2, 2]) / temper)([q, k])  # shape=(batch, q, k)
+        if mask is not None:
+            mmask = Lambda(lambda x: (-1e9) * (1.0 - K.cast(x, "float32")))(mask)  # setting to infinity
+            attn = Add()([attn, mmask])
+        attn = self.activation(attn)
+        attn = self.dropout(attn)
+        output = Lambda(lambda x: K.batch_dot(x[0], x[1]))([attn, v])
+        return output, attn
+
+
+class InterpretableMultiHeadAttention:
+    """Defines interpretable multi-head attention layer.
+
+    Attributes:
+      n_head: Number of heads
+      d_k: Key/query dimensionality per head
+      d_v: Value dimensionality
+      dropout: Dropout rate to apply
+      qs_layers: List of queries across heads
+      ks_layers: List of keys across heads
+      vs_layers: List of values across heads
+      attention: Scaled dot product attention layer
+      w_o: Output weight matrix to project internal state to the original TFT
+        state size
+    """
+
+    def __init__(self, n_head, d_model, dropout):
+        """Initialises layer.
+
+        Args:
+          n_head: Number of heads
+          d_model: TFT state dimensionality
+          dropout: Dropout discard rate
+        """
+
+        self.n_head = n_head
+        self.d_k = self.d_v = d_k = d_v = d_model // n_head
+        self.dropout = dropout
+
+        self.qs_layers = []
+        self.ks_layers = []
+        self.vs_layers = []
+
+        # Use same value layer to facilitate interp
+        vs_layer = Dense(d_v, use_bias=False)
+
+        for _ in range(n_head):
+            self.qs_layers.append(Dense(d_k, use_bias=False))
+            self.ks_layers.append(Dense(d_k, use_bias=False))
+            self.vs_layers.append(vs_layer)  # use same vs_layer
+
+        self.attention = ScaledDotProductAttention()
+        self.w_o = Dense(d_model, use_bias=False)
+
+    def __call__(self, q, k, v, mask=None):
+        """Applies interpretable multihead attention.
+
+        Using T to denote the number of time steps fed into the transformer.
+
+        Args:
+          q: Query tensor of shape=(?, T, d_model)
+          k: Key of shape=(?, T, d_model)
+          v: Values of shape=(?, T, d_model)
+          mask: Masking if required with shape=(?, T, T)
+
+        Returns:
+          Tuple of (layer outputs, attention weights)
+        """
+        n_head = self.n_head
+
+        heads = []
+        attns = []
+        for i in range(n_head):
+            qs = self.qs_layers[i](q)
+            ks = self.ks_layers[i](k)
+            vs = self.vs_layers[i](v)
+            head, attn = self.attention(qs, ks, vs, mask)
+
+            head_dropout = Dropout(self.dropout)(head)
+            heads.append(head_dropout)
+            attns.append(attn)
+        head = K.stack(heads) if n_head > 1 else heads[0]
+        attn = K.stack(attns)
+
+        outputs = K.mean(head, axis=0) if n_head > 1 else head
+        outputs = self.w_o(outputs)
+        outputs = Dropout(self.dropout)(outputs)  # output dropout
+
+        return outputs, attn
+
+
+class TFTDataCache(object):
+    """Caches data for the TFT."""
+
+    _data_cache = {}
+
+    @classmethod
+    def update(cls, data, key):
+        """Updates cached data.
+
+        Args:
+          data: Source to update
+          key: Key to dictionary location
+        """
+        cls._data_cache[key] = data
+
+    @classmethod
+    def get(cls, key):
+        """Returns data stored at key location."""
+        return cls._data_cache[key].copy()
+
+    @classmethod
+    def contains(cls, key):
+        """Retuns boolean indicating whether key is present in cache."""
+
+        return key in cls._data_cache
+
+
+# TFT model definitions.
+class TemporalFusionTransformer(object):
+    """Defines Temporal Fusion Transformer.
+
+    Attributes:
+      name: Name of model
+      time_steps: Total number of input time steps per forecast date (i.e. Width
+        of Temporal fusion decoder N)
+      input_size: Total number of inputs
+      output_size: Total number of outputs
+      category_counts: Number of categories per categorical variable
+      n_multiprocessing_workers: Number of workers to use for parallel
+        computations
+      column_definition: List of tuples of (string, DataType, InputType) that
+        define each column
+      quantiles: Quantiles to forecast for TFT
+      use_cudnn: Whether to use Keras CuDNNLSTM or standard LSTM layers
+      hidden_layer_size: Internal state size of TFT
+      dropout_rate: Dropout discard rate
+      max_gradient_norm: Maximum norm for gradient clipping
+      learning_rate: Initial learning rate of ADAM optimizer
+      minibatch_size: Size of minibatches for training
+      num_epochs: Maximum number of epochs for training
+      early_stopping_patience: Maximum number of iterations of non-improvement
+        before early stopping kicks in
+      num_encoder_steps: Size of LSTM encoder -- i.e. number of past time steps
+        before forecast date to use
+      num_stacks: Number of self-attention layers to apply (default is 1 for basic
+        TFT)
+      num_heads: Number of heads for interpretable mulit-head attention
+      model: Keras model for TFT
+    """
+
+    def __init__(self, raw_params, use_cudnn=False):
+        """Builds TFT from parameters.
+
+        Args:
+          raw_params: Parameters to define TFT
+          use_cudnn: Whether to use CUDNN GPU optimised LSTM
+        """
+
+        self.name = self.__class__.__name__
+
+        params = dict(raw_params)  # copy locally
+
+        # Data parameters
+        self.time_steps = int(params["total_time_steps"])
+        self.input_size = int(params["input_size"])
+        self.output_size = int(params["output_size"])
+        self.category_counts = json.loads(str(params["category_counts"]))
+        self.n_multiprocessing_workers = int(params["multiprocessing_workers"])
+
+        # Relevant indices for TFT
+        self._input_obs_loc = json.loads(str(params["input_obs_loc"]))
+        self._static_input_loc = json.loads(str(params["static_input_loc"]))
+        self._known_regular_input_idx = json.loads(str(params["known_regular_inputs"]))
+        self._known_categorical_input_idx = json.loads(str(params["known_categorical_inputs"]))
+
+        self.column_definition = params["column_definition"]
+
+        # Network params
+        self.quantiles = [0.1, 0.5, 0.9]
+        self.use_cudnn = use_cudnn  # Whether to use GPU optimised LSTM
+        self.hidden_layer_size = int(params["hidden_layer_size"])
+        self.dropout_rate = float(params["dropout_rate"])
+        self.max_gradient_norm = float(params["max_gradient_norm"])
+        self.learning_rate = float(params["learning_rate"])
+        self.minibatch_size = int(params["minibatch_size"])
+        self.num_epochs = int(params["num_epochs"])
+        self.early_stopping_patience = int(params["early_stopping_patience"])
+
+        self.num_encoder_steps = int(params["num_encoder_steps"])
+        self.num_stacks = int(params["stack_size"])
+        self.num_heads = int(params["num_heads"])
+
+        # Serialisation options
+        self._temp_folder = os.path.join(params["model_folder"], "tmp")
+        self.reset_temp_folder()
+
+        # Extra components to store Tensorflow nodes for attention computations
+        self._input_placeholder = None
+        self._attention_components = None
+        self._prediction_parts = None
+
+        print("*** {} params ***".format(self.name))
+        for k in params:
+            print("# {} = {}".format(k, params[k]))
+
+        # Build model
+        self.model = self.build_model()
+
+    def get_tft_embeddings(self, all_inputs):
+        """Transforms raw inputs to embeddings.
+
+        Applies linear transformation onto continuous variables and uses embeddings
+        for categorical variables.
+
+        Args:
+          all_inputs: Inputs to transform
+
+        Returns:
+          Tensors for transformed inputs.
+        """
+
+        time_steps = self.time_steps
+
+        # Sanity checks
+        for i in self._known_regular_input_idx:
+            if i in self._input_obs_loc:
+                raise ValueError("Observation cannot be known a priori!")
+        for i in self._input_obs_loc:
+            if i in self._static_input_loc:
+                raise ValueError("Observation cannot be static!")
+
+        if all_inputs.get_shape().as_list()[-1] != self.input_size:
+            raise ValueError(
+                "Illegal number of inputs! Inputs observed={}, expected={}".format(
+                    all_inputs.get_shape().as_list()[-1], self.input_size
+                )
+            )
+
+        num_categorical_variables = len(self.category_counts)
+        num_regular_variables = self.input_size - num_categorical_variables
+
+        embedding_sizes = [self.hidden_layer_size for i, size in enumerate(self.category_counts)]
+
+        embeddings = []
+        for i in range(num_categorical_variables):
+
+            embedding = tf.keras.Sequential(
+                [
+                    tf.keras.layers.InputLayer([time_steps]),
+                    tf.keras.layers.Embedding(
+                        self.category_counts[i], embedding_sizes[i], input_length=time_steps, dtype=tf.float32
+                    ),
+                ]
+            )
+            embeddings.append(embedding)
+
+        regular_inputs, categorical_inputs = (
+            all_inputs[:, :, :num_regular_variables],
+            all_inputs[:, :, num_regular_variables:],
+        )
+
+        embedded_inputs = [embeddings[i](categorical_inputs[Ellipsis, i]) for i in range(num_categorical_variables)]
+
+        # Static inputs
+        if self._static_input_loc:
+            static_inputs = [
+                tf.keras.layers.Dense(self.hidden_layer_size)(regular_inputs[:, 0, i : i + 1])
+                for i in range(num_regular_variables)
+                if i in self._static_input_loc
+            ] + [
+                embedded_inputs[i][:, 0, :]
+                for i in range(num_categorical_variables)
+                if i + num_regular_variables in self._static_input_loc
+            ]
+            static_inputs = tf.keras.backend.stack(static_inputs, axis=1)
+
+        else:
+            static_inputs = None
+
+        def convert_real_to_embedding(x):
+            """Applies linear transformation for time-varying inputs."""
+            return tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(self.hidden_layer_size))(x)
+
+        # Targets
+        obs_inputs = tf.keras.backend.stack(
+            [convert_real_to_embedding(regular_inputs[Ellipsis, i : i + 1]) for i in self._input_obs_loc], axis=-1
+        )
+
+        # Observed (a prioir unknown) inputs
+        wired_embeddings = []
+        for i in range(num_categorical_variables):
+            if i not in self._known_categorical_input_idx and i + num_regular_variables not in self._input_obs_loc:
+                e = embeddings[i](categorical_inputs[:, :, i])
+                wired_embeddings.append(e)
+
+        unknown_inputs = []
+        for i in range(regular_inputs.shape[-1]):
+            if i not in self._known_regular_input_idx and i not in self._input_obs_loc:
+                e = convert_real_to_embedding(regular_inputs[Ellipsis, i : i + 1])
+                unknown_inputs.append(e)
+
+        if unknown_inputs + wired_embeddings:
+            unknown_inputs = tf.keras.backend.stack(unknown_inputs + wired_embeddings, axis=-1)
+        else:
+            unknown_inputs = None
+
+        # A priori known inputs
+        known_regular_inputs = [
+            convert_real_to_embedding(regular_inputs[Ellipsis, i : i + 1])
+            for i in self._known_regular_input_idx
+            if i not in self._static_input_loc
+        ]
+        known_categorical_inputs = [
+            embedded_inputs[i]
+            for i in self._known_categorical_input_idx
+            if i + num_regular_variables not in self._static_input_loc
+        ]
+
+        known_combined_layer = tf.keras.backend.stack(known_regular_inputs + known_categorical_inputs, axis=-1)
+
+        return unknown_inputs, known_combined_layer, obs_inputs, static_inputs
+
+    def _get_single_col_by_type(self, input_type):
+        """Returns name of single column for input type."""
+
+        return utils.get_single_col_by_input_type(input_type, self.column_definition)
+
+    def training_data_cached(self):
+        """Returns boolean indicating if training data has been cached."""
+
+        return TFTDataCache.contains("train") and TFTDataCache.contains("valid")
+
+    def cache_batched_data(self, data, cache_key, num_samples=-1):
+        """Batches and caches data once for using during training.
+
+        Args:
+          data: Data to batch and cache
+          cache_key: Key used for cache
+          num_samples: Maximum number of samples to extract (-1 to use all data)
+        """
+
+        if num_samples > 0:
+            TFTDataCache.update(self._batch_sampled_data(data, max_samples=num_samples), cache_key)
+        else:
+            TFTDataCache.update(self._batch_data(data), cache_key)
+
+        print('Cached data "{}" updated'.format(cache_key))
+
+    def _batch_sampled_data(self, data, max_samples):
+        """Samples segments into a compatible format.
+
+        Args:
+          data: Sources data to sample and batch
+          max_samples: Maximum number of samples in batch
+
+        Returns:
+          Dictionary of batched data with the maximum samples specified.
+        """
+
+        if max_samples < 1:
+            raise ValueError("Illegal number of samples specified! samples={}".format(max_samples))
+
+        id_col = self._get_single_col_by_type(InputTypes.ID)
+        time_col = self._get_single_col_by_type(InputTypes.TIME)
+
+        data.sort_values(by=[id_col, time_col], inplace=True)
+
+        print("Getting valid sampling locations.")
+        valid_sampling_locations = []
+        split_data_map = {}
+        for identifier, df in data.groupby(id_col):
+            print("Getting locations for {}".format(identifier))
+            num_entries = len(df)
+            if num_entries >= self.time_steps:
+                valid_sampling_locations += [
+                    (identifier, self.time_steps + i) for i in range(num_entries - self.time_steps + 1)
+                ]
+            split_data_map[identifier] = df
+
+        inputs = np.zeros((max_samples, self.time_steps, self.input_size))
+        outputs = np.zeros((max_samples, self.time_steps, self.output_size))
+        time = np.empty((max_samples, self.time_steps, 1), dtype=object)
+        identifiers = np.empty((max_samples, self.time_steps, 1), dtype=object)
+
+        if max_samples > 0 and len(valid_sampling_locations) > max_samples:
+            print("Extracting {} samples...".format(max_samples))
+            ranges = [
+                valid_sampling_locations[i]
+                for i in np.random.choice(len(valid_sampling_locations), max_samples, replace=False)
+            ]
+        else:
+            print("Max samples={} exceeds # available segments={}".format(max_samples, len(valid_sampling_locations)))
+            ranges = valid_sampling_locations
+
+        id_col = self._get_single_col_by_type(InputTypes.ID)
+        time_col = self._get_single_col_by_type(InputTypes.TIME)
+        target_col = self._get_single_col_by_type(InputTypes.TARGET)
+        input_cols = [tup[0] for tup in self.column_definition if tup[2] not in {InputTypes.ID, InputTypes.TIME}]
+
+        for i, tup in enumerate(ranges):
+            if (i + 1 % 1000) == 0:
+                print(i + 1, "of", max_samples, "samples done...")
+            identifier, start_idx = tup
+            sliced = split_data_map[identifier].iloc[start_idx - self.time_steps : start_idx]
+            inputs[i, :, :] = sliced[input_cols]
+            outputs[i, :, :] = sliced[[target_col]]
+            time[i, :, 0] = sliced[time_col]
+            identifiers[i, :, 0] = sliced[id_col]
+
+        sampled_data = {
+            "inputs": inputs,
+            "outputs": outputs[:, self.num_encoder_steps :, :],
+            "active_entries": np.ones_like(outputs[:, self.num_encoder_steps :, :]),
+            "time": time,
+            "identifier": identifiers,
+        }
+
+        return sampled_data
+
+    def _batch_data(self, data):
+        """Batches data for training.
+
+        Converts raw dataframe from a 2-D tabular format to a batched 3-D array
+        to feed into Keras model.
+
+        Args:
+          data: DataFrame to batch
+
+        Returns:
+          Batched Numpy array with shape=(?, self.time_steps, self.input_size)
+        """
+
+        # Functions.
+        def _batch_single_entity(input_data):
+            time_steps = len(input_data)
+            lags = self.time_steps
+            x = input_data.values
+            if time_steps >= lags:
+                return np.stack([x[i : time_steps - (lags - 1) + i, :] for i in range(lags)], axis=1)
+
+            else:
+                return None
+
+        id_col = self._get_single_col_by_type(InputTypes.ID)
+        time_col = self._get_single_col_by_type(InputTypes.TIME)
+        target_col = self._get_single_col_by_type(InputTypes.TARGET)
+        input_cols = [tup[0] for tup in self.column_definition if tup[2] not in {InputTypes.ID, InputTypes.TIME}]
+
+        data_map = {}
+        for _, sliced in data.groupby(id_col):
+
+            col_mappings = {"identifier": [id_col], "time": [time_col], "outputs": [target_col], "inputs": input_cols}
+
+            for k in col_mappings:
+                cols = col_mappings[k]
+                arr = _batch_single_entity(sliced[cols].copy())
+
+                if k not in data_map:
+                    data_map[k] = [arr]
+                else:
+                    data_map[k].append(arr)
+
+        # Combine all data
+        for k in data_map:
+            # Wendi: Avoid returning None when the length is not enough
+            data_map[k] = np.concatenate([i for i in data_map[k] if i is not None], axis=0)
+
+        # Shorten target so we only get decoder steps
+        data_map["outputs"] = data_map["outputs"][:, self.num_encoder_steps :, :]
+
+        active_entries = np.ones_like(data_map["outputs"])
+        if "active_entries" not in data_map:
+            data_map["active_entries"] = active_entries
+        else:
+            data_map["active_entries"].append(active_entries)
+
+        return data_map
+
+    def _get_active_locations(self, x):
+        """Formats sample weights for Keras training."""
+        return (np.sum(x, axis=-1) > 0.0) * 1.0
+
+    def _build_base_graph(self):
+        """Returns graph defining layers of the TFT."""
+
+        # Size definitions.
+        time_steps = self.time_steps
+        combined_input_size = self.input_size
+        encoder_steps = self.num_encoder_steps
+
+        # Inputs.
+        all_inputs = tf.keras.layers.Input(
+            shape=(
+                time_steps,
+                combined_input_size,
+            )
+        )
+
+        unknown_inputs, known_combined_layer, obs_inputs, static_inputs = self.get_tft_embeddings(all_inputs)
+
+        # Isolate known and observed historical inputs.
+        if unknown_inputs is not None:
+            historical_inputs = concat(
+                [
+                    unknown_inputs[:, :encoder_steps, :],
+                    known_combined_layer[:, :encoder_steps, :],
+                    obs_inputs[:, :encoder_steps, :],
+                ],
+                axis=-1,
+            )
+        else:
+            historical_inputs = concat(
+                [known_combined_layer[:, :encoder_steps, :], obs_inputs[:, :encoder_steps, :]], axis=-1
+            )
+
+        # Isolate only known future inputs.
+        future_inputs = known_combined_layer[:, encoder_steps:, :]
+
+        def static_combine_and_mask(embedding):
+            """Applies variable selection network to static inputs.
+
+            Args:
+              embedding: Transformed static inputs
+
+            Returns:
+              Tensor output for variable selection network
+            """
+
+            # Add temporal features
+            _, num_static, _ = embedding.get_shape().as_list()
+
+            flatten = tf.keras.layers.Flatten()(embedding)
+
+            # Nonlinear transformation with gated residual network.
+            mlp_outputs = gated_residual_network(
+                flatten,
+                self.hidden_layer_size,
+                output_size=num_static,
+                dropout_rate=self.dropout_rate,
+                use_time_distributed=False,
+                additional_context=None,
+            )
+
+            sparse_weights = tf.keras.layers.Activation("softmax")(mlp_outputs)
+            sparse_weights = K.expand_dims(sparse_weights, axis=-1)
+
+            trans_emb_list = []
+            for i in range(num_static):
+                e = gated_residual_network(
+                    embedding[:, i : i + 1, :],
+                    self.hidden_layer_size,
+                    dropout_rate=self.dropout_rate,
+                    use_time_distributed=False,
+                )
+                trans_emb_list.append(e)
+
+            transformed_embedding = concat(trans_emb_list, axis=1)
+
+            combined = tf.keras.layers.Multiply()([sparse_weights, transformed_embedding])
+
+            static_vec = K.sum(combined, axis=1)
+
+            return static_vec, sparse_weights
+
+        static_encoder, static_weights = static_combine_and_mask(static_inputs)
+
+        static_context_variable_selection = gated_residual_network(
+            static_encoder, self.hidden_layer_size, dropout_rate=self.dropout_rate, use_time_distributed=False
+        )
+        static_context_enrichment = gated_residual_network(
+            static_encoder, self.hidden_layer_size, dropout_rate=self.dropout_rate, use_time_distributed=False
+        )
+        static_context_state_h = gated_residual_network(
+            static_encoder, self.hidden_layer_size, dropout_rate=self.dropout_rate, use_time_distributed=False
+        )
+        static_context_state_c = gated_residual_network(
+            static_encoder, self.hidden_layer_size, dropout_rate=self.dropout_rate, use_time_distributed=False
+        )
+
+        def lstm_combine_and_mask(embedding):
+            """Apply temporal variable selection networks.
+
+            Args:
+              embedding: Transformed inputs.
+
+            Returns:
+              Processed tensor outputs.
+            """
+
+            # Add temporal features
+            _, time_steps, embedding_dim, num_inputs = embedding.get_shape().as_list()
+
+            flatten = K.reshape(embedding, [-1, time_steps, embedding_dim * num_inputs])
+
+            expanded_static_context = K.expand_dims(static_context_variable_selection, axis=1)
+
+            # Variable selection weights
+            mlp_outputs, static_gate = gated_residual_network(
+                flatten,
+                self.hidden_layer_size,
+                output_size=num_inputs,
+                dropout_rate=self.dropout_rate,
+                use_time_distributed=True,
+                additional_context=expanded_static_context,
+                return_gate=True,
+            )
+
+            sparse_weights = tf.keras.layers.Activation("softmax")(mlp_outputs)
+            sparse_weights = tf.expand_dims(sparse_weights, axis=2)
+
+            # Non-linear Processing & weight application
+            trans_emb_list = []
+            for i in range(num_inputs):
+                grn_output = gated_residual_network(
+                    embedding[Ellipsis, i],
+                    self.hidden_layer_size,
+                    dropout_rate=self.dropout_rate,
+                    use_time_distributed=True,
+                )
+                trans_emb_list.append(grn_output)
+
+            transformed_embedding = stack(trans_emb_list, axis=-1)
+
+            combined = tf.keras.layers.Multiply()([sparse_weights, transformed_embedding])
+            temporal_ctx = K.sum(combined, axis=-1)
+
+            return temporal_ctx, sparse_weights, static_gate
+
+        historical_features, historical_flags, _ = lstm_combine_and_mask(historical_inputs)
+        future_features, future_flags, _ = lstm_combine_and_mask(future_inputs)
+
+        # LSTM layer
+        def get_lstm(return_state):
+            """Returns LSTM cell initialized with default parameters."""
+            if self.use_cudnn:
+                lstm = tf.keras.layers.CuDNNLSTM(
+                    self.hidden_layer_size,
+                    return_sequences=True,
+                    return_state=return_state,
+                    stateful=False,
+                )
+            else:
+                lstm = tf.keras.layers.LSTM(
+                    self.hidden_layer_size,
+                    return_sequences=True,
+                    return_state=return_state,
+                    stateful=False,
+                    # Additional params to ensure LSTM matches CuDNN, See TF 2.0 :
+                    # (https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM)
+                    activation="tanh",
+                    recurrent_activation="sigmoid",
+                    recurrent_dropout=0,
+                    unroll=False,
+                    use_bias=True,
+                )
+            return lstm
+
+        history_lstm, state_h, state_c = get_lstm(return_state=True)(
+            historical_features, initial_state=[static_context_state_h, static_context_state_c]
+        )
+
+        future_lstm = get_lstm(return_state=False)(future_features, initial_state=[state_h, state_c])
+
+        lstm_layer = concat([history_lstm, future_lstm], axis=1)
+
+        # Apply gated skip connection
+        input_embeddings = concat([historical_features, future_features], axis=1)
+
+        lstm_layer, _ = apply_gating_layer(lstm_layer, self.hidden_layer_size, self.dropout_rate, activation=None)
+        temporal_feature_layer = add_and_norm([lstm_layer, input_embeddings])
+
+        # Static enrichment layers
+        expanded_static_context = K.expand_dims(static_context_enrichment, axis=1)
+        enriched, _ = gated_residual_network(
+            temporal_feature_layer,
+            self.hidden_layer_size,
+            dropout_rate=self.dropout_rate,
+            use_time_distributed=True,
+            additional_context=expanded_static_context,
+            return_gate=True,
+        )
+
+        # Decoder self attention
+        self_attn_layer = InterpretableMultiHeadAttention(
+            self.num_heads, self.hidden_layer_size, dropout=self.dropout_rate
+        )
+
+        mask = get_decoder_mask(enriched)
+        x, self_att = self_attn_layer(enriched, enriched, enriched, mask=mask)
+
+        x, _ = apply_gating_layer(x, self.hidden_layer_size, dropout_rate=self.dropout_rate, activation=None)
+        x = add_and_norm([x, enriched])
+
+        # Nonlinear processing on outputs
+        decoder = gated_residual_network(
+            x, self.hidden_layer_size, dropout_rate=self.dropout_rate, use_time_distributed=True
+        )
+
+        # Final skip connection
+        decoder, _ = apply_gating_layer(decoder, self.hidden_layer_size, activation=None)
+        transformer_layer = add_and_norm([decoder, temporal_feature_layer])
+
+        # Attention components for explainability
+        attention_components = {
+            # Temporal attention weights
+            "decoder_self_attn": self_att,
+            # Static variable selection weights
+            "static_flags": static_weights[Ellipsis, 0],
+            # Variable selection weights of past inputs
+            "historical_flags": historical_flags[Ellipsis, 0, :],
+            # Variable selection weights of future inputs
+            "future_flags": future_flags[Ellipsis, 0, :],
+        }
+
+        return transformer_layer, all_inputs, attention_components
+
+    def build_model(self):
+        """Build model and defines training losses.
+
+        Returns:
+          Fully defined Keras model.
+        """
+
+        with tf.variable_scope(self.name):
+
+            transformer_layer, all_inputs, attention_components = self._build_base_graph()
+
+            outputs = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(self.output_size * len(self.quantiles)))(
+                transformer_layer[Ellipsis, self.num_encoder_steps :, :]
+            )
+
+            self._attention_components = attention_components
+
+            adam = tf.keras.optimizers.Adam(lr=self.learning_rate, clipnorm=self.max_gradient_norm)
+
+            model = tf.keras.Model(inputs=all_inputs, outputs=outputs)
+
+            print(model.summary())
+
+            valid_quantiles = self.quantiles
+            output_size = self.output_size
+
+            class QuantileLossCalculator(object):
+                """Computes the combined quantile loss for prespecified quantiles.
+
+                Attributes:
+                  quantiles: Quantiles to compute losses
+                """
+
+                def __init__(self, quantiles):
+                    """Initializes computer with quantiles for loss calculations.
+
+                    Args:
+                      quantiles: Quantiles to use for computations.
+                    """
+                    self.quantiles = quantiles
+
+                def quantile_loss(self, a, b):
+                    """Returns quantile loss for specified quantiles.
+
+                    Args:
+                      a: Targets
+                      b: Predictions
+                    """
+                    quantiles_used = set(self.quantiles)
+
+                    loss = 0.0
+                    for i, quantile in enumerate(valid_quantiles):
+                        if quantile in quantiles_used:
+                            loss += utils.tensorflow_quantile_loss(
+                                a[Ellipsis, output_size * i : output_size * (i + 1)],
+                                b[Ellipsis, output_size * i : output_size * (i + 1)],
+                                quantile,
+                            )
+                    return loss
+
+            quantile_loss = QuantileLossCalculator(valid_quantiles).quantile_loss
+
+            model.compile(loss=quantile_loss, optimizer=adam, sample_weight_mode="temporal")
+
+            self._input_placeholder = all_inputs
+
+        return model
+
+    def fit(self, train_df=None, valid_df=None):
+        """Fits deep neural network for given training and validation data.
+
+        Args:
+          train_df: DataFrame for training data
+          valid_df: DataFrame for validation data
+        """
+
+        print("*** Fitting {} ***".format(self.name))
+
+        # Add relevant callbacks
+        callbacks = [
+            tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=self.early_stopping_patience, min_delta=1e-4),
+            tf.keras.callbacks.ModelCheckpoint(
+                filepath=self.get_keras_saved_path(self._temp_folder),
+                monitor="val_loss",
+                save_best_only=True,
+                save_weights_only=True,
+            ),
+            tf.keras.callbacks.TerminateOnNaN(),
+        ]
+
+        print("Getting batched_data")
+        if train_df is None:
+            print("Using cached training data")
+            train_data = TFTDataCache.get("train")
+        else:
+            train_data = self._batch_data(train_df)
+
+        if valid_df is None:
+            print("Using cached validation data")
+            valid_data = TFTDataCache.get("valid")
+        else:
+            valid_data = self._batch_data(valid_df)
+
+        print("Using keras standard fit")
+
+        def _unpack(data):
+            return data["inputs"], data["outputs"], self._get_active_locations(data["active_entries"])
+
+        # Unpack without sample weights
+        data, labels, active_flags = _unpack(train_data)
+        val_data, val_labels, val_flags = _unpack(valid_data)
+
+        all_callbacks = callbacks
+
+        self.model.fit(
+            x=data,
+            y=np.concatenate([labels, labels, labels], axis=-1),
+            sample_weight=active_flags,
+            epochs=self.num_epochs,
+            batch_size=self.minibatch_size,
+            validation_data=(val_data, np.concatenate([val_labels, val_labels, val_labels], axis=-1), val_flags),
+            callbacks=all_callbacks,
+            shuffle=True,
+            use_multiprocessing=True,
+            workers=self.n_multiprocessing_workers,
+        )
+
+        # Load best checkpoint again
+        tmp_checkpont = self.get_keras_saved_path(self._temp_folder)
+        if os.path.exists(tmp_checkpont):
+            self.load(self._temp_folder, use_keras_loadings=True)
+
+        else:
+            print("Cannot load from {}, skipping ...".format(self._temp_folder))
+
+    def evaluate(self, data=None, eval_metric="loss"):
+        """Applies evaluation metric to the training data.
+
+        Args:
+          data: Dataframe for evaluation
+          eval_metric: Evaluation metic to return, based on model definition.
+
+        Returns:
+          Computed evaluation loss.
+        """
+
+        if data is None:
+            print("Using cached validation data")
+            raw_data = TFTDataCache.get("valid")
+        else:
+            raw_data = self._batch_data(data)
+
+        inputs = raw_data["inputs"]
+        outputs = raw_data["outputs"]
+        active_entries = self._get_active_locations(raw_data["active_entries"])
+
+        metric_values = self.model.evaluate(
+            x=inputs,
+            y=np.concatenate([outputs, outputs, outputs], axis=-1),
+            sample_weight=active_entries,
+            workers=16,
+            use_multiprocessing=True,
+        )
+
+        metrics = pd.Series(metric_values, self.model.metrics_names)
+
+        return metrics[eval_metric]
+
+    def predict(self, df, return_targets=False):
+        """Computes predictions for a given input dataset.
+
+        Args:
+          df: Input dataframe
+          return_targets: Whether to also return outputs aligned with predictions to
+            faciliate evaluation
+
+        Returns:
+          Input dataframe or tuple of (input dataframe, algined output dataframe).
+        """
+
+        data = self._batch_data(df)
+
+        inputs = data["inputs"]
+        time = data["time"]
+        identifier = data["identifier"]
+        outputs = data["outputs"]
+
+        combined = self.model.predict(inputs, workers=16, use_multiprocessing=True, batch_size=self.minibatch_size)
+
+        # Format output_csv
+        if self.output_size != 1:
+            raise NotImplementedError("Current version only supports 1D targets!")
+
+        def format_outputs(prediction):
+            """Returns formatted dataframes for prediction."""
+
+            flat_prediction = pd.DataFrame(
+                prediction[:, :, 0], columns=["t+{}".format(i) for i in range(self.time_steps - self.num_encoder_steps)]
+            )
+            cols = list(flat_prediction.columns)
+            flat_prediction["forecast_time"] = time[:, self.num_encoder_steps - 1, 0]
+            flat_prediction["identifier"] = identifier[:, 0, 0]
+
+            # Arrange in order
+            return flat_prediction[["forecast_time", "identifier"] + cols]
+
+        # Extract predictions for each quantile into different entries
+        process_map = {
+            "p{}".format(int(q * 100)): combined[Ellipsis, i * self.output_size : (i + 1) * self.output_size]
+            for i, q in enumerate(self.quantiles)
+        }
+
+        if return_targets:
+            # Add targets if relevant
+            process_map["targets"] = outputs
+
+        return {k: format_outputs(process_map[k]) for k in process_map}
+
+    def get_attention(self, df):
+        """Computes TFT attention weights for a given dataset.
+
+        Args:
+          df: Input dataframe
+
+        Returns:
+            Dictionary of numpy arrays for temporal attention weights and variable
+              selection weights, along with their identifiers and time indices
+        """
+
+        data = self._batch_data(df)
+        inputs = data["inputs"]
+        identifiers = data["identifier"]
+        time = data["time"]
+
+        def get_batch_attention_weights(input_batch):
+            """Returns weights for a given minibatch of data."""
+            input_placeholder = self._input_placeholder
+            attention_weights = {}
+            for k in self._attention_components:
+                attention_weight = tf.keras.backend.get_session().run(
+                    self._attention_components[k], {input_placeholder: input_batch.astype(np.float32)}
+                )
+                attention_weights[k] = attention_weight
+            return attention_weights
+
+        # Compute number of batches
+        batch_size = self.minibatch_size
+        n = inputs.shape[0]
+        num_batches = n // batch_size
+        if n - (num_batches * batch_size) > 0:
+            num_batches += 1
+
+        # Split up inputs into batches
+        batched_inputs = [inputs[i * batch_size : (i + 1) * batch_size, Ellipsis] for i in range(num_batches)]
+
+        # Get attention weights, while avoiding large memory increases
+        attention_by_batch = [get_batch_attention_weights(batch) for batch in batched_inputs]
+        attention_weights = {}
+        for k in self._attention_components:
+            attention_weights[k] = []
+            for batch_weights in attention_by_batch:
+                attention_weights[k].append(batch_weights[k])
+
+            if len(attention_weights[k][0].shape) == 4:
+                tmp = np.concatenate(attention_weights[k], axis=1)
+            else:
+                tmp = np.concatenate(attention_weights[k], axis=0)
+
+            del attention_weights[k]
+            gc.collect()
+            attention_weights[k] = tmp
+
+        attention_weights["identifiers"] = identifiers[:, 0, 0]
+        attention_weights["time"] = time[:, :, 0]
+
+        return attention_weights
+
+    # Serialisation.
+    def reset_temp_folder(self):
+        """Deletes and recreates folder with temporary Keras training outputs."""
+        print("Resetting temp folder...")
+        utils.create_folder_if_not_exist(self._temp_folder)
+        shutil.rmtree(self._temp_folder)
+        os.makedirs(self._temp_folder)
+
+    def get_keras_saved_path(self, model_folder):
+        """Returns path to keras checkpoint."""
+        return os.path.join(model_folder, "{}.check".format(self.name))
+
+    def save(self, model_folder):
+        """Saves optimal TFT weights.
+
+        Args:
+          model_folder: Location to serialze model.
+        """
+        # Allows for direct serialisation of tensorflow variables to avoid spurious
+        # issue with Keras that leads to different performance evaluation results
+        # when model is reloaded (https://github.com/keras-team/keras/issues/4875).
+
+        utils.save(tf.keras.backend.get_session(), model_folder, cp_name=self.name, scope=self.name)
+
+    def load(self, model_folder, use_keras_loadings=False):
+        """Loads TFT weights.
+
+        Args:
+          model_folder: Folder containing serialized models.
+          use_keras_loadings: Whether to load from Keras checkpoint.
+
+        Returns:
+
+        """
+        if use_keras_loadings:
+            # Loads temporary Keras model saved during training.
+            serialisation_path = self.get_keras_saved_path(model_folder)
+            print("Loading model from {}".format(serialisation_path))
+            self.model.load_weights(serialisation_path)
+        else:
+            # Loads tensorflow graph for optimal models.
+            utils.load(tf.keras.backend.get_session(), model_folder, cp_name=self.name, scope=self.name)
+
+    @classmethod
+    def get_hyperparm_choices(cls):
+        """Returns hyperparameter ranges for random search."""
+        return {
+            "dropout_rate": [0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.9],
+            "hidden_layer_size": [10, 20, 40, 80, 160, 240, 320],
+            "minibatch_size": [64, 128, 256],
+            "learning_rate": [1e-4, 1e-3, 1e-2],
+            "max_gradient_norm": [0.01, 1.0, 100.0],
+            "num_heads": [1, 4],
+            "stack_size": [1],
+        }
diff --git a/examples/benchmarks/TFT/libs/utils.py b/examples/benchmarks/TFT/libs/utils.py
index 813d4b176..4682434d6 100644
--- a/examples/benchmarks/TFT/libs/utils.py
+++ b/examples/benchmarks/TFT/libs/utils.py
@@ -1,236 +1,224 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-"""Generic helper functions used across codebase."""
-
-import os
-import pathlib
-
-import numpy as np
-import tensorflow as tf
-from tensorflow.python.tools.inspect_checkpoint import print_tensors_in_checkpoint_file
-
-
-# Generic.
-def get_single_col_by_input_type(input_type, column_definition):
-  """Returns name of single column.
-
-  Args:
-    input_type: Input type of column to extract
-    column_definition: Column definition list for experiment
-  """
-
-  l = [tup[0] for tup in column_definition if tup[2] == input_type]
-
-  if len(l) != 1:
-    raise ValueError('Invalid number of columns for {}'.format(input_type))
-
-  return l[0]
-
-
-def extract_cols_from_data_type(data_type, column_definition,
-                                excluded_input_types):
-  """Extracts the names of columns that correspond to a define data_type.
-
-  Args:
-    data_type: DataType of columns to extract.
-    column_definition: Column definition to use.
-    excluded_input_types: Set of input types to exclude
-
-  Returns:
-    List of names for columns with data type specified.
-  """
-  return [
-      tup[0]
-      for tup in column_definition
-      if tup[1] == data_type and tup[2] not in excluded_input_types
-  ]
-
-
-# Loss functions.
-def tensorflow_quantile_loss(y, y_pred, quantile):
-  """Computes quantile loss for tensorflow.
-
-  Standard quantile loss as defined in the "Training Procedure" section of
-  the main TFT paper
-
-  Args:
-    y: Targets
-    y_pred: Predictions
-    quantile: Quantile to use for loss calculations (between 0 & 1)
-
-  Returns:
-    Tensor for quantile loss.
-  """
-
-  # Checks quantile
-  if quantile < 0 or quantile > 1:
-    raise ValueError(
-        'Illegal quantile value={}! Values should be between 0 and 1.'.format(
-            quantile))
-
-  prediction_underflow = y - y_pred
-  q_loss = quantile * tf.maximum(prediction_underflow, 0.) + (
-      1. - quantile) * tf.maximum(-prediction_underflow, 0.)
-
-  return tf.reduce_sum(q_loss, axis=-1)
-
-
-def numpy_normalised_quantile_loss(y, y_pred, quantile):
-  """Computes normalised quantile loss for numpy arrays.
-
-  Uses the q-Risk metric as defined in the "Training Procedure" section of the
-  main TFT paper.
-
-  Args:
-    y: Targets
-    y_pred: Predictions
-    quantile: Quantile to use for loss calculations (between 0 & 1)
-
-  Returns:
-    Float for normalised quantile loss.
-  """
-  prediction_underflow = y - y_pred
-  weighted_errors = quantile * np.maximum(prediction_underflow, 0.) \
-      + (1. - quantile) * np.maximum(-prediction_underflow, 0.)
-
-  quantile_loss = weighted_errors.mean()
-  normaliser = y.abs().mean()
-
-  return 2 * quantile_loss / normaliser
-
-
-# OS related functions.
-def create_folder_if_not_exist(directory):
-  """Creates folder if it doesn't exist.
-
-  Args:
-    directory: Folder path to create.
-  """
-  # Also creates directories recursively
-  pathlib.Path(directory).mkdir(parents=True, exist_ok=True)
-
-
-# Tensorflow related functions.
-def get_default_tensorflow_config(tf_device='gpu', gpu_id=0):
-  """Creates tensorflow config for graphs to run on CPU or GPU.
-
-  Specifies whether to run graph on gpu or cpu and which GPU ID to use for multi
-  GPU machines.
-
-  Args:
-    tf_device: 'cpu' or 'gpu'
-    gpu_id: GPU ID to use if relevant
-
-  Returns:
-    Tensorflow config.
-  """
-
-  if tf_device == 'cpu':
-    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # for training on cpu
-    tf_config = tf.ConfigProto(
-        log_device_placement=False, device_count={'GPU': 0})
-
-  else:
-    os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
-    os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
-
-    print('Selecting GPU ID={}'.format(gpu_id))
-
-    tf_config = tf.ConfigProto(log_device_placement=False)
-    tf_config.gpu_options.allow_growth = True
-
-  return tf_config
-
-
-def save(tf_session, model_folder, cp_name, scope=None):
-  """Saves Tensorflow graph to checkpoint.
-
-  Saves all trainiable variables under a given variable scope to checkpoint.
-
-  Args:
-    tf_session: Session containing graph
-    model_folder: Folder to save models
-    cp_name: Name of Tensorflow checkpoint
-    scope: Variable scope containing variables to save
-  """
-  # Save model
-  if scope is None:
-    saver = tf.train.Saver()
-  else:
-    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
-    saver = tf.train.Saver(var_list=var_list, max_to_keep=100000)
-
-  save_path = saver.save(tf_session,
-                         os.path.join(model_folder, '{0}.ckpt'.format(cp_name)))
-  print('Model saved to: {0}'.format(save_path))
-
-
-def load(tf_session, model_folder, cp_name, scope=None, verbose=False):
-  """Loads Tensorflow graph from checkpoint.
-
-  Args:
-    tf_session: Session to load graph into
-    model_folder: Folder containing serialised model
-    cp_name: Name of Tensorflow checkpoint
-    scope: Variable scope to use.
-    verbose: Whether to print additional debugging information.
-  """
-  # Load model proper
-  load_path = os.path.join(model_folder, '{0}.ckpt'.format(cp_name))
-
-  print('Loading model from {0}'.format(load_path))
-
-  print_weights_in_checkpoint(model_folder, cp_name)
-
-  initial_vars = set(
-      [v.name for v in tf.get_default_graph().as_graph_def().node])
-
-  # Saver
-  if scope is None:
-    saver = tf.train.Saver()
-  else:
-    var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)
-    saver = tf.train.Saver(var_list=var_list, max_to_keep=100000)
-  # Load
-  saver.restore(tf_session, load_path)
-  all_vars = set([v.name for v in tf.get_default_graph().as_graph_def().node])
-
-  if verbose:
-    print('Restored {0}'.format(','.join(initial_vars.difference(all_vars))))
-    print('Existing {0}'.format(','.join(all_vars.difference(initial_vars))))
-    print('All {0}'.format(','.join(all_vars)))
-
-  print('Done.')
-
-
-def print_weights_in_checkpoint(model_folder, cp_name):
-  """Prints all weights in Tensorflow checkpoint.
-
-  Args:
-    model_folder: Folder containing checkpoint
-    cp_name: Name of checkpoint
-
-  Returns:
-
-  """
-  load_path = os.path.join(model_folder, '{0}.ckpt'.format(cp_name))
-
-  print_tensors_in_checkpoint_file(
-      file_name=load_path,
-      tensor_name='',
-      all_tensors=True,
-      all_tensor_names=True)
+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Generic helper functions used across codebase."""
+
+import os
+import pathlib
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.tools.inspect_checkpoint import print_tensors_in_checkpoint_file
+
+
+# Generic.
+def get_single_col_by_input_type(input_type, column_definition):
+    """Returns name of single column.
+
+    Args:
+      input_type: Input type of column to extract
+      column_definition: Column definition list for experiment
+    """
+
+    l = [tup[0] for tup in column_definition if tup[2] == input_type]
+
+    if len(l) != 1:
+        raise ValueError("Invalid number of columns for {}".format(input_type))
+
+    return l[0]
+
+
+def extract_cols_from_data_type(data_type, column_definition, excluded_input_types):
+    """Extracts the names of columns that correspond to a define data_type.
+
+    Args:
+      data_type: DataType of columns to extract.
+      column_definition: Column definition to use.
+      excluded_input_types: Set of input types to exclude
+
+    Returns:
+      List of names for columns with data type specified.
+    """
+    return [tup[0] for tup in column_definition if tup[1] == data_type and tup[2] not in excluded_input_types]
+
+
+# Loss functions.
+def tensorflow_quantile_loss(y, y_pred, quantile):
+    """Computes quantile loss for tensorflow.
+
+    Standard quantile loss as defined in the "Training Procedure" section of
+    the main TFT paper
+
+    Args:
+      y: Targets
+      y_pred: Predictions
+      quantile: Quantile to use for loss calculations (between 0 & 1)
+
+    Returns:
+      Tensor for quantile loss.
+    """
+
+    # Checks quantile
+    if quantile < 0 or quantile > 1:
+        raise ValueError("Illegal quantile value={}! Values should be between 0 and 1.".format(quantile))
+
+    prediction_underflow = y - y_pred
+    q_loss = quantile * tf.maximum(prediction_underflow, 0.0) + (1.0 - quantile) * tf.maximum(
+        -prediction_underflow, 0.0
+    )
+
+    return tf.reduce_sum(q_loss, axis=-1)
+
+
+def numpy_normalised_quantile_loss(y, y_pred, quantile):
+    """Computes normalised quantile loss for numpy arrays.
+
+    Uses the q-Risk metric as defined in the "Training Procedure" section of the
+    main TFT paper.
+
+    Args:
+      y: Targets
+      y_pred: Predictions
+      quantile: Quantile to use for loss calculations (between 0 & 1)
+
+    Returns:
+      Float for normalised quantile loss.
+    """
+    prediction_underflow = y - y_pred
+    weighted_errors = quantile * np.maximum(prediction_underflow, 0.0) + (1.0 - quantile) * np.maximum(
+        -prediction_underflow, 0.0
+    )
+
+    quantile_loss = weighted_errors.mean()
+    normaliser = y.abs().mean()
+
+    return 2 * quantile_loss / normaliser
+
+
+# OS related functions.
+def create_folder_if_not_exist(directory):
+    """Creates folder if it doesn't exist.
+
+    Args:
+      directory: Folder path to create.
+    """
+    # Also creates directories recursively
+    pathlib.Path(directory).mkdir(parents=True, exist_ok=True)
+
+
+# Tensorflow related functions.
+def get_default_tensorflow_config(tf_device="gpu", gpu_id=0):
+    """Creates tensorflow config for graphs to run on CPU or GPU.
+
+    Specifies whether to run graph on gpu or cpu and which GPU ID to use for multi
+    GPU machines.
+
+    Args:
+      tf_device: 'cpu' or 'gpu'
+      gpu_id: GPU ID to use if relevant
+
+    Returns:
+      Tensorflow config.
+    """
+
+    if tf_device == "cpu":
+        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # for training on cpu
+        tf_config = tf.ConfigProto(log_device_placement=False, device_count={"GPU": 0})
+
+    else:
+        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+
+        print("Selecting GPU ID={}".format(gpu_id))
+
+        tf_config = tf.ConfigProto(log_device_placement=False)
+        tf_config.gpu_options.allow_growth = True
+
+    return tf_config
+
+
+def save(tf_session, model_folder, cp_name, scope=None):
+    """Saves Tensorflow graph to checkpoint.
+
+    Saves all trainiable variables under a given variable scope to checkpoint.
+
+    Args:
+      tf_session: Session containing graph
+      model_folder: Folder to save models
+      cp_name: Name of Tensorflow checkpoint
+      scope: Variable scope containing variables to save
+    """
+    # Save model
+    if scope is None:
+        saver = tf.train.Saver()
+    else:
+        var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
+        saver = tf.train.Saver(var_list=var_list, max_to_keep=100000)
+
+    save_path = saver.save(tf_session, os.path.join(model_folder, "{0}.ckpt".format(cp_name)))
+    print("Model saved to: {0}".format(save_path))
+
+
+def load(tf_session, model_folder, cp_name, scope=None, verbose=False):
+    """Loads Tensorflow graph from checkpoint.
+
+    Args:
+      tf_session: Session to load graph into
+      model_folder: Folder containing serialised model
+      cp_name: Name of Tensorflow checkpoint
+      scope: Variable scope to use.
+      verbose: Whether to print additional debugging information.
+    """
+    # Load model proper
+    load_path = os.path.join(model_folder, "{0}.ckpt".format(cp_name))
+
+    print("Loading model from {0}".format(load_path))
+
+    print_weights_in_checkpoint(model_folder, cp_name)
+
+    initial_vars = set([v.name for v in tf.get_default_graph().as_graph_def().node])
+
+    # Saver
+    if scope is None:
+        saver = tf.train.Saver()
+    else:
+        var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)
+        saver = tf.train.Saver(var_list=var_list, max_to_keep=100000)
+    # Load
+    saver.restore(tf_session, load_path)
+    all_vars = set([v.name for v in tf.get_default_graph().as_graph_def().node])
+
+    if verbose:
+        print("Restored {0}".format(",".join(initial_vars.difference(all_vars))))
+        print("Existing {0}".format(",".join(all_vars.difference(initial_vars))))
+        print("All {0}".format(",".join(all_vars)))
+
+    print("Done.")
+
+
+def print_weights_in_checkpoint(model_folder, cp_name):
+    """Prints all weights in Tensorflow checkpoint.
+
+    Args:
+      model_folder: Folder containing checkpoint
+      cp_name: Name of checkpoint
+
+    Returns:
+
+    """
+    load_path = os.path.join(model_folder, "{0}.ckpt".format(cp_name))
+
+    print_tensors_in_checkpoint_file(file_name=load_path, tensor_name="", all_tensors=True, all_tensor_names=True)
diff --git a/examples/benchmarks/TFT/tft.py b/examples/benchmarks/TFT/tft.py
index ee49a1eb7..631204a3d 100644
--- a/examples/benchmarks/TFT/tft.py
+++ b/examples/benchmarks/TFT/tft.py
@@ -1,246 +1,248 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-
-import numpy as np
-import pandas as pd
-import tensorflow.compat.v1 as tf
-import data_formatters.base
-import expt_settings.configs
-import libs.hyperparam_opt
-import libs.tft_model
-import libs.utils as utils
-import os
-import datetime as dte
-
-
-from qlib.model.base import ModelFT
-from qlib.data.dataset import DatasetH
-from qlib.data.dataset.handler import DataHandlerLP
-
-
-
-# To register new datasets, please add them here.
-ALLOW_DATASET = ['Alpha158'] 
-DATASET_SETTING = {
-    'Alpha158': {
-        'feature_col': ['RESI5', 'WVMA5', 'RSQR5', 'KLEN', 'RSQR10', 'CORR5', 'CORD5', 'CORR10', 'ROC60', 'RESI10'],
-        'label_col': ['LABEL0'],
-    },
-}
-# To register new datasets, please add their configurations here.
-
-def get_shifted_label(data_df, shifts=5, col_shift='LABEL0'):
-    return data_df[[col_shift]].groupby('instrument').apply(lambda df: df.shift(shifts))
-
-def fill_test_na(test_df):
-    test_df_res = test_df.copy()
-    feature_cols = ~test_df_res.columns.str.contains('label', case=False)
-    test_feature_fna = test_df_res.loc[:, feature_cols].groupby('datetime').apply(lambda df: df.fillna(df.mean()))
-    test_df_res.loc[:, feature_cols] = test_feature_fna
-    return test_df_res
-
-def process_qlib_data(df, dataset, fillna=False):
-    """Prepare data to fit the TFT model.
-
-        Args:
-          df: Original DataFrame.
-          fillna: Whether to fill the data with the mean values.
-
-        Returns:
-          Transformed DataFrame.
-
-        """
-    # Several features selected manually
-    feature_col = DATASET_SETTING[dataset]['feature_col']
-    label_col = DATASET_SETTING[dataset]['label_col']
-    temp_df = df.loc[:, feature_col+label_col]
-    if fillna:
-        temp_df = fill_test_na(temp_df)
-    temp_df = temp_df.swaplevel()
-    temp_df = temp_df.sort_index()
-    temp_df = temp_df.reset_index(level=0)
-    dates = pd.to_datetime(temp_df.index)
-    temp_df['date'] = dates
-    temp_df['day_of_week'] = dates.dayofweek
-    temp_df['month'] = dates.month
-    temp_df['year'] = dates.year
-    temp_df['const'] = 1.0
-    return temp_df
-
-def process_predicted(df, col_name):
-    """Transform the TFT predicted data into Qlib format.
-
-    Args:
-      df: Original DataFrame.
-      fillna: New column name.
-
-    Returns:
-      Transformed DataFrame.
-
-    """
-    df_res = df.copy()
-    df_res = df_res.rename(columns={"forecast_time": "datetime", "identifier": "instrument", "t+0": col_name})
-    df_res = df_res.set_index(['datetime','instrument']).sort_index()
-    df_res = df_res[[col_name]]
-    return df_res
-
-def format_score(forecast_df, col_name='pred', label_shift=5):
-    pred = process_predicted(forecast_df, col_name=col_name)
-    pred = get_shifted_label(pred, shifts=-label_shift, col_shift=col_name)
-    pred = pred.dropna()[col_name]
-    return pred
-
-def transform_df(df, col_name='LABEL0'):
-    df_res = df['feature']
-    df_res[col_name] = df['label']
-    return df_res
-
-class TFTModel(ModelFT):
-    """TFT Model"""
-
-    def __init__(self, **kwargs):
-        self.model = None
-
-    def _prepare_data(self, dataset: DatasetH):
-        df_train, df_valid = dataset.prepare(
-            ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
-        )
-        return transform_df(df_train), transform_df(df_valid)
-
-    def fit(
-        self,
-        dataset: DatasetH,
-        DATASET = 'Alpha158',
-        MODEL_FOLDER = 'qlib_alpha158_model',
-        LABEL_COL = 'LABEL0',
-        LABEL_SHIFT = 5,
-        USE_GPU_ID = 0,
-        **kwargs
-    ):
-        
-        if DATASET not in ALLOW_DATASET:
-            raise AssertionError("The dataset is not supported, please make a new formatter to fit this dataset")
-        
-        dtrain, dvalid = self._prepare_data(dataset)
-        dtrain.loc[:, LABEL_COL] = get_shifted_label(dtrain, shifts=LABEL_SHIFT, col_shift=LABEL_COL)
-        dvalid.loc[:, LABEL_COL] = get_shifted_label(dvalid, shifts=LABEL_SHIFT, col_shift=LABEL_COL)
-        
-        
-        train = process_qlib_data(dtrain, DATASET, fillna=True).dropna()
-        valid = process_qlib_data(dvalid, DATASET, fillna=True).dropna()
-        
-        ExperimentConfig = expt_settings.configs.ExperimentConfig
-        config = ExperimentConfig(DATASET)
-        self.data_formatter = config.make_data_formatter()
-        self.model_folder = MODEL_FOLDER
-        self.gpu_id = USE_GPU_ID
-        self.label_shift = LABEL_SHIFT
-        self.expt_name = DATASET
-        self.label_col = LABEL_COL
-        
-        use_gpu = (True, self.gpu_id)
-        #===========================Training Process===========================
-        ModelClass = libs.tft_model.TemporalFusionTransformer
-        if not isinstance(self.data_formatter, data_formatters.base.GenericDataFormatter):
-             raise ValueError(
-            "Data formatters should inherit from" +
-            "AbstractDataFormatter! Type={}".format(type(self.data_formatter)))
-
-        default_keras_session = tf.keras.backend.get_session()
-
-        if use_gpu[0]:
-            self.tf_config = utils.get_default_tensorflow_config(tf_device="gpu", gpu_id=use_gpu[1])
-        else:
-            self.tf_config = utils.get_default_tensorflow_config(tf_device="cpu")
-
-        self.data_formatter.set_scalers(train)
-
-        # Sets up default params
-        fixed_params = self.data_formatter.get_experiment_params()
-        params = self.data_formatter.get_default_model_params()
-
-        # Wendi: 合并调优的参数和非调优的参数
-        params = {**params, **fixed_params}
-
-        if not os.path.exists(self.model_folder):
-            os.makedirs(self.model_folder)
-        params['model_folder'] = self.model_folder
-
-        print("*** Begin training ***")
-        best_loss = np.Inf
-
-        tf.reset_default_graph()
-        
-        self.tf_graph = tf.Graph()
-        with self.tf_graph.as_default():
-            self.sess = tf.Session(config=self.tf_config)
-            tf.keras.backend.set_session(self.sess)
-            self.model = ModelClass(params, use_cudnn=use_gpu[0])
-            self.sess.run(tf.global_variables_initializer())
-            self.model.fit(train_df=train, valid_df=valid)
-            print("*** Finished training ***")
-            saved_model_dir = self.model_folder+'/'+'saved_model'
-            if not os.path.exists(saved_model_dir):
-                os.makedirs(saved_model_dir)
-            self.model.save(saved_model_dir)
-            
-            def extract_numerical_data(data):
-                """Strips out forecast time and identifier columns."""
-                return data[[
-                    col for col in data.columns
-                    if col not in {"forecast_time", "identifier"}
-                    ]]
-            
-            #p50_loss = utils.numpy_normalised_quantile_loss(
-            #    extract_numerical_data(targets), extract_numerical_data(p50_forecast),
-            #    0.5)
-            #p90_loss = utils.numpy_normalised_quantile_loss(
-            #    extract_numerical_data(targets), extract_numerical_data(p90_forecast),
-            #    0.9)
-            tf.keras.backend.set_session(default_keras_session)
-        print("Training completed.".format(dte.datetime.now()))
-        #===========================Training Process===========================
-
-    def predict(self, dataset):
-        if self.model is None:
-            raise ValueError("model is not fitted yet!")
-        d_test = dataset.prepare("test", col_set=["feature", "label"])
-        d_test = transform_df(d_test)
-        d_test.loc[:, self.label_col] = get_shifted_label(d_test, shifts=self.label_shift, col_shift=self.label_col)
-        test = process_qlib_data(d_test, self.expt_name, fillna=True).dropna()
-        
-        use_gpu = (True, self.gpu_id)
-        #===========================Predicting Process===========================
-        default_keras_session = tf.keras.backend.get_session()
-        
-        # Sets up default params
-        fixed_params = self.data_formatter.get_experiment_params()
-        params = self.data_formatter.get_default_model_params()
-        params = {**params, **fixed_params}
-        
-        
-        print("*** Begin predicting ***")
-        tf.reset_default_graph()
-        
-        with self.tf_graph.as_default():
-            tf.keras.backend.set_session(self.sess)
-            output_map = self.model.predict(test, return_targets=True)
-            targets = self.data_formatter.format_predictions(output_map["targets"])
-            p50_forecast = self.data_formatter.format_predictions(output_map["p50"])
-            p90_forecast = self.data_formatter.format_predictions(output_map["p90"])
-            tf.keras.backend.set_session(default_keras_session)
-            
-        predict = format_score(p90_forecast, 'pred', self.label_shift)
-        label = format_score(targets, 'label', self.label_shift)
-        #===========================Predicting Process===========================
-        return predict, label
-
-    def finetune(self, dataset: DatasetH):
-        """
-        finetune model
-        Parameters
-        ----------
-        dataset : DatasetH
-            dataset for finetuning
-        """
-        pass
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import numpy as np
+import pandas as pd
+import tensorflow.compat.v1 as tf
+import data_formatters.base
+import expt_settings.configs
+import libs.hyperparam_opt
+import libs.tft_model
+import libs.utils as utils
+import os
+import datetime as dte
+
+
+from qlib.model.base import ModelFT
+from qlib.data.dataset import DatasetH
+from qlib.data.dataset.handler import DataHandlerLP
+
+
+# To register new datasets, please add them here.
+ALLOW_DATASET = ["Alpha158"]
+DATASET_SETTING = {
+    "Alpha158": {
+        "feature_col": ["RESI5", "WVMA5", "RSQR5", "KLEN", "RSQR10", "CORR5", "CORD5", "CORR10", "ROC60", "RESI10"],
+        "label_col": ["LABEL0"],
+    },
+}
+# To register new datasets, please add their configurations here.
+
+
+def get_shifted_label(data_df, shifts=5, col_shift="LABEL0"):
+    return data_df[[col_shift]].groupby("instrument").apply(lambda df: df.shift(shifts))
+
+
+def fill_test_na(test_df):
+    test_df_res = test_df.copy()
+    feature_cols = ~test_df_res.columns.str.contains("label", case=False)
+    test_feature_fna = test_df_res.loc[:, feature_cols].groupby("datetime").apply(lambda df: df.fillna(df.mean()))
+    test_df_res.loc[:, feature_cols] = test_feature_fna
+    return test_df_res
+
+
+def process_qlib_data(df, dataset, fillna=False):
+    """Prepare data to fit the TFT model.
+
+    Args:
+      df: Original DataFrame.
+      fillna: Whether to fill the data with the mean values.
+
+    Returns:
+      Transformed DataFrame.
+
+    """
+    # Several features selected manually
+    feature_col = DATASET_SETTING[dataset]["feature_col"]
+    label_col = DATASET_SETTING[dataset]["label_col"]
+    temp_df = df.loc[:, feature_col + label_col]
+    if fillna:
+        temp_df = fill_test_na(temp_df)
+    temp_df = temp_df.swaplevel()
+    temp_df = temp_df.sort_index()
+    temp_df = temp_df.reset_index(level=0)
+    dates = pd.to_datetime(temp_df.index)
+    temp_df["date"] = dates
+    temp_df["day_of_week"] = dates.dayofweek
+    temp_df["month"] = dates.month
+    temp_df["year"] = dates.year
+    temp_df["const"] = 1.0
+    return temp_df
+
+
+def process_predicted(df, col_name):
+    """Transform the TFT predicted data into Qlib format.
+
+    Args:
+      df: Original DataFrame.
+      fillna: New column name.
+
+    Returns:
+      Transformed DataFrame.
+
+    """
+    df_res = df.copy()
+    df_res = df_res.rename(columns={"forecast_time": "datetime", "identifier": "instrument", "t+0": col_name})
+    df_res = df_res.set_index(["datetime", "instrument"]).sort_index()
+    df_res = df_res[[col_name]]
+    return df_res
+
+
+def format_score(forecast_df, col_name="pred", label_shift=5):
+    pred = process_predicted(forecast_df, col_name=col_name)
+    pred = get_shifted_label(pred, shifts=-label_shift, col_shift=col_name)
+    pred = pred.dropna()[col_name]
+    return pred
+
+
+def transform_df(df, col_name="LABEL0"):
+    df_res = df["feature"]
+    df_res[col_name] = df["label"]
+    return df_res
+
+
+class TFTModel(ModelFT):
+    """TFT Model"""
+
+    def __init__(self, **kwargs):
+        self.model = None
+
+    def _prepare_data(self, dataset: DatasetH):
+        df_train, df_valid = dataset.prepare(
+            ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
+        )
+        return transform_df(df_train), transform_df(df_valid)
+
+    def fit(
+        self,
+        dataset: DatasetH,
+        DATASET="Alpha158",
+        MODEL_FOLDER="qlib_alpha158_model",
+        LABEL_COL="LABEL0",
+        LABEL_SHIFT=5,
+        USE_GPU_ID=0,
+        **kwargs
+    ):
+
+        if DATASET not in ALLOW_DATASET:
+            raise AssertionError("The dataset is not supported, please make a new formatter to fit this dataset")
+
+        dtrain, dvalid = self._prepare_data(dataset)
+        dtrain.loc[:, LABEL_COL] = get_shifted_label(dtrain, shifts=LABEL_SHIFT, col_shift=LABEL_COL)
+        dvalid.loc[:, LABEL_COL] = get_shifted_label(dvalid, shifts=LABEL_SHIFT, col_shift=LABEL_COL)
+
+        train = process_qlib_data(dtrain, DATASET, fillna=True).dropna()
+        valid = process_qlib_data(dvalid, DATASET, fillna=True).dropna()
+
+        ExperimentConfig = expt_settings.configs.ExperimentConfig
+        config = ExperimentConfig(DATASET)
+        self.data_formatter = config.make_data_formatter()
+        self.model_folder = MODEL_FOLDER
+        self.gpu_id = USE_GPU_ID
+        self.label_shift = LABEL_SHIFT
+        self.expt_name = DATASET
+        self.label_col = LABEL_COL
+
+        use_gpu = (True, self.gpu_id)
+        # ===========================Training Process===========================
+        ModelClass = libs.tft_model.TemporalFusionTransformer
+        if not isinstance(self.data_formatter, data_formatters.base.GenericDataFormatter):
+            raise ValueError(
+                "Data formatters should inherit from"
+                + "AbstractDataFormatter! Type={}".format(type(self.data_formatter))
+            )
+
+        default_keras_session = tf.keras.backend.get_session()
+
+        if use_gpu[0]:
+            self.tf_config = utils.get_default_tensorflow_config(tf_device="gpu", gpu_id=use_gpu[1])
+        else:
+            self.tf_config = utils.get_default_tensorflow_config(tf_device="cpu")
+
+        self.data_formatter.set_scalers(train)
+
+        # Sets up default params
+        fixed_params = self.data_formatter.get_experiment_params()
+        params = self.data_formatter.get_default_model_params()
+
+        # Wendi: 合并调优的参数和非调优的参数
+        params = {**params, **fixed_params}
+
+        if not os.path.exists(self.model_folder):
+            os.makedirs(self.model_folder)
+        params["model_folder"] = self.model_folder
+
+        print("*** Begin training ***")
+        best_loss = np.Inf
+
+        tf.reset_default_graph()
+
+        self.tf_graph = tf.Graph()
+        with self.tf_graph.as_default():
+            self.sess = tf.Session(config=self.tf_config)
+            tf.keras.backend.set_session(self.sess)
+            self.model = ModelClass(params, use_cudnn=use_gpu[0])
+            self.sess.run(tf.global_variables_initializer())
+            self.model.fit(train_df=train, valid_df=valid)
+            print("*** Finished training ***")
+            saved_model_dir = self.model_folder + "/" + "saved_model"
+            if not os.path.exists(saved_model_dir):
+                os.makedirs(saved_model_dir)
+            self.model.save(saved_model_dir)
+
+            def extract_numerical_data(data):
+                """Strips out forecast time and identifier columns."""
+                return data[[col for col in data.columns if col not in {"forecast_time", "identifier"}]]
+
+            # p50_loss = utils.numpy_normalised_quantile_loss(
+            #    extract_numerical_data(targets), extract_numerical_data(p50_forecast),
+            #    0.5)
+            # p90_loss = utils.numpy_normalised_quantile_loss(
+            #    extract_numerical_data(targets), extract_numerical_data(p90_forecast),
+            #    0.9)
+            tf.keras.backend.set_session(default_keras_session)
+        print("Training completed.".format(dte.datetime.now()))
+        # ===========================Training Process===========================
+
+    def predict(self, dataset):
+        if self.model is None:
+            raise ValueError("model is not fitted yet!")
+        d_test = dataset.prepare("test", col_set=["feature", "label"])
+        d_test = transform_df(d_test)
+        d_test.loc[:, self.label_col] = get_shifted_label(d_test, shifts=self.label_shift, col_shift=self.label_col)
+        test = process_qlib_data(d_test, self.expt_name, fillna=True).dropna()
+
+        use_gpu = (True, self.gpu_id)
+        # ===========================Predicting Process===========================
+        default_keras_session = tf.keras.backend.get_session()
+
+        # Sets up default params
+        fixed_params = self.data_formatter.get_experiment_params()
+        params = self.data_formatter.get_default_model_params()
+        params = {**params, **fixed_params}
+
+        print("*** Begin predicting ***")
+        tf.reset_default_graph()
+
+        with self.tf_graph.as_default():
+            tf.keras.backend.set_session(self.sess)
+            output_map = self.model.predict(test, return_targets=True)
+            targets = self.data_formatter.format_predictions(output_map["targets"])
+            p50_forecast = self.data_formatter.format_predictions(output_map["p50"])
+            p90_forecast = self.data_formatter.format_predictions(output_map["p90"])
+            tf.keras.backend.set_session(default_keras_session)
+
+        predict = format_score(p90_forecast, "pred", self.label_shift)
+        label = format_score(targets, "label", self.label_shift)
+        # ===========================Predicting Process===========================
+        return predict, label
+
+    def finetune(self, dataset: DatasetH):
+        """
+        finetune model
+        Parameters
+        ----------
+        dataset : DatasetH
+            dataset for finetuning
+        """
+        pass
diff --git a/examples/benchmarks/TFT/workflow_by_code_tft.py b/examples/benchmarks/TFT/workflow_by_code_tft.py
index 593ac468f..64c7d3df5 100644
--- a/examples/benchmarks/TFT/workflow_by_code_tft.py
+++ b/examples/benchmarks/TFT/workflow_by_code_tft.py
@@ -1,130 +1,132 @@
- #Copyright (c) Microsoft Corporation.
- #Licensed under the MIT License.
-
-import sys
-from pathlib import Path
-
-import qlib
-import pandas as pd
-from qlib.config import REG_CN
-from qlib.contrib.model.pytorch_lstm import LSTM
-from qlib.contrib.data.handler import ALPHA360_Denoise
-from qlib.contrib.strategy.strategy import TopkDropoutStrategy
-from qlib.contrib.evaluate import (
-    backtest as normal_backtest,
-    risk_analysis,
-)
-from qlib.utils import exists_qlib_data
-
-# from qlib.model.learner import train_model
-from qlib.utils import init_instance_by_config
-
-import pickle
-from tft import TFTModel
-
-if __name__ == "__main__":
-
-    # use default data
-    provider_uri = "~/.qlib/qlib_data/cn_data"  # target_dir
-    if not exists_qlib_data(provider_uri):
-        print(f"Qlib data is not found in {provider_uri}")
-        sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts")))
-        from get_data import GetData
-
-        GetData().qlib_data_cn(target_dir=provider_uri)
-
-    qlib.init(provider_uri=provider_uri, region=REG_CN)
-
-    MARKET = "csi300"
-    BENCHMARK = "SH000300"
-
-    ###################################
-    # train model
-    ###################################
-    DATA_HANDLER_CONFIG = {
-        "start_time": "2008-01-01",
-        "end_time": "2020-08-01",
-        "fit_start_time": "2008-01-01",
-        "fit_end_time": "2014-12-31",
-        "instruments": MARKET,
-    }
-
-    TRAINER_CONFIG = {
-        "train_start_time": "2008-01-01",
-        "train_end_time": "2014-12-31",
-        "validate_start_time": "2015-01-01",
-        "validate_end_time": "2016-12-31",
-        "test_start_time": "2017-01-01",
-        "test_end_time": "2020-08-01",
-    }
-
-    task = {
-        "dataset": {
-            "class": "DatasetH",
-            "module_path": "qlib.data.dataset",
-            "kwargs": {
-                'handler': {
-                    "class": "Alpha158",
-                    "module_path": "qlib.contrib.data.handler",
-                    "kwargs": DATA_HANDLER_CONFIG
-                },
-                'segments': {
-                    'train': ("2008-01-01", "2014-12-31"),
-                    'valid': ("2015-01-01", "2016-12-31",),
-                    'test': ("2017-01-01", "2020-08-01",),
-                }
-            }
-        }
-        # You shoud record the data in specific sequence
-        # "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'],
-    }
-    
-
-    model = TFTModel()
-    dataset = init_instance_by_config(task["dataset"])
-    model.fit(dataset)
-
-    pred_score, label_score = model.predict(dataset)
-
-    # save pred_score to file
-    pred_score_path = Path("~/tmp/qlib/pred_score.pkl").expanduser()
-    pred_score_path.parent.mkdir(exist_ok=True, parents=True)
-    pred_score.to_pickle(pred_score_path)
-
-
-    ###################################
-    # backtest
-    ###################################
-    STRATEGY_CONFIG = {
-        "topk": 50,
-        "n_drop": 5,
-    }
-    BACKTEST_CONFIG = {
-        "verbose": False,
-        "limit_threshold": 0.095,
-        "account": 100000000,
-        "benchmark": BENCHMARK,
-        "deal_price": "close",
-        "open_cost": 0.0005,
-        "close_cost": 0.0015,
-        "min_cost": 5,
-    }
-
-    # use default strategy
-    # custom Strategy, refer to: TODO: Strategy API url
-    strategy = TopkDropoutStrategy(**STRATEGY_CONFIG)
-    report_normal, positions_normal = normal_backtest(pred_score, strategy=strategy, **BACKTEST_CONFIG)
-
-    ###################################
-    # analyze
-    # If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb
-    ###################################
-    analysis = dict()
-    analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"])
-    analysis["excess_return_with_cost"] = risk_analysis(
-        report_normal["return"] - report_normal["bench"] - report_normal["cost"]
-    )
-    analysis_df = pd.concat(analysis)  # type: pd.DataFrame
-    print(analysis_df)
-
-
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import sys
+from pathlib import Path
+
+import qlib
+import pandas as pd
+from qlib.config import REG_CN
+from qlib.contrib.model.pytorch_lstm import LSTM
+from qlib.contrib.data.handler import ALPHA360_Denoise
+from qlib.contrib.strategy.strategy import TopkDropoutStrategy
+from qlib.contrib.evaluate import (
+    backtest as normal_backtest,
+    risk_analysis,
+)
+from qlib.utils import exists_qlib_data
+
+# from qlib.model.learner import train_model
+from qlib.utils import init_instance_by_config
+
+import pickle
+from tft import TFTModel
+
+if __name__ == "__main__":
+
+    # use default data
+    provider_uri = "~/.qlib/qlib_data/cn_data"  # target_dir
+    if not exists_qlib_data(provider_uri):
+        print(f"Qlib data is not found in {provider_uri}")
+        sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts")))
+        from get_data import GetData
+
+        GetData().qlib_data_cn(target_dir=provider_uri)
+
+    qlib.init(provider_uri=provider_uri, region=REG_CN)
+
+    MARKET = "csi300"
+    BENCHMARK = "SH000300"
+
+    ###################################
+    # train model
+    ###################################
+    DATA_HANDLER_CONFIG = {
+        "start_time": "2008-01-01",
+        "end_time": "2020-08-01",
+        "fit_start_time": "2008-01-01",
+        "fit_end_time": "2014-12-31",
+        "instruments": MARKET,
+    }
+
+    TRAINER_CONFIG = {
+        "train_start_time": "2008-01-01",
+        "train_end_time": "2014-12-31",
+        "validate_start_time": "2015-01-01",
+        "validate_end_time": "2016-12-31",
+        "test_start_time": "2017-01-01",
+        "test_end_time": "2020-08-01",
+    }
+
+    task = {
+        "dataset": {
+            "class": "DatasetH",
+            "module_path": "qlib.data.dataset",
+            "kwargs": {
+                "handler": {
+                    "class": "Alpha158",
+                    "module_path": "qlib.contrib.data.handler",
+                    "kwargs": DATA_HANDLER_CONFIG,
+                },
+                "segments": {
+                    "train": ("2008-01-01", "2014-12-31"),
+                    "valid": (
+                        "2015-01-01",
+                        "2016-12-31",
+                    ),
+                    "test": (
+                        "2017-01-01",
+                        "2020-08-01",
+                    ),
+                },
+            },
+        }
+        # You shoud record the data in specific sequence
+        # "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'],
+    }
+
+    model = TFTModel()
+    dataset = init_instance_by_config(task["dataset"])
+    model.fit(dataset)
+
+    pred_score, label_score = model.predict(dataset)
+
+    # save pred_score to file
+    pred_score_path = Path("~/tmp/qlib/pred_score.pkl").expanduser()
+    pred_score_path.parent.mkdir(exist_ok=True, parents=True)
+    pred_score.to_pickle(pred_score_path)
+
+    ###################################
+    # backtest
+    ###################################
+    STRATEGY_CONFIG = {
+        "topk": 50,
+        "n_drop": 5,
+    }
+    BACKTEST_CONFIG = {
+        "verbose": False,
+        "limit_threshold": 0.095,
+        "account": 100000000,
+        "benchmark": BENCHMARK,
+        "deal_price": "close",
+        "open_cost": 0.0005,
+        "close_cost": 0.0015,
+        "min_cost": 5,
+    }
+
+    # use default strategy
+    # custom Strategy, refer to: TODO: Strategy API url
+    strategy = TopkDropoutStrategy(**STRATEGY_CONFIG)
+    report_normal, positions_normal = normal_backtest(pred_score, strategy=strategy, **BACKTEST_CONFIG)
+
+    ###################################
+    # analyze
+    # If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb
+    ###################################
+    analysis = dict()
+    analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"])
+    analysis["excess_return_with_cost"] = risk_analysis(
+        report_normal["return"] - report_normal["bench"] - report_normal["cost"]
+    )
+    analysis_df = pd.concat(analysis)  # type: pd.DataFrame
+    print(analysis_df)