mirror of
https://github.com/microsoft/qlib.git
synced 2026-06-06 05:51:17 +08:00
Format TFT
This commit is contained in:
@@ -1,15 +1,14 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Google Research Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Google Research Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
@@ -1,235 +1,223 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Google Research Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Lint as: python3
|
||||
"""Default data formatting functions for experiments.
|
||||
|
||||
For new datasets, inherit form GenericDataFormatter and implement
|
||||
all abstract functions.
|
||||
|
||||
These dataset-specific methods:
|
||||
1) Define the column and input types for tabular dataframes used by model
|
||||
2) Perform the necessary input feature engineering & normalisation steps
|
||||
3) Reverts the normalisation for predictions
|
||||
4) Are responsible for train, validation and test splits
|
||||
|
||||
|
||||
"""
|
||||
|
||||
import abc
|
||||
import enum
|
||||
|
||||
|
||||
# Type defintions
|
||||
class DataTypes(enum.IntEnum):
|
||||
"""Defines numerical types of each column."""
|
||||
REAL_VALUED = 0
|
||||
CATEGORICAL = 1
|
||||
DATE = 2
|
||||
|
||||
|
||||
class InputTypes(enum.IntEnum):
|
||||
"""Defines input types of each column."""
|
||||
TARGET = 0
|
||||
OBSERVED_INPUT = 1
|
||||
KNOWN_INPUT = 2
|
||||
STATIC_INPUT = 3
|
||||
ID = 4 # Single column used as an entity identifier
|
||||
TIME = 5 # Single column exclusively used as a time index
|
||||
|
||||
|
||||
class GenericDataFormatter(abc.ABC):
|
||||
"""Abstract base class for all data formatters.
|
||||
|
||||
User can implement the abstract methods below to perform dataset-specific
|
||||
manipulations.
|
||||
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def set_scalers(self, df):
|
||||
"""Calibrates scalers using the data supplied."""
|
||||
raise NotImplementedError()
|
||||
|
||||
@abc.abstractmethod
|
||||
def transform_inputs(self, df):
|
||||
"""Performs feature transformation."""
|
||||
raise NotImplementedError()
|
||||
|
||||
@abc.abstractmethod
|
||||
def format_predictions(self, df):
|
||||
"""Reverts any normalisation to give predictions in original scale."""
|
||||
raise NotImplementedError()
|
||||
|
||||
@abc.abstractmethod
|
||||
def split_data(self, df):
|
||||
"""Performs the default train, validation and test splits."""
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abc.abstractmethod
|
||||
def _column_definition(self):
|
||||
"""Defines order, input type and data type of each column."""
|
||||
raise NotImplementedError()
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_fixed_params(self):
|
||||
"""Defines the fixed parameters used by the model for training.
|
||||
|
||||
Requires the following keys:
|
||||
'total_time_steps': Defines the total number of time steps used by TFT
|
||||
'num_encoder_steps': Determines length of LSTM encoder (i.e. history)
|
||||
'num_epochs': Maximum number of epochs for training
|
||||
'early_stopping_patience': Early stopping param for keras
|
||||
'multiprocessing_workers': # of cpus for data processing
|
||||
|
||||
|
||||
Returns:
|
||||
A dictionary of fixed parameters, e.g.:
|
||||
|
||||
fixed_params = {
|
||||
'total_time_steps': 252 + 5,
|
||||
'num_encoder_steps': 252,
|
||||
'num_epochs': 100,
|
||||
'early_stopping_patience': 5,
|
||||
'multiprocessing_workers': 5,
|
||||
}
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
# Shared functions across data-formatters
|
||||
@property
|
||||
def num_classes_per_cat_input(self):
|
||||
"""Returns number of categories per relevant input.
|
||||
|
||||
This is seqeuently required for keras embedding layers.
|
||||
"""
|
||||
return self._num_classes_per_cat_input
|
||||
|
||||
def get_num_samples_for_calibration(self):
|
||||
"""Gets the default number of training and validation samples.
|
||||
|
||||
Use to sub-sample the data for network calibration and a value of -1 uses
|
||||
all available samples.
|
||||
|
||||
Returns:
|
||||
Tuple of (training samples, validation samples)
|
||||
"""
|
||||
return -1, -1
|
||||
|
||||
def get_column_definition(self):
|
||||
""""Returns formatted column definition in order expected by the TFT."""
|
||||
|
||||
column_definition = self._column_definition
|
||||
|
||||
# Sanity checks first.
|
||||
# Ensure only one ID and time column exist
|
||||
def _check_single_column(input_type):
|
||||
|
||||
length = len([tup for tup in column_definition if tup[2] == input_type])
|
||||
|
||||
if length != 1:
|
||||
raise ValueError('Illegal number of inputs ({}) of type {}'.format(
|
||||
length, input_type))
|
||||
|
||||
_check_single_column(InputTypes.ID)
|
||||
_check_single_column(InputTypes.TIME)
|
||||
|
||||
identifier = [tup for tup in column_definition if tup[2] == InputTypes.ID]
|
||||
time = [tup for tup in column_definition if tup[2] == InputTypes.TIME]
|
||||
real_inputs = [
|
||||
tup for tup in column_definition if tup[1] == DataTypes.REAL_VALUED and
|
||||
tup[2] not in {InputTypes.ID, InputTypes.TIME}
|
||||
]
|
||||
categorical_inputs = [
|
||||
tup for tup in column_definition if tup[1] == DataTypes.CATEGORICAL and
|
||||
tup[2] not in {InputTypes.ID, InputTypes.TIME}
|
||||
]
|
||||
|
||||
return identifier + time + real_inputs + categorical_inputs
|
||||
|
||||
def _get_input_columns(self):
|
||||
"""Returns names of all input columns."""
|
||||
return [
|
||||
tup[0]
|
||||
for tup in self.get_column_definition()
|
||||
if tup[2] not in {InputTypes.ID, InputTypes.TIME}
|
||||
]
|
||||
|
||||
def _get_tft_input_indices(self):
|
||||
"""Returns the relevant indexes and input sizes required by TFT."""
|
||||
|
||||
# Functions
|
||||
def _extract_tuples_from_data_type(data_type, defn):
|
||||
return [
|
||||
tup for tup in defn if tup[1] == data_type and
|
||||
tup[2] not in {InputTypes.ID, InputTypes.TIME}
|
||||
]
|
||||
|
||||
def _get_locations(input_types, defn):
|
||||
return [i for i, tup in enumerate(defn) if tup[2] in input_types]
|
||||
|
||||
# Start extraction
|
||||
column_definition = [
|
||||
tup for tup in self.get_column_definition()
|
||||
if tup[2] not in {InputTypes.ID, InputTypes.TIME}
|
||||
]
|
||||
|
||||
categorical_inputs = _extract_tuples_from_data_type(DataTypes.CATEGORICAL,
|
||||
column_definition)
|
||||
real_inputs = _extract_tuples_from_data_type(DataTypes.REAL_VALUED,
|
||||
column_definition)
|
||||
|
||||
locations = {
|
||||
'input_size':
|
||||
len(self._get_input_columns()),
|
||||
'output_size':
|
||||
len(_get_locations({InputTypes.TARGET}, column_definition)),
|
||||
'category_counts':
|
||||
self.num_classes_per_cat_input,
|
||||
'input_obs_loc':
|
||||
_get_locations({InputTypes.TARGET}, column_definition),
|
||||
'static_input_loc':
|
||||
_get_locations({InputTypes.STATIC_INPUT}, column_definition),
|
||||
'known_regular_inputs':
|
||||
_get_locations({InputTypes.STATIC_INPUT, InputTypes.KNOWN_INPUT},
|
||||
real_inputs),
|
||||
'known_categorical_inputs':
|
||||
_get_locations({InputTypes.STATIC_INPUT, InputTypes.KNOWN_INPUT},
|
||||
categorical_inputs),
|
||||
}
|
||||
|
||||
return locations
|
||||
|
||||
def get_experiment_params(self):
|
||||
"""Returns fixed model parameters for experiments."""
|
||||
|
||||
required_keys = [
|
||||
'total_time_steps', 'num_encoder_steps', 'num_epochs',
|
||||
'early_stopping_patience', 'multiprocessing_workers'
|
||||
]
|
||||
|
||||
fixed_params = self.get_fixed_params()
|
||||
|
||||
for k in required_keys:
|
||||
if k not in fixed_params:
|
||||
raise ValueError('Field {}'.format(k) +
|
||||
' missing from fixed parameter definitions!')
|
||||
|
||||
fixed_params['column_definition'] = self.get_column_definition()
|
||||
|
||||
fixed_params.update(self._get_tft_input_indices())
|
||||
|
||||
return fixed_params
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Google Research Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Lint as: python3
|
||||
"""Default data formatting functions for experiments.
|
||||
|
||||
For new datasets, inherit form GenericDataFormatter and implement
|
||||
all abstract functions.
|
||||
|
||||
These dataset-specific methods:
|
||||
1) Define the column and input types for tabular dataframes used by model
|
||||
2) Perform the necessary input feature engineering & normalisation steps
|
||||
3) Reverts the normalisation for predictions
|
||||
4) Are responsible for train, validation and test splits
|
||||
|
||||
|
||||
"""
|
||||
|
||||
import abc
|
||||
import enum
|
||||
|
||||
|
||||
# Type defintions
|
||||
class DataTypes(enum.IntEnum):
|
||||
"""Defines numerical types of each column."""
|
||||
|
||||
REAL_VALUED = 0
|
||||
CATEGORICAL = 1
|
||||
DATE = 2
|
||||
|
||||
|
||||
class InputTypes(enum.IntEnum):
|
||||
"""Defines input types of each column."""
|
||||
|
||||
TARGET = 0
|
||||
OBSERVED_INPUT = 1
|
||||
KNOWN_INPUT = 2
|
||||
STATIC_INPUT = 3
|
||||
ID = 4 # Single column used as an entity identifier
|
||||
TIME = 5 # Single column exclusively used as a time index
|
||||
|
||||
|
||||
class GenericDataFormatter(abc.ABC):
|
||||
"""Abstract base class for all data formatters.
|
||||
|
||||
User can implement the abstract methods below to perform dataset-specific
|
||||
manipulations.
|
||||
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def set_scalers(self, df):
|
||||
"""Calibrates scalers using the data supplied."""
|
||||
raise NotImplementedError()
|
||||
|
||||
@abc.abstractmethod
|
||||
def transform_inputs(self, df):
|
||||
"""Performs feature transformation."""
|
||||
raise NotImplementedError()
|
||||
|
||||
@abc.abstractmethod
|
||||
def format_predictions(self, df):
|
||||
"""Reverts any normalisation to give predictions in original scale."""
|
||||
raise NotImplementedError()
|
||||
|
||||
@abc.abstractmethod
|
||||
def split_data(self, df):
|
||||
"""Performs the default train, validation and test splits."""
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
@abc.abstractmethod
|
||||
def _column_definition(self):
|
||||
"""Defines order, input type and data type of each column."""
|
||||
raise NotImplementedError()
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_fixed_params(self):
|
||||
"""Defines the fixed parameters used by the model for training.
|
||||
|
||||
Requires the following keys:
|
||||
'total_time_steps': Defines the total number of time steps used by TFT
|
||||
'num_encoder_steps': Determines length of LSTM encoder (i.e. history)
|
||||
'num_epochs': Maximum number of epochs for training
|
||||
'early_stopping_patience': Early stopping param for keras
|
||||
'multiprocessing_workers': # of cpus for data processing
|
||||
|
||||
|
||||
Returns:
|
||||
A dictionary of fixed parameters, e.g.:
|
||||
|
||||
fixed_params = {
|
||||
'total_time_steps': 252 + 5,
|
||||
'num_encoder_steps': 252,
|
||||
'num_epochs': 100,
|
||||
'early_stopping_patience': 5,
|
||||
'multiprocessing_workers': 5,
|
||||
}
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
# Shared functions across data-formatters
|
||||
@property
|
||||
def num_classes_per_cat_input(self):
|
||||
"""Returns number of categories per relevant input.
|
||||
|
||||
This is seqeuently required for keras embedding layers.
|
||||
"""
|
||||
return self._num_classes_per_cat_input
|
||||
|
||||
def get_num_samples_for_calibration(self):
|
||||
"""Gets the default number of training and validation samples.
|
||||
|
||||
Use to sub-sample the data for network calibration and a value of -1 uses
|
||||
all available samples.
|
||||
|
||||
Returns:
|
||||
Tuple of (training samples, validation samples)
|
||||
"""
|
||||
return -1, -1
|
||||
|
||||
def get_column_definition(self):
|
||||
""""Returns formatted column definition in order expected by the TFT."""
|
||||
|
||||
column_definition = self._column_definition
|
||||
|
||||
# Sanity checks first.
|
||||
# Ensure only one ID and time column exist
|
||||
def _check_single_column(input_type):
|
||||
|
||||
length = len([tup for tup in column_definition if tup[2] == input_type])
|
||||
|
||||
if length != 1:
|
||||
raise ValueError("Illegal number of inputs ({}) of type {}".format(length, input_type))
|
||||
|
||||
_check_single_column(InputTypes.ID)
|
||||
_check_single_column(InputTypes.TIME)
|
||||
|
||||
identifier = [tup for tup in column_definition if tup[2] == InputTypes.ID]
|
||||
time = [tup for tup in column_definition if tup[2] == InputTypes.TIME]
|
||||
real_inputs = [
|
||||
tup
|
||||
for tup in column_definition
|
||||
if tup[1] == DataTypes.REAL_VALUED and tup[2] not in {InputTypes.ID, InputTypes.TIME}
|
||||
]
|
||||
categorical_inputs = [
|
||||
tup
|
||||
for tup in column_definition
|
||||
if tup[1] == DataTypes.CATEGORICAL and tup[2] not in {InputTypes.ID, InputTypes.TIME}
|
||||
]
|
||||
|
||||
return identifier + time + real_inputs + categorical_inputs
|
||||
|
||||
def _get_input_columns(self):
|
||||
"""Returns names of all input columns."""
|
||||
return [tup[0] for tup in self.get_column_definition() if tup[2] not in {InputTypes.ID, InputTypes.TIME}]
|
||||
|
||||
def _get_tft_input_indices(self):
|
||||
"""Returns the relevant indexes and input sizes required by TFT."""
|
||||
|
||||
# Functions
|
||||
def _extract_tuples_from_data_type(data_type, defn):
|
||||
return [tup for tup in defn if tup[1] == data_type and tup[2] not in {InputTypes.ID, InputTypes.TIME}]
|
||||
|
||||
def _get_locations(input_types, defn):
|
||||
return [i for i, tup in enumerate(defn) if tup[2] in input_types]
|
||||
|
||||
# Start extraction
|
||||
column_definition = [
|
||||
tup for tup in self.get_column_definition() if tup[2] not in {InputTypes.ID, InputTypes.TIME}
|
||||
]
|
||||
|
||||
categorical_inputs = _extract_tuples_from_data_type(DataTypes.CATEGORICAL, column_definition)
|
||||
real_inputs = _extract_tuples_from_data_type(DataTypes.REAL_VALUED, column_definition)
|
||||
|
||||
locations = {
|
||||
"input_size": len(self._get_input_columns()),
|
||||
"output_size": len(_get_locations({InputTypes.TARGET}, column_definition)),
|
||||
"category_counts": self.num_classes_per_cat_input,
|
||||
"input_obs_loc": _get_locations({InputTypes.TARGET}, column_definition),
|
||||
"static_input_loc": _get_locations({InputTypes.STATIC_INPUT}, column_definition),
|
||||
"known_regular_inputs": _get_locations({InputTypes.STATIC_INPUT, InputTypes.KNOWN_INPUT}, real_inputs),
|
||||
"known_categorical_inputs": _get_locations(
|
||||
{InputTypes.STATIC_INPUT, InputTypes.KNOWN_INPUT}, categorical_inputs
|
||||
),
|
||||
}
|
||||
|
||||
return locations
|
||||
|
||||
def get_experiment_params(self):
|
||||
"""Returns fixed model parameters for experiments."""
|
||||
|
||||
required_keys = [
|
||||
"total_time_steps",
|
||||
"num_encoder_steps",
|
||||
"num_epochs",
|
||||
"early_stopping_patience",
|
||||
"multiprocessing_workers",
|
||||
]
|
||||
|
||||
fixed_params = self.get_fixed_params()
|
||||
|
||||
for k in required_keys:
|
||||
if k not in fixed_params:
|
||||
raise ValueError("Field {}".format(k) + " missing from fixed parameter definitions!")
|
||||
|
||||
fixed_params["column_definition"] = self.get_column_definition()
|
||||
|
||||
fixed_params.update(self._get_tft_input_indices())
|
||||
|
||||
return fixed_params
|
||||
|
||||
@@ -1,261 +1,254 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Google Research Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Lint as: python3
|
||||
"""Custom formatting functions for Electricity dataset.
|
||||
|
||||
Defines dataset specific column definitions and data transformations. Uses
|
||||
entity specific z-score normalization.
|
||||
"""
|
||||
|
||||
import data_formatters.base
|
||||
import libs.utils as utils
|
||||
import pandas as pd
|
||||
import sklearn.preprocessing
|
||||
|
||||
GenericDataFormatter = data_formatters.base.GenericDataFormatter
|
||||
DataTypes = data_formatters.base.DataTypes
|
||||
InputTypes = data_formatters.base.InputTypes
|
||||
|
||||
|
||||
class ElectricityFormatter(GenericDataFormatter):
|
||||
"""Defines and formats data for the electricity dataset.
|
||||
|
||||
Note that per-entity z-score normalization is used here, and is implemented
|
||||
across functions.
|
||||
|
||||
Attributes:
|
||||
column_definition: Defines input and data type of column used in the
|
||||
experiment.
|
||||
identifiers: Entity identifiers used in experiments.
|
||||
"""
|
||||
|
||||
_column_definition = [
|
||||
('id', DataTypes.REAL_VALUED, InputTypes.ID),
|
||||
('hours_from_start', DataTypes.REAL_VALUED, InputTypes.TIME),
|
||||
('power_usage', DataTypes.REAL_VALUED, InputTypes.TARGET),
|
||||
('hour', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
|
||||
('day_of_week', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
|
||||
('hours_from_start', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
|
||||
('categorical_id', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
"""Initialises formatter."""
|
||||
|
||||
self.identifiers = None
|
||||
self._real_scalers = None
|
||||
self._cat_scalers = None
|
||||
self._target_scaler = None
|
||||
self._num_classes_per_cat_input = None
|
||||
self._time_steps = self.get_fixed_params()['total_time_steps']
|
||||
|
||||
def split_data(self, df, valid_boundary=1315, test_boundary=1339):
|
||||
"""Splits data frame into training-validation-test data frames.
|
||||
|
||||
This also calibrates scaling object, and transforms data for each split.
|
||||
|
||||
Args:
|
||||
df: Source data frame to split.
|
||||
valid_boundary: Starting year for validation data
|
||||
test_boundary: Starting year for test data
|
||||
|
||||
Returns:
|
||||
Tuple of transformed (train, valid, test) data.
|
||||
"""
|
||||
|
||||
print('Formatting train-valid-test splits.')
|
||||
|
||||
index = df['days_from_start']
|
||||
train = df.loc[index < valid_boundary]
|
||||
valid = df.loc[(index >= valid_boundary - 7) & (index < test_boundary)]
|
||||
test = df.loc[index >= test_boundary - 7]
|
||||
|
||||
self.set_scalers(train)
|
||||
|
||||
return (self.transform_inputs(data) for data in [train, valid, test])
|
||||
|
||||
def set_scalers(self, df):
|
||||
"""Calibrates scalers using the data supplied.
|
||||
|
||||
Args:
|
||||
df: Data to use to calibrate scalers.
|
||||
"""
|
||||
print('Setting scalers with training data...')
|
||||
|
||||
column_definitions = self.get_column_definition()
|
||||
id_column = utils.get_single_col_by_input_type(InputTypes.ID,
|
||||
column_definitions)
|
||||
target_column = utils.get_single_col_by_input_type(InputTypes.TARGET,
|
||||
column_definitions)
|
||||
|
||||
# Format real scalers
|
||||
real_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.REAL_VALUED, column_definitions,
|
||||
{InputTypes.ID, InputTypes.TIME})
|
||||
|
||||
# Initialise scaler caches
|
||||
self._real_scalers = {}
|
||||
self._target_scaler = {}
|
||||
identifiers = []
|
||||
for identifier, sliced in df.groupby(id_column):
|
||||
|
||||
if len(sliced) >= self._time_steps:
|
||||
|
||||
data = sliced[real_inputs].values
|
||||
targets = sliced[[target_column]].values
|
||||
self._real_scalers[identifier] \
|
||||
= sklearn.preprocessing.StandardScaler().fit(data)
|
||||
|
||||
self._target_scaler[identifier] \
|
||||
= sklearn.preprocessing.StandardScaler().fit(targets)
|
||||
identifiers.append(identifier)
|
||||
|
||||
# Format categorical scalers
|
||||
categorical_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.CATEGORICAL, column_definitions,
|
||||
{InputTypes.ID, InputTypes.TIME})
|
||||
|
||||
categorical_scalers = {}
|
||||
num_classes = []
|
||||
for col in categorical_inputs:
|
||||
# Set all to str so that we don't have mixed integer/string columns
|
||||
srs = df[col].apply(str)
|
||||
categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(
|
||||
srs.values)
|
||||
num_classes.append(srs.nunique())
|
||||
|
||||
# Set categorical scaler outputs
|
||||
self._cat_scalers = categorical_scalers
|
||||
self._num_classes_per_cat_input = num_classes
|
||||
|
||||
# Extract identifiers in case required
|
||||
self.identifiers = identifiers
|
||||
|
||||
def transform_inputs(self, df):
|
||||
"""Performs feature transformations.
|
||||
|
||||
This includes both feature engineering, preprocessing and normalisation.
|
||||
|
||||
Args:
|
||||
df: Data frame to transform.
|
||||
|
||||
Returns:
|
||||
Transformed data frame.
|
||||
|
||||
"""
|
||||
|
||||
if self._real_scalers is None and self._cat_scalers is None:
|
||||
raise ValueError('Scalers have not been set!')
|
||||
|
||||
# Extract relevant columns
|
||||
column_definitions = self.get_column_definition()
|
||||
id_col = utils.get_single_col_by_input_type(InputTypes.ID,
|
||||
column_definitions)
|
||||
real_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.REAL_VALUED, column_definitions,
|
||||
{InputTypes.ID, InputTypes.TIME})
|
||||
categorical_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.CATEGORICAL, column_definitions,
|
||||
{InputTypes.ID, InputTypes.TIME})
|
||||
|
||||
# Transform real inputs per entity
|
||||
df_list = []
|
||||
for identifier, sliced in df.groupby(id_col):
|
||||
|
||||
# Filter out any trajectories that are too short
|
||||
if len(sliced) >= self._time_steps:
|
||||
sliced_copy = sliced.copy()
|
||||
sliced_copy[real_inputs] = self._real_scalers[identifier].transform(
|
||||
sliced_copy[real_inputs].values)
|
||||
df_list.append(sliced_copy)
|
||||
|
||||
output = pd.concat(df_list, axis=0)
|
||||
|
||||
# Format categorical inputs
|
||||
for col in categorical_inputs:
|
||||
string_df = df[col].apply(str)
|
||||
output[col] = self._cat_scalers[col].transform(string_df)
|
||||
|
||||
return output
|
||||
|
||||
def format_predictions(self, predictions):
|
||||
"""Reverts any normalisation to give predictions in original scale.
|
||||
|
||||
Args:
|
||||
predictions: Dataframe of model predictions.
|
||||
|
||||
Returns:
|
||||
Data frame of unnormalised predictions.
|
||||
"""
|
||||
|
||||
if self._target_scaler is None:
|
||||
raise ValueError('Scalers have not been set!')
|
||||
|
||||
column_names = predictions.columns
|
||||
|
||||
df_list = []
|
||||
for identifier, sliced in predictions.groupby('identifier'):
|
||||
sliced_copy = sliced.copy()
|
||||
target_scaler = self._target_scaler[identifier]
|
||||
|
||||
for col in column_names:
|
||||
if col not in {'forecast_time', 'identifier'}:
|
||||
sliced_copy[col] = target_scaler.inverse_transform(sliced_copy[col])
|
||||
df_list.append(sliced_copy)
|
||||
|
||||
output = pd.concat(df_list, axis=0)
|
||||
|
||||
return output
|
||||
|
||||
# Default params
|
||||
def get_fixed_params(self):
|
||||
"""Returns fixed model parameters for experiments."""
|
||||
|
||||
fixed_params = {
|
||||
'total_time_steps': 8 * 24,
|
||||
'num_encoder_steps': 7 * 24,
|
||||
'num_epochs': 100,
|
||||
'early_stopping_patience': 5,
|
||||
'multiprocessing_workers': 5
|
||||
}
|
||||
|
||||
return fixed_params
|
||||
|
||||
def get_default_model_params(self):
|
||||
"""Returns default optimised model parameters."""
|
||||
|
||||
model_params = {
|
||||
'dropout_rate': 0.1,
|
||||
'hidden_layer_size': 160,
|
||||
'learning_rate': 0.001,
|
||||
'minibatch_size': 64,
|
||||
'max_gradient_norm': 0.01,
|
||||
'num_heads': 4,
|
||||
'stack_size': 1
|
||||
}
|
||||
|
||||
return model_params
|
||||
|
||||
def get_num_samples_for_calibration(self):
|
||||
"""Gets the default number of training and validation samples.
|
||||
|
||||
Use to sub-sample the data for network calibration and a value of -1 uses
|
||||
all available samples.
|
||||
|
||||
Returns:
|
||||
Tuple of (training samples, validation samples)
|
||||
"""
|
||||
return 450000, 50000
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Google Research Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Lint as: python3
|
||||
"""Custom formatting functions for Electricity dataset.
|
||||
|
||||
Defines dataset specific column definitions and data transformations. Uses
|
||||
entity specific z-score normalization.
|
||||
"""
|
||||
|
||||
import data_formatters.base
|
||||
import libs.utils as utils
|
||||
import pandas as pd
|
||||
import sklearn.preprocessing
|
||||
|
||||
GenericDataFormatter = data_formatters.base.GenericDataFormatter
|
||||
DataTypes = data_formatters.base.DataTypes
|
||||
InputTypes = data_formatters.base.InputTypes
|
||||
|
||||
|
||||
class ElectricityFormatter(GenericDataFormatter):
|
||||
"""Defines and formats data for the electricity dataset.
|
||||
|
||||
Note that per-entity z-score normalization is used here, and is implemented
|
||||
across functions.
|
||||
|
||||
Attributes:
|
||||
column_definition: Defines input and data type of column used in the
|
||||
experiment.
|
||||
identifiers: Entity identifiers used in experiments.
|
||||
"""
|
||||
|
||||
_column_definition = [
|
||||
("id", DataTypes.REAL_VALUED, InputTypes.ID),
|
||||
("hours_from_start", DataTypes.REAL_VALUED, InputTypes.TIME),
|
||||
("power_usage", DataTypes.REAL_VALUED, InputTypes.TARGET),
|
||||
("hour", DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
|
||||
("day_of_week", DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
|
||||
("hours_from_start", DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
|
||||
("categorical_id", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
"""Initialises formatter."""
|
||||
|
||||
self.identifiers = None
|
||||
self._real_scalers = None
|
||||
self._cat_scalers = None
|
||||
self._target_scaler = None
|
||||
self._num_classes_per_cat_input = None
|
||||
self._time_steps = self.get_fixed_params()["total_time_steps"]
|
||||
|
||||
def split_data(self, df, valid_boundary=1315, test_boundary=1339):
|
||||
"""Splits data frame into training-validation-test data frames.
|
||||
|
||||
This also calibrates scaling object, and transforms data for each split.
|
||||
|
||||
Args:
|
||||
df: Source data frame to split.
|
||||
valid_boundary: Starting year for validation data
|
||||
test_boundary: Starting year for test data
|
||||
|
||||
Returns:
|
||||
Tuple of transformed (train, valid, test) data.
|
||||
"""
|
||||
|
||||
print("Formatting train-valid-test splits.")
|
||||
|
||||
index = df["days_from_start"]
|
||||
train = df.loc[index < valid_boundary]
|
||||
valid = df.loc[(index >= valid_boundary - 7) & (index < test_boundary)]
|
||||
test = df.loc[index >= test_boundary - 7]
|
||||
|
||||
self.set_scalers(train)
|
||||
|
||||
return (self.transform_inputs(data) for data in [train, valid, test])
|
||||
|
||||
def set_scalers(self, df):
|
||||
"""Calibrates scalers using the data supplied.
|
||||
|
||||
Args:
|
||||
df: Data to use to calibrate scalers.
|
||||
"""
|
||||
print("Setting scalers with training data...")
|
||||
|
||||
column_definitions = self.get_column_definition()
|
||||
id_column = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions)
|
||||
target_column = utils.get_single_col_by_input_type(InputTypes.TARGET, column_definitions)
|
||||
|
||||
# Format real scalers
|
||||
real_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}
|
||||
)
|
||||
|
||||
# Initialise scaler caches
|
||||
self._real_scalers = {}
|
||||
self._target_scaler = {}
|
||||
identifiers = []
|
||||
for identifier, sliced in df.groupby(id_column):
|
||||
|
||||
if len(sliced) >= self._time_steps:
|
||||
|
||||
data = sliced[real_inputs].values
|
||||
targets = sliced[[target_column]].values
|
||||
self._real_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(data)
|
||||
|
||||
self._target_scaler[identifier] = sklearn.preprocessing.StandardScaler().fit(targets)
|
||||
identifiers.append(identifier)
|
||||
|
||||
# Format categorical scalers
|
||||
categorical_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}
|
||||
)
|
||||
|
||||
categorical_scalers = {}
|
||||
num_classes = []
|
||||
for col in categorical_inputs:
|
||||
# Set all to str so that we don't have mixed integer/string columns
|
||||
srs = df[col].apply(str)
|
||||
categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
|
||||
num_classes.append(srs.nunique())
|
||||
|
||||
# Set categorical scaler outputs
|
||||
self._cat_scalers = categorical_scalers
|
||||
self._num_classes_per_cat_input = num_classes
|
||||
|
||||
# Extract identifiers in case required
|
||||
self.identifiers = identifiers
|
||||
|
||||
def transform_inputs(self, df):
|
||||
"""Performs feature transformations.
|
||||
|
||||
This includes both feature engineering, preprocessing and normalisation.
|
||||
|
||||
Args:
|
||||
df: Data frame to transform.
|
||||
|
||||
Returns:
|
||||
Transformed data frame.
|
||||
|
||||
"""
|
||||
|
||||
if self._real_scalers is None and self._cat_scalers is None:
|
||||
raise ValueError("Scalers have not been set!")
|
||||
|
||||
# Extract relevant columns
|
||||
column_definitions = self.get_column_definition()
|
||||
id_col = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions)
|
||||
real_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}
|
||||
)
|
||||
categorical_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}
|
||||
)
|
||||
|
||||
# Transform real inputs per entity
|
||||
df_list = []
|
||||
for identifier, sliced in df.groupby(id_col):
|
||||
|
||||
# Filter out any trajectories that are too short
|
||||
if len(sliced) >= self._time_steps:
|
||||
sliced_copy = sliced.copy()
|
||||
sliced_copy[real_inputs] = self._real_scalers[identifier].transform(sliced_copy[real_inputs].values)
|
||||
df_list.append(sliced_copy)
|
||||
|
||||
output = pd.concat(df_list, axis=0)
|
||||
|
||||
# Format categorical inputs
|
||||
for col in categorical_inputs:
|
||||
string_df = df[col].apply(str)
|
||||
output[col] = self._cat_scalers[col].transform(string_df)
|
||||
|
||||
return output
|
||||
|
||||
def format_predictions(self, predictions):
|
||||
"""Reverts any normalisation to give predictions in original scale.
|
||||
|
||||
Args:
|
||||
predictions: Dataframe of model predictions.
|
||||
|
||||
Returns:
|
||||
Data frame of unnormalised predictions.
|
||||
"""
|
||||
|
||||
if self._target_scaler is None:
|
||||
raise ValueError("Scalers have not been set!")
|
||||
|
||||
column_names = predictions.columns
|
||||
|
||||
df_list = []
|
||||
for identifier, sliced in predictions.groupby("identifier"):
|
||||
sliced_copy = sliced.copy()
|
||||
target_scaler = self._target_scaler[identifier]
|
||||
|
||||
for col in column_names:
|
||||
if col not in {"forecast_time", "identifier"}:
|
||||
sliced_copy[col] = target_scaler.inverse_transform(sliced_copy[col])
|
||||
df_list.append(sliced_copy)
|
||||
|
||||
output = pd.concat(df_list, axis=0)
|
||||
|
||||
return output
|
||||
|
||||
# Default params
|
||||
def get_fixed_params(self):
|
||||
"""Returns fixed model parameters for experiments."""
|
||||
|
||||
fixed_params = {
|
||||
"total_time_steps": 8 * 24,
|
||||
"num_encoder_steps": 7 * 24,
|
||||
"num_epochs": 100,
|
||||
"early_stopping_patience": 5,
|
||||
"multiprocessing_workers": 5,
|
||||
}
|
||||
|
||||
return fixed_params
|
||||
|
||||
def get_default_model_params(self):
|
||||
"""Returns default optimised model parameters."""
|
||||
|
||||
model_params = {
|
||||
"dropout_rate": 0.1,
|
||||
"hidden_layer_size": 160,
|
||||
"learning_rate": 0.001,
|
||||
"minibatch_size": 64,
|
||||
"max_gradient_norm": 0.01,
|
||||
"num_heads": 4,
|
||||
"stack_size": 1,
|
||||
}
|
||||
|
||||
return model_params
|
||||
|
||||
def get_num_samples_for_calibration(self):
|
||||
"""Gets the default number of training and validation samples.
|
||||
|
||||
Use to sub-sample the data for network calibration and a value of -1 uses
|
||||
all available samples.
|
||||
|
||||
Returns:
|
||||
Tuple of (training samples, validation samples)
|
||||
"""
|
||||
return 450000, 50000
|
||||
|
||||
@@ -1,327 +1,333 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Google Research Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Lint as: python3
|
||||
"""Custom formatting functions for Favorita dataset.
|
||||
|
||||
Defines dataset specific column definitions and data transformations.
|
||||
"""
|
||||
|
||||
import data_formatters.base
|
||||
import libs.utils as utils
|
||||
import pandas as pd
|
||||
import sklearn.preprocessing
|
||||
|
||||
DataTypes = data_formatters.base.DataTypes
|
||||
InputTypes = data_formatters.base.InputTypes
|
||||
|
||||
|
||||
class FavoritaFormatter(data_formatters.base.GenericDataFormatter):
|
||||
"""Defines and formats data for the Favorita dataset.
|
||||
|
||||
Attributes:
|
||||
column_definition: Defines input and data type of column used in the
|
||||
experiment.
|
||||
identifiers: Entity identifiers used in experiments.
|
||||
"""
|
||||
|
||||
_column_definition = [
|
||||
('traj_id', DataTypes.REAL_VALUED, InputTypes.ID),
|
||||
('date', DataTypes.DATE, InputTypes.TIME),
|
||||
('log_sales', DataTypes.REAL_VALUED, InputTypes.TARGET),
|
||||
('onpromotion', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
|
||||
('transactions', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
('oil', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
('day_of_week', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
|
||||
('day_of_month', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
|
||||
('month', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
|
||||
('national_hol', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
|
||||
('regional_hol', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
|
||||
('local_hol', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
|
||||
('open', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
|
||||
('item_nbr', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
('store_nbr', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
('city', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
('state', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
('type', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
('cluster', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
('family', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
('class', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
('perishable', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT)
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
"""Initialises formatter."""
|
||||
|
||||
self.identifiers = None
|
||||
self._real_scalers = None
|
||||
self._cat_scalers = None
|
||||
self._target_scaler = None
|
||||
self._num_classes_per_cat_input = None
|
||||
|
||||
def split_data(self, df, valid_boundary=None, test_boundary=None):
|
||||
"""Splits data frame into training-validation-test data frames.
|
||||
|
||||
This also calibrates scaling object, and transforms data for each split.
|
||||
|
||||
Args:
|
||||
df: Source data frame to split.
|
||||
valid_boundary: Starting year for validation data
|
||||
test_boundary: Starting year for test data
|
||||
|
||||
Returns:
|
||||
Tuple of transformed (train, valid, test) data.
|
||||
"""
|
||||
|
||||
print('Formatting train-valid-test splits.')
|
||||
|
||||
if valid_boundary is None:
|
||||
valid_boundary = pd.datetime(2015, 12, 1)
|
||||
|
||||
fixed_params = self.get_fixed_params()
|
||||
time_steps = fixed_params['total_time_steps']
|
||||
lookback = fixed_params['num_encoder_steps']
|
||||
forecast_horizon = time_steps - lookback
|
||||
|
||||
df['date'] = pd.to_datetime(df['date'])
|
||||
df_lists = {'train': [], 'valid': [], 'test': []}
|
||||
for _, sliced in df.groupby('traj_id'):
|
||||
index = sliced['date']
|
||||
train = sliced.loc[index < valid_boundary]
|
||||
train_len = len(train)
|
||||
valid_len = train_len + forecast_horizon
|
||||
valid = sliced.iloc[train_len - lookback:valid_len, :]
|
||||
test = sliced.iloc[valid_len - lookback:valid_len + forecast_horizon, :]
|
||||
|
||||
sliced_map = {'train': train, 'valid': valid, 'test': test}
|
||||
|
||||
for k in sliced_map:
|
||||
item = sliced_map[k]
|
||||
|
||||
if len(item) >= time_steps:
|
||||
df_lists[k].append(item)
|
||||
|
||||
dfs = {k: pd.concat(df_lists[k], axis=0) for k in df_lists}
|
||||
|
||||
train = dfs['train']
|
||||
self.set_scalers(train, set_real=True)
|
||||
|
||||
# Use all data for label encoding to handle labels not present in training.
|
||||
self.set_scalers(df, set_real=False)
|
||||
|
||||
# Filter out identifiers not present in training (i.e. cold-started items).
|
||||
def filter_ids(frame):
|
||||
identifiers = set(self.identifiers)
|
||||
index = frame['traj_id']
|
||||
return frame.loc[index.apply(lambda x: x in identifiers)]
|
||||
|
||||
valid = filter_ids(dfs['valid'])
|
||||
test = filter_ids(dfs['test'])
|
||||
|
||||
return (self.transform_inputs(data) for data in [train, valid, test])
|
||||
|
||||
def set_scalers(self, df, set_real=True):
|
||||
"""Calibrates scalers using the data supplied.
|
||||
|
||||
Label encoding is applied to the entire dataset (i.e. including test),
|
||||
so that unseen labels can be handled at run-time.
|
||||
|
||||
Args:
|
||||
df: Data to use to calibrate scalers.
|
||||
set_real: Whether to fit set real-valued or categorical scalers
|
||||
"""
|
||||
print('Setting scalers with training data...')
|
||||
|
||||
column_definitions = self.get_column_definition()
|
||||
id_column = utils.get_single_col_by_input_type(InputTypes.ID,
|
||||
column_definitions)
|
||||
target_column = utils.get_single_col_by_input_type(InputTypes.TARGET,
|
||||
column_definitions)
|
||||
|
||||
if set_real:
|
||||
|
||||
# Extract identifiers in case required
|
||||
self.identifiers = list(df[id_column].unique())
|
||||
|
||||
# Format real scalers
|
||||
self._real_scalers = {}
|
||||
for col in ['oil', 'transactions', 'log_sales']:
|
||||
self._real_scalers[col] = (df[col].mean(), df[col].std())
|
||||
|
||||
self._target_scaler = (df[target_column].mean(), df[target_column].std())
|
||||
|
||||
else:
|
||||
# Format categorical scalers
|
||||
categorical_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.CATEGORICAL, column_definitions,
|
||||
{InputTypes.ID, InputTypes.TIME})
|
||||
|
||||
categorical_scalers = {}
|
||||
num_classes = []
|
||||
if self.identifiers is None:
|
||||
raise ValueError('Scale real-valued inputs first!')
|
||||
id_set = set(self.identifiers)
|
||||
valid_idx = df['traj_id'].apply(lambda x: x in id_set)
|
||||
for col in categorical_inputs:
|
||||
# Set all to str so that we don't have mixed integer/string columns
|
||||
srs = df[col].apply(str).loc[valid_idx]
|
||||
categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(
|
||||
srs.values)
|
||||
|
||||
num_classes.append(srs.nunique())
|
||||
|
||||
# Set categorical scaler outputs
|
||||
self._cat_scalers = categorical_scalers
|
||||
self._num_classes_per_cat_input = num_classes
|
||||
|
||||
def transform_inputs(self, df):
|
||||
"""Performs feature transformations.
|
||||
|
||||
This includes both feature engineering, preprocessing and normalisation.
|
||||
|
||||
Args:
|
||||
df: Data frame to transform.
|
||||
|
||||
Returns:
|
||||
Transformed data frame.
|
||||
|
||||
"""
|
||||
output = df.copy()
|
||||
|
||||
if self._real_scalers is None and self._cat_scalers is None:
|
||||
raise ValueError('Scalers have not been set!')
|
||||
|
||||
column_definitions = self.get_column_definition()
|
||||
|
||||
categorical_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.CATEGORICAL, column_definitions,
|
||||
{InputTypes.ID, InputTypes.TIME})
|
||||
|
||||
# Format real inputs
|
||||
for col in ['log_sales', 'oil', 'transactions']:
|
||||
mean, std = self._real_scalers[col]
|
||||
output[col] = (df[col] - mean) / std
|
||||
|
||||
if col == 'log_sales':
|
||||
output[col] = output[col].fillna(0.) # mean imputation
|
||||
|
||||
# Format categorical inputs
|
||||
for col in categorical_inputs:
|
||||
string_df = df[col].apply(str)
|
||||
output[col] = self._cat_scalers[col].transform(string_df)
|
||||
|
||||
return output
|
||||
|
||||
def format_predictions(self, predictions):
|
||||
"""Reverts any normalisation to give predictions in original scale.
|
||||
|
||||
Args:
|
||||
predictions: Dataframe of model predictions.
|
||||
|
||||
Returns:
|
||||
Data frame of unnormalised predictions.
|
||||
"""
|
||||
output = predictions.copy()
|
||||
|
||||
column_names = predictions.columns
|
||||
mean, std = self._target_scaler
|
||||
for col in column_names:
|
||||
if col not in {'forecast_time', 'identifier'}:
|
||||
output[col] = (predictions[col] * std) + mean
|
||||
|
||||
return output
|
||||
|
||||
# Default params
|
||||
def get_fixed_params(self):
|
||||
"""Returns fixed model parameters for experiments."""
|
||||
|
||||
fixed_params = {
|
||||
'total_time_steps': 120,
|
||||
'num_encoder_steps': 90,
|
||||
'num_epochs': 100,
|
||||
'early_stopping_patience': 5,
|
||||
'multiprocessing_workers': 5
|
||||
}
|
||||
|
||||
return fixed_params
|
||||
|
||||
def get_default_model_params(self):
|
||||
"""Returns default optimised model parameters."""
|
||||
|
||||
model_params = {
|
||||
'dropout_rate': 0.1,
|
||||
'hidden_layer_size': 240,
|
||||
'learning_rate': 0.001,
|
||||
'minibatch_size': 128,
|
||||
'max_gradient_norm': 100.,
|
||||
'num_heads': 4,
|
||||
'stack_size': 1
|
||||
}
|
||||
|
||||
return model_params
|
||||
|
||||
def get_num_samples_for_calibration(self):
|
||||
"""Gets the default number of training and validation samples.
|
||||
|
||||
Use to sub-sample the data for network calibration and a value of -1 uses
|
||||
all available samples.
|
||||
|
||||
Returns:
|
||||
Tuple of (training samples, validation samples)
|
||||
"""
|
||||
return 450000, 50000
|
||||
|
||||
def get_column_definition(self):
|
||||
""""Formats column definition in order expected by the TFT.
|
||||
|
||||
Modified for Favorita to match column order of original experiment.
|
||||
|
||||
Returns:
|
||||
Favorita-specific column definition
|
||||
"""
|
||||
|
||||
column_definition = self._column_definition
|
||||
|
||||
# Sanity checks first.
|
||||
# Ensure only one ID and time column exist
|
||||
def _check_single_column(input_type):
|
||||
|
||||
length = len([tup for tup in column_definition if tup[2] == input_type])
|
||||
|
||||
if length != 1:
|
||||
raise ValueError('Illegal number of inputs ({}) of type {}'.format(
|
||||
length, input_type))
|
||||
|
||||
_check_single_column(InputTypes.ID)
|
||||
_check_single_column(InputTypes.TIME)
|
||||
|
||||
identifier = [tup for tup in column_definition if tup[2] == InputTypes.ID]
|
||||
time = [tup for tup in column_definition if tup[2] == InputTypes.TIME]
|
||||
real_inputs = [
|
||||
tup for tup in column_definition if tup[1] == DataTypes.REAL_VALUED and
|
||||
tup[2] not in {InputTypes.ID, InputTypes.TIME}
|
||||
]
|
||||
|
||||
col_definition_map = {tup[0]: tup for tup in column_definition}
|
||||
col_order = [
|
||||
'item_nbr', 'store_nbr', 'city', 'state', 'type', 'cluster', 'family',
|
||||
'class', 'perishable', 'onpromotion', 'day_of_week', 'national_hol',
|
||||
'regional_hol', 'local_hol'
|
||||
]
|
||||
categorical_inputs = [
|
||||
col_definition_map[k] for k in col_order if k in col_definition_map
|
||||
]
|
||||
|
||||
return identifier + time + real_inputs + categorical_inputs
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Google Research Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Lint as: python3
|
||||
"""Custom formatting functions for Favorita dataset.
|
||||
|
||||
Defines dataset specific column definitions and data transformations.
|
||||
"""
|
||||
|
||||
import data_formatters.base
|
||||
import libs.utils as utils
|
||||
import pandas as pd
|
||||
import sklearn.preprocessing
|
||||
|
||||
DataTypes = data_formatters.base.DataTypes
|
||||
InputTypes = data_formatters.base.InputTypes
|
||||
|
||||
|
||||
class FavoritaFormatter(data_formatters.base.GenericDataFormatter):
|
||||
"""Defines and formats data for the Favorita dataset.
|
||||
|
||||
Attributes:
|
||||
column_definition: Defines input and data type of column used in the
|
||||
experiment.
|
||||
identifiers: Entity identifiers used in experiments.
|
||||
"""
|
||||
|
||||
_column_definition = [
|
||||
("traj_id", DataTypes.REAL_VALUED, InputTypes.ID),
|
||||
("date", DataTypes.DATE, InputTypes.TIME),
|
||||
("log_sales", DataTypes.REAL_VALUED, InputTypes.TARGET),
|
||||
("onpromotion", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
|
||||
("transactions", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
("oil", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
("day_of_week", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
|
||||
("day_of_month", DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
|
||||
("month", DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
|
||||
("national_hol", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
|
||||
("regional_hol", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
|
||||
("local_hol", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
|
||||
("open", DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
|
||||
("item_nbr", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
("store_nbr", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
("city", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
("state", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
("type", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
("cluster", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
("family", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
("class", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
("perishable", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
"""Initialises formatter."""
|
||||
|
||||
self.identifiers = None
|
||||
self._real_scalers = None
|
||||
self._cat_scalers = None
|
||||
self._target_scaler = None
|
||||
self._num_classes_per_cat_input = None
|
||||
|
||||
def split_data(self, df, valid_boundary=None, test_boundary=None):
|
||||
"""Splits data frame into training-validation-test data frames.
|
||||
|
||||
This also calibrates scaling object, and transforms data for each split.
|
||||
|
||||
Args:
|
||||
df: Source data frame to split.
|
||||
valid_boundary: Starting year for validation data
|
||||
test_boundary: Starting year for test data
|
||||
|
||||
Returns:
|
||||
Tuple of transformed (train, valid, test) data.
|
||||
"""
|
||||
|
||||
print("Formatting train-valid-test splits.")
|
||||
|
||||
if valid_boundary is None:
|
||||
valid_boundary = pd.datetime(2015, 12, 1)
|
||||
|
||||
fixed_params = self.get_fixed_params()
|
||||
time_steps = fixed_params["total_time_steps"]
|
||||
lookback = fixed_params["num_encoder_steps"]
|
||||
forecast_horizon = time_steps - lookback
|
||||
|
||||
df["date"] = pd.to_datetime(df["date"])
|
||||
df_lists = {"train": [], "valid": [], "test": []}
|
||||
for _, sliced in df.groupby("traj_id"):
|
||||
index = sliced["date"]
|
||||
train = sliced.loc[index < valid_boundary]
|
||||
train_len = len(train)
|
||||
valid_len = train_len + forecast_horizon
|
||||
valid = sliced.iloc[train_len - lookback : valid_len, :]
|
||||
test = sliced.iloc[valid_len - lookback : valid_len + forecast_horizon, :]
|
||||
|
||||
sliced_map = {"train": train, "valid": valid, "test": test}
|
||||
|
||||
for k in sliced_map:
|
||||
item = sliced_map[k]
|
||||
|
||||
if len(item) >= time_steps:
|
||||
df_lists[k].append(item)
|
||||
|
||||
dfs = {k: pd.concat(df_lists[k], axis=0) for k in df_lists}
|
||||
|
||||
train = dfs["train"]
|
||||
self.set_scalers(train, set_real=True)
|
||||
|
||||
# Use all data for label encoding to handle labels not present in training.
|
||||
self.set_scalers(df, set_real=False)
|
||||
|
||||
# Filter out identifiers not present in training (i.e. cold-started items).
|
||||
def filter_ids(frame):
|
||||
identifiers = set(self.identifiers)
|
||||
index = frame["traj_id"]
|
||||
return frame.loc[index.apply(lambda x: x in identifiers)]
|
||||
|
||||
valid = filter_ids(dfs["valid"])
|
||||
test = filter_ids(dfs["test"])
|
||||
|
||||
return (self.transform_inputs(data) for data in [train, valid, test])
|
||||
|
||||
def set_scalers(self, df, set_real=True):
|
||||
"""Calibrates scalers using the data supplied.
|
||||
|
||||
Label encoding is applied to the entire dataset (i.e. including test),
|
||||
so that unseen labels can be handled at run-time.
|
||||
|
||||
Args:
|
||||
df: Data to use to calibrate scalers.
|
||||
set_real: Whether to fit set real-valued or categorical scalers
|
||||
"""
|
||||
print("Setting scalers with training data...")
|
||||
|
||||
column_definitions = self.get_column_definition()
|
||||
id_column = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions)
|
||||
target_column = utils.get_single_col_by_input_type(InputTypes.TARGET, column_definitions)
|
||||
|
||||
if set_real:
|
||||
|
||||
# Extract identifiers in case required
|
||||
self.identifiers = list(df[id_column].unique())
|
||||
|
||||
# Format real scalers
|
||||
self._real_scalers = {}
|
||||
for col in ["oil", "transactions", "log_sales"]:
|
||||
self._real_scalers[col] = (df[col].mean(), df[col].std())
|
||||
|
||||
self._target_scaler = (df[target_column].mean(), df[target_column].std())
|
||||
|
||||
else:
|
||||
# Format categorical scalers
|
||||
categorical_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}
|
||||
)
|
||||
|
||||
categorical_scalers = {}
|
||||
num_classes = []
|
||||
if self.identifiers is None:
|
||||
raise ValueError("Scale real-valued inputs first!")
|
||||
id_set = set(self.identifiers)
|
||||
valid_idx = df["traj_id"].apply(lambda x: x in id_set)
|
||||
for col in categorical_inputs:
|
||||
# Set all to str so that we don't have mixed integer/string columns
|
||||
srs = df[col].apply(str).loc[valid_idx]
|
||||
categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
|
||||
|
||||
num_classes.append(srs.nunique())
|
||||
|
||||
# Set categorical scaler outputs
|
||||
self._cat_scalers = categorical_scalers
|
||||
self._num_classes_per_cat_input = num_classes
|
||||
|
||||
def transform_inputs(self, df):
|
||||
"""Performs feature transformations.
|
||||
|
||||
This includes both feature engineering, preprocessing and normalisation.
|
||||
|
||||
Args:
|
||||
df: Data frame to transform.
|
||||
|
||||
Returns:
|
||||
Transformed data frame.
|
||||
|
||||
"""
|
||||
output = df.copy()
|
||||
|
||||
if self._real_scalers is None and self._cat_scalers is None:
|
||||
raise ValueError("Scalers have not been set!")
|
||||
|
||||
column_definitions = self.get_column_definition()
|
||||
|
||||
categorical_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}
|
||||
)
|
||||
|
||||
# Format real inputs
|
||||
for col in ["log_sales", "oil", "transactions"]:
|
||||
mean, std = self._real_scalers[col]
|
||||
output[col] = (df[col] - mean) / std
|
||||
|
||||
if col == "log_sales":
|
||||
output[col] = output[col].fillna(0.0) # mean imputation
|
||||
|
||||
# Format categorical inputs
|
||||
for col in categorical_inputs:
|
||||
string_df = df[col].apply(str)
|
||||
output[col] = self._cat_scalers[col].transform(string_df)
|
||||
|
||||
return output
|
||||
|
||||
def format_predictions(self, predictions):
|
||||
"""Reverts any normalisation to give predictions in original scale.
|
||||
|
||||
Args:
|
||||
predictions: Dataframe of model predictions.
|
||||
|
||||
Returns:
|
||||
Data frame of unnormalised predictions.
|
||||
"""
|
||||
output = predictions.copy()
|
||||
|
||||
column_names = predictions.columns
|
||||
mean, std = self._target_scaler
|
||||
for col in column_names:
|
||||
if col not in {"forecast_time", "identifier"}:
|
||||
output[col] = (predictions[col] * std) + mean
|
||||
|
||||
return output
|
||||
|
||||
# Default params
|
||||
def get_fixed_params(self):
|
||||
"""Returns fixed model parameters for experiments."""
|
||||
|
||||
fixed_params = {
|
||||
"total_time_steps": 120,
|
||||
"num_encoder_steps": 90,
|
||||
"num_epochs": 100,
|
||||
"early_stopping_patience": 5,
|
||||
"multiprocessing_workers": 5,
|
||||
}
|
||||
|
||||
return fixed_params
|
||||
|
||||
def get_default_model_params(self):
|
||||
"""Returns default optimised model parameters."""
|
||||
|
||||
model_params = {
|
||||
"dropout_rate": 0.1,
|
||||
"hidden_layer_size": 240,
|
||||
"learning_rate": 0.001,
|
||||
"minibatch_size": 128,
|
||||
"max_gradient_norm": 100.0,
|
||||
"num_heads": 4,
|
||||
"stack_size": 1,
|
||||
}
|
||||
|
||||
return model_params
|
||||
|
||||
def get_num_samples_for_calibration(self):
|
||||
"""Gets the default number of training and validation samples.
|
||||
|
||||
Use to sub-sample the data for network calibration and a value of -1 uses
|
||||
all available samples.
|
||||
|
||||
Returns:
|
||||
Tuple of (training samples, validation samples)
|
||||
"""
|
||||
return 450000, 50000
|
||||
|
||||
def get_column_definition(self):
|
||||
""" "Formats column definition in order expected by the TFT.
|
||||
|
||||
Modified for Favorita to match column order of original experiment.
|
||||
|
||||
Returns:
|
||||
Favorita-specific column definition
|
||||
"""
|
||||
|
||||
column_definition = self._column_definition
|
||||
|
||||
# Sanity checks first.
|
||||
# Ensure only one ID and time column exist
|
||||
def _check_single_column(input_type):
|
||||
|
||||
length = len([tup for tup in column_definition if tup[2] == input_type])
|
||||
|
||||
if length != 1:
|
||||
raise ValueError("Illegal number of inputs ({}) of type {}".format(length, input_type))
|
||||
|
||||
_check_single_column(InputTypes.ID)
|
||||
_check_single_column(InputTypes.TIME)
|
||||
|
||||
identifier = [tup for tup in column_definition if tup[2] == InputTypes.ID]
|
||||
time = [tup for tup in column_definition if tup[2] == InputTypes.TIME]
|
||||
real_inputs = [
|
||||
tup
|
||||
for tup in column_definition
|
||||
if tup[1] == DataTypes.REAL_VALUED and tup[2] not in {InputTypes.ID, InputTypes.TIME}
|
||||
]
|
||||
|
||||
col_definition_map = {tup[0]: tup for tup in column_definition}
|
||||
col_order = [
|
||||
"item_nbr",
|
||||
"store_nbr",
|
||||
"city",
|
||||
"state",
|
||||
"type",
|
||||
"cluster",
|
||||
"family",
|
||||
"class",
|
||||
"perishable",
|
||||
"onpromotion",
|
||||
"day_of_week",
|
||||
"national_hol",
|
||||
"regional_hol",
|
||||
"local_hol",
|
||||
]
|
||||
categorical_inputs = [col_definition_map[k] for k in col_order if k in col_definition_map]
|
||||
|
||||
return identifier + time + real_inputs + categorical_inputs
|
||||
|
||||
@@ -1,220 +1,219 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Google Research Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Lint as: python3
|
||||
"""Custom formatting functions for Alpha158 dataset.
|
||||
|
||||
Defines dataset specific column definitions and data transformations.
|
||||
"""
|
||||
|
||||
import data_formatters.base
|
||||
import libs.utils as utils
|
||||
import sklearn.preprocessing
|
||||
|
||||
GenericDataFormatter = data_formatters.base.GenericDataFormatter
|
||||
DataTypes = data_formatters.base.DataTypes
|
||||
InputTypes = data_formatters.base.InputTypes
|
||||
|
||||
class Alpha158Formatter(GenericDataFormatter):
|
||||
"""Defines and formats data for the Alpha158 dataset.
|
||||
|
||||
Attributes:
|
||||
column_definition: Defines input and data type of column used in the
|
||||
experiment.
|
||||
identifiers: Entity identifiers used in experiments.
|
||||
"""
|
||||
|
||||
_column_definition = [
|
||||
('instrument', DataTypes.CATEGORICAL, InputTypes.ID),
|
||||
('LABEL0', DataTypes.REAL_VALUED, InputTypes.TARGET),
|
||||
('date', DataTypes.DATE, InputTypes.TIME),
|
||||
('month', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
|
||||
('day_of_week', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
|
||||
# Selected 10 features
|
||||
('RESI5', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
('WVMA5', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
('RSQR5', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
('KLEN', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
('RSQR10', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
('CORR5', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
('CORD5', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
('CORR10', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
('ROC60', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
('RESI10', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
('const', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
"""Initialises formatter."""
|
||||
|
||||
self.identifiers = None
|
||||
self._real_scalers = None
|
||||
self._cat_scalers = None
|
||||
self._target_scaler = None
|
||||
self._num_classes_per_cat_input = None
|
||||
|
||||
def split_data(self, df, valid_boundary=2016, test_boundary=2018):
|
||||
"""Splits data frame into training-validation-test data frames.
|
||||
|
||||
This also calibrates scaling object, and transforms data for each split.
|
||||
|
||||
Args:
|
||||
df: Source data frame to split.
|
||||
valid_boundary: Starting year for validation data
|
||||
test_boundary: Starting year for test data
|
||||
|
||||
Returns:
|
||||
Tuple of transformed (train, valid, test) data.
|
||||
"""
|
||||
|
||||
print('Formatting train-valid-test splits.')
|
||||
|
||||
index = df['year']
|
||||
train = df.loc[index < valid_boundary]
|
||||
valid = df.loc[(index >= valid_boundary) & (index < test_boundary)]
|
||||
test = df.loc[index >= test_boundary]
|
||||
|
||||
self.set_scalers(train)
|
||||
|
||||
return (self.transform_inputs(data) for data in [train, valid, test])
|
||||
|
||||
def set_scalers(self, df):
|
||||
"""Calibrates scalers using the data supplied.
|
||||
|
||||
Args:
|
||||
df: Data to use to calibrate scalers.
|
||||
"""
|
||||
print('Setting scalers with training data...')
|
||||
|
||||
column_definitions = self.get_column_definition()
|
||||
id_column = utils.get_single_col_by_input_type(InputTypes.ID,
|
||||
column_definitions)
|
||||
target_column = utils.get_single_col_by_input_type(InputTypes.TARGET,
|
||||
column_definitions)
|
||||
|
||||
# Extract identifiers in case required
|
||||
self.identifiers = list(df[id_column].unique())
|
||||
|
||||
# Format real scalers
|
||||
real_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.REAL_VALUED, column_definitions,
|
||||
{InputTypes.ID, InputTypes.TIME})
|
||||
|
||||
data = df[real_inputs].values
|
||||
self._real_scalers = sklearn.preprocessing.StandardScaler().fit(data)
|
||||
self._target_scaler = sklearn.preprocessing.StandardScaler().fit(
|
||||
df[[target_column]].values) # used for predictions
|
||||
|
||||
# Format categorical scalers
|
||||
categorical_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.CATEGORICAL, column_definitions,
|
||||
{InputTypes.ID, InputTypes.TIME})
|
||||
|
||||
categorical_scalers = {}
|
||||
num_classes = []
|
||||
for col in categorical_inputs:
|
||||
# Set all to str so that we don't have mixed integer/string columns
|
||||
srs = df[col].apply(str)
|
||||
categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(
|
||||
srs.values)
|
||||
num_classes.append(srs.nunique())
|
||||
|
||||
# Set categorical scaler outputs
|
||||
self._cat_scalers = categorical_scalers
|
||||
self._num_classes_per_cat_input = num_classes
|
||||
|
||||
def transform_inputs(self, df):
|
||||
"""Performs feature transformations.
|
||||
|
||||
This includes both feature engineering, preprocessing and normalisation.
|
||||
|
||||
Args:
|
||||
df: Data frame to transform.
|
||||
|
||||
Returns:
|
||||
Transformed data frame.
|
||||
|
||||
"""
|
||||
output = df.copy()
|
||||
|
||||
if self._real_scalers is None and self._cat_scalers is None:
|
||||
raise ValueError('Scalers have not been set!')
|
||||
|
||||
column_definitions = self.get_column_definition()
|
||||
|
||||
real_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.REAL_VALUED, column_definitions,
|
||||
{InputTypes.ID, InputTypes.TIME})
|
||||
categorical_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.CATEGORICAL, column_definitions,
|
||||
{InputTypes.ID, InputTypes.TIME})
|
||||
|
||||
# Format real inputs
|
||||
output[real_inputs] = self._real_scalers.transform(df[real_inputs].values)
|
||||
|
||||
# Format categorical inputs
|
||||
for col in categorical_inputs:
|
||||
string_df = df[col].apply(str)
|
||||
output[col] = self._cat_scalers[col].transform(string_df)
|
||||
|
||||
return output
|
||||
|
||||
def format_predictions(self, predictions):
|
||||
"""Reverts any normalisation to give predictions in original scale.
|
||||
|
||||
Args:
|
||||
predictions: Dataframe of model predictions.
|
||||
|
||||
Returns:
|
||||
Data frame of unnormalised predictions.
|
||||
"""
|
||||
output = predictions.copy()
|
||||
|
||||
column_names = predictions.columns
|
||||
|
||||
for col in column_names:
|
||||
if col not in {'forecast_time', 'identifier'}:
|
||||
output[col] = self._target_scaler.inverse_transform(predictions[col])
|
||||
|
||||
return output
|
||||
|
||||
# Default params
|
||||
def get_fixed_params(self):
|
||||
"""Returns fixed model parameters for experiments."""
|
||||
|
||||
fixed_params = {
|
||||
'total_time_steps': 16 + 6,
|
||||
'num_encoder_steps': 16,
|
||||
'num_epochs': 100,
|
||||
'early_stopping_patience': 5,
|
||||
'multiprocessing_workers': 5,
|
||||
}
|
||||
|
||||
return fixed_params
|
||||
|
||||
def get_default_model_params(self):
|
||||
"""Returns default optimised model parameters."""
|
||||
|
||||
model_params = {
|
||||
'dropout_rate': 0.3,
|
||||
'hidden_layer_size': 160,
|
||||
'learning_rate': 0.01,
|
||||
'minibatch_size': 64,
|
||||
'max_gradient_norm': 0.01,
|
||||
'num_heads': 1,
|
||||
'stack_size': 1
|
||||
}
|
||||
|
||||
return model_params
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Google Research Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Lint as: python3
|
||||
"""Custom formatting functions for Alpha158 dataset.
|
||||
|
||||
Defines dataset specific column definitions and data transformations.
|
||||
"""
|
||||
|
||||
import data_formatters.base
|
||||
import libs.utils as utils
|
||||
import sklearn.preprocessing
|
||||
|
||||
GenericDataFormatter = data_formatters.base.GenericDataFormatter
|
||||
DataTypes = data_formatters.base.DataTypes
|
||||
InputTypes = data_formatters.base.InputTypes
|
||||
|
||||
|
||||
class Alpha158Formatter(GenericDataFormatter):
|
||||
"""Defines and formats data for the Alpha158 dataset.
|
||||
|
||||
Attributes:
|
||||
column_definition: Defines input and data type of column used in the
|
||||
experiment.
|
||||
identifiers: Entity identifiers used in experiments.
|
||||
"""
|
||||
|
||||
_column_definition = [
|
||||
("instrument", DataTypes.CATEGORICAL, InputTypes.ID),
|
||||
("LABEL0", DataTypes.REAL_VALUED, InputTypes.TARGET),
|
||||
("date", DataTypes.DATE, InputTypes.TIME),
|
||||
("month", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
|
||||
("day_of_week", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
|
||||
# Selected 10 features
|
||||
("RESI5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
("WVMA5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
("RSQR5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
("KLEN", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
("RSQR10", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
("CORR5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
("CORD5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
("CORR10", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
("ROC60", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
("RESI10", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
("const", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
"""Initialises formatter."""
|
||||
|
||||
self.identifiers = None
|
||||
self._real_scalers = None
|
||||
self._cat_scalers = None
|
||||
self._target_scaler = None
|
||||
self._num_classes_per_cat_input = None
|
||||
|
||||
def split_data(self, df, valid_boundary=2016, test_boundary=2018):
|
||||
"""Splits data frame into training-validation-test data frames.
|
||||
|
||||
This also calibrates scaling object, and transforms data for each split.
|
||||
|
||||
Args:
|
||||
df: Source data frame to split.
|
||||
valid_boundary: Starting year for validation data
|
||||
test_boundary: Starting year for test data
|
||||
|
||||
Returns:
|
||||
Tuple of transformed (train, valid, test) data.
|
||||
"""
|
||||
|
||||
print("Formatting train-valid-test splits.")
|
||||
|
||||
index = df["year"]
|
||||
train = df.loc[index < valid_boundary]
|
||||
valid = df.loc[(index >= valid_boundary) & (index < test_boundary)]
|
||||
test = df.loc[index >= test_boundary]
|
||||
|
||||
self.set_scalers(train)
|
||||
|
||||
return (self.transform_inputs(data) for data in [train, valid, test])
|
||||
|
||||
def set_scalers(self, df):
|
||||
"""Calibrates scalers using the data supplied.
|
||||
|
||||
Args:
|
||||
df: Data to use to calibrate scalers.
|
||||
"""
|
||||
print("Setting scalers with training data...")
|
||||
|
||||
column_definitions = self.get_column_definition()
|
||||
id_column = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions)
|
||||
target_column = utils.get_single_col_by_input_type(InputTypes.TARGET, column_definitions)
|
||||
|
||||
# Extract identifiers in case required
|
||||
self.identifiers = list(df[id_column].unique())
|
||||
|
||||
# Format real scalers
|
||||
real_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}
|
||||
)
|
||||
|
||||
data = df[real_inputs].values
|
||||
self._real_scalers = sklearn.preprocessing.StandardScaler().fit(data)
|
||||
self._target_scaler = sklearn.preprocessing.StandardScaler().fit(
|
||||
df[[target_column]].values
|
||||
) # used for predictions
|
||||
|
||||
# Format categorical scalers
|
||||
categorical_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}
|
||||
)
|
||||
|
||||
categorical_scalers = {}
|
||||
num_classes = []
|
||||
for col in categorical_inputs:
|
||||
# Set all to str so that we don't have mixed integer/string columns
|
||||
srs = df[col].apply(str)
|
||||
categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
|
||||
num_classes.append(srs.nunique())
|
||||
|
||||
# Set categorical scaler outputs
|
||||
self._cat_scalers = categorical_scalers
|
||||
self._num_classes_per_cat_input = num_classes
|
||||
|
||||
def transform_inputs(self, df):
|
||||
"""Performs feature transformations.
|
||||
|
||||
This includes both feature engineering, preprocessing and normalisation.
|
||||
|
||||
Args:
|
||||
df: Data frame to transform.
|
||||
|
||||
Returns:
|
||||
Transformed data frame.
|
||||
|
||||
"""
|
||||
output = df.copy()
|
||||
|
||||
if self._real_scalers is None and self._cat_scalers is None:
|
||||
raise ValueError("Scalers have not been set!")
|
||||
|
||||
column_definitions = self.get_column_definition()
|
||||
|
||||
real_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}
|
||||
)
|
||||
categorical_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}
|
||||
)
|
||||
|
||||
# Format real inputs
|
||||
output[real_inputs] = self._real_scalers.transform(df[real_inputs].values)
|
||||
|
||||
# Format categorical inputs
|
||||
for col in categorical_inputs:
|
||||
string_df = df[col].apply(str)
|
||||
output[col] = self._cat_scalers[col].transform(string_df)
|
||||
|
||||
return output
|
||||
|
||||
def format_predictions(self, predictions):
|
||||
"""Reverts any normalisation to give predictions in original scale.
|
||||
|
||||
Args:
|
||||
predictions: Dataframe of model predictions.
|
||||
|
||||
Returns:
|
||||
Data frame of unnormalised predictions.
|
||||
"""
|
||||
output = predictions.copy()
|
||||
|
||||
column_names = predictions.columns
|
||||
|
||||
for col in column_names:
|
||||
if col not in {"forecast_time", "identifier"}:
|
||||
output[col] = self._target_scaler.inverse_transform(predictions[col])
|
||||
|
||||
return output
|
||||
|
||||
# Default params
|
||||
def get_fixed_params(self):
|
||||
"""Returns fixed model parameters for experiments."""
|
||||
|
||||
fixed_params = {
|
||||
"total_time_steps": 16 + 6,
|
||||
"num_encoder_steps": 16,
|
||||
"num_epochs": 100,
|
||||
"early_stopping_patience": 5,
|
||||
"multiprocessing_workers": 5,
|
||||
}
|
||||
|
||||
return fixed_params
|
||||
|
||||
def get_default_model_params(self):
|
||||
"""Returns default optimised model parameters."""
|
||||
|
||||
model_params = {
|
||||
"dropout_rate": 0.3,
|
||||
"hidden_layer_size": 160,
|
||||
"learning_rate": 0.01,
|
||||
"minibatch_size": 64,
|
||||
"max_gradient_norm": 0.01,
|
||||
"num_heads": 1,
|
||||
"stack_size": 1,
|
||||
}
|
||||
|
||||
return model_params
|
||||
|
||||
@@ -1,117 +1,117 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Google Research Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Lint as: python3
|
||||
"""Custom formatting functions for Traffic dataset.
|
||||
|
||||
Defines dataset specific column definitions and data transformations. This also
|
||||
performs z-score normalization across the entire dataset, hence re-uses most of
|
||||
the same functions as volatility.
|
||||
"""
|
||||
|
||||
import data_formatters.base
|
||||
import data_formatters.volatility
|
||||
|
||||
VolatilityFormatter = data_formatters.volatility.VolatilityFormatter
|
||||
DataTypes = data_formatters.base.DataTypes
|
||||
InputTypes = data_formatters.base.InputTypes
|
||||
|
||||
|
||||
class TrafficFormatter(VolatilityFormatter):
|
||||
"""Defines and formats data for the traffic dataset.
|
||||
|
||||
This also performs z-score normalization across the entire dataset, hence
|
||||
re-uses most of the same functions as volatility.
|
||||
|
||||
Attributes:
|
||||
column_definition: Defines input and data type of column used in the
|
||||
experiment.
|
||||
identifiers: Entity identifiers used in experiments.
|
||||
"""
|
||||
|
||||
_column_definition = [
|
||||
('id', DataTypes.REAL_VALUED, InputTypes.ID),
|
||||
('hours_from_start', DataTypes.REAL_VALUED, InputTypes.TIME),
|
||||
('values', DataTypes.REAL_VALUED, InputTypes.TARGET),
|
||||
('time_on_day', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
|
||||
('day_of_week', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
|
||||
('hours_from_start', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
|
||||
('categorical_id', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
]
|
||||
|
||||
def split_data(self, df, valid_boundary=151, test_boundary=166):
|
||||
"""Splits data frame into training-validation-test data frames.
|
||||
|
||||
This also calibrates scaling object, and transforms data for each split.
|
||||
|
||||
Args:
|
||||
df: Source data frame to split.
|
||||
valid_boundary: Starting year for validation data
|
||||
test_boundary: Starting year for test data
|
||||
|
||||
Returns:
|
||||
Tuple of transformed (train, valid, test) data.
|
||||
"""
|
||||
|
||||
print('Formatting train-valid-test splits.')
|
||||
|
||||
index = df['sensor_day']
|
||||
train = df.loc[index < valid_boundary]
|
||||
valid = df.loc[(index >= valid_boundary - 7) & (index < test_boundary)]
|
||||
test = df.loc[index >= test_boundary - 7]
|
||||
|
||||
self.set_scalers(train)
|
||||
|
||||
return (self.transform_inputs(data) for data in [train, valid, test])
|
||||
|
||||
# Default params
|
||||
def get_fixed_params(self):
|
||||
"""Returns fixed model parameters for experiments."""
|
||||
|
||||
fixed_params = {
|
||||
'total_time_steps': 8 * 24,
|
||||
'num_encoder_steps': 7 * 24,
|
||||
'num_epochs': 100,
|
||||
'early_stopping_patience': 5,
|
||||
'multiprocessing_workers': 5
|
||||
}
|
||||
|
||||
return fixed_params
|
||||
|
||||
def get_default_model_params(self):
|
||||
"""Returns default optimised model parameters."""
|
||||
|
||||
model_params = {
|
||||
'dropout_rate': 0.3,
|
||||
'hidden_layer_size': 320,
|
||||
'learning_rate': 0.001,
|
||||
'minibatch_size': 128,
|
||||
'max_gradient_norm': 100.,
|
||||
'num_heads': 4,
|
||||
'stack_size': 1
|
||||
}
|
||||
|
||||
return model_params
|
||||
|
||||
def get_num_samples_for_calibration(self):
|
||||
"""Gets the default number of training and validation samples.
|
||||
|
||||
Use to sub-sample the data for network calibration and a value of -1 uses
|
||||
all available samples.
|
||||
|
||||
Returns:
|
||||
Tuple of (training samples, validation samples)
|
||||
"""
|
||||
return 450000, 50000
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Google Research Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Lint as: python3
|
||||
"""Custom formatting functions for Traffic dataset.
|
||||
|
||||
Defines dataset specific column definitions and data transformations. This also
|
||||
performs z-score normalization across the entire dataset, hence re-uses most of
|
||||
the same functions as volatility.
|
||||
"""
|
||||
|
||||
import data_formatters.base
|
||||
import data_formatters.volatility
|
||||
|
||||
VolatilityFormatter = data_formatters.volatility.VolatilityFormatter
|
||||
DataTypes = data_formatters.base.DataTypes
|
||||
InputTypes = data_formatters.base.InputTypes
|
||||
|
||||
|
||||
class TrafficFormatter(VolatilityFormatter):
|
||||
"""Defines and formats data for the traffic dataset.
|
||||
|
||||
This also performs z-score normalization across the entire dataset, hence
|
||||
re-uses most of the same functions as volatility.
|
||||
|
||||
Attributes:
|
||||
column_definition: Defines input and data type of column used in the
|
||||
experiment.
|
||||
identifiers: Entity identifiers used in experiments.
|
||||
"""
|
||||
|
||||
_column_definition = [
|
||||
("id", DataTypes.REAL_VALUED, InputTypes.ID),
|
||||
("hours_from_start", DataTypes.REAL_VALUED, InputTypes.TIME),
|
||||
("values", DataTypes.REAL_VALUED, InputTypes.TARGET),
|
||||
("time_on_day", DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
|
||||
("day_of_week", DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
|
||||
("hours_from_start", DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
|
||||
("categorical_id", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
]
|
||||
|
||||
def split_data(self, df, valid_boundary=151, test_boundary=166):
|
||||
"""Splits data frame into training-validation-test data frames.
|
||||
|
||||
This also calibrates scaling object, and transforms data for each split.
|
||||
|
||||
Args:
|
||||
df: Source data frame to split.
|
||||
valid_boundary: Starting year for validation data
|
||||
test_boundary: Starting year for test data
|
||||
|
||||
Returns:
|
||||
Tuple of transformed (train, valid, test) data.
|
||||
"""
|
||||
|
||||
print("Formatting train-valid-test splits.")
|
||||
|
||||
index = df["sensor_day"]
|
||||
train = df.loc[index < valid_boundary]
|
||||
valid = df.loc[(index >= valid_boundary - 7) & (index < test_boundary)]
|
||||
test = df.loc[index >= test_boundary - 7]
|
||||
|
||||
self.set_scalers(train)
|
||||
|
||||
return (self.transform_inputs(data) for data in [train, valid, test])
|
||||
|
||||
# Default params
|
||||
def get_fixed_params(self):
|
||||
"""Returns fixed model parameters for experiments."""
|
||||
|
||||
fixed_params = {
|
||||
"total_time_steps": 8 * 24,
|
||||
"num_encoder_steps": 7 * 24,
|
||||
"num_epochs": 100,
|
||||
"early_stopping_patience": 5,
|
||||
"multiprocessing_workers": 5,
|
||||
}
|
||||
|
||||
return fixed_params
|
||||
|
||||
def get_default_model_params(self):
|
||||
"""Returns default optimised model parameters."""
|
||||
|
||||
model_params = {
|
||||
"dropout_rate": 0.3,
|
||||
"hidden_layer_size": 320,
|
||||
"learning_rate": 0.001,
|
||||
"minibatch_size": 128,
|
||||
"max_gradient_norm": 100.0,
|
||||
"num_heads": 4,
|
||||
"stack_size": 1,
|
||||
}
|
||||
|
||||
return model_params
|
||||
|
||||
def get_num_samples_for_calibration(self):
|
||||
"""Gets the default number of training and validation samples.
|
||||
|
||||
Use to sub-sample the data for network calibration and a value of -1 uses
|
||||
all available samples.
|
||||
|
||||
Returns:
|
||||
Tuple of (training samples, validation samples)
|
||||
"""
|
||||
return 450000, 50000
|
||||
|
||||
@@ -1,214 +1,212 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Google Research Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Lint as: python3
|
||||
"""Custom formatting functions for Volatility dataset.
|
||||
|
||||
Defines dataset specific column definitions and data transformations.
|
||||
"""
|
||||
|
||||
import data_formatters.base
|
||||
import libs.utils as utils
|
||||
import sklearn.preprocessing
|
||||
|
||||
GenericDataFormatter = data_formatters.base.GenericDataFormatter
|
||||
DataTypes = data_formatters.base.DataTypes
|
||||
InputTypes = data_formatters.base.InputTypes
|
||||
|
||||
|
||||
class VolatilityFormatter(GenericDataFormatter):
|
||||
"""Defines and formats data for the volatility dataset.
|
||||
|
||||
Attributes:
|
||||
column_definition: Defines input and data type of column used in the
|
||||
experiment.
|
||||
identifiers: Entity identifiers used in experiments.
|
||||
"""
|
||||
|
||||
_column_definition = [
|
||||
('Symbol', DataTypes.CATEGORICAL, InputTypes.ID),
|
||||
('date', DataTypes.DATE, InputTypes.TIME),
|
||||
('log_vol', DataTypes.REAL_VALUED, InputTypes.TARGET),
|
||||
('open_to_close', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
('days_from_start', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
|
||||
('day_of_week', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
|
||||
('day_of_month', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
|
||||
('week_of_year', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
|
||||
('month', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
|
||||
('Region', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
"""Initialises formatter."""
|
||||
|
||||
self.identifiers = None
|
||||
self._real_scalers = None
|
||||
self._cat_scalers = None
|
||||
self._target_scaler = None
|
||||
self._num_classes_per_cat_input = None
|
||||
|
||||
def split_data(self, df, valid_boundary=2016, test_boundary=2018):
|
||||
"""Splits data frame into training-validation-test data frames.
|
||||
|
||||
This also calibrates scaling object, and transforms data for each split.
|
||||
|
||||
Args:
|
||||
df: Source data frame to split.
|
||||
valid_boundary: Starting year for validation data
|
||||
test_boundary: Starting year for test data
|
||||
|
||||
Returns:
|
||||
Tuple of transformed (train, valid, test) data.
|
||||
"""
|
||||
|
||||
print('Formatting train-valid-test splits.')
|
||||
|
||||
index = df['year']
|
||||
train = df.loc[index < valid_boundary]
|
||||
valid = df.loc[(index >= valid_boundary) & (index < test_boundary)]
|
||||
test = df.loc[index >= test_boundary]
|
||||
|
||||
self.set_scalers(train)
|
||||
|
||||
return (self.transform_inputs(data) for data in [train, valid, test])
|
||||
|
||||
def set_scalers(self, df):
|
||||
"""Calibrates scalers using the data supplied.
|
||||
|
||||
Args:
|
||||
df: Data to use to calibrate scalers.
|
||||
"""
|
||||
print('Setting scalers with training data...')
|
||||
|
||||
column_definitions = self.get_column_definition()
|
||||
id_column = utils.get_single_col_by_input_type(InputTypes.ID,
|
||||
column_definitions)
|
||||
target_column = utils.get_single_col_by_input_type(InputTypes.TARGET,
|
||||
column_definitions)
|
||||
|
||||
# Extract identifiers in case required
|
||||
self.identifiers = list(df[id_column].unique())
|
||||
|
||||
# Format real scalers
|
||||
real_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.REAL_VALUED, column_definitions,
|
||||
{InputTypes.ID, InputTypes.TIME})
|
||||
|
||||
data = df[real_inputs].values
|
||||
self._real_scalers = sklearn.preprocessing.StandardScaler().fit(data)
|
||||
self._target_scaler = sklearn.preprocessing.StandardScaler().fit(
|
||||
df[[target_column]].values) # used for predictions
|
||||
|
||||
# Format categorical scalers
|
||||
categorical_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.CATEGORICAL, column_definitions,
|
||||
{InputTypes.ID, InputTypes.TIME})
|
||||
|
||||
categorical_scalers = {}
|
||||
num_classes = []
|
||||
for col in categorical_inputs:
|
||||
# Set all to str so that we don't have mixed integer/string columns
|
||||
srs = df[col].apply(str)
|
||||
categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(
|
||||
srs.values)
|
||||
num_classes.append(srs.nunique())
|
||||
|
||||
# Set categorical scaler outputs
|
||||
self._cat_scalers = categorical_scalers
|
||||
self._num_classes_per_cat_input = num_classes
|
||||
|
||||
def transform_inputs(self, df):
|
||||
"""Performs feature transformations.
|
||||
|
||||
This includes both feature engineering, preprocessing and normalisation.
|
||||
|
||||
Args:
|
||||
df: Data frame to transform.
|
||||
|
||||
Returns:
|
||||
Transformed data frame.
|
||||
|
||||
"""
|
||||
output = df.copy()
|
||||
|
||||
if self._real_scalers is None and self._cat_scalers is None:
|
||||
raise ValueError('Scalers have not been set!')
|
||||
|
||||
column_definitions = self.get_column_definition()
|
||||
|
||||
real_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.REAL_VALUED, column_definitions,
|
||||
{InputTypes.ID, InputTypes.TIME})
|
||||
categorical_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.CATEGORICAL, column_definitions,
|
||||
{InputTypes.ID, InputTypes.TIME})
|
||||
|
||||
# Format real inputs
|
||||
output[real_inputs] = self._real_scalers.transform(df[real_inputs].values)
|
||||
|
||||
# Format categorical inputs
|
||||
for col in categorical_inputs:
|
||||
string_df = df[col].apply(str)
|
||||
output[col] = self._cat_scalers[col].transform(string_df)
|
||||
|
||||
return output
|
||||
|
||||
def format_predictions(self, predictions):
|
||||
"""Reverts any normalisation to give predictions in original scale.
|
||||
|
||||
Args:
|
||||
predictions: Dataframe of model predictions.
|
||||
|
||||
Returns:
|
||||
Data frame of unnormalised predictions.
|
||||
"""
|
||||
output = predictions.copy()
|
||||
|
||||
column_names = predictions.columns
|
||||
|
||||
for col in column_names:
|
||||
if col not in {'forecast_time', 'identifier'}:
|
||||
output[col] = self._target_scaler.inverse_transform(predictions[col])
|
||||
|
||||
return output
|
||||
|
||||
# Default params
|
||||
def get_fixed_params(self):
|
||||
"""Returns fixed model parameters for experiments."""
|
||||
|
||||
fixed_params = {
|
||||
'total_time_steps': 252 + 5,
|
||||
'num_encoder_steps': 252,
|
||||
'num_epochs': 100,
|
||||
'early_stopping_patience': 5,
|
||||
'multiprocessing_workers': 5,
|
||||
}
|
||||
|
||||
return fixed_params
|
||||
|
||||
def get_default_model_params(self):
|
||||
"""Returns default optimised model parameters."""
|
||||
|
||||
model_params = {
|
||||
'dropout_rate': 0.3,
|
||||
'hidden_layer_size': 160,
|
||||
'learning_rate': 0.01,
|
||||
'minibatch_size': 64,
|
||||
'max_gradient_norm': 0.01,
|
||||
'num_heads': 1,
|
||||
'stack_size': 1
|
||||
}
|
||||
|
||||
return model_params
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Google Research Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Lint as: python3
|
||||
"""Custom formatting functions for Volatility dataset.
|
||||
|
||||
Defines dataset specific column definitions and data transformations.
|
||||
"""
|
||||
|
||||
import data_formatters.base
|
||||
import libs.utils as utils
|
||||
import sklearn.preprocessing
|
||||
|
||||
GenericDataFormatter = data_formatters.base.GenericDataFormatter
|
||||
DataTypes = data_formatters.base.DataTypes
|
||||
InputTypes = data_formatters.base.InputTypes
|
||||
|
||||
|
||||
class VolatilityFormatter(GenericDataFormatter):
|
||||
"""Defines and formats data for the volatility dataset.
|
||||
|
||||
Attributes:
|
||||
column_definition: Defines input and data type of column used in the
|
||||
experiment.
|
||||
identifiers: Entity identifiers used in experiments.
|
||||
"""
|
||||
|
||||
_column_definition = [
|
||||
("Symbol", DataTypes.CATEGORICAL, InputTypes.ID),
|
||||
("date", DataTypes.DATE, InputTypes.TIME),
|
||||
("log_vol", DataTypes.REAL_VALUED, InputTypes.TARGET),
|
||||
("open_to_close", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
|
||||
("days_from_start", DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
|
||||
("day_of_week", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
|
||||
("day_of_month", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
|
||||
("week_of_year", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
|
||||
("month", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
|
||||
("Region", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
"""Initialises formatter."""
|
||||
|
||||
self.identifiers = None
|
||||
self._real_scalers = None
|
||||
self._cat_scalers = None
|
||||
self._target_scaler = None
|
||||
self._num_classes_per_cat_input = None
|
||||
|
||||
def split_data(self, df, valid_boundary=2016, test_boundary=2018):
|
||||
"""Splits data frame into training-validation-test data frames.
|
||||
|
||||
This also calibrates scaling object, and transforms data for each split.
|
||||
|
||||
Args:
|
||||
df: Source data frame to split.
|
||||
valid_boundary: Starting year for validation data
|
||||
test_boundary: Starting year for test data
|
||||
|
||||
Returns:
|
||||
Tuple of transformed (train, valid, test) data.
|
||||
"""
|
||||
|
||||
print("Formatting train-valid-test splits.")
|
||||
|
||||
index = df["year"]
|
||||
train = df.loc[index < valid_boundary]
|
||||
valid = df.loc[(index >= valid_boundary) & (index < test_boundary)]
|
||||
test = df.loc[index >= test_boundary]
|
||||
|
||||
self.set_scalers(train)
|
||||
|
||||
return (self.transform_inputs(data) for data in [train, valid, test])
|
||||
|
||||
def set_scalers(self, df):
|
||||
"""Calibrates scalers using the data supplied.
|
||||
|
||||
Args:
|
||||
df: Data to use to calibrate scalers.
|
||||
"""
|
||||
print("Setting scalers with training data...")
|
||||
|
||||
column_definitions = self.get_column_definition()
|
||||
id_column = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions)
|
||||
target_column = utils.get_single_col_by_input_type(InputTypes.TARGET, column_definitions)
|
||||
|
||||
# Extract identifiers in case required
|
||||
self.identifiers = list(df[id_column].unique())
|
||||
|
||||
# Format real scalers
|
||||
real_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}
|
||||
)
|
||||
|
||||
data = df[real_inputs].values
|
||||
self._real_scalers = sklearn.preprocessing.StandardScaler().fit(data)
|
||||
self._target_scaler = sklearn.preprocessing.StandardScaler().fit(
|
||||
df[[target_column]].values
|
||||
) # used for predictions
|
||||
|
||||
# Format categorical scalers
|
||||
categorical_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}
|
||||
)
|
||||
|
||||
categorical_scalers = {}
|
||||
num_classes = []
|
||||
for col in categorical_inputs:
|
||||
# Set all to str so that we don't have mixed integer/string columns
|
||||
srs = df[col].apply(str)
|
||||
categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
|
||||
num_classes.append(srs.nunique())
|
||||
|
||||
# Set categorical scaler outputs
|
||||
self._cat_scalers = categorical_scalers
|
||||
self._num_classes_per_cat_input = num_classes
|
||||
|
||||
def transform_inputs(self, df):
|
||||
"""Performs feature transformations.
|
||||
|
||||
This includes both feature engineering, preprocessing and normalisation.
|
||||
|
||||
Args:
|
||||
df: Data frame to transform.
|
||||
|
||||
Returns:
|
||||
Transformed data frame.
|
||||
|
||||
"""
|
||||
output = df.copy()
|
||||
|
||||
if self._real_scalers is None and self._cat_scalers is None:
|
||||
raise ValueError("Scalers have not been set!")
|
||||
|
||||
column_definitions = self.get_column_definition()
|
||||
|
||||
real_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}
|
||||
)
|
||||
categorical_inputs = utils.extract_cols_from_data_type(
|
||||
DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}
|
||||
)
|
||||
|
||||
# Format real inputs
|
||||
output[real_inputs] = self._real_scalers.transform(df[real_inputs].values)
|
||||
|
||||
# Format categorical inputs
|
||||
for col in categorical_inputs:
|
||||
string_df = df[col].apply(str)
|
||||
output[col] = self._cat_scalers[col].transform(string_df)
|
||||
|
||||
return output
|
||||
|
||||
def format_predictions(self, predictions):
|
||||
"""Reverts any normalisation to give predictions in original scale.
|
||||
|
||||
Args:
|
||||
predictions: Dataframe of model predictions.
|
||||
|
||||
Returns:
|
||||
Data frame of unnormalised predictions.
|
||||
"""
|
||||
output = predictions.copy()
|
||||
|
||||
column_names = predictions.columns
|
||||
|
||||
for col in column_names:
|
||||
if col not in {"forecast_time", "identifier"}:
|
||||
output[col] = self._target_scaler.inverse_transform(predictions[col])
|
||||
|
||||
return output
|
||||
|
||||
# Default params
|
||||
def get_fixed_params(self):
|
||||
"""Returns fixed model parameters for experiments."""
|
||||
|
||||
fixed_params = {
|
||||
"total_time_steps": 252 + 5,
|
||||
"num_encoder_steps": 252,
|
||||
"num_epochs": 100,
|
||||
"early_stopping_patience": 5,
|
||||
"multiprocessing_workers": 5,
|
||||
}
|
||||
|
||||
return fixed_params
|
||||
|
||||
def get_default_model_params(self):
|
||||
"""Returns default optimised model parameters."""
|
||||
|
||||
model_params = {
|
||||
"dropout_rate": 0.3,
|
||||
"hidden_layer_size": 160,
|
||||
"learning_rate": 0.01,
|
||||
"minibatch_size": 64,
|
||||
"max_gradient_norm": 0.01,
|
||||
"num_heads": 1,
|
||||
"stack_size": 1,
|
||||
}
|
||||
|
||||
return model_params
|
||||
|
||||
@@ -1,15 +1,14 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Google Research Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Google Research Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
@@ -1,111 +1,107 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Google Research Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Lint as: python3
|
||||
"""Default configs for TFT experiments.
|
||||
|
||||
Contains the default output paths for data, serialised models and predictions
|
||||
for the main experiments used in the publication.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import data_formatters.electricity
|
||||
import data_formatters.favorita
|
||||
import data_formatters.traffic
|
||||
import data_formatters.volatility
|
||||
import data_formatters.qlib_Alpha158
|
||||
|
||||
|
||||
class ExperimentConfig(object):
|
||||
"""Defines experiment configs and paths to outputs.
|
||||
|
||||
Attributes:
|
||||
root_folder: Root folder to contain all experimental outputs.
|
||||
experiment: Name of experiment to run.
|
||||
data_folder: Folder to store data for experiment.
|
||||
model_folder: Folder to store serialised models.
|
||||
results_folder: Folder to store results.
|
||||
data_csv_path: Path to primary data csv file used in experiment.
|
||||
hyperparam_iterations: Default number of random search iterations for
|
||||
experiment.
|
||||
"""
|
||||
|
||||
default_experiments = ['volatility', 'electricity', 'traffic', 'favorita', 'Alpha158']
|
||||
|
||||
def __init__(self, experiment='volatility', root_folder=None):
|
||||
"""Creates configs based on default experiment chosen.
|
||||
|
||||
Args:
|
||||
experiment: Name of experiment.
|
||||
root_folder: Root folder to save all outputs of training.
|
||||
"""
|
||||
|
||||
if experiment not in self.default_experiments:
|
||||
raise ValueError('Unrecognised experiment={}'.format(experiment))
|
||||
|
||||
# Defines all relevant paths
|
||||
if root_folder is None:
|
||||
root_folder = os.path.join(
|
||||
os.path.dirname(os.path.realpath(__file__)), '..', 'outputs')
|
||||
print('Using root folder {}'.format(root_folder))
|
||||
|
||||
self.root_folder = root_folder
|
||||
self.experiment = experiment
|
||||
self.data_folder = os.path.join(root_folder, 'data', experiment)
|
||||
self.model_folder = os.path.join(root_folder, 'saved_models', experiment)
|
||||
self.results_folder = os.path.join(root_folder, 'results', experiment)
|
||||
|
||||
# Creates folders if they don't exist
|
||||
for relevant_directory in [
|
||||
self.root_folder, self.data_folder, self.model_folder,
|
||||
self.results_folder
|
||||
]:
|
||||
if not os.path.exists(relevant_directory):
|
||||
os.makedirs(relevant_directory)
|
||||
|
||||
@property
|
||||
def data_csv_path(self):
|
||||
csv_map = {
|
||||
'volatility': 'formatted_omi_vol.csv',
|
||||
'electricity': 'hourly_electricity.csv',
|
||||
'traffic': 'hourly_data.csv',
|
||||
'favorita': 'favorita_consolidated.csv',
|
||||
'Alpha158': 'Alpha158.csv',
|
||||
}
|
||||
|
||||
return os.path.join(self.data_folder, csv_map[self.experiment])
|
||||
|
||||
@property
|
||||
def hyperparam_iterations(self):
|
||||
|
||||
return 240 if self.experiment == 'volatility' else 60
|
||||
|
||||
def make_data_formatter(self):
|
||||
"""Gets a data formatter object for experiment.
|
||||
|
||||
Returns:
|
||||
Default DataFormatter per experiment.
|
||||
"""
|
||||
|
||||
data_formatter_class = {
|
||||
'volatility': data_formatters.volatility.VolatilityFormatter,
|
||||
'electricity': data_formatters.electricity.ElectricityFormatter,
|
||||
'traffic': data_formatters.traffic.TrafficFormatter,
|
||||
'favorita': data_formatters.favorita.FavoritaFormatter,
|
||||
'Alpha158': data_formatters.qlib_Alpha158.Alpha158Formatter,
|
||||
}
|
||||
|
||||
return data_formatter_class[self.experiment]()
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Google Research Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Lint as: python3
|
||||
"""Default configs for TFT experiments.
|
||||
|
||||
Contains the default output paths for data, serialised models and predictions
|
||||
for the main experiments used in the publication.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import data_formatters.electricity
|
||||
import data_formatters.favorita
|
||||
import data_formatters.traffic
|
||||
import data_formatters.volatility
|
||||
import data_formatters.qlib_Alpha158
|
||||
|
||||
|
||||
class ExperimentConfig(object):
|
||||
"""Defines experiment configs and paths to outputs.
|
||||
|
||||
Attributes:
|
||||
root_folder: Root folder to contain all experimental outputs.
|
||||
experiment: Name of experiment to run.
|
||||
data_folder: Folder to store data for experiment.
|
||||
model_folder: Folder to store serialised models.
|
||||
results_folder: Folder to store results.
|
||||
data_csv_path: Path to primary data csv file used in experiment.
|
||||
hyperparam_iterations: Default number of random search iterations for
|
||||
experiment.
|
||||
"""
|
||||
|
||||
default_experiments = ["volatility", "electricity", "traffic", "favorita", "Alpha158"]
|
||||
|
||||
def __init__(self, experiment="volatility", root_folder=None):
|
||||
"""Creates configs based on default experiment chosen.
|
||||
|
||||
Args:
|
||||
experiment: Name of experiment.
|
||||
root_folder: Root folder to save all outputs of training.
|
||||
"""
|
||||
|
||||
if experiment not in self.default_experiments:
|
||||
raise ValueError("Unrecognised experiment={}".format(experiment))
|
||||
|
||||
# Defines all relevant paths
|
||||
if root_folder is None:
|
||||
root_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "outputs")
|
||||
print("Using root folder {}".format(root_folder))
|
||||
|
||||
self.root_folder = root_folder
|
||||
self.experiment = experiment
|
||||
self.data_folder = os.path.join(root_folder, "data", experiment)
|
||||
self.model_folder = os.path.join(root_folder, "saved_models", experiment)
|
||||
self.results_folder = os.path.join(root_folder, "results", experiment)
|
||||
|
||||
# Creates folders if they don't exist
|
||||
for relevant_directory in [self.root_folder, self.data_folder, self.model_folder, self.results_folder]:
|
||||
if not os.path.exists(relevant_directory):
|
||||
os.makedirs(relevant_directory)
|
||||
|
||||
@property
|
||||
def data_csv_path(self):
|
||||
csv_map = {
|
||||
"volatility": "formatted_omi_vol.csv",
|
||||
"electricity": "hourly_electricity.csv",
|
||||
"traffic": "hourly_data.csv",
|
||||
"favorita": "favorita_consolidated.csv",
|
||||
"Alpha158": "Alpha158.csv",
|
||||
}
|
||||
|
||||
return os.path.join(self.data_folder, csv_map[self.experiment])
|
||||
|
||||
@property
|
||||
def hyperparam_iterations(self):
|
||||
|
||||
return 240 if self.experiment == "volatility" else 60
|
||||
|
||||
def make_data_formatter(self):
|
||||
"""Gets a data formatter object for experiment.
|
||||
|
||||
Returns:
|
||||
Default DataFormatter per experiment.
|
||||
"""
|
||||
|
||||
data_formatter_class = {
|
||||
"volatility": data_formatters.volatility.VolatilityFormatter,
|
||||
"electricity": data_formatters.electricity.ElectricityFormatter,
|
||||
"traffic": data_formatters.traffic.TrafficFormatter,
|
||||
"favorita": data_formatters.favorita.FavoritaFormatter,
|
||||
"Alpha158": data_formatters.qlib_Alpha158.Alpha158Formatter,
|
||||
}
|
||||
|
||||
return data_formatter_class[self.experiment]()
|
||||
|
||||
@@ -1,15 +1,14 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Google Research Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Google Research Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
@@ -1,438 +1,430 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Google Research Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Lint as: python3
|
||||
"""Classes used for hyperparameter optimisation.
|
||||
|
||||
Two main classes exist:
|
||||
1) HyperparamOptManager used for optimisation on a single machine/GPU.
|
||||
2) DistributedHyperparamOptManager for multiple GPUs on different machines.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import collections
|
||||
import os
|
||||
import shutil
|
||||
import libs.utils as utils
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
Deque = collections.deque
|
||||
|
||||
|
||||
class HyperparamOptManager:
|
||||
"""Manages hyperparameter optimisation using random search for a single GPU.
|
||||
|
||||
Attributes:
|
||||
param_ranges: Discrete hyperparameter range for random search.
|
||||
results: Dataframe of validation results.
|
||||
fixed_params: Fixed model parameters per experiment.
|
||||
saved_params: Dataframe of parameters trained.
|
||||
best_score: Minimum validation loss observed thus far.
|
||||
optimal_name: Key to best configuration.
|
||||
hyperparam_folder: Where to save optimisation outputs.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
param_ranges,
|
||||
fixed_params,
|
||||
model_folder,
|
||||
override_w_fixed_params=True):
|
||||
"""Instantiates model.
|
||||
|
||||
Args:
|
||||
param_ranges: Discrete hyperparameter range for random search.
|
||||
fixed_params: Fixed model parameters per experiment.
|
||||
model_folder: Folder to store optimisation artifacts.
|
||||
override_w_fixed_params: Whether to override serialsed fixed model
|
||||
parameters with new supplied values.
|
||||
"""
|
||||
|
||||
self.param_ranges = param_ranges
|
||||
|
||||
self._max_tries = 1000
|
||||
self.results = pd.DataFrame()
|
||||
self.fixed_params = fixed_params
|
||||
self.saved_params = pd.DataFrame()
|
||||
|
||||
self.best_score = np.Inf
|
||||
self.optimal_name = ""
|
||||
|
||||
# Setup
|
||||
# Create folder for saving if its not there
|
||||
self.hyperparam_folder = model_folder
|
||||
utils.create_folder_if_not_exist(self.hyperparam_folder)
|
||||
|
||||
self._override_w_fixed_params = override_w_fixed_params
|
||||
|
||||
def load_results(self):
|
||||
"""Loads results from previous hyperparameter optimisation.
|
||||
|
||||
Returns:
|
||||
A boolean indicating if previous results can be loaded.
|
||||
"""
|
||||
print("Loading results from", self.hyperparam_folder)
|
||||
|
||||
results_file = os.path.join(self.hyperparam_folder, "results.csv")
|
||||
params_file = os.path.join(self.hyperparam_folder, "params.csv")
|
||||
|
||||
if os.path.exists(results_file) and os.path.exists(params_file):
|
||||
|
||||
self.results = pd.read_csv(results_file, index_col=0)
|
||||
self.saved_params = pd.read_csv(params_file, index_col=0)
|
||||
|
||||
if not self.results.empty:
|
||||
self.results.at["loss"] = self.results.loc["loss"].apply(float)
|
||||
self.best_score = self.results.loc["loss"].min()
|
||||
|
||||
is_optimal = self.results.loc["loss"] == self.best_score
|
||||
self.optimal_name = self.results.T[is_optimal].index[0]
|
||||
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _get_params_from_name(self, name):
|
||||
"""Returns previously saved parameters given a key."""
|
||||
params = self.saved_params
|
||||
|
||||
selected_params = dict(params[name])
|
||||
|
||||
if self._override_w_fixed_params:
|
||||
for k in self.fixed_params:
|
||||
selected_params[k] = self.fixed_params[k]
|
||||
|
||||
return selected_params
|
||||
|
||||
def get_best_params(self):
|
||||
"""Returns the optimal hyperparameters thus far."""
|
||||
|
||||
optimal_name = self.optimal_name
|
||||
|
||||
return self._get_params_from_name(optimal_name)
|
||||
|
||||
def clear(self):
|
||||
"""Clears all previous results and saved parameters."""
|
||||
shutil.rmtree(self.hyperparam_folder)
|
||||
os.makedirs(self.hyperparam_folder)
|
||||
self.results = pd.DataFrame()
|
||||
self.saved_params = pd.DataFrame()
|
||||
|
||||
def _check_params(self, params):
|
||||
"""Checks that parameter map is properly defined."""
|
||||
|
||||
valid_fields = list(self.param_ranges.keys()) + list(
|
||||
self.fixed_params.keys())
|
||||
invalid_fields = [k for k in params if k not in valid_fields]
|
||||
missing_fields = [k for k in valid_fields if k not in params]
|
||||
|
||||
if invalid_fields:
|
||||
raise ValueError("Invalid Fields Found {} - Valid ones are {}".format(
|
||||
invalid_fields, valid_fields))
|
||||
if missing_fields:
|
||||
raise ValueError("Missing Fields Found {} - Valid ones are {}".format(
|
||||
missing_fields, valid_fields))
|
||||
|
||||
def _get_name(self, params):
|
||||
"""Returns a unique key for the supplied set of params."""
|
||||
|
||||
self._check_params(params)
|
||||
|
||||
fields = list(params.keys())
|
||||
fields.sort()
|
||||
|
||||
return "_".join([str(params[k]) for k in fields])
|
||||
|
||||
def get_next_parameters(self, ranges_to_skip=None):
|
||||
"""Returns the next set of parameters to optimise.
|
||||
|
||||
Args:
|
||||
ranges_to_skip: Explicitly defines a set of keys to skip.
|
||||
"""
|
||||
if ranges_to_skip is None:
|
||||
ranges_to_skip = set(self.results.index)
|
||||
|
||||
if not isinstance(self.param_ranges, dict):
|
||||
raise ValueError("Only works for random search!")
|
||||
|
||||
param_range_keys = list(self.param_ranges.keys())
|
||||
param_range_keys.sort()
|
||||
|
||||
def _get_next():
|
||||
"""Returns next hyperparameter set per try."""
|
||||
|
||||
parameters = {
|
||||
k: np.random.choice(self.param_ranges[k]) for k in param_range_keys
|
||||
}
|
||||
|
||||
# Adds fixed params
|
||||
for k in self.fixed_params:
|
||||
parameters[k] = self.fixed_params[k]
|
||||
|
||||
return parameters
|
||||
|
||||
for _ in range(self._max_tries):
|
||||
|
||||
parameters = _get_next()
|
||||
name = self._get_name(parameters)
|
||||
|
||||
if name not in ranges_to_skip:
|
||||
return parameters
|
||||
|
||||
raise ValueError("Exceeded max number of hyperparameter searches!!")
|
||||
|
||||
def update_score(self, parameters, loss, model, info=""):
|
||||
"""Updates the results from last optimisation run.
|
||||
|
||||
Args:
|
||||
parameters: Hyperparameters used in optimisation.
|
||||
loss: Validation loss obtained.
|
||||
model: Model to serialised if required.
|
||||
info: Any ancillary information to tag on to results.
|
||||
|
||||
Returns:
|
||||
Boolean flag indicating if the model is the best seen so far.
|
||||
"""
|
||||
|
||||
if np.isnan(loss):
|
||||
loss = np.Inf
|
||||
|
||||
if not os.path.isdir(self.hyperparam_folder):
|
||||
os.makedirs(self.hyperparam_folder)
|
||||
|
||||
name = self._get_name(parameters)
|
||||
|
||||
is_optimal = self.results.empty or loss < self.best_score
|
||||
|
||||
# save the first model
|
||||
if is_optimal:
|
||||
# Try saving first, before updating info
|
||||
if model is not None:
|
||||
print("Optimal model found, updating")
|
||||
model.save(self.hyperparam_folder)
|
||||
self.best_score = loss
|
||||
self.optimal_name = name
|
||||
|
||||
self.results[name] = pd.Series({"loss": loss, "info": info})
|
||||
self.saved_params[name] = pd.Series(parameters)
|
||||
|
||||
self.results.to_csv(os.path.join(self.hyperparam_folder, "results.csv"))
|
||||
self.saved_params.to_csv(os.path.join(self.hyperparam_folder, "params.csv"))
|
||||
|
||||
return is_optimal
|
||||
|
||||
|
||||
class DistributedHyperparamOptManager(HyperparamOptManager):
|
||||
"""Manages distributed hyperparameter optimisation across many gpus."""
|
||||
|
||||
def __init__(self,
|
||||
param_ranges,
|
||||
fixed_params,
|
||||
root_model_folder,
|
||||
worker_number,
|
||||
search_iterations=1000,
|
||||
num_iterations_per_worker=5,
|
||||
clear_serialised_params=False):
|
||||
"""Instantiates optimisation manager.
|
||||
|
||||
This hyperparameter optimisation pre-generates #search_iterations
|
||||
hyperparameter combinations and serialises them
|
||||
at the start. At runtime, each worker goes through their own set of
|
||||
parameter ranges. The pregeneration
|
||||
allows for multiple workers to run in parallel on different machines without
|
||||
resulting in parameter overlaps.
|
||||
|
||||
Args:
|
||||
param_ranges: Discrete hyperparameter range for random search.
|
||||
fixed_params: Fixed model parameters per experiment.
|
||||
root_model_folder: Folder to store optimisation artifacts.
|
||||
worker_number: Worker index definining which set of hyperparameters to
|
||||
test.
|
||||
search_iterations: Maximum numer of random search iterations.
|
||||
num_iterations_per_worker: How many iterations are handled per worker.
|
||||
clear_serialised_params: Whether to regenerate hyperparameter
|
||||
combinations.
|
||||
"""
|
||||
|
||||
max_workers = int(np.ceil(search_iterations / num_iterations_per_worker))
|
||||
|
||||
# Sanity checks
|
||||
if worker_number > max_workers:
|
||||
raise ValueError(
|
||||
"Worker number ({}) cannot be larger than the total number of workers!"
|
||||
.format(max_workers))
|
||||
if worker_number > search_iterations:
|
||||
raise ValueError(
|
||||
"Worker number ({}) cannot be larger than the max search iterations ({})!"
|
||||
.format(worker_number, search_iterations))
|
||||
|
||||
print("*** Creating hyperparameter manager for worker {} ***".format(
|
||||
worker_number))
|
||||
|
||||
hyperparam_folder = os.path.join(root_model_folder, str(worker_number))
|
||||
super().__init__(
|
||||
param_ranges,
|
||||
fixed_params,
|
||||
hyperparam_folder,
|
||||
override_w_fixed_params=True)
|
||||
|
||||
serialised_ranges_folder = os.path.join(root_model_folder, "hyperparams")
|
||||
if clear_serialised_params:
|
||||
print("Regenerating hyperparameter list")
|
||||
if os.path.exists(serialised_ranges_folder):
|
||||
shutil.rmtree(serialised_ranges_folder)
|
||||
|
||||
utils.create_folder_if_not_exist(serialised_ranges_folder)
|
||||
|
||||
self.serialised_ranges_path = os.path.join(
|
||||
serialised_ranges_folder, "ranges_{}.csv".format(search_iterations))
|
||||
self.hyperparam_folder = hyperparam_folder # override
|
||||
self.worker_num = worker_number
|
||||
self.total_search_iterations = search_iterations
|
||||
self.num_iterations_per_worker = num_iterations_per_worker
|
||||
self.global_hyperparam_df = self.load_serialised_hyperparam_df()
|
||||
self.worker_search_queue = self._get_worker_search_queue()
|
||||
|
||||
@property
|
||||
def optimisation_completed(self):
|
||||
return False if self.worker_search_queue else True
|
||||
|
||||
def get_next_parameters(self):
|
||||
"""Returns next dictionary of hyperparameters to optimise."""
|
||||
param_name = self.worker_search_queue.pop()
|
||||
|
||||
params = self.global_hyperparam_df.loc[param_name, :].to_dict()
|
||||
|
||||
# Always override!
|
||||
for k in self.fixed_params:
|
||||
print("Overriding saved {}: {}".format(k, self.fixed_params[k]))
|
||||
|
||||
params[k] = self.fixed_params[k]
|
||||
|
||||
return params
|
||||
|
||||
def load_serialised_hyperparam_df(self):
|
||||
"""Loads serialsed hyperparameter ranges from file.
|
||||
|
||||
Returns:
|
||||
DataFrame containing hyperparameter combinations.
|
||||
"""
|
||||
print("Loading params for {} search iterations form {}".format(
|
||||
self.total_search_iterations, self.serialised_ranges_path))
|
||||
|
||||
if os.path.exists(self.serialised_ranges_folder):
|
||||
df = pd.read_csv(self.serialised_ranges_path, index_col=0)
|
||||
else:
|
||||
print("Unable to load - regenerating serach ranges instead")
|
||||
df = self.update_serialised_hyperparam_df()
|
||||
|
||||
return df
|
||||
|
||||
def update_serialised_hyperparam_df(self):
|
||||
"""Regenerates hyperparameter combinations and saves to file.
|
||||
|
||||
Returns:
|
||||
DataFrame containing hyperparameter combinations.
|
||||
"""
|
||||
search_df = self._generate_full_hyperparam_df()
|
||||
|
||||
print("Serialising params for {} search iterations to {}".format(
|
||||
self.total_search_iterations, self.serialised_ranges_path))
|
||||
|
||||
search_df.to_csv(self.serialised_ranges_path)
|
||||
|
||||
return search_df
|
||||
|
||||
def _generate_full_hyperparam_df(self):
|
||||
"""Generates actual hyperparameter combinations.
|
||||
|
||||
Returns:
|
||||
DataFrame containing hyperparameter combinations.
|
||||
"""
|
||||
|
||||
np.random.seed(131) # for reproducibility of hyperparam list
|
||||
|
||||
name_list = []
|
||||
param_list = []
|
||||
for _ in range(self.total_search_iterations):
|
||||
params = super().get_next_parameters(name_list)
|
||||
|
||||
name = self._get_name(params)
|
||||
|
||||
name_list.append(name)
|
||||
param_list.append(params)
|
||||
|
||||
full_search_df = pd.DataFrame(param_list, index=name_list)
|
||||
|
||||
return full_search_df
|
||||
|
||||
def clear(self): # reset when cleared
|
||||
"""Clears results for hyperparameter manager and resets."""
|
||||
super().clear()
|
||||
self.worker_search_queue = self._get_worker_search_queue()
|
||||
|
||||
def load_results(self):
|
||||
"""Load results from file and queue parameter combinations to try.
|
||||
|
||||
Returns:
|
||||
Boolean indicating if results were successfully loaded.
|
||||
"""
|
||||
success = super().load_results()
|
||||
|
||||
if success:
|
||||
self.worker_search_queue = self._get_worker_search_queue()
|
||||
|
||||
return success
|
||||
|
||||
def _get_worker_search_queue(self):
|
||||
"""Generates the queue of param combinations for current worker.
|
||||
|
||||
Returns:
|
||||
Queue of hyperparameter combinations outstanding.
|
||||
"""
|
||||
global_df = self.assign_worker_numbers(self.global_hyperparam_df)
|
||||
worker_df = global_df[global_df["worker"] == self.worker_num]
|
||||
|
||||
left_overs = [s for s in worker_df.index if s not in self.results.columns]
|
||||
|
||||
return Deque(left_overs)
|
||||
|
||||
def assign_worker_numbers(self, df):
|
||||
"""Updates parameter combinations with the index of the worker used.
|
||||
|
||||
Args:
|
||||
df: DataFrame of parameter combinations.
|
||||
|
||||
Returns:
|
||||
Updated DataFrame with worker number.
|
||||
"""
|
||||
output = df.copy()
|
||||
|
||||
n = self.total_search_iterations
|
||||
batch_size = self.num_iterations_per_worker
|
||||
|
||||
max_worker_num = int(np.ceil(n / batch_size))
|
||||
|
||||
worker_idx = np.concatenate([
|
||||
np.tile(i + 1, self.num_iterations_per_worker)
|
||||
for i in range(max_worker_num)
|
||||
])
|
||||
|
||||
output["worker"] = worker_idx[:len(output)]
|
||||
|
||||
return output
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Google Research Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Lint as: python3
|
||||
"""Classes used for hyperparameter optimisation.
|
||||
|
||||
Two main classes exist:
|
||||
1) HyperparamOptManager used for optimisation on a single machine/GPU.
|
||||
2) DistributedHyperparamOptManager for multiple GPUs on different machines.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import collections
|
||||
import os
|
||||
import shutil
|
||||
import libs.utils as utils
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
Deque = collections.deque
|
||||
|
||||
|
||||
class HyperparamOptManager:
|
||||
"""Manages hyperparameter optimisation using random search for a single GPU.
|
||||
|
||||
Attributes:
|
||||
param_ranges: Discrete hyperparameter range for random search.
|
||||
results: Dataframe of validation results.
|
||||
fixed_params: Fixed model parameters per experiment.
|
||||
saved_params: Dataframe of parameters trained.
|
||||
best_score: Minimum validation loss observed thus far.
|
||||
optimal_name: Key to best configuration.
|
||||
hyperparam_folder: Where to save optimisation outputs.
|
||||
"""
|
||||
|
||||
def __init__(self, param_ranges, fixed_params, model_folder, override_w_fixed_params=True):
|
||||
"""Instantiates model.
|
||||
|
||||
Args:
|
||||
param_ranges: Discrete hyperparameter range for random search.
|
||||
fixed_params: Fixed model parameters per experiment.
|
||||
model_folder: Folder to store optimisation artifacts.
|
||||
override_w_fixed_params: Whether to override serialsed fixed model
|
||||
parameters with new supplied values.
|
||||
"""
|
||||
|
||||
self.param_ranges = param_ranges
|
||||
|
||||
self._max_tries = 1000
|
||||
self.results = pd.DataFrame()
|
||||
self.fixed_params = fixed_params
|
||||
self.saved_params = pd.DataFrame()
|
||||
|
||||
self.best_score = np.Inf
|
||||
self.optimal_name = ""
|
||||
|
||||
# Setup
|
||||
# Create folder for saving if its not there
|
||||
self.hyperparam_folder = model_folder
|
||||
utils.create_folder_if_not_exist(self.hyperparam_folder)
|
||||
|
||||
self._override_w_fixed_params = override_w_fixed_params
|
||||
|
||||
def load_results(self):
|
||||
"""Loads results from previous hyperparameter optimisation.
|
||||
|
||||
Returns:
|
||||
A boolean indicating if previous results can be loaded.
|
||||
"""
|
||||
print("Loading results from", self.hyperparam_folder)
|
||||
|
||||
results_file = os.path.join(self.hyperparam_folder, "results.csv")
|
||||
params_file = os.path.join(self.hyperparam_folder, "params.csv")
|
||||
|
||||
if os.path.exists(results_file) and os.path.exists(params_file):
|
||||
|
||||
self.results = pd.read_csv(results_file, index_col=0)
|
||||
self.saved_params = pd.read_csv(params_file, index_col=0)
|
||||
|
||||
if not self.results.empty:
|
||||
self.results.at["loss"] = self.results.loc["loss"].apply(float)
|
||||
self.best_score = self.results.loc["loss"].min()
|
||||
|
||||
is_optimal = self.results.loc["loss"] == self.best_score
|
||||
self.optimal_name = self.results.T[is_optimal].index[0]
|
||||
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _get_params_from_name(self, name):
|
||||
"""Returns previously saved parameters given a key."""
|
||||
params = self.saved_params
|
||||
|
||||
selected_params = dict(params[name])
|
||||
|
||||
if self._override_w_fixed_params:
|
||||
for k in self.fixed_params:
|
||||
selected_params[k] = self.fixed_params[k]
|
||||
|
||||
return selected_params
|
||||
|
||||
def get_best_params(self):
|
||||
"""Returns the optimal hyperparameters thus far."""
|
||||
|
||||
optimal_name = self.optimal_name
|
||||
|
||||
return self._get_params_from_name(optimal_name)
|
||||
|
||||
def clear(self):
|
||||
"""Clears all previous results and saved parameters."""
|
||||
shutil.rmtree(self.hyperparam_folder)
|
||||
os.makedirs(self.hyperparam_folder)
|
||||
self.results = pd.DataFrame()
|
||||
self.saved_params = pd.DataFrame()
|
||||
|
||||
def _check_params(self, params):
|
||||
"""Checks that parameter map is properly defined."""
|
||||
|
||||
valid_fields = list(self.param_ranges.keys()) + list(self.fixed_params.keys())
|
||||
invalid_fields = [k for k in params if k not in valid_fields]
|
||||
missing_fields = [k for k in valid_fields if k not in params]
|
||||
|
||||
if invalid_fields:
|
||||
raise ValueError("Invalid Fields Found {} - Valid ones are {}".format(invalid_fields, valid_fields))
|
||||
if missing_fields:
|
||||
raise ValueError("Missing Fields Found {} - Valid ones are {}".format(missing_fields, valid_fields))
|
||||
|
||||
def _get_name(self, params):
|
||||
"""Returns a unique key for the supplied set of params."""
|
||||
|
||||
self._check_params(params)
|
||||
|
||||
fields = list(params.keys())
|
||||
fields.sort()
|
||||
|
||||
return "_".join([str(params[k]) for k in fields])
|
||||
|
||||
def get_next_parameters(self, ranges_to_skip=None):
|
||||
"""Returns the next set of parameters to optimise.
|
||||
|
||||
Args:
|
||||
ranges_to_skip: Explicitly defines a set of keys to skip.
|
||||
"""
|
||||
if ranges_to_skip is None:
|
||||
ranges_to_skip = set(self.results.index)
|
||||
|
||||
if not isinstance(self.param_ranges, dict):
|
||||
raise ValueError("Only works for random search!")
|
||||
|
||||
param_range_keys = list(self.param_ranges.keys())
|
||||
param_range_keys.sort()
|
||||
|
||||
def _get_next():
|
||||
"""Returns next hyperparameter set per try."""
|
||||
|
||||
parameters = {k: np.random.choice(self.param_ranges[k]) for k in param_range_keys}
|
||||
|
||||
# Adds fixed params
|
||||
for k in self.fixed_params:
|
||||
parameters[k] = self.fixed_params[k]
|
||||
|
||||
return parameters
|
||||
|
||||
for _ in range(self._max_tries):
|
||||
|
||||
parameters = _get_next()
|
||||
name = self._get_name(parameters)
|
||||
|
||||
if name not in ranges_to_skip:
|
||||
return parameters
|
||||
|
||||
raise ValueError("Exceeded max number of hyperparameter searches!!")
|
||||
|
||||
def update_score(self, parameters, loss, model, info=""):
|
||||
"""Updates the results from last optimisation run.
|
||||
|
||||
Args:
|
||||
parameters: Hyperparameters used in optimisation.
|
||||
loss: Validation loss obtained.
|
||||
model: Model to serialised if required.
|
||||
info: Any ancillary information to tag on to results.
|
||||
|
||||
Returns:
|
||||
Boolean flag indicating if the model is the best seen so far.
|
||||
"""
|
||||
|
||||
if np.isnan(loss):
|
||||
loss = np.Inf
|
||||
|
||||
if not os.path.isdir(self.hyperparam_folder):
|
||||
os.makedirs(self.hyperparam_folder)
|
||||
|
||||
name = self._get_name(parameters)
|
||||
|
||||
is_optimal = self.results.empty or loss < self.best_score
|
||||
|
||||
# save the first model
|
||||
if is_optimal:
|
||||
# Try saving first, before updating info
|
||||
if model is not None:
|
||||
print("Optimal model found, updating")
|
||||
model.save(self.hyperparam_folder)
|
||||
self.best_score = loss
|
||||
self.optimal_name = name
|
||||
|
||||
self.results[name] = pd.Series({"loss": loss, "info": info})
|
||||
self.saved_params[name] = pd.Series(parameters)
|
||||
|
||||
self.results.to_csv(os.path.join(self.hyperparam_folder, "results.csv"))
|
||||
self.saved_params.to_csv(os.path.join(self.hyperparam_folder, "params.csv"))
|
||||
|
||||
return is_optimal
|
||||
|
||||
|
||||
class DistributedHyperparamOptManager(HyperparamOptManager):
|
||||
"""Manages distributed hyperparameter optimisation across many gpus."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
param_ranges,
|
||||
fixed_params,
|
||||
root_model_folder,
|
||||
worker_number,
|
||||
search_iterations=1000,
|
||||
num_iterations_per_worker=5,
|
||||
clear_serialised_params=False,
|
||||
):
|
||||
"""Instantiates optimisation manager.
|
||||
|
||||
This hyperparameter optimisation pre-generates #search_iterations
|
||||
hyperparameter combinations and serialises them
|
||||
at the start. At runtime, each worker goes through their own set of
|
||||
parameter ranges. The pregeneration
|
||||
allows for multiple workers to run in parallel on different machines without
|
||||
resulting in parameter overlaps.
|
||||
|
||||
Args:
|
||||
param_ranges: Discrete hyperparameter range for random search.
|
||||
fixed_params: Fixed model parameters per experiment.
|
||||
root_model_folder: Folder to store optimisation artifacts.
|
||||
worker_number: Worker index definining which set of hyperparameters to
|
||||
test.
|
||||
search_iterations: Maximum numer of random search iterations.
|
||||
num_iterations_per_worker: How many iterations are handled per worker.
|
||||
clear_serialised_params: Whether to regenerate hyperparameter
|
||||
combinations.
|
||||
"""
|
||||
|
||||
max_workers = int(np.ceil(search_iterations / num_iterations_per_worker))
|
||||
|
||||
# Sanity checks
|
||||
if worker_number > max_workers:
|
||||
raise ValueError(
|
||||
"Worker number ({}) cannot be larger than the total number of workers!".format(max_workers)
|
||||
)
|
||||
if worker_number > search_iterations:
|
||||
raise ValueError(
|
||||
"Worker number ({}) cannot be larger than the max search iterations ({})!".format(
|
||||
worker_number, search_iterations
|
||||
)
|
||||
)
|
||||
|
||||
print("*** Creating hyperparameter manager for worker {} ***".format(worker_number))
|
||||
|
||||
hyperparam_folder = os.path.join(root_model_folder, str(worker_number))
|
||||
super().__init__(param_ranges, fixed_params, hyperparam_folder, override_w_fixed_params=True)
|
||||
|
||||
serialised_ranges_folder = os.path.join(root_model_folder, "hyperparams")
|
||||
if clear_serialised_params:
|
||||
print("Regenerating hyperparameter list")
|
||||
if os.path.exists(serialised_ranges_folder):
|
||||
shutil.rmtree(serialised_ranges_folder)
|
||||
|
||||
utils.create_folder_if_not_exist(serialised_ranges_folder)
|
||||
|
||||
self.serialised_ranges_path = os.path.join(serialised_ranges_folder, "ranges_{}.csv".format(search_iterations))
|
||||
self.hyperparam_folder = hyperparam_folder # override
|
||||
self.worker_num = worker_number
|
||||
self.total_search_iterations = search_iterations
|
||||
self.num_iterations_per_worker = num_iterations_per_worker
|
||||
self.global_hyperparam_df = self.load_serialised_hyperparam_df()
|
||||
self.worker_search_queue = self._get_worker_search_queue()
|
||||
|
||||
@property
|
||||
def optimisation_completed(self):
|
||||
return False if self.worker_search_queue else True
|
||||
|
||||
def get_next_parameters(self):
|
||||
"""Returns next dictionary of hyperparameters to optimise."""
|
||||
param_name = self.worker_search_queue.pop()
|
||||
|
||||
params = self.global_hyperparam_df.loc[param_name, :].to_dict()
|
||||
|
||||
# Always override!
|
||||
for k in self.fixed_params:
|
||||
print("Overriding saved {}: {}".format(k, self.fixed_params[k]))
|
||||
|
||||
params[k] = self.fixed_params[k]
|
||||
|
||||
return params
|
||||
|
||||
def load_serialised_hyperparam_df(self):
|
||||
"""Loads serialsed hyperparameter ranges from file.
|
||||
|
||||
Returns:
|
||||
DataFrame containing hyperparameter combinations.
|
||||
"""
|
||||
print(
|
||||
"Loading params for {} search iterations form {}".format(
|
||||
self.total_search_iterations, self.serialised_ranges_path
|
||||
)
|
||||
)
|
||||
|
||||
if os.path.exists(self.serialised_ranges_folder):
|
||||
df = pd.read_csv(self.serialised_ranges_path, index_col=0)
|
||||
else:
|
||||
print("Unable to load - regenerating serach ranges instead")
|
||||
df = self.update_serialised_hyperparam_df()
|
||||
|
||||
return df
|
||||
|
||||
def update_serialised_hyperparam_df(self):
|
||||
"""Regenerates hyperparameter combinations and saves to file.
|
||||
|
||||
Returns:
|
||||
DataFrame containing hyperparameter combinations.
|
||||
"""
|
||||
search_df = self._generate_full_hyperparam_df()
|
||||
|
||||
print(
|
||||
"Serialising params for {} search iterations to {}".format(
|
||||
self.total_search_iterations, self.serialised_ranges_path
|
||||
)
|
||||
)
|
||||
|
||||
search_df.to_csv(self.serialised_ranges_path)
|
||||
|
||||
return search_df
|
||||
|
||||
def _generate_full_hyperparam_df(self):
|
||||
"""Generates actual hyperparameter combinations.
|
||||
|
||||
Returns:
|
||||
DataFrame containing hyperparameter combinations.
|
||||
"""
|
||||
|
||||
np.random.seed(131) # for reproducibility of hyperparam list
|
||||
|
||||
name_list = []
|
||||
param_list = []
|
||||
for _ in range(self.total_search_iterations):
|
||||
params = super().get_next_parameters(name_list)
|
||||
|
||||
name = self._get_name(params)
|
||||
|
||||
name_list.append(name)
|
||||
param_list.append(params)
|
||||
|
||||
full_search_df = pd.DataFrame(param_list, index=name_list)
|
||||
|
||||
return full_search_df
|
||||
|
||||
def clear(self): # reset when cleared
|
||||
"""Clears results for hyperparameter manager and resets."""
|
||||
super().clear()
|
||||
self.worker_search_queue = self._get_worker_search_queue()
|
||||
|
||||
def load_results(self):
|
||||
"""Load results from file and queue parameter combinations to try.
|
||||
|
||||
Returns:
|
||||
Boolean indicating if results were successfully loaded.
|
||||
"""
|
||||
success = super().load_results()
|
||||
|
||||
if success:
|
||||
self.worker_search_queue = self._get_worker_search_queue()
|
||||
|
||||
return success
|
||||
|
||||
def _get_worker_search_queue(self):
|
||||
"""Generates the queue of param combinations for current worker.
|
||||
|
||||
Returns:
|
||||
Queue of hyperparameter combinations outstanding.
|
||||
"""
|
||||
global_df = self.assign_worker_numbers(self.global_hyperparam_df)
|
||||
worker_df = global_df[global_df["worker"] == self.worker_num]
|
||||
|
||||
left_overs = [s for s in worker_df.index if s not in self.results.columns]
|
||||
|
||||
return Deque(left_overs)
|
||||
|
||||
def assign_worker_numbers(self, df):
|
||||
"""Updates parameter combinations with the index of the worker used.
|
||||
|
||||
Args:
|
||||
df: DataFrame of parameter combinations.
|
||||
|
||||
Returns:
|
||||
Updated DataFrame with worker number.
|
||||
"""
|
||||
output = df.copy()
|
||||
|
||||
n = self.total_search_iterations
|
||||
batch_size = self.num_iterations_per_worker
|
||||
|
||||
max_worker_num = int(np.ceil(n / batch_size))
|
||||
|
||||
worker_idx = np.concatenate([np.tile(i + 1, self.num_iterations_per_worker) for i in range(max_worker_num)])
|
||||
|
||||
output["worker"] = worker_idx[: len(output)]
|
||||
|
||||
return output
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,236 +1,224 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Google Research Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Lint as: python3
|
||||
"""Generic helper functions used across codebase."""
|
||||
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tensorflow.python.tools.inspect_checkpoint import print_tensors_in_checkpoint_file
|
||||
|
||||
|
||||
# Generic.
|
||||
def get_single_col_by_input_type(input_type, column_definition):
|
||||
"""Returns name of single column.
|
||||
|
||||
Args:
|
||||
input_type: Input type of column to extract
|
||||
column_definition: Column definition list for experiment
|
||||
"""
|
||||
|
||||
l = [tup[0] for tup in column_definition if tup[2] == input_type]
|
||||
|
||||
if len(l) != 1:
|
||||
raise ValueError('Invalid number of columns for {}'.format(input_type))
|
||||
|
||||
return l[0]
|
||||
|
||||
|
||||
def extract_cols_from_data_type(data_type, column_definition,
|
||||
excluded_input_types):
|
||||
"""Extracts the names of columns that correspond to a define data_type.
|
||||
|
||||
Args:
|
||||
data_type: DataType of columns to extract.
|
||||
column_definition: Column definition to use.
|
||||
excluded_input_types: Set of input types to exclude
|
||||
|
||||
Returns:
|
||||
List of names for columns with data type specified.
|
||||
"""
|
||||
return [
|
||||
tup[0]
|
||||
for tup in column_definition
|
||||
if tup[1] == data_type and tup[2] not in excluded_input_types
|
||||
]
|
||||
|
||||
|
||||
# Loss functions.
|
||||
def tensorflow_quantile_loss(y, y_pred, quantile):
|
||||
"""Computes quantile loss for tensorflow.
|
||||
|
||||
Standard quantile loss as defined in the "Training Procedure" section of
|
||||
the main TFT paper
|
||||
|
||||
Args:
|
||||
y: Targets
|
||||
y_pred: Predictions
|
||||
quantile: Quantile to use for loss calculations (between 0 & 1)
|
||||
|
||||
Returns:
|
||||
Tensor for quantile loss.
|
||||
"""
|
||||
|
||||
# Checks quantile
|
||||
if quantile < 0 or quantile > 1:
|
||||
raise ValueError(
|
||||
'Illegal quantile value={}! Values should be between 0 and 1.'.format(
|
||||
quantile))
|
||||
|
||||
prediction_underflow = y - y_pred
|
||||
q_loss = quantile * tf.maximum(prediction_underflow, 0.) + (
|
||||
1. - quantile) * tf.maximum(-prediction_underflow, 0.)
|
||||
|
||||
return tf.reduce_sum(q_loss, axis=-1)
|
||||
|
||||
|
||||
def numpy_normalised_quantile_loss(y, y_pred, quantile):
|
||||
"""Computes normalised quantile loss for numpy arrays.
|
||||
|
||||
Uses the q-Risk metric as defined in the "Training Procedure" section of the
|
||||
main TFT paper.
|
||||
|
||||
Args:
|
||||
y: Targets
|
||||
y_pred: Predictions
|
||||
quantile: Quantile to use for loss calculations (between 0 & 1)
|
||||
|
||||
Returns:
|
||||
Float for normalised quantile loss.
|
||||
"""
|
||||
prediction_underflow = y - y_pred
|
||||
weighted_errors = quantile * np.maximum(prediction_underflow, 0.) \
|
||||
+ (1. - quantile) * np.maximum(-prediction_underflow, 0.)
|
||||
|
||||
quantile_loss = weighted_errors.mean()
|
||||
normaliser = y.abs().mean()
|
||||
|
||||
return 2 * quantile_loss / normaliser
|
||||
|
||||
|
||||
# OS related functions.
|
||||
def create_folder_if_not_exist(directory):
|
||||
"""Creates folder if it doesn't exist.
|
||||
|
||||
Args:
|
||||
directory: Folder path to create.
|
||||
"""
|
||||
# Also creates directories recursively
|
||||
pathlib.Path(directory).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
# Tensorflow related functions.
|
||||
def get_default_tensorflow_config(tf_device='gpu', gpu_id=0):
|
||||
"""Creates tensorflow config for graphs to run on CPU or GPU.
|
||||
|
||||
Specifies whether to run graph on gpu or cpu and which GPU ID to use for multi
|
||||
GPU machines.
|
||||
|
||||
Args:
|
||||
tf_device: 'cpu' or 'gpu'
|
||||
gpu_id: GPU ID to use if relevant
|
||||
|
||||
Returns:
|
||||
Tensorflow config.
|
||||
"""
|
||||
|
||||
if tf_device == 'cpu':
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # for training on cpu
|
||||
tf_config = tf.ConfigProto(
|
||||
log_device_placement=False, device_count={'GPU': 0})
|
||||
|
||||
else:
|
||||
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
|
||||
|
||||
print('Selecting GPU ID={}'.format(gpu_id))
|
||||
|
||||
tf_config = tf.ConfigProto(log_device_placement=False)
|
||||
tf_config.gpu_options.allow_growth = True
|
||||
|
||||
return tf_config
|
||||
|
||||
|
||||
def save(tf_session, model_folder, cp_name, scope=None):
|
||||
"""Saves Tensorflow graph to checkpoint.
|
||||
|
||||
Saves all trainiable variables under a given variable scope to checkpoint.
|
||||
|
||||
Args:
|
||||
tf_session: Session containing graph
|
||||
model_folder: Folder to save models
|
||||
cp_name: Name of Tensorflow checkpoint
|
||||
scope: Variable scope containing variables to save
|
||||
"""
|
||||
# Save model
|
||||
if scope is None:
|
||||
saver = tf.train.Saver()
|
||||
else:
|
||||
var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
|
||||
saver = tf.train.Saver(var_list=var_list, max_to_keep=100000)
|
||||
|
||||
save_path = saver.save(tf_session,
|
||||
os.path.join(model_folder, '{0}.ckpt'.format(cp_name)))
|
||||
print('Model saved to: {0}'.format(save_path))
|
||||
|
||||
|
||||
def load(tf_session, model_folder, cp_name, scope=None, verbose=False):
|
||||
"""Loads Tensorflow graph from checkpoint.
|
||||
|
||||
Args:
|
||||
tf_session: Session to load graph into
|
||||
model_folder: Folder containing serialised model
|
||||
cp_name: Name of Tensorflow checkpoint
|
||||
scope: Variable scope to use.
|
||||
verbose: Whether to print additional debugging information.
|
||||
"""
|
||||
# Load model proper
|
||||
load_path = os.path.join(model_folder, '{0}.ckpt'.format(cp_name))
|
||||
|
||||
print('Loading model from {0}'.format(load_path))
|
||||
|
||||
print_weights_in_checkpoint(model_folder, cp_name)
|
||||
|
||||
initial_vars = set(
|
||||
[v.name for v in tf.get_default_graph().as_graph_def().node])
|
||||
|
||||
# Saver
|
||||
if scope is None:
|
||||
saver = tf.train.Saver()
|
||||
else:
|
||||
var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)
|
||||
saver = tf.train.Saver(var_list=var_list, max_to_keep=100000)
|
||||
# Load
|
||||
saver.restore(tf_session, load_path)
|
||||
all_vars = set([v.name for v in tf.get_default_graph().as_graph_def().node])
|
||||
|
||||
if verbose:
|
||||
print('Restored {0}'.format(','.join(initial_vars.difference(all_vars))))
|
||||
print('Existing {0}'.format(','.join(all_vars.difference(initial_vars))))
|
||||
print('All {0}'.format(','.join(all_vars)))
|
||||
|
||||
print('Done.')
|
||||
|
||||
|
||||
def print_weights_in_checkpoint(model_folder, cp_name):
|
||||
"""Prints all weights in Tensorflow checkpoint.
|
||||
|
||||
Args:
|
||||
model_folder: Folder containing checkpoint
|
||||
cp_name: Name of checkpoint
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
load_path = os.path.join(model_folder, '{0}.ckpt'.format(cp_name))
|
||||
|
||||
print_tensors_in_checkpoint_file(
|
||||
file_name=load_path,
|
||||
tensor_name='',
|
||||
all_tensors=True,
|
||||
all_tensor_names=True)
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Google Research Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Lint as: python3
|
||||
"""Generic helper functions used across codebase."""
|
||||
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tensorflow.python.tools.inspect_checkpoint import print_tensors_in_checkpoint_file
|
||||
|
||||
|
||||
# Generic.
|
||||
def get_single_col_by_input_type(input_type, column_definition):
|
||||
"""Returns name of single column.
|
||||
|
||||
Args:
|
||||
input_type: Input type of column to extract
|
||||
column_definition: Column definition list for experiment
|
||||
"""
|
||||
|
||||
l = [tup[0] for tup in column_definition if tup[2] == input_type]
|
||||
|
||||
if len(l) != 1:
|
||||
raise ValueError("Invalid number of columns for {}".format(input_type))
|
||||
|
||||
return l[0]
|
||||
|
||||
|
||||
def extract_cols_from_data_type(data_type, column_definition, excluded_input_types):
|
||||
"""Extracts the names of columns that correspond to a define data_type.
|
||||
|
||||
Args:
|
||||
data_type: DataType of columns to extract.
|
||||
column_definition: Column definition to use.
|
||||
excluded_input_types: Set of input types to exclude
|
||||
|
||||
Returns:
|
||||
List of names for columns with data type specified.
|
||||
"""
|
||||
return [tup[0] for tup in column_definition if tup[1] == data_type and tup[2] not in excluded_input_types]
|
||||
|
||||
|
||||
# Loss functions.
|
||||
def tensorflow_quantile_loss(y, y_pred, quantile):
|
||||
"""Computes quantile loss for tensorflow.
|
||||
|
||||
Standard quantile loss as defined in the "Training Procedure" section of
|
||||
the main TFT paper
|
||||
|
||||
Args:
|
||||
y: Targets
|
||||
y_pred: Predictions
|
||||
quantile: Quantile to use for loss calculations (between 0 & 1)
|
||||
|
||||
Returns:
|
||||
Tensor for quantile loss.
|
||||
"""
|
||||
|
||||
# Checks quantile
|
||||
if quantile < 0 or quantile > 1:
|
||||
raise ValueError("Illegal quantile value={}! Values should be between 0 and 1.".format(quantile))
|
||||
|
||||
prediction_underflow = y - y_pred
|
||||
q_loss = quantile * tf.maximum(prediction_underflow, 0.0) + (1.0 - quantile) * tf.maximum(
|
||||
-prediction_underflow, 0.0
|
||||
)
|
||||
|
||||
return tf.reduce_sum(q_loss, axis=-1)
|
||||
|
||||
|
||||
def numpy_normalised_quantile_loss(y, y_pred, quantile):
|
||||
"""Computes normalised quantile loss for numpy arrays.
|
||||
|
||||
Uses the q-Risk metric as defined in the "Training Procedure" section of the
|
||||
main TFT paper.
|
||||
|
||||
Args:
|
||||
y: Targets
|
||||
y_pred: Predictions
|
||||
quantile: Quantile to use for loss calculations (between 0 & 1)
|
||||
|
||||
Returns:
|
||||
Float for normalised quantile loss.
|
||||
"""
|
||||
prediction_underflow = y - y_pred
|
||||
weighted_errors = quantile * np.maximum(prediction_underflow, 0.0) + (1.0 - quantile) * np.maximum(
|
||||
-prediction_underflow, 0.0
|
||||
)
|
||||
|
||||
quantile_loss = weighted_errors.mean()
|
||||
normaliser = y.abs().mean()
|
||||
|
||||
return 2 * quantile_loss / normaliser
|
||||
|
||||
|
||||
# OS related functions.
|
||||
def create_folder_if_not_exist(directory):
|
||||
"""Creates folder if it doesn't exist.
|
||||
|
||||
Args:
|
||||
directory: Folder path to create.
|
||||
"""
|
||||
# Also creates directories recursively
|
||||
pathlib.Path(directory).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
# Tensorflow related functions.
|
||||
def get_default_tensorflow_config(tf_device="gpu", gpu_id=0):
|
||||
"""Creates tensorflow config for graphs to run on CPU or GPU.
|
||||
|
||||
Specifies whether to run graph on gpu or cpu and which GPU ID to use for multi
|
||||
GPU machines.
|
||||
|
||||
Args:
|
||||
tf_device: 'cpu' or 'gpu'
|
||||
gpu_id: GPU ID to use if relevant
|
||||
|
||||
Returns:
|
||||
Tensorflow config.
|
||||
"""
|
||||
|
||||
if tf_device == "cpu":
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # for training on cpu
|
||||
tf_config = tf.ConfigProto(log_device_placement=False, device_count={"GPU": 0})
|
||||
|
||||
else:
|
||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
|
||||
|
||||
print("Selecting GPU ID={}".format(gpu_id))
|
||||
|
||||
tf_config = tf.ConfigProto(log_device_placement=False)
|
||||
tf_config.gpu_options.allow_growth = True
|
||||
|
||||
return tf_config
|
||||
|
||||
|
||||
def save(tf_session, model_folder, cp_name, scope=None):
|
||||
"""Saves Tensorflow graph to checkpoint.
|
||||
|
||||
Saves all trainiable variables under a given variable scope to checkpoint.
|
||||
|
||||
Args:
|
||||
tf_session: Session containing graph
|
||||
model_folder: Folder to save models
|
||||
cp_name: Name of Tensorflow checkpoint
|
||||
scope: Variable scope containing variables to save
|
||||
"""
|
||||
# Save model
|
||||
if scope is None:
|
||||
saver = tf.train.Saver()
|
||||
else:
|
||||
var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
|
||||
saver = tf.train.Saver(var_list=var_list, max_to_keep=100000)
|
||||
|
||||
save_path = saver.save(tf_session, os.path.join(model_folder, "{0}.ckpt".format(cp_name)))
|
||||
print("Model saved to: {0}".format(save_path))
|
||||
|
||||
|
||||
def load(tf_session, model_folder, cp_name, scope=None, verbose=False):
|
||||
"""Loads Tensorflow graph from checkpoint.
|
||||
|
||||
Args:
|
||||
tf_session: Session to load graph into
|
||||
model_folder: Folder containing serialised model
|
||||
cp_name: Name of Tensorflow checkpoint
|
||||
scope: Variable scope to use.
|
||||
verbose: Whether to print additional debugging information.
|
||||
"""
|
||||
# Load model proper
|
||||
load_path = os.path.join(model_folder, "{0}.ckpt".format(cp_name))
|
||||
|
||||
print("Loading model from {0}".format(load_path))
|
||||
|
||||
print_weights_in_checkpoint(model_folder, cp_name)
|
||||
|
||||
initial_vars = set([v.name for v in tf.get_default_graph().as_graph_def().node])
|
||||
|
||||
# Saver
|
||||
if scope is None:
|
||||
saver = tf.train.Saver()
|
||||
else:
|
||||
var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)
|
||||
saver = tf.train.Saver(var_list=var_list, max_to_keep=100000)
|
||||
# Load
|
||||
saver.restore(tf_session, load_path)
|
||||
all_vars = set([v.name for v in tf.get_default_graph().as_graph_def().node])
|
||||
|
||||
if verbose:
|
||||
print("Restored {0}".format(",".join(initial_vars.difference(all_vars))))
|
||||
print("Existing {0}".format(",".join(all_vars.difference(initial_vars))))
|
||||
print("All {0}".format(",".join(all_vars)))
|
||||
|
||||
print("Done.")
|
||||
|
||||
|
||||
def print_weights_in_checkpoint(model_folder, cp_name):
|
||||
"""Prints all weights in Tensorflow checkpoint.
|
||||
|
||||
Args:
|
||||
model_folder: Folder containing checkpoint
|
||||
cp_name: Name of checkpoint
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
load_path = os.path.join(model_folder, "{0}.ckpt".format(cp_name))
|
||||
|
||||
print_tensors_in_checkpoint_file(file_name=load_path, tensor_name="", all_tensors=True, all_tensor_names=True)
|
||||
|
||||
@@ -1,246 +1,248 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import tensorflow.compat.v1 as tf
|
||||
import data_formatters.base
|
||||
import expt_settings.configs
|
||||
import libs.hyperparam_opt
|
||||
import libs.tft_model
|
||||
import libs.utils as utils
|
||||
import os
|
||||
import datetime as dte
|
||||
|
||||
|
||||
from qlib.model.base import ModelFT
|
||||
from qlib.data.dataset import DatasetH
|
||||
from qlib.data.dataset.handler import DataHandlerLP
|
||||
|
||||
|
||||
|
||||
# To register new datasets, please add them here.
|
||||
ALLOW_DATASET = ['Alpha158']
|
||||
DATASET_SETTING = {
|
||||
'Alpha158': {
|
||||
'feature_col': ['RESI5', 'WVMA5', 'RSQR5', 'KLEN', 'RSQR10', 'CORR5', 'CORD5', 'CORR10', 'ROC60', 'RESI10'],
|
||||
'label_col': ['LABEL0'],
|
||||
},
|
||||
}
|
||||
# To register new datasets, please add their configurations here.
|
||||
|
||||
def get_shifted_label(data_df, shifts=5, col_shift='LABEL0'):
|
||||
return data_df[[col_shift]].groupby('instrument').apply(lambda df: df.shift(shifts))
|
||||
|
||||
def fill_test_na(test_df):
|
||||
test_df_res = test_df.copy()
|
||||
feature_cols = ~test_df_res.columns.str.contains('label', case=False)
|
||||
test_feature_fna = test_df_res.loc[:, feature_cols].groupby('datetime').apply(lambda df: df.fillna(df.mean()))
|
||||
test_df_res.loc[:, feature_cols] = test_feature_fna
|
||||
return test_df_res
|
||||
|
||||
def process_qlib_data(df, dataset, fillna=False):
|
||||
"""Prepare data to fit the TFT model.
|
||||
|
||||
Args:
|
||||
df: Original DataFrame.
|
||||
fillna: Whether to fill the data with the mean values.
|
||||
|
||||
Returns:
|
||||
Transformed DataFrame.
|
||||
|
||||
"""
|
||||
# Several features selected manually
|
||||
feature_col = DATASET_SETTING[dataset]['feature_col']
|
||||
label_col = DATASET_SETTING[dataset]['label_col']
|
||||
temp_df = df.loc[:, feature_col+label_col]
|
||||
if fillna:
|
||||
temp_df = fill_test_na(temp_df)
|
||||
temp_df = temp_df.swaplevel()
|
||||
temp_df = temp_df.sort_index()
|
||||
temp_df = temp_df.reset_index(level=0)
|
||||
dates = pd.to_datetime(temp_df.index)
|
||||
temp_df['date'] = dates
|
||||
temp_df['day_of_week'] = dates.dayofweek
|
||||
temp_df['month'] = dates.month
|
||||
temp_df['year'] = dates.year
|
||||
temp_df['const'] = 1.0
|
||||
return temp_df
|
||||
|
||||
def process_predicted(df, col_name):
|
||||
"""Transform the TFT predicted data into Qlib format.
|
||||
|
||||
Args:
|
||||
df: Original DataFrame.
|
||||
fillna: New column name.
|
||||
|
||||
Returns:
|
||||
Transformed DataFrame.
|
||||
|
||||
"""
|
||||
df_res = df.copy()
|
||||
df_res = df_res.rename(columns={"forecast_time": "datetime", "identifier": "instrument", "t+0": col_name})
|
||||
df_res = df_res.set_index(['datetime','instrument']).sort_index()
|
||||
df_res = df_res[[col_name]]
|
||||
return df_res
|
||||
|
||||
def format_score(forecast_df, col_name='pred', label_shift=5):
|
||||
pred = process_predicted(forecast_df, col_name=col_name)
|
||||
pred = get_shifted_label(pred, shifts=-label_shift, col_shift=col_name)
|
||||
pred = pred.dropna()[col_name]
|
||||
return pred
|
||||
|
||||
def transform_df(df, col_name='LABEL0'):
|
||||
df_res = df['feature']
|
||||
df_res[col_name] = df['label']
|
||||
return df_res
|
||||
|
||||
class TFTModel(ModelFT):
|
||||
"""TFT Model"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self.model = None
|
||||
|
||||
def _prepare_data(self, dataset: DatasetH):
|
||||
df_train, df_valid = dataset.prepare(
|
||||
["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
|
||||
)
|
||||
return transform_df(df_train), transform_df(df_valid)
|
||||
|
||||
def fit(
|
||||
self,
|
||||
dataset: DatasetH,
|
||||
DATASET = 'Alpha158',
|
||||
MODEL_FOLDER = 'qlib_alpha158_model',
|
||||
LABEL_COL = 'LABEL0',
|
||||
LABEL_SHIFT = 5,
|
||||
USE_GPU_ID = 0,
|
||||
**kwargs
|
||||
):
|
||||
|
||||
if DATASET not in ALLOW_DATASET:
|
||||
raise AssertionError("The dataset is not supported, please make a new formatter to fit this dataset")
|
||||
|
||||
dtrain, dvalid = self._prepare_data(dataset)
|
||||
dtrain.loc[:, LABEL_COL] = get_shifted_label(dtrain, shifts=LABEL_SHIFT, col_shift=LABEL_COL)
|
||||
dvalid.loc[:, LABEL_COL] = get_shifted_label(dvalid, shifts=LABEL_SHIFT, col_shift=LABEL_COL)
|
||||
|
||||
|
||||
train = process_qlib_data(dtrain, DATASET, fillna=True).dropna()
|
||||
valid = process_qlib_data(dvalid, DATASET, fillna=True).dropna()
|
||||
|
||||
ExperimentConfig = expt_settings.configs.ExperimentConfig
|
||||
config = ExperimentConfig(DATASET)
|
||||
self.data_formatter = config.make_data_formatter()
|
||||
self.model_folder = MODEL_FOLDER
|
||||
self.gpu_id = USE_GPU_ID
|
||||
self.label_shift = LABEL_SHIFT
|
||||
self.expt_name = DATASET
|
||||
self.label_col = LABEL_COL
|
||||
|
||||
use_gpu = (True, self.gpu_id)
|
||||
#===========================Training Process===========================
|
||||
ModelClass = libs.tft_model.TemporalFusionTransformer
|
||||
if not isinstance(self.data_formatter, data_formatters.base.GenericDataFormatter):
|
||||
raise ValueError(
|
||||
"Data formatters should inherit from" +
|
||||
"AbstractDataFormatter! Type={}".format(type(self.data_formatter)))
|
||||
|
||||
default_keras_session = tf.keras.backend.get_session()
|
||||
|
||||
if use_gpu[0]:
|
||||
self.tf_config = utils.get_default_tensorflow_config(tf_device="gpu", gpu_id=use_gpu[1])
|
||||
else:
|
||||
self.tf_config = utils.get_default_tensorflow_config(tf_device="cpu")
|
||||
|
||||
self.data_formatter.set_scalers(train)
|
||||
|
||||
# Sets up default params
|
||||
fixed_params = self.data_formatter.get_experiment_params()
|
||||
params = self.data_formatter.get_default_model_params()
|
||||
|
||||
# Wendi: 合并调优的参数和非调优的参数
|
||||
params = {**params, **fixed_params}
|
||||
|
||||
if not os.path.exists(self.model_folder):
|
||||
os.makedirs(self.model_folder)
|
||||
params['model_folder'] = self.model_folder
|
||||
|
||||
print("*** Begin training ***")
|
||||
best_loss = np.Inf
|
||||
|
||||
tf.reset_default_graph()
|
||||
|
||||
self.tf_graph = tf.Graph()
|
||||
with self.tf_graph.as_default():
|
||||
self.sess = tf.Session(config=self.tf_config)
|
||||
tf.keras.backend.set_session(self.sess)
|
||||
self.model = ModelClass(params, use_cudnn=use_gpu[0])
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
self.model.fit(train_df=train, valid_df=valid)
|
||||
print("*** Finished training ***")
|
||||
saved_model_dir = self.model_folder+'/'+'saved_model'
|
||||
if not os.path.exists(saved_model_dir):
|
||||
os.makedirs(saved_model_dir)
|
||||
self.model.save(saved_model_dir)
|
||||
|
||||
def extract_numerical_data(data):
|
||||
"""Strips out forecast time and identifier columns."""
|
||||
return data[[
|
||||
col for col in data.columns
|
||||
if col not in {"forecast_time", "identifier"}
|
||||
]]
|
||||
|
||||
#p50_loss = utils.numpy_normalised_quantile_loss(
|
||||
# extract_numerical_data(targets), extract_numerical_data(p50_forecast),
|
||||
# 0.5)
|
||||
#p90_loss = utils.numpy_normalised_quantile_loss(
|
||||
# extract_numerical_data(targets), extract_numerical_data(p90_forecast),
|
||||
# 0.9)
|
||||
tf.keras.backend.set_session(default_keras_session)
|
||||
print("Training completed.".format(dte.datetime.now()))
|
||||
#===========================Training Process===========================
|
||||
|
||||
def predict(self, dataset):
|
||||
if self.model is None:
|
||||
raise ValueError("model is not fitted yet!")
|
||||
d_test = dataset.prepare("test", col_set=["feature", "label"])
|
||||
d_test = transform_df(d_test)
|
||||
d_test.loc[:, self.label_col] = get_shifted_label(d_test, shifts=self.label_shift, col_shift=self.label_col)
|
||||
test = process_qlib_data(d_test, self.expt_name, fillna=True).dropna()
|
||||
|
||||
use_gpu = (True, self.gpu_id)
|
||||
#===========================Predicting Process===========================
|
||||
default_keras_session = tf.keras.backend.get_session()
|
||||
|
||||
# Sets up default params
|
||||
fixed_params = self.data_formatter.get_experiment_params()
|
||||
params = self.data_formatter.get_default_model_params()
|
||||
params = {**params, **fixed_params}
|
||||
|
||||
|
||||
print("*** Begin predicting ***")
|
||||
tf.reset_default_graph()
|
||||
|
||||
with self.tf_graph.as_default():
|
||||
tf.keras.backend.set_session(self.sess)
|
||||
output_map = self.model.predict(test, return_targets=True)
|
||||
targets = self.data_formatter.format_predictions(output_map["targets"])
|
||||
p50_forecast = self.data_formatter.format_predictions(output_map["p50"])
|
||||
p90_forecast = self.data_formatter.format_predictions(output_map["p90"])
|
||||
tf.keras.backend.set_session(default_keras_session)
|
||||
|
||||
predict = format_score(p90_forecast, 'pred', self.label_shift)
|
||||
label = format_score(targets, 'label', self.label_shift)
|
||||
#===========================Predicting Process===========================
|
||||
return predict, label
|
||||
|
||||
def finetune(self, dataset: DatasetH):
|
||||
"""
|
||||
finetune model
|
||||
Parameters
|
||||
----------
|
||||
dataset : DatasetH
|
||||
dataset for finetuning
|
||||
"""
|
||||
pass
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import tensorflow.compat.v1 as tf
|
||||
import data_formatters.base
|
||||
import expt_settings.configs
|
||||
import libs.hyperparam_opt
|
||||
import libs.tft_model
|
||||
import libs.utils as utils
|
||||
import os
|
||||
import datetime as dte
|
||||
|
||||
|
||||
from qlib.model.base import ModelFT
|
||||
from qlib.data.dataset import DatasetH
|
||||
from qlib.data.dataset.handler import DataHandlerLP
|
||||
|
||||
|
||||
# To register new datasets, please add them here.
|
||||
ALLOW_DATASET = ["Alpha158"]
|
||||
DATASET_SETTING = {
|
||||
"Alpha158": {
|
||||
"feature_col": ["RESI5", "WVMA5", "RSQR5", "KLEN", "RSQR10", "CORR5", "CORD5", "CORR10", "ROC60", "RESI10"],
|
||||
"label_col": ["LABEL0"],
|
||||
},
|
||||
}
|
||||
# To register new datasets, please add their configurations here.
|
||||
|
||||
|
||||
def get_shifted_label(data_df, shifts=5, col_shift="LABEL0"):
|
||||
return data_df[[col_shift]].groupby("instrument").apply(lambda df: df.shift(shifts))
|
||||
|
||||
|
||||
def fill_test_na(test_df):
|
||||
test_df_res = test_df.copy()
|
||||
feature_cols = ~test_df_res.columns.str.contains("label", case=False)
|
||||
test_feature_fna = test_df_res.loc[:, feature_cols].groupby("datetime").apply(lambda df: df.fillna(df.mean()))
|
||||
test_df_res.loc[:, feature_cols] = test_feature_fna
|
||||
return test_df_res
|
||||
|
||||
|
||||
def process_qlib_data(df, dataset, fillna=False):
|
||||
"""Prepare data to fit the TFT model.
|
||||
|
||||
Args:
|
||||
df: Original DataFrame.
|
||||
fillna: Whether to fill the data with the mean values.
|
||||
|
||||
Returns:
|
||||
Transformed DataFrame.
|
||||
|
||||
"""
|
||||
# Several features selected manually
|
||||
feature_col = DATASET_SETTING[dataset]["feature_col"]
|
||||
label_col = DATASET_SETTING[dataset]["label_col"]
|
||||
temp_df = df.loc[:, feature_col + label_col]
|
||||
if fillna:
|
||||
temp_df = fill_test_na(temp_df)
|
||||
temp_df = temp_df.swaplevel()
|
||||
temp_df = temp_df.sort_index()
|
||||
temp_df = temp_df.reset_index(level=0)
|
||||
dates = pd.to_datetime(temp_df.index)
|
||||
temp_df["date"] = dates
|
||||
temp_df["day_of_week"] = dates.dayofweek
|
||||
temp_df["month"] = dates.month
|
||||
temp_df["year"] = dates.year
|
||||
temp_df["const"] = 1.0
|
||||
return temp_df
|
||||
|
||||
|
||||
def process_predicted(df, col_name):
|
||||
"""Transform the TFT predicted data into Qlib format.
|
||||
|
||||
Args:
|
||||
df: Original DataFrame.
|
||||
fillna: New column name.
|
||||
|
||||
Returns:
|
||||
Transformed DataFrame.
|
||||
|
||||
"""
|
||||
df_res = df.copy()
|
||||
df_res = df_res.rename(columns={"forecast_time": "datetime", "identifier": "instrument", "t+0": col_name})
|
||||
df_res = df_res.set_index(["datetime", "instrument"]).sort_index()
|
||||
df_res = df_res[[col_name]]
|
||||
return df_res
|
||||
|
||||
|
||||
def format_score(forecast_df, col_name="pred", label_shift=5):
|
||||
pred = process_predicted(forecast_df, col_name=col_name)
|
||||
pred = get_shifted_label(pred, shifts=-label_shift, col_shift=col_name)
|
||||
pred = pred.dropna()[col_name]
|
||||
return pred
|
||||
|
||||
|
||||
def transform_df(df, col_name="LABEL0"):
|
||||
df_res = df["feature"]
|
||||
df_res[col_name] = df["label"]
|
||||
return df_res
|
||||
|
||||
|
||||
class TFTModel(ModelFT):
|
||||
"""TFT Model"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self.model = None
|
||||
|
||||
def _prepare_data(self, dataset: DatasetH):
|
||||
df_train, df_valid = dataset.prepare(
|
||||
["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
|
||||
)
|
||||
return transform_df(df_train), transform_df(df_valid)
|
||||
|
||||
def fit(
|
||||
self,
|
||||
dataset: DatasetH,
|
||||
DATASET="Alpha158",
|
||||
MODEL_FOLDER="qlib_alpha158_model",
|
||||
LABEL_COL="LABEL0",
|
||||
LABEL_SHIFT=5,
|
||||
USE_GPU_ID=0,
|
||||
**kwargs
|
||||
):
|
||||
|
||||
if DATASET not in ALLOW_DATASET:
|
||||
raise AssertionError("The dataset is not supported, please make a new formatter to fit this dataset")
|
||||
|
||||
dtrain, dvalid = self._prepare_data(dataset)
|
||||
dtrain.loc[:, LABEL_COL] = get_shifted_label(dtrain, shifts=LABEL_SHIFT, col_shift=LABEL_COL)
|
||||
dvalid.loc[:, LABEL_COL] = get_shifted_label(dvalid, shifts=LABEL_SHIFT, col_shift=LABEL_COL)
|
||||
|
||||
train = process_qlib_data(dtrain, DATASET, fillna=True).dropna()
|
||||
valid = process_qlib_data(dvalid, DATASET, fillna=True).dropna()
|
||||
|
||||
ExperimentConfig = expt_settings.configs.ExperimentConfig
|
||||
config = ExperimentConfig(DATASET)
|
||||
self.data_formatter = config.make_data_formatter()
|
||||
self.model_folder = MODEL_FOLDER
|
||||
self.gpu_id = USE_GPU_ID
|
||||
self.label_shift = LABEL_SHIFT
|
||||
self.expt_name = DATASET
|
||||
self.label_col = LABEL_COL
|
||||
|
||||
use_gpu = (True, self.gpu_id)
|
||||
# ===========================Training Process===========================
|
||||
ModelClass = libs.tft_model.TemporalFusionTransformer
|
||||
if not isinstance(self.data_formatter, data_formatters.base.GenericDataFormatter):
|
||||
raise ValueError(
|
||||
"Data formatters should inherit from"
|
||||
+ "AbstractDataFormatter! Type={}".format(type(self.data_formatter))
|
||||
)
|
||||
|
||||
default_keras_session = tf.keras.backend.get_session()
|
||||
|
||||
if use_gpu[0]:
|
||||
self.tf_config = utils.get_default_tensorflow_config(tf_device="gpu", gpu_id=use_gpu[1])
|
||||
else:
|
||||
self.tf_config = utils.get_default_tensorflow_config(tf_device="cpu")
|
||||
|
||||
self.data_formatter.set_scalers(train)
|
||||
|
||||
# Sets up default params
|
||||
fixed_params = self.data_formatter.get_experiment_params()
|
||||
params = self.data_formatter.get_default_model_params()
|
||||
|
||||
# Wendi: 合并调优的参数和非调优的参数
|
||||
params = {**params, **fixed_params}
|
||||
|
||||
if not os.path.exists(self.model_folder):
|
||||
os.makedirs(self.model_folder)
|
||||
params["model_folder"] = self.model_folder
|
||||
|
||||
print("*** Begin training ***")
|
||||
best_loss = np.Inf
|
||||
|
||||
tf.reset_default_graph()
|
||||
|
||||
self.tf_graph = tf.Graph()
|
||||
with self.tf_graph.as_default():
|
||||
self.sess = tf.Session(config=self.tf_config)
|
||||
tf.keras.backend.set_session(self.sess)
|
||||
self.model = ModelClass(params, use_cudnn=use_gpu[0])
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
self.model.fit(train_df=train, valid_df=valid)
|
||||
print("*** Finished training ***")
|
||||
saved_model_dir = self.model_folder + "/" + "saved_model"
|
||||
if not os.path.exists(saved_model_dir):
|
||||
os.makedirs(saved_model_dir)
|
||||
self.model.save(saved_model_dir)
|
||||
|
||||
def extract_numerical_data(data):
|
||||
"""Strips out forecast time and identifier columns."""
|
||||
return data[[col for col in data.columns if col not in {"forecast_time", "identifier"}]]
|
||||
|
||||
# p50_loss = utils.numpy_normalised_quantile_loss(
|
||||
# extract_numerical_data(targets), extract_numerical_data(p50_forecast),
|
||||
# 0.5)
|
||||
# p90_loss = utils.numpy_normalised_quantile_loss(
|
||||
# extract_numerical_data(targets), extract_numerical_data(p90_forecast),
|
||||
# 0.9)
|
||||
tf.keras.backend.set_session(default_keras_session)
|
||||
print("Training completed.".format(dte.datetime.now()))
|
||||
# ===========================Training Process===========================
|
||||
|
||||
def predict(self, dataset):
|
||||
if self.model is None:
|
||||
raise ValueError("model is not fitted yet!")
|
||||
d_test = dataset.prepare("test", col_set=["feature", "label"])
|
||||
d_test = transform_df(d_test)
|
||||
d_test.loc[:, self.label_col] = get_shifted_label(d_test, shifts=self.label_shift, col_shift=self.label_col)
|
||||
test = process_qlib_data(d_test, self.expt_name, fillna=True).dropna()
|
||||
|
||||
use_gpu = (True, self.gpu_id)
|
||||
# ===========================Predicting Process===========================
|
||||
default_keras_session = tf.keras.backend.get_session()
|
||||
|
||||
# Sets up default params
|
||||
fixed_params = self.data_formatter.get_experiment_params()
|
||||
params = self.data_formatter.get_default_model_params()
|
||||
params = {**params, **fixed_params}
|
||||
|
||||
print("*** Begin predicting ***")
|
||||
tf.reset_default_graph()
|
||||
|
||||
with self.tf_graph.as_default():
|
||||
tf.keras.backend.set_session(self.sess)
|
||||
output_map = self.model.predict(test, return_targets=True)
|
||||
targets = self.data_formatter.format_predictions(output_map["targets"])
|
||||
p50_forecast = self.data_formatter.format_predictions(output_map["p50"])
|
||||
p90_forecast = self.data_formatter.format_predictions(output_map["p90"])
|
||||
tf.keras.backend.set_session(default_keras_session)
|
||||
|
||||
predict = format_score(p90_forecast, "pred", self.label_shift)
|
||||
label = format_score(targets, "label", self.label_shift)
|
||||
# ===========================Predicting Process===========================
|
||||
return predict, label
|
||||
|
||||
def finetune(self, dataset: DatasetH):
|
||||
"""
|
||||
finetune model
|
||||
Parameters
|
||||
----------
|
||||
dataset : DatasetH
|
||||
dataset for finetuning
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -1,130 +1,132 @@
|
||||
#Copyright (c) Microsoft Corporation.
|
||||
#Licensed under the MIT License.
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import qlib
|
||||
import pandas as pd
|
||||
from qlib.config import REG_CN
|
||||
from qlib.contrib.model.pytorch_lstm import LSTM
|
||||
from qlib.contrib.data.handler import ALPHA360_Denoise
|
||||
from qlib.contrib.strategy.strategy import TopkDropoutStrategy
|
||||
from qlib.contrib.evaluate import (
|
||||
backtest as normal_backtest,
|
||||
risk_analysis,
|
||||
)
|
||||
from qlib.utils import exists_qlib_data
|
||||
|
||||
# from qlib.model.learner import train_model
|
||||
from qlib.utils import init_instance_by_config
|
||||
|
||||
import pickle
|
||||
from tft import TFTModel
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# use default data
|
||||
provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir
|
||||
if not exists_qlib_data(provider_uri):
|
||||
print(f"Qlib data is not found in {provider_uri}")
|
||||
sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts")))
|
||||
from get_data import GetData
|
||||
|
||||
GetData().qlib_data_cn(target_dir=provider_uri)
|
||||
|
||||
qlib.init(provider_uri=provider_uri, region=REG_CN)
|
||||
|
||||
MARKET = "csi300"
|
||||
BENCHMARK = "SH000300"
|
||||
|
||||
###################################
|
||||
# train model
|
||||
###################################
|
||||
DATA_HANDLER_CONFIG = {
|
||||
"start_time": "2008-01-01",
|
||||
"end_time": "2020-08-01",
|
||||
"fit_start_time": "2008-01-01",
|
||||
"fit_end_time": "2014-12-31",
|
||||
"instruments": MARKET,
|
||||
}
|
||||
|
||||
TRAINER_CONFIG = {
|
||||
"train_start_time": "2008-01-01",
|
||||
"train_end_time": "2014-12-31",
|
||||
"validate_start_time": "2015-01-01",
|
||||
"validate_end_time": "2016-12-31",
|
||||
"test_start_time": "2017-01-01",
|
||||
"test_end_time": "2020-08-01",
|
||||
}
|
||||
|
||||
task = {
|
||||
"dataset": {
|
||||
"class": "DatasetH",
|
||||
"module_path": "qlib.data.dataset",
|
||||
"kwargs": {
|
||||
'handler': {
|
||||
"class": "Alpha158",
|
||||
"module_path": "qlib.contrib.data.handler",
|
||||
"kwargs": DATA_HANDLER_CONFIG
|
||||
},
|
||||
'segments': {
|
||||
'train': ("2008-01-01", "2014-12-31"),
|
||||
'valid': ("2015-01-01", "2016-12-31",),
|
||||
'test': ("2017-01-01", "2020-08-01",),
|
||||
}
|
||||
}
|
||||
}
|
||||
# You shoud record the data in specific sequence
|
||||
# "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'],
|
||||
}
|
||||
|
||||
|
||||
model = TFTModel()
|
||||
dataset = init_instance_by_config(task["dataset"])
|
||||
model.fit(dataset)
|
||||
|
||||
pred_score, label_score = model.predict(dataset)
|
||||
|
||||
# save pred_score to file
|
||||
pred_score_path = Path("~/tmp/qlib/pred_score.pkl").expanduser()
|
||||
pred_score_path.parent.mkdir(exist_ok=True, parents=True)
|
||||
pred_score.to_pickle(pred_score_path)
|
||||
|
||||
|
||||
###################################
|
||||
# backtest
|
||||
###################################
|
||||
STRATEGY_CONFIG = {
|
||||
"topk": 50,
|
||||
"n_drop": 5,
|
||||
}
|
||||
BACKTEST_CONFIG = {
|
||||
"verbose": False,
|
||||
"limit_threshold": 0.095,
|
||||
"account": 100000000,
|
||||
"benchmark": BENCHMARK,
|
||||
"deal_price": "close",
|
||||
"open_cost": 0.0005,
|
||||
"close_cost": 0.0015,
|
||||
"min_cost": 5,
|
||||
}
|
||||
|
||||
# use default strategy
|
||||
# custom Strategy, refer to: TODO: Strategy API url
|
||||
strategy = TopkDropoutStrategy(**STRATEGY_CONFIG)
|
||||
report_normal, positions_normal = normal_backtest(pred_score, strategy=strategy, **BACKTEST_CONFIG)
|
||||
|
||||
###################################
|
||||
# analyze
|
||||
# If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb
|
||||
###################################
|
||||
analysis = dict()
|
||||
analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"])
|
||||
analysis["excess_return_with_cost"] = risk_analysis(
|
||||
report_normal["return"] - report_normal["bench"] - report_normal["cost"]
|
||||
)
|
||||
analysis_df = pd.concat(analysis) # type: pd.DataFrame
|
||||
print(analysis_df)
|
||||
|
||||
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import qlib
|
||||
import pandas as pd
|
||||
from qlib.config import REG_CN
|
||||
from qlib.contrib.model.pytorch_lstm import LSTM
|
||||
from qlib.contrib.data.handler import ALPHA360_Denoise
|
||||
from qlib.contrib.strategy.strategy import TopkDropoutStrategy
|
||||
from qlib.contrib.evaluate import (
|
||||
backtest as normal_backtest,
|
||||
risk_analysis,
|
||||
)
|
||||
from qlib.utils import exists_qlib_data
|
||||
|
||||
# from qlib.model.learner import train_model
|
||||
from qlib.utils import init_instance_by_config
|
||||
|
||||
import pickle
|
||||
from tft import TFTModel
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# use default data
|
||||
provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir
|
||||
if not exists_qlib_data(provider_uri):
|
||||
print(f"Qlib data is not found in {provider_uri}")
|
||||
sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts")))
|
||||
from get_data import GetData
|
||||
|
||||
GetData().qlib_data_cn(target_dir=provider_uri)
|
||||
|
||||
qlib.init(provider_uri=provider_uri, region=REG_CN)
|
||||
|
||||
MARKET = "csi300"
|
||||
BENCHMARK = "SH000300"
|
||||
|
||||
###################################
|
||||
# train model
|
||||
###################################
|
||||
DATA_HANDLER_CONFIG = {
|
||||
"start_time": "2008-01-01",
|
||||
"end_time": "2020-08-01",
|
||||
"fit_start_time": "2008-01-01",
|
||||
"fit_end_time": "2014-12-31",
|
||||
"instruments": MARKET,
|
||||
}
|
||||
|
||||
TRAINER_CONFIG = {
|
||||
"train_start_time": "2008-01-01",
|
||||
"train_end_time": "2014-12-31",
|
||||
"validate_start_time": "2015-01-01",
|
||||
"validate_end_time": "2016-12-31",
|
||||
"test_start_time": "2017-01-01",
|
||||
"test_end_time": "2020-08-01",
|
||||
}
|
||||
|
||||
task = {
|
||||
"dataset": {
|
||||
"class": "DatasetH",
|
||||
"module_path": "qlib.data.dataset",
|
||||
"kwargs": {
|
||||
"handler": {
|
||||
"class": "Alpha158",
|
||||
"module_path": "qlib.contrib.data.handler",
|
||||
"kwargs": DATA_HANDLER_CONFIG,
|
||||
},
|
||||
"segments": {
|
||||
"train": ("2008-01-01", "2014-12-31"),
|
||||
"valid": (
|
||||
"2015-01-01",
|
||||
"2016-12-31",
|
||||
),
|
||||
"test": (
|
||||
"2017-01-01",
|
||||
"2020-08-01",
|
||||
),
|
||||
},
|
||||
},
|
||||
}
|
||||
# You shoud record the data in specific sequence
|
||||
# "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'],
|
||||
}
|
||||
|
||||
model = TFTModel()
|
||||
dataset = init_instance_by_config(task["dataset"])
|
||||
model.fit(dataset)
|
||||
|
||||
pred_score, label_score = model.predict(dataset)
|
||||
|
||||
# save pred_score to file
|
||||
pred_score_path = Path("~/tmp/qlib/pred_score.pkl").expanduser()
|
||||
pred_score_path.parent.mkdir(exist_ok=True, parents=True)
|
||||
pred_score.to_pickle(pred_score_path)
|
||||
|
||||
###################################
|
||||
# backtest
|
||||
###################################
|
||||
STRATEGY_CONFIG = {
|
||||
"topk": 50,
|
||||
"n_drop": 5,
|
||||
}
|
||||
BACKTEST_CONFIG = {
|
||||
"verbose": False,
|
||||
"limit_threshold": 0.095,
|
||||
"account": 100000000,
|
||||
"benchmark": BENCHMARK,
|
||||
"deal_price": "close",
|
||||
"open_cost": 0.0005,
|
||||
"close_cost": 0.0015,
|
||||
"min_cost": 5,
|
||||
}
|
||||
|
||||
# use default strategy
|
||||
# custom Strategy, refer to: TODO: Strategy API url
|
||||
strategy = TopkDropoutStrategy(**STRATEGY_CONFIG)
|
||||
report_normal, positions_normal = normal_backtest(pred_score, strategy=strategy, **BACKTEST_CONFIG)
|
||||
|
||||
###################################
|
||||
# analyze
|
||||
# If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb
|
||||
###################################
|
||||
analysis = dict()
|
||||
analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"])
|
||||
analysis["excess_return_with_cost"] = risk_analysis(
|
||||
report_normal["return"] - report_normal["bench"] - report_normal["cost"]
|
||||
)
|
||||
analysis_df = pd.concat(analysis) # type: pd.DataFrame
|
||||
print(analysis_df)
|
||||
|
||||
Reference in New Issue
Block a user