# coding=utf-8 # Copyright 2020 The Google Research Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Lint as: python3 """Custom formatting functions for Alpha158 dataset. Defines dataset specific column definitions and data transformations. """ import data_formatters.base import libs.utils as utils import sklearn.preprocessing GenericDataFormatter = data_formatters.base.GenericDataFormatter DataTypes = data_formatters.base.DataTypes InputTypes = data_formatters.base.InputTypes class Alpha158Formatter(GenericDataFormatter): """Defines and formats data for the Alpha158 dataset. Attributes: column_definition: Defines input and data type of column used in the experiment. identifiers: Entity identifiers used in experiments. """ _column_definition = [ ("instrument", DataTypes.CATEGORICAL, InputTypes.ID), ("LABEL0", DataTypes.REAL_VALUED, InputTypes.TARGET), ("date", DataTypes.DATE, InputTypes.TIME), ("month", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT), ("day_of_week", DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT), # Selected features ("RESI5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT), ("WVMA5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT), ("RSQR5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT), ("KLEN", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT), ("RSQR10", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT), ("CORR5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT), ("CORD5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT), ("CORR10", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT), ("ROC60", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT), ("RESI10", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT), ("VSTD5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT), ("RSQR60", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT), ("CORR60", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT), ("WVMA60", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT), ("STD5", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT), ("RSQR20", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT), ("CORD60", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT), ("CORD10", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT), ("CORR20", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT), ("KLOW", DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT), ("const", DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT), ] def __init__(self): """Initialises formatter.""" self.identifiers = None self._real_scalers = None self._cat_scalers = None self._target_scaler = None self._num_classes_per_cat_input = None def split_data(self, df, valid_boundary=2016, test_boundary=2018): """Splits data frame into training-validation-test data frames. This also calibrates scaling object, and transforms data for each split. Args: df: Source data frame to split. valid_boundary: Starting year for validation data test_boundary: Starting year for test data Returns: Tuple of transformed (train, valid, test) data. """ print("Formatting train-valid-test splits.") index = df["year"] train = df.loc[index < valid_boundary] valid = df.loc[(index >= valid_boundary) & (index < test_boundary)] test = df.loc[index >= test_boundary] self.set_scalers(train) return (self.transform_inputs(data) for data in [train, valid, test]) def set_scalers(self, df): """Calibrates scalers using the data supplied. Args: df: Data to use to calibrate scalers. """ print("Setting scalers with training data...") column_definitions = self.get_column_definition() id_column = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions) target_column = utils.get_single_col_by_input_type(InputTypes.TARGET, column_definitions) # Extract identifiers in case required self.identifiers = list(df[id_column].unique()) # Format real scalers real_inputs = utils.extract_cols_from_data_type( DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME} ) data = df[real_inputs].values self._real_scalers = sklearn.preprocessing.StandardScaler().fit(data) self._target_scaler = sklearn.preprocessing.StandardScaler().fit( df[[target_column]].values ) # used for predictions # Format categorical scalers categorical_inputs = utils.extract_cols_from_data_type( DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME} ) categorical_scalers = {} num_classes = [] for col in categorical_inputs: # Set all to str so that we don't have mixed integer/string columns srs = df[col].apply(str) categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(srs.values) num_classes.append(srs.nunique()) # Set categorical scaler outputs self._cat_scalers = categorical_scalers self._num_classes_per_cat_input = num_classes def transform_inputs(self, df): """Performs feature transformations. This includes both feature engineering, preprocessing and normalisation. Args: df: Data frame to transform. Returns: Transformed data frame. """ output = df.copy() if self._real_scalers is None and self._cat_scalers is None: raise ValueError("Scalers have not been set!") column_definitions = self.get_column_definition() real_inputs = utils.extract_cols_from_data_type( DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME} ) categorical_inputs = utils.extract_cols_from_data_type( DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME} ) # Format real inputs output[real_inputs] = self._real_scalers.transform(df[real_inputs].values) # Format categorical inputs for col in categorical_inputs: string_df = df[col].apply(str) output[col] = self._cat_scalers[col].transform(string_df) return output def format_predictions(self, predictions): """Reverts any normalisation to give predictions in original scale. Args: predictions: Dataframe of model predictions. Returns: Data frame of unnormalised predictions. """ output = predictions.copy() column_names = predictions.columns for col in column_names: if col not in {"forecast_time", "identifier"}: output[col] = self._target_scaler.inverse_transform(predictions[col]) return output # Default params def get_fixed_params(self): """Returns fixed model parameters for experiments.""" fixed_params = { "total_time_steps": 6 + 6, "num_encoder_steps": 6, "num_epochs": 100, "early_stopping_patience": 10, "multiprocessing_workers": 5, } return fixed_params def get_default_model_params(self): """Returns default optimised model parameters.""" model_params = { "dropout_rate": 0.4, "hidden_layer_size": 160, "learning_rate": 0.0001, "minibatch_size": 128, "max_gradient_norm": 0.0135, "num_heads": 1, "stack_size": 1, } return model_params