1
0
mirror of https://github.com/microsoft/qlib.git synced 2026-06-06 05:51:17 +08:00

Add TFT benchmark

This commit is contained in:
Wendi Li
2020-11-23 15:54:27 +08:00
committed by you-n-g
parent c897ecac33
commit 93323ed6b3
15 changed files with 3971 additions and 0 deletions

View File

@@ -0,0 +1,15 @@
# coding=utf-8
# Copyright 2020 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,235 @@
# coding=utf-8
# Copyright 2020 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Default data formatting functions for experiments.
For new datasets, inherit form GenericDataFormatter and implement
all abstract functions.
These dataset-specific methods:
1) Define the column and input types for tabular dataframes used by model
2) Perform the necessary input feature engineering & normalisation steps
3) Reverts the normalisation for predictions
4) Are responsible for train, validation and test splits
"""
import abc
import enum
# Type defintions
class DataTypes(enum.IntEnum):
"""Defines numerical types of each column."""
REAL_VALUED = 0
CATEGORICAL = 1
DATE = 2
class InputTypes(enum.IntEnum):
"""Defines input types of each column."""
TARGET = 0
OBSERVED_INPUT = 1
KNOWN_INPUT = 2
STATIC_INPUT = 3
ID = 4 # Single column used as an entity identifier
TIME = 5 # Single column exclusively used as a time index
class GenericDataFormatter(abc.ABC):
"""Abstract base class for all data formatters.
User can implement the abstract methods below to perform dataset-specific
manipulations.
"""
@abc.abstractmethod
def set_scalers(self, df):
"""Calibrates scalers using the data supplied."""
raise NotImplementedError()
@abc.abstractmethod
def transform_inputs(self, df):
"""Performs feature transformation."""
raise NotImplementedError()
@abc.abstractmethod
def format_predictions(self, df):
"""Reverts any normalisation to give predictions in original scale."""
raise NotImplementedError()
@abc.abstractmethod
def split_data(self, df):
"""Performs the default train, validation and test splits."""
raise NotImplementedError()
@property
@abc.abstractmethod
def _column_definition(self):
"""Defines order, input type and data type of each column."""
raise NotImplementedError()
@abc.abstractmethod
def get_fixed_params(self):
"""Defines the fixed parameters used by the model for training.
Requires the following keys:
'total_time_steps': Defines the total number of time steps used by TFT
'num_encoder_steps': Determines length of LSTM encoder (i.e. history)
'num_epochs': Maximum number of epochs for training
'early_stopping_patience': Early stopping param for keras
'multiprocessing_workers': # of cpus for data processing
Returns:
A dictionary of fixed parameters, e.g.:
fixed_params = {
'total_time_steps': 252 + 5,
'num_encoder_steps': 252,
'num_epochs': 100,
'early_stopping_patience': 5,
'multiprocessing_workers': 5,
}
"""
raise NotImplementedError
# Shared functions across data-formatters
@property
def num_classes_per_cat_input(self):
"""Returns number of categories per relevant input.
This is seqeuently required for keras embedding layers.
"""
return self._num_classes_per_cat_input
def get_num_samples_for_calibration(self):
"""Gets the default number of training and validation samples.
Use to sub-sample the data for network calibration and a value of -1 uses
all available samples.
Returns:
Tuple of (training samples, validation samples)
"""
return -1, -1
def get_column_definition(self):
""""Returns formatted column definition in order expected by the TFT."""
column_definition = self._column_definition
# Sanity checks first.
# Ensure only one ID and time column exist
def _check_single_column(input_type):
length = len([tup for tup in column_definition if tup[2] == input_type])
if length != 1:
raise ValueError('Illegal number of inputs ({}) of type {}'.format(
length, input_type))
_check_single_column(InputTypes.ID)
_check_single_column(InputTypes.TIME)
identifier = [tup for tup in column_definition if tup[2] == InputTypes.ID]
time = [tup for tup in column_definition if tup[2] == InputTypes.TIME]
real_inputs = [
tup for tup in column_definition if tup[1] == DataTypes.REAL_VALUED and
tup[2] not in {InputTypes.ID, InputTypes.TIME}
]
categorical_inputs = [
tup for tup in column_definition if tup[1] == DataTypes.CATEGORICAL and
tup[2] not in {InputTypes.ID, InputTypes.TIME}
]
return identifier + time + real_inputs + categorical_inputs
def _get_input_columns(self):
"""Returns names of all input columns."""
return [
tup[0]
for tup in self.get_column_definition()
if tup[2] not in {InputTypes.ID, InputTypes.TIME}
]
def _get_tft_input_indices(self):
"""Returns the relevant indexes and input sizes required by TFT."""
# Functions
def _extract_tuples_from_data_type(data_type, defn):
return [
tup for tup in defn if tup[1] == data_type and
tup[2] not in {InputTypes.ID, InputTypes.TIME}
]
def _get_locations(input_types, defn):
return [i for i, tup in enumerate(defn) if tup[2] in input_types]
# Start extraction
column_definition = [
tup for tup in self.get_column_definition()
if tup[2] not in {InputTypes.ID, InputTypes.TIME}
]
categorical_inputs = _extract_tuples_from_data_type(DataTypes.CATEGORICAL,
column_definition)
real_inputs = _extract_tuples_from_data_type(DataTypes.REAL_VALUED,
column_definition)
locations = {
'input_size':
len(self._get_input_columns()),
'output_size':
len(_get_locations({InputTypes.TARGET}, column_definition)),
'category_counts':
self.num_classes_per_cat_input,
'input_obs_loc':
_get_locations({InputTypes.TARGET}, column_definition),
'static_input_loc':
_get_locations({InputTypes.STATIC_INPUT}, column_definition),
'known_regular_inputs':
_get_locations({InputTypes.STATIC_INPUT, InputTypes.KNOWN_INPUT},
real_inputs),
'known_categorical_inputs':
_get_locations({InputTypes.STATIC_INPUT, InputTypes.KNOWN_INPUT},
categorical_inputs),
}
return locations
def get_experiment_params(self):
"""Returns fixed model parameters for experiments."""
required_keys = [
'total_time_steps', 'num_encoder_steps', 'num_epochs',
'early_stopping_patience', 'multiprocessing_workers'
]
fixed_params = self.get_fixed_params()
for k in required_keys:
if k not in fixed_params:
raise ValueError('Field {}'.format(k) +
' missing from fixed parameter definitions!')
fixed_params['column_definition'] = self.get_column_definition()
fixed_params.update(self._get_tft_input_indices())
return fixed_params

View File

@@ -0,0 +1,261 @@
# coding=utf-8
# Copyright 2020 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Custom formatting functions for Electricity dataset.
Defines dataset specific column definitions and data transformations. Uses
entity specific z-score normalization.
"""
import data_formatters.base
import libs.utils as utils
import pandas as pd
import sklearn.preprocessing
GenericDataFormatter = data_formatters.base.GenericDataFormatter
DataTypes = data_formatters.base.DataTypes
InputTypes = data_formatters.base.InputTypes
class ElectricityFormatter(GenericDataFormatter):
"""Defines and formats data for the electricity dataset.
Note that per-entity z-score normalization is used here, and is implemented
across functions.
Attributes:
column_definition: Defines input and data type of column used in the
experiment.
identifiers: Entity identifiers used in experiments.
"""
_column_definition = [
('id', DataTypes.REAL_VALUED, InputTypes.ID),
('hours_from_start', DataTypes.REAL_VALUED, InputTypes.TIME),
('power_usage', DataTypes.REAL_VALUED, InputTypes.TARGET),
('hour', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
('day_of_week', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
('hours_from_start', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
('categorical_id', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
]
def __init__(self):
"""Initialises formatter."""
self.identifiers = None
self._real_scalers = None
self._cat_scalers = None
self._target_scaler = None
self._num_classes_per_cat_input = None
self._time_steps = self.get_fixed_params()['total_time_steps']
def split_data(self, df, valid_boundary=1315, test_boundary=1339):
"""Splits data frame into training-validation-test data frames.
This also calibrates scaling object, and transforms data for each split.
Args:
df: Source data frame to split.
valid_boundary: Starting year for validation data
test_boundary: Starting year for test data
Returns:
Tuple of transformed (train, valid, test) data.
"""
print('Formatting train-valid-test splits.')
index = df['days_from_start']
train = df.loc[index < valid_boundary]
valid = df.loc[(index >= valid_boundary - 7) & (index < test_boundary)]
test = df.loc[index >= test_boundary - 7]
self.set_scalers(train)
return (self.transform_inputs(data) for data in [train, valid, test])
def set_scalers(self, df):
"""Calibrates scalers using the data supplied.
Args:
df: Data to use to calibrate scalers.
"""
print('Setting scalers with training data...')
column_definitions = self.get_column_definition()
id_column = utils.get_single_col_by_input_type(InputTypes.ID,
column_definitions)
target_column = utils.get_single_col_by_input_type(InputTypes.TARGET,
column_definitions)
# Format real scalers
real_inputs = utils.extract_cols_from_data_type(
DataTypes.REAL_VALUED, column_definitions,
{InputTypes.ID, InputTypes.TIME})
# Initialise scaler caches
self._real_scalers = {}
self._target_scaler = {}
identifiers = []
for identifier, sliced in df.groupby(id_column):
if len(sliced) >= self._time_steps:
data = sliced[real_inputs].values
targets = sliced[[target_column]].values
self._real_scalers[identifier] \
= sklearn.preprocessing.StandardScaler().fit(data)
self._target_scaler[identifier] \
= sklearn.preprocessing.StandardScaler().fit(targets)
identifiers.append(identifier)
# Format categorical scalers
categorical_inputs = utils.extract_cols_from_data_type(
DataTypes.CATEGORICAL, column_definitions,
{InputTypes.ID, InputTypes.TIME})
categorical_scalers = {}
num_classes = []
for col in categorical_inputs:
# Set all to str so that we don't have mixed integer/string columns
srs = df[col].apply(str)
categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(
srs.values)
num_classes.append(srs.nunique())
# Set categorical scaler outputs
self._cat_scalers = categorical_scalers
self._num_classes_per_cat_input = num_classes
# Extract identifiers in case required
self.identifiers = identifiers
def transform_inputs(self, df):
"""Performs feature transformations.
This includes both feature engineering, preprocessing and normalisation.
Args:
df: Data frame to transform.
Returns:
Transformed data frame.
"""
if self._real_scalers is None and self._cat_scalers is None:
raise ValueError('Scalers have not been set!')
# Extract relevant columns
column_definitions = self.get_column_definition()
id_col = utils.get_single_col_by_input_type(InputTypes.ID,
column_definitions)
real_inputs = utils.extract_cols_from_data_type(
DataTypes.REAL_VALUED, column_definitions,
{InputTypes.ID, InputTypes.TIME})
categorical_inputs = utils.extract_cols_from_data_type(
DataTypes.CATEGORICAL, column_definitions,
{InputTypes.ID, InputTypes.TIME})
# Transform real inputs per entity
df_list = []
for identifier, sliced in df.groupby(id_col):
# Filter out any trajectories that are too short
if len(sliced) >= self._time_steps:
sliced_copy = sliced.copy()
sliced_copy[real_inputs] = self._real_scalers[identifier].transform(
sliced_copy[real_inputs].values)
df_list.append(sliced_copy)
output = pd.concat(df_list, axis=0)
# Format categorical inputs
for col in categorical_inputs:
string_df = df[col].apply(str)
output[col] = self._cat_scalers[col].transform(string_df)
return output
def format_predictions(self, predictions):
"""Reverts any normalisation to give predictions in original scale.
Args:
predictions: Dataframe of model predictions.
Returns:
Data frame of unnormalised predictions.
"""
if self._target_scaler is None:
raise ValueError('Scalers have not been set!')
column_names = predictions.columns
df_list = []
for identifier, sliced in predictions.groupby('identifier'):
sliced_copy = sliced.copy()
target_scaler = self._target_scaler[identifier]
for col in column_names:
if col not in {'forecast_time', 'identifier'}:
sliced_copy[col] = target_scaler.inverse_transform(sliced_copy[col])
df_list.append(sliced_copy)
output = pd.concat(df_list, axis=0)
return output
# Default params
def get_fixed_params(self):
"""Returns fixed model parameters for experiments."""
fixed_params = {
'total_time_steps': 8 * 24,
'num_encoder_steps': 7 * 24,
'num_epochs': 100,
'early_stopping_patience': 5,
'multiprocessing_workers': 5
}
return fixed_params
def get_default_model_params(self):
"""Returns default optimised model parameters."""
model_params = {
'dropout_rate': 0.1,
'hidden_layer_size': 160,
'learning_rate': 0.001,
'minibatch_size': 64,
'max_gradient_norm': 0.01,
'num_heads': 4,
'stack_size': 1
}
return model_params
def get_num_samples_for_calibration(self):
"""Gets the default number of training and validation samples.
Use to sub-sample the data for network calibration and a value of -1 uses
all available samples.
Returns:
Tuple of (training samples, validation samples)
"""
return 450000, 50000

View File

@@ -0,0 +1,327 @@
# coding=utf-8
# Copyright 2020 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Custom formatting functions for Favorita dataset.
Defines dataset specific column definitions and data transformations.
"""
import data_formatters.base
import libs.utils as utils
import pandas as pd
import sklearn.preprocessing
DataTypes = data_formatters.base.DataTypes
InputTypes = data_formatters.base.InputTypes
class FavoritaFormatter(data_formatters.base.GenericDataFormatter):
"""Defines and formats data for the Favorita dataset.
Attributes:
column_definition: Defines input and data type of column used in the
experiment.
identifiers: Entity identifiers used in experiments.
"""
_column_definition = [
('traj_id', DataTypes.REAL_VALUED, InputTypes.ID),
('date', DataTypes.DATE, InputTypes.TIME),
('log_sales', DataTypes.REAL_VALUED, InputTypes.TARGET),
('onpromotion', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
('transactions', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
('oil', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
('day_of_week', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
('day_of_month', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
('month', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
('national_hol', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
('regional_hol', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
('local_hol', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
('open', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
('item_nbr', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
('store_nbr', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
('city', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
('state', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
('type', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
('cluster', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
('family', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
('class', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
('perishable', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT)
]
def __init__(self):
"""Initialises formatter."""
self.identifiers = None
self._real_scalers = None
self._cat_scalers = None
self._target_scaler = None
self._num_classes_per_cat_input = None
def split_data(self, df, valid_boundary=None, test_boundary=None):
"""Splits data frame into training-validation-test data frames.
This also calibrates scaling object, and transforms data for each split.
Args:
df: Source data frame to split.
valid_boundary: Starting year for validation data
test_boundary: Starting year for test data
Returns:
Tuple of transformed (train, valid, test) data.
"""
print('Formatting train-valid-test splits.')
if valid_boundary is None:
valid_boundary = pd.datetime(2015, 12, 1)
fixed_params = self.get_fixed_params()
time_steps = fixed_params['total_time_steps']
lookback = fixed_params['num_encoder_steps']
forecast_horizon = time_steps - lookback
df['date'] = pd.to_datetime(df['date'])
df_lists = {'train': [], 'valid': [], 'test': []}
for _, sliced in df.groupby('traj_id'):
index = sliced['date']
train = sliced.loc[index < valid_boundary]
train_len = len(train)
valid_len = train_len + forecast_horizon
valid = sliced.iloc[train_len - lookback:valid_len, :]
test = sliced.iloc[valid_len - lookback:valid_len + forecast_horizon, :]
sliced_map = {'train': train, 'valid': valid, 'test': test}
for k in sliced_map:
item = sliced_map[k]
if len(item) >= time_steps:
df_lists[k].append(item)
dfs = {k: pd.concat(df_lists[k], axis=0) for k in df_lists}
train = dfs['train']
self.set_scalers(train, set_real=True)
# Use all data for label encoding to handle labels not present in training.
self.set_scalers(df, set_real=False)
# Filter out identifiers not present in training (i.e. cold-started items).
def filter_ids(frame):
identifiers = set(self.identifiers)
index = frame['traj_id']
return frame.loc[index.apply(lambda x: x in identifiers)]
valid = filter_ids(dfs['valid'])
test = filter_ids(dfs['test'])
return (self.transform_inputs(data) for data in [train, valid, test])
def set_scalers(self, df, set_real=True):
"""Calibrates scalers using the data supplied.
Label encoding is applied to the entire dataset (i.e. including test),
so that unseen labels can be handled at run-time.
Args:
df: Data to use to calibrate scalers.
set_real: Whether to fit set real-valued or categorical scalers
"""
print('Setting scalers with training data...')
column_definitions = self.get_column_definition()
id_column = utils.get_single_col_by_input_type(InputTypes.ID,
column_definitions)
target_column = utils.get_single_col_by_input_type(InputTypes.TARGET,
column_definitions)
if set_real:
# Extract identifiers in case required
self.identifiers = list(df[id_column].unique())
# Format real scalers
self._real_scalers = {}
for col in ['oil', 'transactions', 'log_sales']:
self._real_scalers[col] = (df[col].mean(), df[col].std())
self._target_scaler = (df[target_column].mean(), df[target_column].std())
else:
# Format categorical scalers
categorical_inputs = utils.extract_cols_from_data_type(
DataTypes.CATEGORICAL, column_definitions,
{InputTypes.ID, InputTypes.TIME})
categorical_scalers = {}
num_classes = []
if self.identifiers is None:
raise ValueError('Scale real-valued inputs first!')
id_set = set(self.identifiers)
valid_idx = df['traj_id'].apply(lambda x: x in id_set)
for col in categorical_inputs:
# Set all to str so that we don't have mixed integer/string columns
srs = df[col].apply(str).loc[valid_idx]
categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(
srs.values)
num_classes.append(srs.nunique())
# Set categorical scaler outputs
self._cat_scalers = categorical_scalers
self._num_classes_per_cat_input = num_classes
def transform_inputs(self, df):
"""Performs feature transformations.
This includes both feature engineering, preprocessing and normalisation.
Args:
df: Data frame to transform.
Returns:
Transformed data frame.
"""
output = df.copy()
if self._real_scalers is None and self._cat_scalers is None:
raise ValueError('Scalers have not been set!')
column_definitions = self.get_column_definition()
categorical_inputs = utils.extract_cols_from_data_type(
DataTypes.CATEGORICAL, column_definitions,
{InputTypes.ID, InputTypes.TIME})
# Format real inputs
for col in ['log_sales', 'oil', 'transactions']:
mean, std = self._real_scalers[col]
output[col] = (df[col] - mean) / std
if col == 'log_sales':
output[col] = output[col].fillna(0.) # mean imputation
# Format categorical inputs
for col in categorical_inputs:
string_df = df[col].apply(str)
output[col] = self._cat_scalers[col].transform(string_df)
return output
def format_predictions(self, predictions):
"""Reverts any normalisation to give predictions in original scale.
Args:
predictions: Dataframe of model predictions.
Returns:
Data frame of unnormalised predictions.
"""
output = predictions.copy()
column_names = predictions.columns
mean, std = self._target_scaler
for col in column_names:
if col not in {'forecast_time', 'identifier'}:
output[col] = (predictions[col] * std) + mean
return output
# Default params
def get_fixed_params(self):
"""Returns fixed model parameters for experiments."""
fixed_params = {
'total_time_steps': 120,
'num_encoder_steps': 90,
'num_epochs': 100,
'early_stopping_patience': 5,
'multiprocessing_workers': 5
}
return fixed_params
def get_default_model_params(self):
"""Returns default optimised model parameters."""
model_params = {
'dropout_rate': 0.1,
'hidden_layer_size': 240,
'learning_rate': 0.001,
'minibatch_size': 128,
'max_gradient_norm': 100.,
'num_heads': 4,
'stack_size': 1
}
return model_params
def get_num_samples_for_calibration(self):
"""Gets the default number of training and validation samples.
Use to sub-sample the data for network calibration and a value of -1 uses
all available samples.
Returns:
Tuple of (training samples, validation samples)
"""
return 450000, 50000
def get_column_definition(self):
""""Formats column definition in order expected by the TFT.
Modified for Favorita to match column order of original experiment.
Returns:
Favorita-specific column definition
"""
column_definition = self._column_definition
# Sanity checks first.
# Ensure only one ID and time column exist
def _check_single_column(input_type):
length = len([tup for tup in column_definition if tup[2] == input_type])
if length != 1:
raise ValueError('Illegal number of inputs ({}) of type {}'.format(
length, input_type))
_check_single_column(InputTypes.ID)
_check_single_column(InputTypes.TIME)
identifier = [tup for tup in column_definition if tup[2] == InputTypes.ID]
time = [tup for tup in column_definition if tup[2] == InputTypes.TIME]
real_inputs = [
tup for tup in column_definition if tup[1] == DataTypes.REAL_VALUED and
tup[2] not in {InputTypes.ID, InputTypes.TIME}
]
col_definition_map = {tup[0]: tup for tup in column_definition}
col_order = [
'item_nbr', 'store_nbr', 'city', 'state', 'type', 'cluster', 'family',
'class', 'perishable', 'onpromotion', 'day_of_week', 'national_hol',
'regional_hol', 'local_hol'
]
categorical_inputs = [
col_definition_map[k] for k in col_order if k in col_definition_map
]
return identifier + time + real_inputs + categorical_inputs

View File

@@ -0,0 +1,220 @@
# coding=utf-8
# Copyright 2020 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Custom formatting functions for Alpha158 dataset.
Defines dataset specific column definitions and data transformations.
"""
import data_formatters.base
import libs.utils as utils
import sklearn.preprocessing
GenericDataFormatter = data_formatters.base.GenericDataFormatter
DataTypes = data_formatters.base.DataTypes
InputTypes = data_formatters.base.InputTypes
class Alpha158Formatter(GenericDataFormatter):
"""Defines and formats data for the Alpha158 dataset.
Attributes:
column_definition: Defines input and data type of column used in the
experiment.
identifiers: Entity identifiers used in experiments.
"""
_column_definition = [
('instrument', DataTypes.CATEGORICAL, InputTypes.ID),
('LABEL0', DataTypes.REAL_VALUED, InputTypes.TARGET),
('date', DataTypes.DATE, InputTypes.TIME),
('month', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
('day_of_week', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
# Selected 10 features
('RESI5', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
('WVMA5', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
('RSQR5', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
('KLEN', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
('RSQR10', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
('CORR5', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
('CORD5', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
('CORR10', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
('ROC60', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
('RESI10', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
('const', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
]
def __init__(self):
"""Initialises formatter."""
self.identifiers = None
self._real_scalers = None
self._cat_scalers = None
self._target_scaler = None
self._num_classes_per_cat_input = None
def split_data(self, df, valid_boundary=2016, test_boundary=2018):
"""Splits data frame into training-validation-test data frames.
This also calibrates scaling object, and transforms data for each split.
Args:
df: Source data frame to split.
valid_boundary: Starting year for validation data
test_boundary: Starting year for test data
Returns:
Tuple of transformed (train, valid, test) data.
"""
print('Formatting train-valid-test splits.')
index = df['year']
train = df.loc[index < valid_boundary]
valid = df.loc[(index >= valid_boundary) & (index < test_boundary)]
test = df.loc[index >= test_boundary]
self.set_scalers(train)
return (self.transform_inputs(data) for data in [train, valid, test])
def set_scalers(self, df):
"""Calibrates scalers using the data supplied.
Args:
df: Data to use to calibrate scalers.
"""
print('Setting scalers with training data...')
column_definitions = self.get_column_definition()
id_column = utils.get_single_col_by_input_type(InputTypes.ID,
column_definitions)
target_column = utils.get_single_col_by_input_type(InputTypes.TARGET,
column_definitions)
# Extract identifiers in case required
self.identifiers = list(df[id_column].unique())
# Format real scalers
real_inputs = utils.extract_cols_from_data_type(
DataTypes.REAL_VALUED, column_definitions,
{InputTypes.ID, InputTypes.TIME})
data = df[real_inputs].values
self._real_scalers = sklearn.preprocessing.StandardScaler().fit(data)
self._target_scaler = sklearn.preprocessing.StandardScaler().fit(
df[[target_column]].values) # used for predictions
# Format categorical scalers
categorical_inputs = utils.extract_cols_from_data_type(
DataTypes.CATEGORICAL, column_definitions,
{InputTypes.ID, InputTypes.TIME})
categorical_scalers = {}
num_classes = []
for col in categorical_inputs:
# Set all to str so that we don't have mixed integer/string columns
srs = df[col].apply(str)
categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(
srs.values)
num_classes.append(srs.nunique())
# Set categorical scaler outputs
self._cat_scalers = categorical_scalers
self._num_classes_per_cat_input = num_classes
def transform_inputs(self, df):
"""Performs feature transformations.
This includes both feature engineering, preprocessing and normalisation.
Args:
df: Data frame to transform.
Returns:
Transformed data frame.
"""
output = df.copy()
if self._real_scalers is None and self._cat_scalers is None:
raise ValueError('Scalers have not been set!')
column_definitions = self.get_column_definition()
real_inputs = utils.extract_cols_from_data_type(
DataTypes.REAL_VALUED, column_definitions,
{InputTypes.ID, InputTypes.TIME})
categorical_inputs = utils.extract_cols_from_data_type(
DataTypes.CATEGORICAL, column_definitions,
{InputTypes.ID, InputTypes.TIME})
# Format real inputs
output[real_inputs] = self._real_scalers.transform(df[real_inputs].values)
# Format categorical inputs
for col in categorical_inputs:
string_df = df[col].apply(str)
output[col] = self._cat_scalers[col].transform(string_df)
return output
def format_predictions(self, predictions):
"""Reverts any normalisation to give predictions in original scale.
Args:
predictions: Dataframe of model predictions.
Returns:
Data frame of unnormalised predictions.
"""
output = predictions.copy()
column_names = predictions.columns
for col in column_names:
if col not in {'forecast_time', 'identifier'}:
output[col] = self._target_scaler.inverse_transform(predictions[col])
return output
# Default params
def get_fixed_params(self):
"""Returns fixed model parameters for experiments."""
fixed_params = {
'total_time_steps': 16 + 6,
'num_encoder_steps': 16,
'num_epochs': 100,
'early_stopping_patience': 5,
'multiprocessing_workers': 5,
}
return fixed_params
def get_default_model_params(self):
"""Returns default optimised model parameters."""
model_params = {
'dropout_rate': 0.3,
'hidden_layer_size': 160,
'learning_rate': 0.01,
'minibatch_size': 64,
'max_gradient_norm': 0.01,
'num_heads': 1,
'stack_size': 1
}
return model_params

View File

@@ -0,0 +1,117 @@
# coding=utf-8
# Copyright 2020 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Custom formatting functions for Traffic dataset.
Defines dataset specific column definitions and data transformations. This also
performs z-score normalization across the entire dataset, hence re-uses most of
the same functions as volatility.
"""
import data_formatters.base
import data_formatters.volatility
VolatilityFormatter = data_formatters.volatility.VolatilityFormatter
DataTypes = data_formatters.base.DataTypes
InputTypes = data_formatters.base.InputTypes
class TrafficFormatter(VolatilityFormatter):
"""Defines and formats data for the traffic dataset.
This also performs z-score normalization across the entire dataset, hence
re-uses most of the same functions as volatility.
Attributes:
column_definition: Defines input and data type of column used in the
experiment.
identifiers: Entity identifiers used in experiments.
"""
_column_definition = [
('id', DataTypes.REAL_VALUED, InputTypes.ID),
('hours_from_start', DataTypes.REAL_VALUED, InputTypes.TIME),
('values', DataTypes.REAL_VALUED, InputTypes.TARGET),
('time_on_day', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
('day_of_week', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
('hours_from_start', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
('categorical_id', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
]
def split_data(self, df, valid_boundary=151, test_boundary=166):
"""Splits data frame into training-validation-test data frames.
This also calibrates scaling object, and transforms data for each split.
Args:
df: Source data frame to split.
valid_boundary: Starting year for validation data
test_boundary: Starting year for test data
Returns:
Tuple of transformed (train, valid, test) data.
"""
print('Formatting train-valid-test splits.')
index = df['sensor_day']
train = df.loc[index < valid_boundary]
valid = df.loc[(index >= valid_boundary - 7) & (index < test_boundary)]
test = df.loc[index >= test_boundary - 7]
self.set_scalers(train)
return (self.transform_inputs(data) for data in [train, valid, test])
# Default params
def get_fixed_params(self):
"""Returns fixed model parameters for experiments."""
fixed_params = {
'total_time_steps': 8 * 24,
'num_encoder_steps': 7 * 24,
'num_epochs': 100,
'early_stopping_patience': 5,
'multiprocessing_workers': 5
}
return fixed_params
def get_default_model_params(self):
"""Returns default optimised model parameters."""
model_params = {
'dropout_rate': 0.3,
'hidden_layer_size': 320,
'learning_rate': 0.001,
'minibatch_size': 128,
'max_gradient_norm': 100.,
'num_heads': 4,
'stack_size': 1
}
return model_params
def get_num_samples_for_calibration(self):
"""Gets the default number of training and validation samples.
Use to sub-sample the data for network calibration and a value of -1 uses
all available samples.
Returns:
Tuple of (training samples, validation samples)
"""
return 450000, 50000

View File

@@ -0,0 +1,214 @@
# coding=utf-8
# Copyright 2020 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Custom formatting functions for Volatility dataset.
Defines dataset specific column definitions and data transformations.
"""
import data_formatters.base
import libs.utils as utils
import sklearn.preprocessing
GenericDataFormatter = data_formatters.base.GenericDataFormatter
DataTypes = data_formatters.base.DataTypes
InputTypes = data_formatters.base.InputTypes
class VolatilityFormatter(GenericDataFormatter):
"""Defines and formats data for the volatility dataset.
Attributes:
column_definition: Defines input and data type of column used in the
experiment.
identifiers: Entity identifiers used in experiments.
"""
_column_definition = [
('Symbol', DataTypes.CATEGORICAL, InputTypes.ID),
('date', DataTypes.DATE, InputTypes.TIME),
('log_vol', DataTypes.REAL_VALUED, InputTypes.TARGET),
('open_to_close', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
('days_from_start', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
('day_of_week', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
('day_of_month', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
('week_of_year', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
('month', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
('Region', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
]
def __init__(self):
"""Initialises formatter."""
self.identifiers = None
self._real_scalers = None
self._cat_scalers = None
self._target_scaler = None
self._num_classes_per_cat_input = None
def split_data(self, df, valid_boundary=2016, test_boundary=2018):
"""Splits data frame into training-validation-test data frames.
This also calibrates scaling object, and transforms data for each split.
Args:
df: Source data frame to split.
valid_boundary: Starting year for validation data
test_boundary: Starting year for test data
Returns:
Tuple of transformed (train, valid, test) data.
"""
print('Formatting train-valid-test splits.')
index = df['year']
train = df.loc[index < valid_boundary]
valid = df.loc[(index >= valid_boundary) & (index < test_boundary)]
test = df.loc[index >= test_boundary]
self.set_scalers(train)
return (self.transform_inputs(data) for data in [train, valid, test])
def set_scalers(self, df):
"""Calibrates scalers using the data supplied.
Args:
df: Data to use to calibrate scalers.
"""
print('Setting scalers with training data...')
column_definitions = self.get_column_definition()
id_column = utils.get_single_col_by_input_type(InputTypes.ID,
column_definitions)
target_column = utils.get_single_col_by_input_type(InputTypes.TARGET,
column_definitions)
# Extract identifiers in case required
self.identifiers = list(df[id_column].unique())
# Format real scalers
real_inputs = utils.extract_cols_from_data_type(
DataTypes.REAL_VALUED, column_definitions,
{InputTypes.ID, InputTypes.TIME})
data = df[real_inputs].values
self._real_scalers = sklearn.preprocessing.StandardScaler().fit(data)
self._target_scaler = sklearn.preprocessing.StandardScaler().fit(
df[[target_column]].values) # used for predictions
# Format categorical scalers
categorical_inputs = utils.extract_cols_from_data_type(
DataTypes.CATEGORICAL, column_definitions,
{InputTypes.ID, InputTypes.TIME})
categorical_scalers = {}
num_classes = []
for col in categorical_inputs:
# Set all to str so that we don't have mixed integer/string columns
srs = df[col].apply(str)
categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(
srs.values)
num_classes.append(srs.nunique())
# Set categorical scaler outputs
self._cat_scalers = categorical_scalers
self._num_classes_per_cat_input = num_classes
def transform_inputs(self, df):
"""Performs feature transformations.
This includes both feature engineering, preprocessing and normalisation.
Args:
df: Data frame to transform.
Returns:
Transformed data frame.
"""
output = df.copy()
if self._real_scalers is None and self._cat_scalers is None:
raise ValueError('Scalers have not been set!')
column_definitions = self.get_column_definition()
real_inputs = utils.extract_cols_from_data_type(
DataTypes.REAL_VALUED, column_definitions,
{InputTypes.ID, InputTypes.TIME})
categorical_inputs = utils.extract_cols_from_data_type(
DataTypes.CATEGORICAL, column_definitions,
{InputTypes.ID, InputTypes.TIME})
# Format real inputs
output[real_inputs] = self._real_scalers.transform(df[real_inputs].values)
# Format categorical inputs
for col in categorical_inputs:
string_df = df[col].apply(str)
output[col] = self._cat_scalers[col].transform(string_df)
return output
def format_predictions(self, predictions):
"""Reverts any normalisation to give predictions in original scale.
Args:
predictions: Dataframe of model predictions.
Returns:
Data frame of unnormalised predictions.
"""
output = predictions.copy()
column_names = predictions.columns
for col in column_names:
if col not in {'forecast_time', 'identifier'}:
output[col] = self._target_scaler.inverse_transform(predictions[col])
return output
# Default params
def get_fixed_params(self):
"""Returns fixed model parameters for experiments."""
fixed_params = {
'total_time_steps': 252 + 5,
'num_encoder_steps': 252,
'num_epochs': 100,
'early_stopping_patience': 5,
'multiprocessing_workers': 5,
}
return fixed_params
def get_default_model_params(self):
"""Returns default optimised model parameters."""
model_params = {
'dropout_rate': 0.3,
'hidden_layer_size': 160,
'learning_rate': 0.01,
'minibatch_size': 64,
'max_gradient_norm': 0.01,
'num_heads': 1,
'stack_size': 1
}
return model_params

View File

@@ -0,0 +1,15 @@
# coding=utf-8
# Copyright 2020 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,111 @@
# coding=utf-8
# Copyright 2020 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Default configs for TFT experiments.
Contains the default output paths for data, serialised models and predictions
for the main experiments used in the publication.
"""
import os
import data_formatters.electricity
import data_formatters.favorita
import data_formatters.traffic
import data_formatters.volatility
import data_formatters.qlib_Alpha158
class ExperimentConfig(object):
"""Defines experiment configs and paths to outputs.
Attributes:
root_folder: Root folder to contain all experimental outputs.
experiment: Name of experiment to run.
data_folder: Folder to store data for experiment.
model_folder: Folder to store serialised models.
results_folder: Folder to store results.
data_csv_path: Path to primary data csv file used in experiment.
hyperparam_iterations: Default number of random search iterations for
experiment.
"""
default_experiments = ['volatility', 'electricity', 'traffic', 'favorita', 'Alpha158']
def __init__(self, experiment='volatility', root_folder=None):
"""Creates configs based on default experiment chosen.
Args:
experiment: Name of experiment.
root_folder: Root folder to save all outputs of training.
"""
if experiment not in self.default_experiments:
raise ValueError('Unrecognised experiment={}'.format(experiment))
# Defines all relevant paths
if root_folder is None:
root_folder = os.path.join(
os.path.dirname(os.path.realpath(__file__)), '..', 'outputs')
print('Using root folder {}'.format(root_folder))
self.root_folder = root_folder
self.experiment = experiment
self.data_folder = os.path.join(root_folder, 'data', experiment)
self.model_folder = os.path.join(root_folder, 'saved_models', experiment)
self.results_folder = os.path.join(root_folder, 'results', experiment)
# Creates folders if they don't exist
for relevant_directory in [
self.root_folder, self.data_folder, self.model_folder,
self.results_folder
]:
if not os.path.exists(relevant_directory):
os.makedirs(relevant_directory)
@property
def data_csv_path(self):
csv_map = {
'volatility': 'formatted_omi_vol.csv',
'electricity': 'hourly_electricity.csv',
'traffic': 'hourly_data.csv',
'favorita': 'favorita_consolidated.csv',
'Alpha158': 'Alpha158.csv',
}
return os.path.join(self.data_folder, csv_map[self.experiment])
@property
def hyperparam_iterations(self):
return 240 if self.experiment == 'volatility' else 60
def make_data_formatter(self):
"""Gets a data formatter object for experiment.
Returns:
Default DataFormatter per experiment.
"""
data_formatter_class = {
'volatility': data_formatters.volatility.VolatilityFormatter,
'electricity': data_formatters.electricity.ElectricityFormatter,
'traffic': data_formatters.traffic.TrafficFormatter,
'favorita': data_formatters.favorita.FavoritaFormatter,
'Alpha158': data_formatters.qlib_Alpha158.Alpha158Formatter,
}
return data_formatter_class[self.experiment]()

View File

@@ -0,0 +1,15 @@
# coding=utf-8
# Copyright 2020 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -0,0 +1,438 @@
# coding=utf-8
# Copyright 2020 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Classes used for hyperparameter optimisation.
Two main classes exist:
1) HyperparamOptManager used for optimisation on a single machine/GPU.
2) DistributedHyperparamOptManager for multiple GPUs on different machines.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import os
import shutil
import libs.utils as utils
import numpy as np
import pandas as pd
Deque = collections.deque
class HyperparamOptManager:
"""Manages hyperparameter optimisation using random search for a single GPU.
Attributes:
param_ranges: Discrete hyperparameter range for random search.
results: Dataframe of validation results.
fixed_params: Fixed model parameters per experiment.
saved_params: Dataframe of parameters trained.
best_score: Minimum validation loss observed thus far.
optimal_name: Key to best configuration.
hyperparam_folder: Where to save optimisation outputs.
"""
def __init__(self,
param_ranges,
fixed_params,
model_folder,
override_w_fixed_params=True):
"""Instantiates model.
Args:
param_ranges: Discrete hyperparameter range for random search.
fixed_params: Fixed model parameters per experiment.
model_folder: Folder to store optimisation artifacts.
override_w_fixed_params: Whether to override serialsed fixed model
parameters with new supplied values.
"""
self.param_ranges = param_ranges
self._max_tries = 1000
self.results = pd.DataFrame()
self.fixed_params = fixed_params
self.saved_params = pd.DataFrame()
self.best_score = np.Inf
self.optimal_name = ""
# Setup
# Create folder for saving if its not there
self.hyperparam_folder = model_folder
utils.create_folder_if_not_exist(self.hyperparam_folder)
self._override_w_fixed_params = override_w_fixed_params
def load_results(self):
"""Loads results from previous hyperparameter optimisation.
Returns:
A boolean indicating if previous results can be loaded.
"""
print("Loading results from", self.hyperparam_folder)
results_file = os.path.join(self.hyperparam_folder, "results.csv")
params_file = os.path.join(self.hyperparam_folder, "params.csv")
if os.path.exists(results_file) and os.path.exists(params_file):
self.results = pd.read_csv(results_file, index_col=0)
self.saved_params = pd.read_csv(params_file, index_col=0)
if not self.results.empty:
self.results.at["loss"] = self.results.loc["loss"].apply(float)
self.best_score = self.results.loc["loss"].min()
is_optimal = self.results.loc["loss"] == self.best_score
self.optimal_name = self.results.T[is_optimal].index[0]
return True
return False
def _get_params_from_name(self, name):
"""Returns previously saved parameters given a key."""
params = self.saved_params
selected_params = dict(params[name])
if self._override_w_fixed_params:
for k in self.fixed_params:
selected_params[k] = self.fixed_params[k]
return selected_params
def get_best_params(self):
"""Returns the optimal hyperparameters thus far."""
optimal_name = self.optimal_name
return self._get_params_from_name(optimal_name)
def clear(self):
"""Clears all previous results and saved parameters."""
shutil.rmtree(self.hyperparam_folder)
os.makedirs(self.hyperparam_folder)
self.results = pd.DataFrame()
self.saved_params = pd.DataFrame()
def _check_params(self, params):
"""Checks that parameter map is properly defined."""
valid_fields = list(self.param_ranges.keys()) + list(
self.fixed_params.keys())
invalid_fields = [k for k in params if k not in valid_fields]
missing_fields = [k for k in valid_fields if k not in params]
if invalid_fields:
raise ValueError("Invalid Fields Found {} - Valid ones are {}".format(
invalid_fields, valid_fields))
if missing_fields:
raise ValueError("Missing Fields Found {} - Valid ones are {}".format(
missing_fields, valid_fields))
def _get_name(self, params):
"""Returns a unique key for the supplied set of params."""
self._check_params(params)
fields = list(params.keys())
fields.sort()
return "_".join([str(params[k]) for k in fields])
def get_next_parameters(self, ranges_to_skip=None):
"""Returns the next set of parameters to optimise.
Args:
ranges_to_skip: Explicitly defines a set of keys to skip.
"""
if ranges_to_skip is None:
ranges_to_skip = set(self.results.index)
if not isinstance(self.param_ranges, dict):
raise ValueError("Only works for random search!")
param_range_keys = list(self.param_ranges.keys())
param_range_keys.sort()
def _get_next():
"""Returns next hyperparameter set per try."""
parameters = {
k: np.random.choice(self.param_ranges[k]) for k in param_range_keys
}
# Adds fixed params
for k in self.fixed_params:
parameters[k] = self.fixed_params[k]
return parameters
for _ in range(self._max_tries):
parameters = _get_next()
name = self._get_name(parameters)
if name not in ranges_to_skip:
return parameters
raise ValueError("Exceeded max number of hyperparameter searches!!")
def update_score(self, parameters, loss, model, info=""):
"""Updates the results from last optimisation run.
Args:
parameters: Hyperparameters used in optimisation.
loss: Validation loss obtained.
model: Model to serialised if required.
info: Any ancillary information to tag on to results.
Returns:
Boolean flag indicating if the model is the best seen so far.
"""
if np.isnan(loss):
loss = np.Inf
if not os.path.isdir(self.hyperparam_folder):
os.makedirs(self.hyperparam_folder)
name = self._get_name(parameters)
is_optimal = self.results.empty or loss < self.best_score
# save the first model
if is_optimal:
# Try saving first, before updating info
if model is not None:
print("Optimal model found, updating")
model.save(self.hyperparam_folder)
self.best_score = loss
self.optimal_name = name
self.results[name] = pd.Series({"loss": loss, "info": info})
self.saved_params[name] = pd.Series(parameters)
self.results.to_csv(os.path.join(self.hyperparam_folder, "results.csv"))
self.saved_params.to_csv(os.path.join(self.hyperparam_folder, "params.csv"))
return is_optimal
class DistributedHyperparamOptManager(HyperparamOptManager):
"""Manages distributed hyperparameter optimisation across many gpus."""
def __init__(self,
param_ranges,
fixed_params,
root_model_folder,
worker_number,
search_iterations=1000,
num_iterations_per_worker=5,
clear_serialised_params=False):
"""Instantiates optimisation manager.
This hyperparameter optimisation pre-generates #search_iterations
hyperparameter combinations and serialises them
at the start. At runtime, each worker goes through their own set of
parameter ranges. The pregeneration
allows for multiple workers to run in parallel on different machines without
resulting in parameter overlaps.
Args:
param_ranges: Discrete hyperparameter range for random search.
fixed_params: Fixed model parameters per experiment.
root_model_folder: Folder to store optimisation artifacts.
worker_number: Worker index definining which set of hyperparameters to
test.
search_iterations: Maximum numer of random search iterations.
num_iterations_per_worker: How many iterations are handled per worker.
clear_serialised_params: Whether to regenerate hyperparameter
combinations.
"""
max_workers = int(np.ceil(search_iterations / num_iterations_per_worker))
# Sanity checks
if worker_number > max_workers:
raise ValueError(
"Worker number ({}) cannot be larger than the total number of workers!"
.format(max_workers))
if worker_number > search_iterations:
raise ValueError(
"Worker number ({}) cannot be larger than the max search iterations ({})!"
.format(worker_number, search_iterations))
print("*** Creating hyperparameter manager for worker {} ***".format(
worker_number))
hyperparam_folder = os.path.join(root_model_folder, str(worker_number))
super().__init__(
param_ranges,
fixed_params,
hyperparam_folder,
override_w_fixed_params=True)
serialised_ranges_folder = os.path.join(root_model_folder, "hyperparams")
if clear_serialised_params:
print("Regenerating hyperparameter list")
if os.path.exists(serialised_ranges_folder):
shutil.rmtree(serialised_ranges_folder)
utils.create_folder_if_not_exist(serialised_ranges_folder)
self.serialised_ranges_path = os.path.join(
serialised_ranges_folder, "ranges_{}.csv".format(search_iterations))
self.hyperparam_folder = hyperparam_folder # override
self.worker_num = worker_number
self.total_search_iterations = search_iterations
self.num_iterations_per_worker = num_iterations_per_worker
self.global_hyperparam_df = self.load_serialised_hyperparam_df()
self.worker_search_queue = self._get_worker_search_queue()
@property
def optimisation_completed(self):
return False if self.worker_search_queue else True
def get_next_parameters(self):
"""Returns next dictionary of hyperparameters to optimise."""
param_name = self.worker_search_queue.pop()
params = self.global_hyperparam_df.loc[param_name, :].to_dict()
# Always override!
for k in self.fixed_params:
print("Overriding saved {}: {}".format(k, self.fixed_params[k]))
params[k] = self.fixed_params[k]
return params
def load_serialised_hyperparam_df(self):
"""Loads serialsed hyperparameter ranges from file.
Returns:
DataFrame containing hyperparameter combinations.
"""
print("Loading params for {} search iterations form {}".format(
self.total_search_iterations, self.serialised_ranges_path))
if os.path.exists(self.serialised_ranges_folder):
df = pd.read_csv(self.serialised_ranges_path, index_col=0)
else:
print("Unable to load - regenerating serach ranges instead")
df = self.update_serialised_hyperparam_df()
return df
def update_serialised_hyperparam_df(self):
"""Regenerates hyperparameter combinations and saves to file.
Returns:
DataFrame containing hyperparameter combinations.
"""
search_df = self._generate_full_hyperparam_df()
print("Serialising params for {} search iterations to {}".format(
self.total_search_iterations, self.serialised_ranges_path))
search_df.to_csv(self.serialised_ranges_path)
return search_df
def _generate_full_hyperparam_df(self):
"""Generates actual hyperparameter combinations.
Returns:
DataFrame containing hyperparameter combinations.
"""
np.random.seed(131) # for reproducibility of hyperparam list
name_list = []
param_list = []
for _ in range(self.total_search_iterations):
params = super().get_next_parameters(name_list)
name = self._get_name(params)
name_list.append(name)
param_list.append(params)
full_search_df = pd.DataFrame(param_list, index=name_list)
return full_search_df
def clear(self): # reset when cleared
"""Clears results for hyperparameter manager and resets."""
super().clear()
self.worker_search_queue = self._get_worker_search_queue()
def load_results(self):
"""Load results from file and queue parameter combinations to try.
Returns:
Boolean indicating if results were successfully loaded.
"""
success = super().load_results()
if success:
self.worker_search_queue = self._get_worker_search_queue()
return success
def _get_worker_search_queue(self):
"""Generates the queue of param combinations for current worker.
Returns:
Queue of hyperparameter combinations outstanding.
"""
global_df = self.assign_worker_numbers(self.global_hyperparam_df)
worker_df = global_df[global_df["worker"] == self.worker_num]
left_overs = [s for s in worker_df.index if s not in self.results.columns]
return Deque(left_overs)
def assign_worker_numbers(self, df):
"""Updates parameter combinations with the index of the worker used.
Args:
df: DataFrame of parameter combinations.
Returns:
Updated DataFrame with worker number.
"""
output = df.copy()
n = self.total_search_iterations
batch_size = self.num_iterations_per_worker
max_worker_num = int(np.ceil(n / batch_size))
worker_idx = np.concatenate([
np.tile(i + 1, self.num_iterations_per_worker)
for i in range(max_worker_num)
])
output["worker"] = worker_idx[:len(output)]
return output

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,236 @@
# coding=utf-8
# Copyright 2020 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Generic helper functions used across codebase."""
import os
import pathlib
import numpy as np
import tensorflow as tf
from tensorflow.python.tools.inspect_checkpoint import print_tensors_in_checkpoint_file
# Generic.
def get_single_col_by_input_type(input_type, column_definition):
"""Returns name of single column.
Args:
input_type: Input type of column to extract
column_definition: Column definition list for experiment
"""
l = [tup[0] for tup in column_definition if tup[2] == input_type]
if len(l) != 1:
raise ValueError('Invalid number of columns for {}'.format(input_type))
return l[0]
def extract_cols_from_data_type(data_type, column_definition,
excluded_input_types):
"""Extracts the names of columns that correspond to a define data_type.
Args:
data_type: DataType of columns to extract.
column_definition: Column definition to use.
excluded_input_types: Set of input types to exclude
Returns:
List of names for columns with data type specified.
"""
return [
tup[0]
for tup in column_definition
if tup[1] == data_type and tup[2] not in excluded_input_types
]
# Loss functions.
def tensorflow_quantile_loss(y, y_pred, quantile):
"""Computes quantile loss for tensorflow.
Standard quantile loss as defined in the "Training Procedure" section of
the main TFT paper
Args:
y: Targets
y_pred: Predictions
quantile: Quantile to use for loss calculations (between 0 & 1)
Returns:
Tensor for quantile loss.
"""
# Checks quantile
if quantile < 0 or quantile > 1:
raise ValueError(
'Illegal quantile value={}! Values should be between 0 and 1.'.format(
quantile))
prediction_underflow = y - y_pred
q_loss = quantile * tf.maximum(prediction_underflow, 0.) + (
1. - quantile) * tf.maximum(-prediction_underflow, 0.)
return tf.reduce_sum(q_loss, axis=-1)
def numpy_normalised_quantile_loss(y, y_pred, quantile):
"""Computes normalised quantile loss for numpy arrays.
Uses the q-Risk metric as defined in the "Training Procedure" section of the
main TFT paper.
Args:
y: Targets
y_pred: Predictions
quantile: Quantile to use for loss calculations (between 0 & 1)
Returns:
Float for normalised quantile loss.
"""
prediction_underflow = y - y_pred
weighted_errors = quantile * np.maximum(prediction_underflow, 0.) \
+ (1. - quantile) * np.maximum(-prediction_underflow, 0.)
quantile_loss = weighted_errors.mean()
normaliser = y.abs().mean()
return 2 * quantile_loss / normaliser
# OS related functions.
def create_folder_if_not_exist(directory):
"""Creates folder if it doesn't exist.
Args:
directory: Folder path to create.
"""
# Also creates directories recursively
pathlib.Path(directory).mkdir(parents=True, exist_ok=True)
# Tensorflow related functions.
def get_default_tensorflow_config(tf_device='gpu', gpu_id=0):
"""Creates tensorflow config for graphs to run on CPU or GPU.
Specifies whether to run graph on gpu or cpu and which GPU ID to use for multi
GPU machines.
Args:
tf_device: 'cpu' or 'gpu'
gpu_id: GPU ID to use if relevant
Returns:
Tensorflow config.
"""
if tf_device == 'cpu':
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # for training on cpu
tf_config = tf.ConfigProto(
log_device_placement=False, device_count={'GPU': 0})
else:
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
print('Selecting GPU ID={}'.format(gpu_id))
tf_config = tf.ConfigProto(log_device_placement=False)
tf_config.gpu_options.allow_growth = True
return tf_config
def save(tf_session, model_folder, cp_name, scope=None):
"""Saves Tensorflow graph to checkpoint.
Saves all trainiable variables under a given variable scope to checkpoint.
Args:
tf_session: Session containing graph
model_folder: Folder to save models
cp_name: Name of Tensorflow checkpoint
scope: Variable scope containing variables to save
"""
# Save model
if scope is None:
saver = tf.train.Saver()
else:
var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
saver = tf.train.Saver(var_list=var_list, max_to_keep=100000)
save_path = saver.save(tf_session,
os.path.join(model_folder, '{0}.ckpt'.format(cp_name)))
print('Model saved to: {0}'.format(save_path))
def load(tf_session, model_folder, cp_name, scope=None, verbose=False):
"""Loads Tensorflow graph from checkpoint.
Args:
tf_session: Session to load graph into
model_folder: Folder containing serialised model
cp_name: Name of Tensorflow checkpoint
scope: Variable scope to use.
verbose: Whether to print additional debugging information.
"""
# Load model proper
load_path = os.path.join(model_folder, '{0}.ckpt'.format(cp_name))
print('Loading model from {0}'.format(load_path))
print_weights_in_checkpoint(model_folder, cp_name)
initial_vars = set(
[v.name for v in tf.get_default_graph().as_graph_def().node])
# Saver
if scope is None:
saver = tf.train.Saver()
else:
var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)
saver = tf.train.Saver(var_list=var_list, max_to_keep=100000)
# Load
saver.restore(tf_session, load_path)
all_vars = set([v.name for v in tf.get_default_graph().as_graph_def().node])
if verbose:
print('Restored {0}'.format(','.join(initial_vars.difference(all_vars))))
print('Existing {0}'.format(','.join(all_vars.difference(initial_vars))))
print('All {0}'.format(','.join(all_vars)))
print('Done.')
def print_weights_in_checkpoint(model_folder, cp_name):
"""Prints all weights in Tensorflow checkpoint.
Args:
model_folder: Folder containing checkpoint
cp_name: Name of checkpoint
Returns:
"""
load_path = os.path.join(model_folder, '{0}.ckpt'.format(cp_name))
print_tensors_in_checkpoint_file(
file_name=load_path,
tensor_name='',
all_tensors=True,
all_tensor_names=True)

View File

@@ -0,0 +1,246 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import numpy as np
import pandas as pd
import tensorflow.compat.v1 as tf
import data_formatters.base
import expt_settings.configs
import libs.hyperparam_opt
import libs.tft_model
import libs.utils as utils
import os
import datetime as dte
from qlib.model.base import ModelFT
from qlib.data.dataset import DatasetH
from qlib.data.dataset.handler import DataHandlerLP
# To register new datasets, please add them here.
ALLOW_DATASET = ['Alpha158']
DATASET_SETTING = {
'Alpha158': {
'feature_col': ['RESI5', 'WVMA5', 'RSQR5', 'KLEN', 'RSQR10', 'CORR5', 'CORD5', 'CORR10', 'ROC60', 'RESI10'],
'label_col': ['LABEL0'],
},
}
# To register new datasets, please add their configurations here.
def get_shifted_label(data_df, shifts=5, col_shift='LABEL0'):
return data_df[[col_shift]].groupby('instrument').apply(lambda df: df.shift(shifts))
def fill_test_na(test_df):
test_df_res = test_df.copy()
feature_cols = ~test_df_res.columns.str.contains('label', case=False)
test_feature_fna = test_df_res.loc[:, feature_cols].groupby('datetime').apply(lambda df: df.fillna(df.mean()))
test_df_res.loc[:, feature_cols] = test_feature_fna
return test_df_res
def process_qlib_data(df, dataset, fillna=False):
"""Prepare data to fit the TFT model.
Args:
df: Original DataFrame.
fillna: Whether to fill the data with the mean values.
Returns:
Transformed DataFrame.
"""
# Several features selected manually
feature_col = DATASET_SETTING[dataset]['feature_col']
label_col = DATASET_SETTING[dataset]['label_col']
temp_df = df.loc[:, feature_col+label_col]
if fillna:
temp_df = fill_test_na(temp_df)
temp_df = temp_df.swaplevel()
temp_df = temp_df.sort_index()
temp_df = temp_df.reset_index(level=0)
dates = pd.to_datetime(temp_df.index)
temp_df['date'] = dates
temp_df['day_of_week'] = dates.dayofweek
temp_df['month'] = dates.month
temp_df['year'] = dates.year
temp_df['const'] = 1.0
return temp_df
def process_predicted(df, col_name):
"""Transform the TFT predicted data into Qlib format.
Args:
df: Original DataFrame.
fillna: New column name.
Returns:
Transformed DataFrame.
"""
df_res = df.copy()
df_res = df_res.rename(columns={"forecast_time": "datetime", "identifier": "instrument", "t+0": col_name})
df_res = df_res.set_index(['datetime','instrument']).sort_index()
df_res = df_res[[col_name]]
return df_res
def format_score(forecast_df, col_name='pred', label_shift=5):
pred = process_predicted(forecast_df, col_name=col_name)
pred = get_shifted_label(pred, shifts=-label_shift, col_shift=col_name)
pred = pred.dropna()[col_name]
return pred
def transform_df(df, col_name='LABEL0'):
df_res = df['feature']
df_res[col_name] = df['label']
return df_res
class TFTModel(ModelFT):
"""TFT Model"""
def __init__(self, **kwargs):
self.model = None
def _prepare_data(self, dataset: DatasetH):
df_train, df_valid = dataset.prepare(
["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
)
return transform_df(df_train), transform_df(df_valid)
def fit(
self,
dataset: DatasetH,
DATASET = 'Alpha158',
MODEL_FOLDER = 'qlib_alpha158_model',
LABEL_COL = 'LABEL0',
LABEL_SHIFT = 5,
USE_GPU_ID = 0,
**kwargs
):
if DATASET not in ALLOW_DATASET:
raise AssertionError("The dataset is not supported, please make a new formatter to fit this dataset")
dtrain, dvalid = self._prepare_data(dataset)
dtrain.loc[:, LABEL_COL] = get_shifted_label(dtrain, shifts=LABEL_SHIFT, col_shift=LABEL_COL)
dvalid.loc[:, LABEL_COL] = get_shifted_label(dvalid, shifts=LABEL_SHIFT, col_shift=LABEL_COL)
train = process_qlib_data(dtrain, DATASET, fillna=True).dropna()
valid = process_qlib_data(dvalid, DATASET, fillna=True).dropna()
ExperimentConfig = expt_settings.configs.ExperimentConfig
config = ExperimentConfig(DATASET)
self.data_formatter = config.make_data_formatter()
self.model_folder = MODEL_FOLDER
self.gpu_id = USE_GPU_ID
self.label_shift = LABEL_SHIFT
self.expt_name = DATASET
self.label_col = LABEL_COL
use_gpu = (True, self.gpu_id)
#===========================Training Process===========================
ModelClass = libs.tft_model.TemporalFusionTransformer
if not isinstance(self.data_formatter, data_formatters.base.GenericDataFormatter):
raise ValueError(
"Data formatters should inherit from" +
"AbstractDataFormatter! Type={}".format(type(self.data_formatter)))
default_keras_session = tf.keras.backend.get_session()
if use_gpu[0]:
self.tf_config = utils.get_default_tensorflow_config(tf_device="gpu", gpu_id=use_gpu[1])
else:
self.tf_config = utils.get_default_tensorflow_config(tf_device="cpu")
self.data_formatter.set_scalers(train)
# Sets up default params
fixed_params = self.data_formatter.get_experiment_params()
params = self.data_formatter.get_default_model_params()
# Wendi: 合并调优的参数和非调优的参数
params = {**params, **fixed_params}
if not os.path.exists(self.model_folder):
os.makedirs(self.model_folder)
params['model_folder'] = self.model_folder
print("*** Begin training ***")
best_loss = np.Inf
tf.reset_default_graph()
self.tf_graph = tf.Graph()
with self.tf_graph.as_default():
self.sess = tf.Session(config=self.tf_config)
tf.keras.backend.set_session(self.sess)
self.model = ModelClass(params, use_cudnn=use_gpu[0])
self.sess.run(tf.global_variables_initializer())
self.model.fit(train_df=train, valid_df=valid)
print("*** Finished training ***")
saved_model_dir = self.model_folder+'/'+'saved_model'
if not os.path.exists(saved_model_dir):
os.makedirs(saved_model_dir)
self.model.save(saved_model_dir)
def extract_numerical_data(data):
"""Strips out forecast time and identifier columns."""
return data[[
col for col in data.columns
if col not in {"forecast_time", "identifier"}
]]
#p50_loss = utils.numpy_normalised_quantile_loss(
# extract_numerical_data(targets), extract_numerical_data(p50_forecast),
# 0.5)
#p90_loss = utils.numpy_normalised_quantile_loss(
# extract_numerical_data(targets), extract_numerical_data(p90_forecast),
# 0.9)
tf.keras.backend.set_session(default_keras_session)
print("Training completed.".format(dte.datetime.now()))
#===========================Training Process===========================
def predict(self, dataset):
if self.model is None:
raise ValueError("model is not fitted yet!")
d_test = dataset.prepare("test", col_set=["feature", "label"])
d_test = transform_df(d_test)
d_test.loc[:, self.label_col] = get_shifted_label(d_test, shifts=self.label_shift, col_shift=self.label_col)
test = process_qlib_data(d_test, self.expt_name, fillna=True).dropna()
use_gpu = (True, self.gpu_id)
#===========================Predicting Process===========================
default_keras_session = tf.keras.backend.get_session()
# Sets up default params
fixed_params = self.data_formatter.get_experiment_params()
params = self.data_formatter.get_default_model_params()
params = {**params, **fixed_params}
print("*** Begin predicting ***")
tf.reset_default_graph()
with self.tf_graph.as_default():
tf.keras.backend.set_session(self.sess)
output_map = self.model.predict(test, return_targets=True)
targets = self.data_formatter.format_predictions(output_map["targets"])
p50_forecast = self.data_formatter.format_predictions(output_map["p50"])
p90_forecast = self.data_formatter.format_predictions(output_map["p90"])
tf.keras.backend.set_session(default_keras_session)
predict = format_score(p90_forecast, 'pred', self.label_shift)
label = format_score(targets, 'label', self.label_shift)
#===========================Predicting Process===========================
return predict, label
def finetune(self, dataset: DatasetH):
"""
finetune model
Parameters
----------
dataset : DatasetH
dataset for finetuning
"""
pass

View File

@@ -0,0 +1,130 @@
#Copyright (c) Microsoft Corporation.
#Licensed under the MIT License.
import sys
from pathlib import Path
import qlib
import pandas as pd
from qlib.config import REG_CN
from qlib.contrib.model.pytorch_lstm import LSTM
from qlib.contrib.data.handler import ALPHA360_Denoise
from qlib.contrib.strategy.strategy import TopkDropoutStrategy
from qlib.contrib.evaluate import (
backtest as normal_backtest,
risk_analysis,
)
from qlib.utils import exists_qlib_data
# from qlib.model.learner import train_model
from qlib.utils import init_instance_by_config
import pickle
from tft import TFTModel
if __name__ == "__main__":
# use default data
provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir
if not exists_qlib_data(provider_uri):
print(f"Qlib data is not found in {provider_uri}")
sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts")))
from get_data import GetData
GetData().qlib_data_cn(target_dir=provider_uri)
qlib.init(provider_uri=provider_uri, region=REG_CN)
MARKET = "csi300"
BENCHMARK = "SH000300"
###################################
# train model
###################################
DATA_HANDLER_CONFIG = {
"start_time": "2008-01-01",
"end_time": "2020-08-01",
"fit_start_time": "2008-01-01",
"fit_end_time": "2014-12-31",
"instruments": MARKET,
}
TRAINER_CONFIG = {
"train_start_time": "2008-01-01",
"train_end_time": "2014-12-31",
"validate_start_time": "2015-01-01",
"validate_end_time": "2016-12-31",
"test_start_time": "2017-01-01",
"test_end_time": "2020-08-01",
}
task = {
"dataset": {
"class": "DatasetH",
"module_path": "qlib.data.dataset",
"kwargs": {
'handler': {
"class": "Alpha158",
"module_path": "qlib.contrib.data.handler",
"kwargs": DATA_HANDLER_CONFIG
},
'segments': {
'train': ("2008-01-01", "2014-12-31"),
'valid': ("2015-01-01", "2016-12-31",),
'test': ("2017-01-01", "2020-08-01",),
}
}
}
# You shoud record the data in specific sequence
# "record": ['SignalRecord', 'SigAnaRecord', 'PortAnaRecord'],
}
model = TFTModel()
dataset = init_instance_by_config(task["dataset"])
model.fit(dataset)
pred_score, label_score = model.predict(dataset)
# save pred_score to file
pred_score_path = Path("~/tmp/qlib/pred_score.pkl").expanduser()
pred_score_path.parent.mkdir(exist_ok=True, parents=True)
pred_score.to_pickle(pred_score_path)
###################################
# backtest
###################################
STRATEGY_CONFIG = {
"topk": 50,
"n_drop": 5,
}
BACKTEST_CONFIG = {
"verbose": False,
"limit_threshold": 0.095,
"account": 100000000,
"benchmark": BENCHMARK,
"deal_price": "close",
"open_cost": 0.0005,
"close_cost": 0.0015,
"min_cost": 5,
}
# use default strategy
# custom Strategy, refer to: TODO: Strategy API url
strategy = TopkDropoutStrategy(**STRATEGY_CONFIG)
report_normal, positions_normal = normal_backtest(pred_score, strategy=strategy, **BACKTEST_CONFIG)
###################################
# analyze
# If need a more detailed analysis, refer to: examples/train_and_bakctest.ipynb
###################################
analysis = dict()
analysis["excess_return_without_cost"] = risk_analysis(report_normal["return"] - report_normal["bench"])
analysis["excess_return_with_cost"] = risk_analysis(
report_normal["return"] - report_normal["bench"] - report_normal["cost"]
)
analysis_df = pd.concat(analysis) # type: pd.DataFrame
print(analysis_df)