"""
Evaulation object for classification
"""
import torch
import numpy as np
import numpy.typing as npt
from typing import Type, Any, Tuple, Dict, Optional
import datetime
from carate.evaluation.base import Evaluation
from carate.loader.load_data import DatasetObject
from carate.utils.model_files import save_model_parameters
from carate.models.base_model import Model
import logging
[docs]
class RegressionEvaluation(Evaluation):
"""
Module that implements the Regression evaluation
"""
def __init__(
self,
dataset_name: str,
dataset_save_path: str,
result_save_dir: str,
model_net: Model,
optimizer: torch.optim.Optimizer,
data_set: DatasetObject,
device: torch.device,
logger: Any,
resume: bool,
test_ratio: int,
num_epoch: int = 150,
num_cv: int = 5,
num_classes: int = 2,
out_dir: str = r"./out",
batch_size: int = 64,
shuffle: bool = True,
model_save_freq: int = 100,
override: bool = True,
normalize: bool = False,
custom_size: Optional[int] = None,
):
"""
The __init__ function is called when the class is instantiated.
It sets up all the parameters needed for training and testing.
:param self: Used to Refer to the current object.
:param dataset_name:str: Used to Name the dataset.
:param dataset_save_path:str: Used to Save the dataset object
to a file.
:param result_save_dir:str: Used to Save the results of the
cross validation.
:param model_net:Model: Used to Specify the model architecture.
:param optimizer:torch.optim.Optimizer: Used to Define the optimizer
used for training.
:param data_set:DatasetObject: Used to Pass the dataset object
to the class.
:param test_ratio:int: Used to Determine the ratio of test data to
training data.
:param num_epoch:int=150: Used to Set the number of epochs to train for.
:param num_cv:int=5: Used to Set the number of cross-validation folds.
:param num_classes:int=2: Used to Set the number of classes in the dataset.
:param out_dir:str=r"./out": Used to Define the directory where
all results are saved.
:param batch_size:int=64: Used to Set the batch size of the training data.
:param shuffle:bool=True: Used to Shuffle the data set before splitting
it into training and test sets.
:param model_save_freq:int=100: Used to Save the model every 100 epochs.
:param override:bool=True: Used to Override the results if they already exist.
:param : Used to Set the number of epochs for training.
:return: The object itself, which is then assigned to the variable "model_trainer".
:doc-author: Julian M. Kleber
"""
self.dataset_name = dataset_name
self.dataset_save_path = dataset_save_path
self.test_ratio = test_ratio
self.num_epoch = num_epoch
self.model_net = model_net
self.optimizer = optimizer
self.num_classes = num_classes
self.num_cv = num_cv
self.out_dir = out_dir
self.data_set = data_set
self.logger = logger
self.batch_size = batch_size
self.shuffle = shuffle
self.device = device
self.result_save_dir = result_save_dir
self.model_save_freq = model_save_freq
self.override = override
self.resume = resume
self.normalize = normalize
self.custom_size = custom_size
[docs]
def cv(
self,
num_cv: int,
num_epoch: int,
num_classes: int,
dataset_name: str,
dataset_save_path: str,
test_ratio: int,
resume: bool,
data_set: DatasetObject,
logger:Any,
shuffle: bool,
batch_size: int,
model_net: Model,
optimizer: torch.optim.Optimizer,
device: torch.device,
result_save_dir: str,
model_save_freq: int,
override: bool = True,
normalize: bool = True,
custom_size: Optional[int] = None,
) -> Dict[str, Any]:
# initialize
(
num_cv,
num_epoch,
num_classes,
dataset_name,
dataset_save_path,
test_ratio,
resume,
data_set,
logger,
shuffle,
batch_size,
model_net,
optimizer,
device,
result_save_dir,
model_save_freq,
override,
normalize,
custom_size,
) = self._get_defaults(locals())
# data container
result = {}
save_model_parameters(model_net=model_net, save_dir=result_save_dir)
logger.logger.info("Starting " + str(num_cv) + " CVs for "+ dataset_name +
" with data_stored in " + dataset_save_path )
for i in range(num_cv):
tmp = {}
test_mse = []
train_mae = []
train_mse = []
logger.logger.info("Starting CV "+str(i))
loaded_dataset: torch.utils.data.Dataset
(
test_dataset,
train_dataset,
test_loader,
train_loader,
loaded_dataset,
) = data_set.load_data(
dataset_name=dataset_name,
dataset_save_path=dataset_save_path,
test_ratio=test_ratio,
batch_size=batch_size,
shuffle=shuffle,
custom_size=custom_size,
)
del train_dataset, test_dataset
if normalize is True:
norm_factor = self.__normalization_factor(
data_set=loaded_dataset, num_classes=num_classes
)
else:
norm_factor = 1.0
for epoch in range(1, num_epoch + 1):
train_mae_loss, train_loss = self.train(
model_net=model_net,
epoch=epoch,
optimizer=optimizer,
train_loader=train_loader,
device=device,
norm_factor=norm_factor,
num_classes=num_classes,
)
train_mae_val, train_mse_val = self.test(
model_net=model_net,
test_loader=train_loader,
device=device,
norm_factor=norm_factor,
epoch=epoch,
)
test_mae_val, test_mse_val = self.test(
model_net=model_net,
test_loader=test_loader,
device=device,
norm_factor=norm_factor,
epoch=epoch,
)
train_mae.append(train_mae_val)
train_mse.append(train_mse_val)
test_mse.append(test_mae_val)
test_mse.append(test_mse_val)
logger.log({
"Epoch": epoch,
"Train_Loss": "{:16f}".format(train_loss),
"Train_MAE_Loss": "{:16f}".format(train_mae_loss),
"Train_MAE": "{:16f}".format(train_mae_val),
"Train_MSE": "{:16f}".format(train_mse_val),
"Test_MAE": "{:16f}".format(test_mae_val),
"Test_MSE": "{:16f}".format(test_mse_val),
})
torch.cuda.empty_cache()
tmp["MAE Train"] = list(train_mae)
tmp["MSE Train"] = list(train_mse)
tmp["MAE Test"] = list(train_mae)
tmp["MSE Test"] = list(train_mse)
if epoch % model_save_freq == 0:
self.save_whole_checkpoint(
result_save_dir=result_save_dir,
dataset_name=dataset_name,
num_cv=i,
num_epoch=epoch,
model_net=model_net,
data=tmp,
optimizer=optimizer,
loss=train_mae_loss,
override=override,
)
result[str(i)] = tmp
return result
[docs]
def train(
self,
epoch: int,
model_net: Model,
device: torch.device,
train_loader: torch.utils.data.DataLoader,
optimizer: torch.optim.Optimizer,
num_classes: int,
**kwargs: Any,
) -> float:
norm_factor: npt.NDArray[np.float64] = kwargs["norm_factor"]
model_net.train()
mse = 0
for data in train_loader:
data.x = data.x.type(torch.FloatTensor)
y_prev = data.y.clone().detach()
if type(norm_factor) == float:
y_prev_div= torch.div(y_prev, norm_factor)
else:
y_prev_div= torch.div(y_prev, torch.from_numpy(norm_factor))
data.y = y_prev_div
data.y = torch.nan_to_num(data.y.type(torch.FloatTensor))
data = data.to(device)
optimizer.zero_grad()
output_probs = model_net(data.x, data.edge_index, data.batch)
loss = torch.nn.MSELoss()
if len(data.y.size()) == 1:
loss = loss(output_probs, data.y[:])
else:
loss = loss(output_probs, data.y[:, :])
mse += loss.item()
loss.backward()
optimizer.step()
torch.cuda.empty_cache()
del y_prev, y_prev_div
return mse / len(train_loader), loss
[docs]
def test(
self,
test_loader: Type[torch.utils.data.DataLoader],
epoch: int,
model_net: Model,
device: torch.device,
**kwargs: Any,
) -> Tuple[float, float]:
norm_factor: npt.NDArray[np.float64] = kwargs["norm_factor"]
model_net.eval()
mae = 0
mse = 0
for data in test_loader:
data.x = data.x.type(torch.FloatTensor)
data.y = data.y / norm_factor
data.y = torch.nan_to_num(data.y.type(torch.FloatTensor))
data = data.to(device)
output_probs = model_net(data.x, data.edge_index, data.batch)
loss_mae = torch.nn.L1Loss()
mae += loss_mae(output_probs, data.y).item()
loss = torch.nn.MSELoss()
loss = loss(output_probs, data.y)
mse += loss.item()
torch.cuda.empty_cache()
return mae / len(test_loader), mse / len(test_loader)
def __normalization_factor(
self, data_set: Any, num_classes: int
) -> npt.NDArray[np.float64]:
"""
The __normalization_factor function is used to calculate the normalization factor
for each class.The normalization factor is calculated by taking the L2 norm of all of the
labels in a given class.
This function returns an array containing one value for each class, where each value is
equal to the L2 norm of all labels belonging to that particular class.
:param self: Used to Refer to the object itself.
:param data_set:Any: Used to Pass in the data set that we want to normalize.
:param num_classes:int: Used to Specify the number of classes in the data set.
:return: A vector of length num_classes,.
:doc-author: Trelent
"""
y = np.zeros((len(data_set), 1, num_classes))
for i in range(len(data_set)):
y[i, :, :] = data_set[i].y
norm_factor = np.zeros((num_classes))
for i in range(num_classes):
norm = np.linalg.norm(y[:, 0, i], ord=2)
norm_factor[i] = norm
return norm_factor
def __repr__(self) -> str:
return "Regression Evaluation Object"
def __str(self) ->str:
return "Regression Evaluation Object"