Source code for carate.loader.load_data

"""
File for data loading from the standard datasets implemented in the pytorch_geometric #
library. The DataSet loader is implemented as a base class and other subclasses include loaders for standardized benchmarks
as well as custom datasets.

:author: Julian M. Kleber
"""
from typing import Type, Optional, List
from abc import ABC, abstractclassmethod, abstractmethod

import torch
import torch_geometric
from torch_geometric.data import DataLoader
from torch_geometric.datasets import MoleculeNet, TUDataset

import rdkit as rdkit

from carate.default_interface import DefaultObject

import logging

logger = logging.getLogger(__name__)
logging.basicConfig(
    filename="carate.log",
    encoding="utf-8",
    level=logging.DEBUG,
    format="%(asctime)s %(message)s",
)


[docs] class DatasetObject(ABC, DefaultObject, torch.utils.data.Dataset): """ Interface for DataLoading objects """ def __init__( self, dataset_name: str, dataset_save_path: str, test_ratio: int, batch_size: int, shuffle: bool, custom_size: Optional[int], ) -> None: raise NotImplementedError # pragma: no cover
[docs] @abstractclassmethod def load_data( self, dataset_name: str, dataset_save_path: str, test_ratio: int, batch_size: int, shuffle: bool, ) -> None: raise NotImplementedError # pragma: no cover
@abstractmethod def __repr__(self): raise NotImplementedError # pragme: no cover
[docs] class StandardPytorchGeometricDataset(DatasetObject): DataSet: torch.utils.data.Dataset
[docs] def load_data( self, dataset_name: str, test_ratio: int, dataset_save_path: str, batch_size: int = 64, shuffle: bool = True, custom_size: Optional[int] = None, ) -> List[torch.utils.data.DataLoader | torch.utils.data.Dataset]: """ The load_dataset function loads a standard dataset, splits it into a training and testing set, and returns the appropriate dataloaders for each. The test_ratio parameter specifies what percentage of the original dataset should be used as the testing set. The batch_size parameter specifies how many samples should be in each batch. :param path:str: Used to Define the path where the dataset is located. :param dataset_name:str: Used to Specify which dataset to load. :param test_ratio:int: Used to divide the dataset into a training and test set. :param batch_size:int: Used to set the batch size for training. :return: A train_loader and a test_loader. :doc-author: Julian M. Kleber """ if shuffle: dataset = self.DataSet(dataset_save_path, name=dataset_name).shuffle() else: dataset = self.DataSet(dataset_save_path, name=dataset_name) test_dataset, train_dataset, test_loader, train_loader = self.make_split( dataset=dataset, test_ratio=test_ratio, batch_size=batch_size, custom_size=custom_size, ) logging.info( self.dataset_name + " has num_features: " + str(train_dataset.num_features) + " and classes: " + str(train_dataset.num_classes) ) return test_dataset, train_dataset, test_loader, train_loader, dataset
[docs] def make_split( self, dataset: torch.utils.data.Dataset, test_ratio: int, batch_size: int, custom_size: Optional[int] = None, ) -> List[torch.utils.data.DataLoader | torch.utils.data.Dataset]: if custom_size == None: custom_size = len(dataset) dataset = dataset[: int(custom_size)] # make sure the size is an int test_dataset = dataset[: len(dataset) // test_ratio] train_dataset = dataset[len(dataset) // test_ratio :] test_loader = DataLoader(test_dataset, batch_size=batch_size) train_loader = DataLoader(train_dataset, batch_size=batch_size) return [test_dataset, train_dataset, test_loader, train_loader]
[docs] class StandardDatasetMoleculeNet(StandardPytorchGeometricDataset): """ Implementation of the Dataset interaface with focus on the models implemented in pytorch_geometric and provided by the MoleculeNet collection of datasets. """ DataSet = MoleculeNet def __init__( self, dataset_save_path: str, dataset_name: str, test_ratio: int, batch_size: int, shuffle: bool = True, custom_size: Optional[int] = None, ): """ The __init__ function is called the constructor and is automatically called when you create a new instance of this class. The __init__ function allows us to set attributes that are specific to each object created from the class. In our case, we want each data_set object to have a path, dataset_name, test_ratio and batch size attribute. :param self: Used to Reference the object to which the function is applied. :param path:str: Used to Specify the path to the dataset. :param dataset_name:str: Used to Store the name of the data set. :param test_ratio:int: Used to Split the data set into a training and testing set. :param batch_size:int: Used to Set the batch size. :return: The object of the class. :doc-author: Julian M. Kleber """ self.dataset_save_path = dataset_save_path self.dataset_name = dataset_name self.test_ratio = test_ratio self.batch_size = batch_size self.shuffle = shuffle def __repr__(self): return "StandardMoleculeNet"
[docs] class StandardDatasetTUDataset(StandardPytorchGeometricDataset): """ class for loading standard datasates from the TU Dataset collection implemented by PyTorch Geometric. author: Julian M. Kleber """ DataSet = TUDataset def __init__( self, dataset_save_path: str, dataset_name: str, test_ratio: int, batch_size: int, shuffle: bool = True, ): """ The __init__ function is called the constructor and is automatically called when you create a new instance of this class. The __init__ function allows us to set attributes that are specific to each object created from the class. In our case, we want each data_set object to have a path, dataset_name, test_ratio and batch size attribute. :param self: Used to Reference the object to which the function is applied. :param path:str: Used to Specify the path to the dataset. :param dataset_name:str: Used to Store the name of the data set. :param test_ratio:int: Used to Split the data set into a training and testing set. :param batch_size:int: Used to Set the batch size. :return: The object of the class. :doc-author: Julian M. Kleber """ self.dataset_save_path = dataset_save_path self.dataset_name = dataset_name self.test_ratio = test_ratio self.batch_size = batch_size self.shuffle = shuffle def __repr__(self): return "StandardTUDataset"