Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix the bug of tensors not on the same device when running on CUDA device #59

Merged
merged 10 commits into from
Apr 19, 2023
Merged
Prev Previous commit
Next Next commit
fix: map data to the proper device in functions _assemble_input_for*;
  • Loading branch information
WenjieDu committed Apr 19, 2023
commit 1a22fd872e6d2f97801d945e56ba2b5bc641989b
2 changes: 1 addition & 1 deletion pypots/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# Created by Wenjie Du <[email protected]>
# License: GPL-v3

from .__version__ import version as __version__
from pypots.__version__ import version as __version__

__all__ = [
"data",
Expand Down
70 changes: 55 additions & 15 deletions pypots/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Base class for main models in PyPOTS.
The base (abstract) classes for models in PyPOTS.
"""

# Created by Wenjie Du <[email protected]>
Expand All @@ -17,17 +17,34 @@


class BaseModel(ABC):
"""Base model class for all model implementations.
"""The base model class for all model implementations.

Parameters
----------
device : str or `torch.device`, default = None,
The device for the model to run on.
If not given, will try to use CUDA devices first, then CPUs. CUDA and CPU are so far the main devices for people
to train ML models. Other devices like Google TPU and Apple Silicon accelerator MPS may be added in the future.
If not given, will try to use CUDA devices first (will use the GPU with device number 0 only by default),
then CPUs, considering CUDA and CPU are so far the main devices for people to train ML models.
Other devices like Google TPU and Apple Silicon accelerator MPS may be added in the future.

tb_file_saving_path : str, default = None,
The path to save the tensorboard file, which contains the loss values recorded during training.
The path to save the training logs (i.e. loss values recorded during training) into a tensorboard file.
Will not save if not given.

Attributes
----------
model : object, default = None,
The underlying model or algorithm to finish the task.

summary_writer : None or torch.utils.tensorboard.SummaryWriter, default = None,
The event writer to save training logs. Default as None. It only works when parameter `tb_file_saving_path` is
given, otherwise the training events won't be saved.

It is designed as being set up while initializing the model because it's created to
1). help visualize the model's training procedure (during training not after) and
2). assist users to tune the model's hype-parameters.
If only setting it up after training with a function like setter(), it cannot achieve the 1st purpose.

"""

def __init__(
Expand All @@ -36,6 +53,8 @@ def __init__(
tb_file_saving_path: str = None,
):
self.model = None
self.summary_writer = None
self.device = None

# set up the device for model running below
if device is None:
Expand All @@ -57,6 +76,7 @@ def __init__(
)

# set up the summary writer for training log saving below
# initialize self.summary_writer if tb_file_saving_path is given and not None, otherwise don't save the log
if isinstance(tb_file_saving_path, str):

from datetime import datetime
Expand All @@ -73,28 +93,29 @@ def __init__(
self.summary_writer = SummaryWriter(
actual_tb_file_saving_path, filename_suffix=".pypots"
)
else:
# don't save the log if tb_file_saving_path isn't given, set summary_writer as None
self.summary_writer = None

def save_log_into_tb_file(self, step: int, stage: str, loss_dict: dict) -> None:
"""Saving training logs into the tensorboard file.
"""Saving training logs into the tensorboard file specified by the given path `tb_file_saving_path`.

Parameters
----------
step : int,
The current training step number.
One step for one batch processing, so the number of steps means how many batches the model has processed.

stage : str,
The stage of the current operation, 'training' or 'validating'.
The stage of the current operation, e.g. 'pretraining', 'training', 'validating'.

loss_dict : dict,
A dictionary containing items to log, should have at least one item, e.g. {'imputation loss': 0.05}
A dictionary containing items to log, should have at least one item, and only items having its name
including "loss" or "error" will be logged, e.g. {'imputation_loss': 0.05, "classification_error": 0.32}.

"""
while len(loss_dict) > 0:
(item_name, loss) = loss_dict.popitem()
if "loss" in item_name: # save all items containing word "loss" in the name
# save all items containing "loss" or "error" in the name
# WDU: may enable customization keywords in the future
if ("loss" in item_name) or ("error" in item_name):
self.summary_writer.add_scalar(f"{stage}/{item_name}", loss, step)

def save_model(
Expand All @@ -103,7 +124,7 @@ def save_model(
file_name: str,
overwrite: bool = False,
) -> None:
"""Save the model to a disk file.
"""Save the model with current parameters to a disk file.

A .pypots extension will be appended to the filename if it does not already have one.
Please note that such an extension is not necessary, but to indicate the saved model is from PyPOTS framework
Expand Down Expand Up @@ -138,7 +159,9 @@ def save_model(
torch.save(self.model, saving_path)
logger.info(f"Saved successfully to {saving_path}.")
except Exception as e:
raise RuntimeError(f'{e} Failed to save the model to "{saving_path}"!')
raise RuntimeError(
f'Failed to save the model to "{saving_path}" because of the below error! \n{e}'
)

def load_model(self, model_path: str) -> None:
"""Load the saved model from a disk file.
Expand Down Expand Up @@ -166,7 +189,7 @@ def load_model(self, model_path: str) -> None:


class BaseNNModel(BaseModel):
"""Abstract class for all neural-network models.
"""The abstract class for all neural-network models.

Parameters
----------
Expand Down Expand Up @@ -197,6 +220,22 @@ class BaseNNModel(BaseModel):

tb_file_saving_path : str, default = None,
The path to save the tensorboard file, which contains the loss values recorded during training.


Attributes
---------
optimizer : torch.optim.Optimizer, default = None,
The optimizer to back propagate losses for model optimization. Default as None, will be implemented
when the concreate implementation model gets initialized.

best_model_dict : dict, default = None,
A dictionary contains the trained model that achieves the best performance according to the loss defined,
i.e. the lowest loss.

best_loss : float, default = inf,
The criteria to judge whether the model's performance is the best so far.
Usually the lower, the better.

"""

def __init__(
Expand Down Expand Up @@ -224,6 +263,7 @@ def __init__(
self.model = None
self.optimizer = None
self.best_model_dict = None
# WDU: may enable users to customize the criteria in the future
self.best_loss = float("inf")

def _print_model_size(self) -> None:
Expand Down
10 changes: 8 additions & 2 deletions pypots/classification/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
The base class for classification models.
The base classes for PyPOTS classification models.
"""

# Created by Wenjie Du <[email protected]>
Expand All @@ -18,7 +18,13 @@


class BaseClassifier(BaseModel):
"""Abstract class for all classification models."""
"""The abstract class for all PyPOTS classification models.
Parameters
---
device
tb_file_saving_path

"""

def __init__(
self,
Expand Down
4 changes: 2 additions & 2 deletions pypots/classification/brits.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ def _assemble_input_for_training(self, data: dict) -> dict:
back_missing_mask,
back_deltas,
label,
) = data
) = map(lambda x: x.to(self.device), data)

# assemble input data
inputs = {
Expand Down Expand Up @@ -278,7 +278,7 @@ def _assemble_input_for_testing(self, data: dict) -> dict:
back_X,
back_missing_mask,
back_deltas,
) = data
) = map(lambda x: x.to(self.device), data)

# assemble input data
inputs = {
Expand Down
8 changes: 6 additions & 2 deletions pypots/classification/grud.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,9 @@ def _assemble_input_for_training(self, data: dict) -> dict:
A dictionary with data assembled.
"""
# fetch data
indices, X, X_filledLOCF, missing_mask, deltas, empirical_mean, label = data
indices, X, X_filledLOCF, missing_mask, deltas, empirical_mean, label = map(
lambda x: x.to(self.device), data
)

# assemble input data
inputs = {
Expand Down Expand Up @@ -232,7 +234,9 @@ def _assemble_input_for_testing(self, data: dict) -> dict:
inputs : dict,
A python dictionary contains the input data for model testing.
"""
indices, X, X_filledLOCF, missing_mask, deltas, empirical_mean = data
indices, X, X_filledLOCF, missing_mask, deltas, empirical_mean = map(
lambda x: x.to(self.device), data
)

inputs = {
"indices": indices,
Expand Down
8 changes: 6 additions & 2 deletions pypots/classification/raindrop.py
Original file line number Diff line number Diff line change
Expand Up @@ -690,7 +690,9 @@ def _assemble_input_for_training(self, data: dict) -> dict:
A dictionary with data assembled.
"""
# fetch data
indices, X, X_filledLOCF, missing_mask, deltas, empirical_mean, label = data
indices, X, X_filledLOCF, missing_mask, deltas, empirical_mean, label = map(
lambda x: x.to(self.device), data
)

bz, n_steps, n_features = X.shape
lengths = torch.tensor([n_steps] * bz, dtype=torch.float)
Expand Down Expand Up @@ -743,7 +745,9 @@ def _assemble_input_for_testing(self, data: dict) -> dict:
inputs : dict,
A python dictionary contains the input data for model testing.
"""
indices, X, X_filledLOCF, missing_mask, deltas, empirical_mean = data
indices, X, X_filledLOCF, missing_mask, deltas, empirical_mean = map(
lambda x: x.to(self.device), data
)
bz, n_steps, n_features = X.shape
lengths = torch.tensor([n_steps] * bz, dtype=torch.float)
times = torch.tensor(range(n_steps), dtype=torch.float).repeat(bz, 1)
Expand Down
2 changes: 1 addition & 1 deletion pypots/clustering/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
The base class for clustering models.
The base classes for PyPOTS clustering models.
"""

# Created by Wenjie Du <[email protected]>
Expand Down
2 changes: 1 addition & 1 deletion pypots/clustering/crli.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ def _assemble_input_for_training(self, data: list) -> dict:
"""

# fetch data
indices, X, _, missing_mask, _, _ = data
indices, X, _, missing_mask, _, _ = map(lambda x: x.to(self.device), data)

inputs = {
"X": X,
Expand Down
2 changes: 1 addition & 1 deletion pypots/clustering/vader.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ def _assemble_input_for_training(self, data: list) -> dict:
"""

# fetch data
indices, X, _, missing_mask, _, _ = data
indices, X, _, missing_mask, _, _ = map(lambda x: x.to(self.device), data)

inputs = {
"X": X,
Expand Down
2 changes: 1 addition & 1 deletion pypots/data/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Utilities for data manipulation
The base class for PyPOTS datasets.
"""

# Created by Wenjie Du <[email protected]>
Expand Down
10 changes: 5 additions & 5 deletions pypots/data/dataset_for_mit.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@ class DatasetForMIT(BaseDataset):
"""

def __init__(
self,
data: Union[dict, str],
file_type: str = "h5py",
rate: float = 0.2,
self,
data: Union[dict, str],
file_type: str = "h5py",
rate: float = 0.2,
):
super().__init__(data, file_type)
self.rate = rate
Expand Down Expand Up @@ -124,7 +124,7 @@ def _fetch_data_from_file(self, idx: int) -> Iterable:
]

if (
"y" in self.file_handle.keys()
"y" in self.file_handle.keys()
): # if the dataset has labels, then fetch it from the file
sample.append(torch.tensor(self.file_handle["y"][idx], dtype=torch.long))

Expand Down
2 changes: 1 addition & 1 deletion pypots/forecasting/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
The base class for forecasting models.
The base classes for PyPOTS forecasting models.
"""

# Created by Wenjie Du <[email protected]>
Expand Down
4 changes: 2 additions & 2 deletions pypots/imputation/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
The base class for imputation models.
The base class for PyPOTS imputation models.
"""

# Created by Wenjie Du <[email protected]>
Expand Down Expand Up @@ -258,7 +258,7 @@ def _train_model(
imputation_collector.append(imputed_data)

imputation_collector = torch.cat(imputation_collector)
imputation_collector = imputation_collector.numpy()
imputation_collector = imputation_collector.cpu().detach().numpy()

mean_val_loss = cal_mae(
imputation_collector,
Expand Down
4 changes: 3 additions & 1 deletion pypots/imputation/brits.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,9 @@ def _assemble_input_for_training(self, data: list) -> dict:
"""

# fetch data
indices, X, missing_mask, deltas, back_X, back_missing_mask, back_deltas = data
indices, X, missing_mask, deltas, back_X, back_missing_mask, back_deltas = map(
lambda x: x.to(self.device), data
)

# assemble input data
inputs = {
Expand Down
6 changes: 4 additions & 2 deletions pypots/imputation/saits.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,9 @@ def _assemble_input_for_training(self, data: list) -> dict:
A python dictionary contains the input data for model training.
"""

indices, X_intact, X, missing_mask, indicating_mask = data
indices, X_intact, X, missing_mask, indicating_mask = map(
lambda x: x.to(self.device), data
)

inputs = {
"X": X,
Expand Down Expand Up @@ -275,7 +277,7 @@ def _assemble_input_for_validating(self, data) -> dict:
inputs : dict,
A python dictionary contains the input data for model validating.
"""
indices, X, missing_mask = data
indices, X, missing_mask = map(lambda x: x.to(self.device), data)

inputs = {
"X": X,
Expand Down
6 changes: 4 additions & 2 deletions pypots/imputation/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,9 @@ def _assemble_input_for_training(self, data: dict) -> dict:
A python dictionary contains the input data for model training.
"""

indices, X_intact, X, missing_mask, indicating_mask = data
indices, X_intact, X, missing_mask, indicating_mask = map(
lambda x: x.to(self.device), data
)

inputs = {
"X": X,
Expand Down Expand Up @@ -385,7 +387,7 @@ def _assemble_input_for_validating(self, data: list) -> dict:
inputs : dict,
A python dictionary contains the input data for model validating.
"""
indices, X, missing_mask = data
indices, X, missing_mask = map(lambda x: x.to(self.device), data)

inputs = {
"X": X,
Expand Down