Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding the model CSDI #208

Merged
merged 2 commits into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pypots/imputation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from .saits import SAITS
from .transformer import Transformer
from .usgan import USGAN
from .csdi import CSDI

__all__ = [
"SAITS",
Expand All @@ -21,4 +22,5 @@
"LOCF",
"GPVAE",
"USGAN",
"CSDI",
]
12 changes: 12 additions & 0 deletions pypots/imputation/csdi/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""

"""

# Created by Wenjie Du <[email protected]>
# License: GLP-v3

from .model import CSDI

__all__ = [
"CSDI",
]
148 changes: 148 additions & 0 deletions pypots/imputation/csdi/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
"""

"""

# Created by Wenjie Du <[email protected]>
# License: GLP-v3

from typing import Union, Iterable

import torch
from pycorruptor import mcar

from ...data.base import BaseDataset


class DatasetForCSDI(BaseDataset):
"""Dataset for CSDI model."""

def __init__(
self,
data: Union[dict, str],
return_labels: bool = True,
file_type: str = "h5py",
rate: float = 0.1,
):
super().__init__(data, return_labels, file_type)
time_points = None if "time_points" not in data.keys() else data["time_points"]
_, self.time_points = self._check_input(self.X, time_points)
for_pattern_mask = (
None if "for_pattern_mask" not in data.keys() else data["for_pattern_mask"]
)
_, self.for_pattern_mask = self._check_input(self.X, for_pattern_mask)
cut_length = None if "cut_length" not in data.keys() else data["cut_length"]
_, self.cut_length = self._check_input(self.X, cut_length)
self.rate = rate

def _fetch_data_from_array(self, idx: int) -> Iterable:
"""Fetch data according to index.

Parameters
----------
idx : int,
The index to fetch the specified sample.

Returns
-------
sample : list,
A list contains

index : int tensor,
The index of the sample.

X_intact : tensor,
Original time-series for calculating mask imputation loss.

X : tensor,
Time-series data with artificially missing values for model input.

missing_mask : tensor,
The mask records all missing values in X.

indicating_mask : tensor.
The mask indicates artificially missing values in X.
"""
X = self.X[idx].to(torch.float32)
X_intact, X, missing_mask, indicating_mask = mcar(X, rate=self.rate)

observed_data = X_intact
observed_mask = missing_mask + indicating_mask
observed_tp = (
torch.arange(0, self.n_steps, dtype=torch.float32)
if self.time_points is None
else self.time_points[idx].to(torch.float32)
)
gt_mask = indicating_mask
for_pattern_mask = (
gt_mask if self.for_pattern_mask is None else self.for_pattern_mask[idx]
)
cut_length = (
torch.zeros(len(observed_data)).long()
if self.cut_length is None
else self.cut_length[idx]
)

sample = [
torch.tensor(idx),
observed_data,
observed_mask,
observed_tp,
gt_mask,
for_pattern_mask,
cut_length,
]

if self.y is not None and self.return_labels:
sample.append(self.y[idx].to(torch.long))

return sample

def _fetch_data_from_file(self, idx: int) -> Iterable:
"""Fetch data with the lazy-loading strategy, i.e. only loading data from the file while requesting for samples.
Here the opened file handle doesn't load the entire dataset into RAM but only load the currently accessed slice.

Parameters
----------
idx : int,
The index of the sample to be return.

Returns
-------
sample : list,
The collated data sample, a list including all necessary sample info.
"""

if self.file_handle is None:
self.file_handle = self._open_file_handle()

X = torch.from_numpy(self.file_handle["X"][idx]).to(torch.float32)
X_intact, X, missing_mask, indicating_mask = mcar(X, rate=self.rate)

observed_data = X_intact
observed_mask = missing_mask + indicating_mask
observed_tp = self.time_points[idx].to(torch.float32)
gt_mask = indicating_mask
for_pattern_mask = (
gt_mask if self.for_pattern_mask is None else self.for_pattern_mask[idx]
)
cut_length = (
torch.zeros(len(observed_data)).long()
if self.cut_length is None
else self.cut_length[idx]
)

sample = [
torch.tensor(idx),
observed_data,
observed_mask,
observed_tp,
gt_mask,
for_pattern_mask,
cut_length,
]

# if the dataset has labels and is for training, then fetch it from the file
if "y" in self.file_handle.keys() and self.return_labels:
sample.append(torch.tensor(self.file_handle["y"][idx], dtype=torch.long))

return sample
Loading