mlmed · bganglia · Aug 3, 2020 · Aug 6, 2020 · Aug 6, 2020 · Aug 8, 2020
diff --git a/torchxrayvision/datasets.py b/torchxrayvision/datasets.py
@@ -7,6 +7,8 @@
 from torchvision import transforms
 from tqdm import tqdm
 import numpy as np
+from io import BytesIO
+import tarfile
 import os,sys,os.path
 import pandas as pd
 import pickle
@@ -195,7 +197,24 @@ def __len__(self):
  def __getitem__(self, idx):
  return self.dataset[self.idxs[idx]]
 
-
+
+class TarDataset(Dataset):
+ def __init__(self, imgpath):
+ if imgpath.endswith(".tar"):
+ self.tarred = tarfile.open(imgpath)
+ self.tar_paths = self.tarred.getmembers()
+ else:
+ self.tarred = None
+ def get_image(self, path):
+ if self.tarred is None:
+ return imread(os.path.join(self.imgpath, path))
+ else:
+ for tar_path in self.tar_paths:
+ name = tar_path.name
+ if name.endswith(path):
+ bytes = self.tarred.extractfile(name).read()
+ return np.array(Image.open(BytesIO(bytes)))
+
 class NIH_Dataset(Dataset):
  """
  NIH ChestX-ray8 dataset
@@ -293,7 +312,7 @@ def __getitem__(self, idx):
 
 
  imgid = self.csv['Image Index'].iloc[idx]
- img_path = os.path.join(self.imgpath, imgid)
+ #img_path = os.path.join(self.imgpath, imgid)
  #print(img_path)
  img = imread(img_path)
  if self.normalize:
@@ -838,7 +857,7 @@ def __getitem__(self, idx):
 
  return {"img":img, "lab":self.labels[idx], "idx":idx}
 
-class MIMIC_Dataset(Dataset):
+class MIMIC_Dataset(TarDataset):
  """
  Johnson AE, Pollard TJ, Berkowitz S, Greenbaum NR, Lungren MP, Deng CY, Mark RG, Horng S. MIMIC-CXR: A large publicly available database of labeled chest radiographs. arXiv preprint arXiv:1901.07042. 2019 Jan 21.
 
@@ -850,7 +869,7 @@ class MIMIC_Dataset(Dataset):
  def __init__(self, imgpath, csvpath,metacsvpath, views=["PA"], transform=None, data_aug=None,
  flat_dir=True, seed=0, unique_patients=True):
 
- super(MIMIC_Dataset, self).__init__()
+ super(MIMIC_Dataset, self).__init__(imgpath)
  np.random.seed(seed) # Reset the seed so all runs are the same.
  self.MAXVAL = 255
 
@@ -877,8 +896,10 @@ def __init__(self, imgpath, csvpath,metacsvpath, views=["PA"], transform=None, d
  self.csv = pd.read_csv(self.csvpath)
  self.metacsvpath = metacsvpath
  self.metacsv = pd.read_csv(self.metacsvpath)
-
+
+
  self.csv = self.csv.set_index(['subject_id', 'study_id'])
+
  self.metacsv = self.metacsv.set_index(['subject_id', 'study_id'])
 
  self.csv = self.csv.join(self.metacsv).reset_index()
@@ -926,9 +947,9 @@ def __getitem__(self, idx):
  studyid = str(self.csv.iloc[idx]["study_id"])
  dicom_id = str(self.csv.iloc[idx]["dicom_id"])
 
- img_path = os.path.join(self.imgpath, "p" + subjectid[:2], "p" + subjectid, "s" + studyid, dicom_id + ".jpg")
- img = imread(img_path)
- img = normalize(img, self.MAXVAL) 
+ img_fname = os.path.join("p" + subjectid[:2], "p" + subjectid, "s" + studyid, dicom_id + ".jpg")
+ img = self.get_image(img_fname)
+ img = normalize(img, self.MAXVAL)
 
  # Check that images are 2D arrays
  if len(img.shape) > 2: