Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tarfiles #27

Open
wants to merge 54 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
9f3eb7b
Open MIMIC from tarfile
bganglia Aug 3, 2020
0414355
Merge branch 'master' of https://github.com/ieee8023/torchxrayvision …
bganglia Aug 6, 2020
289747d
Merge branch 'master' of https://github.com/ieee8023/torchxrayvision …
bganglia Aug 6, 2020
303832c
revert whitespace
bganglia Aug 8, 2020
da2490b
don't use get_image() in NIH_Dataset
bganglia Aug 8, 2020
8fb3f03
NIH_Dataset extends TarDataset
bganglia Aug 8, 2020
395e5e4
Store tarfiles in dictionary
bganglia Aug 8, 2020
fa69973
use getnames intead of getmembers
bganglia Aug 8, 2020
abbbfec
use O(n) method for determining imgid from tar_path
bganglia Aug 9, 2020
2ba6f5d
random data in MIMIC format
bganglia Aug 9, 2020
cacc3ad
script for generating random MIMIC data
bganglia Aug 9, 2020
ecbf302
track random MIMIC data
bganglia Aug 9, 2020
04f1a32
tarfile test using random MIMIC data
bganglia Aug 9, 2020
90129ab
fix test directory
bganglia Aug 9, 2020
0aa52a7
use .close() on tarfile and regenerate test directory
bganglia Aug 9, 2020
349babb
support for tarfiles in NIH dataset
bganglia Aug 9, 2020
6999bd3
Inherit from TarDataset in PC_Dataset
bganglia Aug 10, 2020
842ddf8
Storage-agnostic dataset
bganglia Aug 10, 2020
37afa4e
Inherit from storage agnostic loader
bganglia Aug 10, 2020
bbd4007
tidy up tarfile code
bganglia Aug 10, 2020
34daddb
remove previous TarDataset, ZipDataset classes
bganglia Aug 10, 2020
727d9ff
Scripts for generating test data
bganglia Aug 13, 2020
d2ae7c0
Test data
bganglia Aug 13, 2020
41b50c4
Tests for zip, tar in MIMIC, NIH, and PC
bganglia Aug 13, 2020
48d8170
clean up storage classes
bganglia Aug 13, 2020
5c4117e
save progress
bganglia Aug 26, 2020
2773c69
inherit from Dataset in NIH_Dataset
bganglia Aug 26, 2020
7ffc252
Add code for automated tests with script-generated data
bganglia Aug 26, 2020
68a71ae
script for writing random data
bganglia Aug 26, 2020
ec9777b
fall back on .index() instead of trying to load a cached version in .…
bganglia Aug 26, 2020
29498a6
support multiprocessing
bganglia Aug 27, 2020
3674357
Clean up new code for tests and format interfaces
bganglia Aug 27, 2020
ccec9ae
write partial metadata files with subset of columns
bganglia Aug 27, 2020
c091734
Improve caching
bganglia Aug 27, 2020
e56a565
fix tests
bganglia Aug 28, 2020
1dde4b7
fix error in data-generation script
bganglia Aug 28, 2020
1628db4
create .torchxrayvision if it does not already exist
bganglia Aug 28, 2020
124467c
fix line adding .torchxrayvision
bganglia Aug 28, 2020
28816e5
Commit sample data for testing NLM_TB datasets, instead of auto-gener…
bganglia Aug 28, 2020
ce38e57
Commit covid test cases
bganglia Aug 28, 2020
281935c
Include parallel tests again
bganglia Aug 28, 2020
9c2c9d2
trycatch on reading/writing stored_mappings, with disk_unwriteable_ou…
bganglia Aug 28, 2020
7c6aebb
work when .torchxrayvision is not writeable
bganglia Aug 28, 2020
cb97e70
remove some print statements
bganglia Aug 28, 2020
950ae96
add test simulating an unwriteable disk
bganglia Aug 28, 2020
300c9d7
use filesystem instead of dictionary
bganglia Aug 28, 2020
218fa75
rewrite data generation scripts as python, not bash scripts; add para…
bganglia Aug 30, 2020
b22cead
cleanup: better variable names and use blake2b instead of hash (works…
bganglia Aug 31, 2020
ae09bc9
Add test for asserting a dataset loads faster the second time
bganglia Aug 31, 2020
30c043b
Don't invoke duration test, to avoid spurious errors
bganglia Aug 31, 2020
bfdebf2
Call on new data generation script
bganglia Aug 31, 2020
0f7ea51
simplify and improve documentation
bganglia Sep 5, 2020
71c7a50
reorganize
bganglia Sep 19, 2020
1715b9d
Fix path length in CheX_Dataset
bganglia Sep 19, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
support multiprocessing
  • Loading branch information
bganglia committed Aug 27, 2020
commit 29498a6f32d45c969eb138b3fd79cb630eb117b8
48 changes: 32 additions & 16 deletions torchxrayvision/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@
from os.path import join
from skimage.io import imread, imsave
from torch import nn
from copy import copy
from torch.nn.modules.linear import Linear
from torch.utils.data import Dataset
from torchvision import transforms
from tqdm import tqdm
import numpy as np
import multiprocessing
from io import BytesIO
import os,sys,os.path
import pdb
import pandas as pd
import pickle
import pydicom
Expand Down Expand Up @@ -242,15 +243,20 @@ def convert_to_image(self, filename, bytes):
if str(filename).endswith(".dcm"):
return pydicom.filereader.dcmread(BytesIO(bytes), force=True).pixel_array
else:
return np.array(Image.open(BytesIO(bytes)))
out = np.array(Image.open(BytesIO(bytes)))
print(type(out))
print(out)
return out

class TarInterface(Interface):
@classmethod
def matches(cls, filename):
return not os.path.isdir(filename) and tarfile.is_tarfile(filename)
def __init__(self, filename, path_length):
def __init__(self, imgpath, path_length):
self.path_length = path_length
self.compressed, self.filename_mapping = self.load_dataset(filename)
self.imgpath = imgpath
compressed, self.filename_mapping = self.load_dataset(imgpath)
self.compressed = {multiprocessing.current_process():compressed}
def get_image(self, imgid):
archive_path = self.filename_mapping[imgid]
return self.extract_from_file(archive_path)
Expand All @@ -265,18 +271,25 @@ def index(self, imgpath):
filename_mapping[imgid] = tar_path
return compressed, filename_mapping
def close(self):
self.compressed.close()
for compressed in self.compressed.values():
compressed.close()
def extract_from_file(self, tar_path):
bytes = self.compressed.extractfile(tar_path).read()
pid = multiprocessing.current_process()
if not pid in self.compressed:
print("Opening tar file on thread:",pid)
self.compressed[pid] = tarfile.open(self.imgpath)
bytes = self.compressed[pid].extractfile(tar_path).read()
return self.convert_to_image(tar_path, bytes)

class ZipInterface(Interface):
@classmethod
def matches(cls, filename):
return not os.path.isdir(filename) and zipfile.is_zipfile(filename)
def __init__(self, filename, path_length):
def __init__(self, imgpath, path_length):
self.path_length = path_length
self.compressed, self.filename_mapping = self.load_dataset(filename)
self.imgpath = imgpath
compressed, self.filename_mapping = self.load_dataset(imgpath)
self.compressed = {multiprocessing.current_process():compressed}
def get_image(self, imgid):
archive_path = self.filename_mapping[imgid]
return self.extract_from_file(archive_path)
Expand All @@ -285,25 +298,28 @@ def index(self, imgpath):
zip_infos = compressed.infolist()
filename_mapping = {}
for zip_info in zip_infos:
# print(zip_info)
if not zip_info.is_dir():
zip_path = zip_info.filename
imgid = last_n_in_filepath(zip_path, self.path_length)
filename_mapping[imgid] = zip_path
return compressed, filename_mapping
def extract_from_file(self, zip_path):
bytes = self.compressed.open(zip_path).read()
if not multiprocessing.current_process() in self.compressed:
print("Opening zip file on thread:",multiprocessing.current_process())
self.compressed[multiprocessing.current_process()] = zipfile.ZipFile(self.imgpath)
bytes = self.compressed[multiprocessing.current_process()].open(zip_path).read()
return self.convert_to_image(zip_path, bytes)
def close(self):
self.compressed.close()
for compressed in self.compressed.values():
compressed.close()

class FolderInterface(Interface):
@classmethod
def matches(cls, filename):
return os.path.isdir(filename)
def __init__(self, filename, path_length):
def __init__(self, imgpath, path_length):
self.path_length = path_length
self.path, self.filename_mapping = self.load_dataset(filename)
self.path, self.filename_mapping = self.load_dataset(imgpath)
def get_image(self, imgid):
archive_path = self.filename_mapping[imgid]
with open(archive_path,"rb") as handle:
Expand All @@ -316,7 +332,7 @@ def index(self, imgpath):
if not os.path.isdir(path):
imgid = last_n_in_filepath(path, self.path_length)
filename_mapping[imgid] = path
#print(filename_mapping)
print(filename_mapping)
return imgpath, filename_mapping
def close(self):
pass
Expand All @@ -340,9 +356,9 @@ def matches(cls, filename):
if is_archive(item):
return True
return False
def __init__(self, filename, path_length):
def __init__(self, imgpath, path_length):
self.path_length = path_length
self.archives, self.filename_mapping = self.load_dataset(filename)
self.archives, self.filename_mapping = self.load_dataset(imgpath)
def get_image(self, imgid):
path_to_archive = self.filename_mapping[imgid]
return self.archives[path_to_archive].get_image(imgid)
Expand Down