Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tarfiles #27

Open
wants to merge 54 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
9f3eb7b
Open MIMIC from tarfile
bganglia Aug 3, 2020
0414355
Merge branch 'master' of https://github.com/ieee8023/torchxrayvision …
bganglia Aug 6, 2020
289747d
Merge branch 'master' of https://github.com/ieee8023/torchxrayvision …
bganglia Aug 6, 2020
303832c
revert whitespace
bganglia Aug 8, 2020
da2490b
don't use get_image() in NIH_Dataset
bganglia Aug 8, 2020
8fb3f03
NIH_Dataset extends TarDataset
bganglia Aug 8, 2020
395e5e4
Store tarfiles in dictionary
bganglia Aug 8, 2020
fa69973
use getnames intead of getmembers
bganglia Aug 8, 2020
abbbfec
use O(n) method for determining imgid from tar_path
bganglia Aug 9, 2020
2ba6f5d
random data in MIMIC format
bganglia Aug 9, 2020
cacc3ad
script for generating random MIMIC data
bganglia Aug 9, 2020
ecbf302
track random MIMIC data
bganglia Aug 9, 2020
04f1a32
tarfile test using random MIMIC data
bganglia Aug 9, 2020
90129ab
fix test directory
bganglia Aug 9, 2020
0aa52a7
use .close() on tarfile and regenerate test directory
bganglia Aug 9, 2020
349babb
support for tarfiles in NIH dataset
bganglia Aug 9, 2020
6999bd3
Inherit from TarDataset in PC_Dataset
bganglia Aug 10, 2020
842ddf8
Storage-agnostic dataset
bganglia Aug 10, 2020
37afa4e
Inherit from storage agnostic loader
bganglia Aug 10, 2020
bbd4007
tidy up tarfile code
bganglia Aug 10, 2020
34daddb
remove previous TarDataset, ZipDataset classes
bganglia Aug 10, 2020
727d9ff
Scripts for generating test data
bganglia Aug 13, 2020
d2ae7c0
Test data
bganglia Aug 13, 2020
41b50c4
Tests for zip, tar in MIMIC, NIH, and PC
bganglia Aug 13, 2020
48d8170
clean up storage classes
bganglia Aug 13, 2020
5c4117e
save progress
bganglia Aug 26, 2020
2773c69
inherit from Dataset in NIH_Dataset
bganglia Aug 26, 2020
7ffc252
Add code for automated tests with script-generated data
bganglia Aug 26, 2020
68a71ae
script for writing random data
bganglia Aug 26, 2020
ec9777b
fall back on .index() instead of trying to load a cached version in .…
bganglia Aug 26, 2020
29498a6
support multiprocessing
bganglia Aug 27, 2020
3674357
Clean up new code for tests and format interfaces
bganglia Aug 27, 2020
ccec9ae
write partial metadata files with subset of columns
bganglia Aug 27, 2020
c091734
Improve caching
bganglia Aug 27, 2020
e56a565
fix tests
bganglia Aug 28, 2020
1dde4b7
fix error in data-generation script
bganglia Aug 28, 2020
1628db4
create .torchxrayvision if it does not already exist
bganglia Aug 28, 2020
124467c
fix line adding .torchxrayvision
bganglia Aug 28, 2020
28816e5
Commit sample data for testing NLM_TB datasets, instead of auto-gener…
bganglia Aug 28, 2020
ce38e57
Commit covid test cases
bganglia Aug 28, 2020
281935c
Include parallel tests again
bganglia Aug 28, 2020
9c2c9d2
trycatch on reading/writing stored_mappings, with disk_unwriteable_ou…
bganglia Aug 28, 2020
7c6aebb
work when .torchxrayvision is not writeable
bganglia Aug 28, 2020
cb97e70
remove some print statements
bganglia Aug 28, 2020
950ae96
add test simulating an unwriteable disk
bganglia Aug 28, 2020
300c9d7
use filesystem instead of dictionary
bganglia Aug 28, 2020
218fa75
rewrite data generation scripts as python, not bash scripts; add para…
bganglia Aug 30, 2020
b22cead
cleanup: better variable names and use blake2b instead of hash (works…
bganglia Aug 31, 2020
ae09bc9
Add test for asserting a dataset loads faster the second time
bganglia Aug 31, 2020
30c043b
Don't invoke duration test, to avoid spurious errors
bganglia Aug 31, 2020
bfdebf2
Call on new data generation script
bganglia Aug 31, 2020
0f7ea51
simplify and improve documentation
bganglia Sep 5, 2020
71c7a50
reorganize
bganglia Sep 19, 2020
1715b9d
Fix path length in CheX_Dataset
bganglia Sep 19, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
script for writing random data
  • Loading branch information
bganglia committed Aug 26, 2020
commit 68a71ae631ae20d764eb2b1400acdce01bf14578
253 changes: 253 additions & 0 deletions tests/random_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
import tarfile
import datetime
import numpy as np
import zipfile
import os
import shutil
import random
from PIL import Image
import glob
import pdb
import pydicom
import io
from pydicom.dataset import FileMetaDataset, FileDataset
from pydicom.encaps import encapsulate
import pydicom
import tempfile

import io
from PIL import Image, ImageDraw
from pydicom.dataset import Dataset
from pydicom.uid import generate_uid, JPEGExtended
from pydicom._storage_sopclass_uids import SecondaryCaptureImageStorage
from pydicom import dcmread
from pydicom.encaps import encapsulate
import numpy as np

def np_to_dcm(image, filename):
image = np.array(image)
WIDTH = image.shape[1]
HEIGHT = image.shape[2]
ds = Dataset()
ds.is_little_endian = True
ds.is_implicit_VR = True
ds.SOPClassUID = SecondaryCaptureImageStorage
ds.SOPInstanceUID = generate_uid()
ds.fix_meta_info()
ds.Modality = "OT"
ds.SamplesPerPixel = 3
ds.BitsAllocated = 8
ds.BitsStored = 8
ds.HighBit = 7
ds.PixelRepresentation = 0
ds.PlanarConfiguration = 1
ds.PhotometricInterpretation = "RGB"
ds.Rows = HEIGHT
ds.Columns = WIDTH
ds.PixelData = encapsulate([image.tobytes()])
ds["PixelData"].is_undefined_length = True
ds.PhotometricInterpretation = "YBR_FULL_422"
ds.file_meta.TransferSyntaxUID = JPEGExtended
ds.save_as(filename, write_like_original=False)

#def np_to_dcm(arr, filename):
# #Create object corresponding to file
# meta = FileMetaDataset()
# meta.MediaStorageSOPClassUID = '1.2.840.10008.5.1.4.1.1.2'
# meta.MediaStorageSOPInstanceUID = "1.2.3"
# meta.ImplementationClassUID = "1.2.3.4"
# dataset = FileDataset(filename, {}, file_meta = meta, preamble=b"\0" * 128)
# dataset.PatientName = "Test^Firstname"
# dataset.PatientID = "123456"
# #ds.is_little_endian = True
# dataset.is_little_endian = True
# dataset.is_implicit_VR = True
# dataset.file_meta.TransferSyntaxUID = pydicom.uid.ExplicitVRBigEndian
# # Set creation date/time
# dt = datetime.datetime.now()
# dataset.ContentDate = dt.strftime('%Y%m%d')
# timeStr = dt.strftime('%H%M%S.%f') # long format with micro seconds
# dataset.ContentTime = timeStr

# dataset.Rows, dataset.Columns = arr.size

# #ds.SOPClassUID = pydicom._storage_sopclass_uids.MRImageStorage
# dataset.PatientName = "Test^Firstname"
# dataset.PatientID = "123456"

# #ds.Modality = "CT"
# #ds.SeriesInstanceUID = pydicom.uid.generate_uid()
# #ds.StudyInstanceUID = pydicom.uid.generate_uid()
# #ds.FrameOfReferenceUID = pydicom.uid.generate_uid()

# dataset.BitsStored = 16
# dataset.BitsAllocated = 16
# dataset.SamplesPerPixel = 1
# dataset.HighBit = 15
# #ds.SliceLocation = DCM_SliceLocation
# #ds.SpacingBetweenSlices = 1
# #ds.SliceThickness = 4
# #ds.ScanLength = length

# dataset.ImagesInAcquisition = "1"

# dataset.InstanceNumber = 1

# #ds.ImagePositionPatient = r"-159\-174"+ "\\-" + str(DCM_SliceLocation*4) #default of 6, sometimes 1
# #ds.ImageOrientationPatient = r"1\0\0\0\-1\0"
# #ds.ImageType = r"ORIGINAL\PRIMARY\AXIAL"

# dataset.RescaleIntercept = "0"
# dataset.RescaleSlope = "1"
# dataset.PixelSpacing = r"0.683594\0.683594"# r"1\1"
# dataset.PhotometricInterpretation = "MONOCHROME2"
# dataset.PixelRepresentation = 1

# #Store image as bytes
# bytes_img = io.BytesIO()
# arr.save(bytes_img, format="PNG")
# #Add byte image to file
# dataset.PixelData = encapsulate(bytes_img.read())
# pdb.set_trace()
# #dataset.pixel_data = np.array(arr)
# #Write file
# dataset.save_as(filename)

def save_as_dicom(arr, filename):
file = FileDataset()
file.binary_data = arr
img_bytes = io.BytesIO()
Image.fromarray(arr).save(img_bytes, format="PNG")
file.PixelData = img_bytes
file.save_as(filename)

def generate_random_image(dimensions):
dimensions = tuple(dimensions)
if len(dimensions) == 2:
dimensions = dimensions + (3,)
return Image.fromarray((np.random.random(dimensions)*255).astype("uint8"))

class FolderOfArchive:
folder_format = "folder{}"
depth = 1
def __init__(self, root, depth, archive_size=3):
self.root = root
self.depth = depth
self.archive_size = archive_size
self.current_archive = -1
self.archive_position = archive_size - 1
self.archives = []
def get_path_from_root(self, n):
return os.path.join(*(
[self.folder_format.format(n)] * self.depth + \
[self.archive_format.format(n)]
))
def get_current_archive(self):
self.archive_position += 1
if self.archive_position == self.archive_size:
new_path = os.path.join(
self.root,
self.get_path_from_root(self.current_archive)
)
os.makedirs(os.path.dirname(new_path), exist_ok=True)
self.archives.append(self.get_new_archive(new_path))
self.archive_position = 0
self.current_archive += 1
return self.archives[-1]
def close(self):
for archive in self.archives:
archive.close()
def write(self, content):
curr = self.get_current_archive()
self.add_to_archive(curr, content)

class FolderOfTar(FolderOfArchive):
archive_format = "tar{}.tar"
def add_to_archive(self, archive, content):
archive.add(content)
def get_new_archive(self, new_path):
return tarfile.open(new_path, "w")

class FolderOfTarGz(FolderOfArchive):
archive_format = "tar{}.tar.gz"
def add_to_archive(self, archive, content):
archive.add(content)
def get_new_archive(self, new_path):
return tarfile.open(new_path, "w:gz")

class FolderOfZip(FolderOfArchive):
archive_format = "zip{}.zip"
def add_to_archive(self, archive, content):
archive.write(content)
def get_new_archive(self, new_path):
return zipfile.ZipFile(new_path, "w")

def write_random_images(paths, extracted, tarname, zipname, folder_of_zip_name, folder_of_tar_gz_name, dimensions, subfolder="."):
folder_of_zip_d1_name = str(folder_of_zip_name) + "_1"
folder_of_zip_d2_name = str(folder_of_zip_name) + "_2"
folder_of_tar_gz_d1_name = str(folder_of_tar_gz_name) + "_1"
folder_of_tar_gz_d2_name = str(folder_of_tar_gz_name) + "_2"
for path in [extracted, tarname, zipname, folder_of_zip_d1_name, folder_of_zip_d2_name]:
if os.path.exists(path):
if os.path.isfile(path):
os.remove(path)
else: #dir
shutil.rmtree(path)
for img_fname in paths:
print(img_fname)
img_path = extracted/subfolder/img_fname
os.makedirs(os.path.dirname(img_path), exist_ok=True)
random_image = generate_random_image(dimensions)
if str(img_fname).endswith(".dcm"):
np_to_dcm(random_image, img_path)
else:
random_image.save(img_path)
tarred = tarfile.TarFile.open(tarname, "w")
zipped = zipfile.ZipFile(zipname,"w")
folder_of_zip_d1 = FolderOfZip(folder_of_zip_d1_name, 0)
folder_of_zip_d2 = FolderOfZip(folder_of_zip_d2_name, 1)
folder_of_tar_gz_d1 = FolderOfTarGz(folder_of_tar_gz_d1_name, 0)
folder_of_tar_gz_d2 = FolderOfTarGz(folder_of_tar_gz_d2_name, 1)
for file in extracted.rglob("*"):
if not os.path.isdir(file):
tarred.add(file)
zipped.write(file)
folder_of_zip_d1.write(file)
folder_of_zip_d2.write(file)
folder_of_tar_gz_d1.write(file)
folder_of_tar_gz_d2.write(file)
tarred.close()
zipped.close()
folder_of_zip_d1.close()
folder_of_zip_d2.close()
folder_of_tar_gz_d1.close()
folder_of_tar_gz_d2.close()

def gen_hex(n):
hex_chars = list("0123456789abcdef")
return "".join(np.random.choice(hex_chars,n))

def gen_int(n):
int_chars = list("0123456789abcdef")
return "".join(np.random.choice(int_chars,n))

def random_pred():
return random.choice(["1.0","-1.0","0.0",""])

def random_preds():
return {
"Atelectasis":random_pred(),
"Cardiomegaly":random_pred(),
"Consolidation":random_pred(),
"Edema":random_pred(),
"Enlarged Cardiomediastinum":random_pred(),
"Fracture":random_pred(),
"Lung Lesion":random_pred(),
"Lung Opacity":random_pred(),
"No Finding":random_pred(),
"Pleural Effusion":random_pred(),
"Pleural Other":random_pred(),
"Pneumonia":random_pred(),
"Pneumothorax":random_pred(),
"Support Devices":random_pred()
}