Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parameterize dtype for h5path with SlideData constructor #335

Open
wants to merge 8 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion docs/source/h5path.rst
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,10 @@ Whole-slide masks are stored in the ``masks/`` Group. All masks are enforced to
However, when running a pipeline, these masks are moved to the tile-level and stored within the tile groups.
The slide-level masks are therefore not saved when calling :meth:`SlideData.write() <pathml.core.SlideData.write>`.

We use ``float16`` as the data type for all Datasets.
By default, we use ``float16`` as the data type for all Datasets and HDF5's
`ENUM type <https://docs.h5py.org/en/stable/special.html#enumerated-types>`
for masks, which are stored as 8-bit integers. Changing the `dtype` in
the Slide constructor changes the data type used to store images.

.. note:: Be aware that the ``h5path`` format specification may change between major versions

Expand Down
15 changes: 11 additions & 4 deletions pathml/core/h5managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@
import pathml.core
import pathml.core.masks
import pathml.core.tile
from pathml.core.utils import readcounts
from pathml.core.utils import readcounts, get_tiles_dtype


class h5pathManager:
"""
Interface between slidedata object and data management on disk by h5py.
Interface between SlideData object and data management on disk by h5py.
"""

def __init__(self, h5path=None, slidedata=None):
Expand Down Expand Up @@ -49,9 +49,12 @@ def __init__(self, h5path=None, slidedata=None):
self.counts.filename = (
str(self.countspath.name) + "/tmpfile.h5ad"
)
# Default to float16 if there are no tiles
self.dtype = get_tiles_dtype(h5path) or np.dtype("float16")

else:
assert slidedata, "must pass slidedata object to create h5path"
self.dtype = slidedata.dtype
# initialize h5path file hierarchy
# fields
fieldsgroup = self.h5.create_group("fields")
Expand Down Expand Up @@ -136,7 +139,7 @@ def add_tile(self, tile):
compression="gzip",
compression_opts=5,
shuffle=True,
dtype="float16",
dtype=self.dtype,
)

# save tile_shape as an attribute to enforce consistency
Expand All @@ -156,7 +159,11 @@ def add_tile(self, tile):
self.h5["tiles"][str(tile.coords)]["masks"].create_dataset(
str(key),
data=mask,
dtype="float16",
chunks=True,
compression="gzip",
compression_opts=9, # masks should be highly compressible
shuffle=True,
dtype=mask.dtype,
)

# add coords
Expand Down
7 changes: 7 additions & 0 deletions pathml/core/slide_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from loguru import logger

import pathml.core
from pathml.core.utils import get_tiles_dtype
import pathml.preprocessing.pipeline
from pathml.core.slide_types import SlideType

Expand Down Expand Up @@ -74,6 +75,7 @@ class SlideData:
time_series (bool, optional): Flag indicating whether the image is a time series.
Defaults to ``None``. Ignored if ``slide_type`` is specified.
counts (anndata.AnnData): object containing counts matrix associated with image quantification
dtype (np.dtype): datatype for image storage
"""

def __init__(
Expand Down Expand Up @@ -177,11 +179,15 @@ def __init__(
self.name = name
self.labels = labels
self.slide_type = slide_type
self.dtype = np.dtype('float16') if dtype is None else np.dtype(dtype)

if _load_from_h5path:
# populate the SlideData object from existing h5path file
with h5py.File(filepath, "r") as f:
self.h5manager = pathml.core.h5managers.h5pathManager(h5path=f)
self.dtype = get_tiles_dtype(f)
if dtype != self.dtype and dtype is not None:
logger.info(f"using dtype {self.dtype} from h5path instead of {dtype}")
self.name = self.h5manager.h5["fields"].attrs["name"]
self.labels = {
key: val
Expand Down Expand Up @@ -212,6 +218,7 @@ def __repr__(self):
if self.backend:
out.append(f"backend={repr(self.backend)}")
out.append(f"image shape: {self.shape}")
out.append(f"image dtype: {self.dtype}")
try:
nlevels = self.slide.level_count
# TODO: change to specific exception
Expand Down
19 changes: 19 additions & 0 deletions pathml/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,22 @@ def readcounts(h5):
for ds in h5.keys():
h5.copy(ds, f)
return anndata.read_h5ad(path.name)


def get_tiles_dtype(h5):
"""
Returns the dtype of tile images in h5path file.

Returns:
np.dtype or None if no tiles in file
"""
# Check that all tiles have the same dtype as the first tile
dtype = None
for tile in h5["tiles"]:
tile_dtype = h5["tiles"][tile]["array"].dtype
if dtype is None:
dtype = tile_dtype
assert (
dtype == tile_dtype
), f"all tiles must have the same dtype. Tile {tile} has dtype {tile_dtype} instead of {dtype}"
return dtype
4 changes: 2 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def create_HE_tile():
im_np_rgb = cv2.cvtColor(im_np, cv2.COLOR_RGBA2RGB)
# make mask object
masks = np.random.randint(
low=1, high=255, size=(im_np_rgb.shape[0], im_np_rgb.shape[1]), dtype=np.uint8
low=0, high=255, size=(im_np_rgb.shape[0], im_np_rgb.shape[1]), dtype=np.uint8
)
masks = {"testmask": masks}
# labels dict
Expand Down Expand Up @@ -74,7 +74,7 @@ def tileVectra():

# make mask object
masks = np.random.randint(
low=1, high=255, size=(region.shape[0], region.shape[1]), dtype=np.uint8
low=0, high=255, size=(region.shape[0], region.shape[1]), dtype=np.uint8
)
masks = {"testmask": masks}

Expand Down
17 changes: 13 additions & 4 deletions tests/core_tests/test_h5managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,17 +42,26 @@ def test_h5manager2(tileHE):


def test_tile_dtype_HE(tileHE):
"""make sure that retrieved tiles and corresponding masks are float16"""
"""Test that tiles have default float16 dtype and masks are bool"""
slidedata = HESlide("tests/testdata/small_HE.svs")
slidedata.tiles.add(tileHE)
tile_retrieved = slidedata.tiles[tileHE.coords]
assert tile_retrieved.image.dtype == np.float16
assert tile_retrieved.masks["testmask"].dtype == np.float16
assert tile_retrieved.masks["testmask"].dtype == bool

def test_tile_dtype_HE_uint8(tileHE):
"""Test that tiles have modified uint8 dtype and masks are bool"""
slidedata = HESlide("tests/testdata/small_HE.svs", dtype=np.dtype('uint8'))
slidedata.tiles.add(tileHE)
tile_retrieved = slidedata.tiles[tileHE.coords]
import pdb; pdb.set_trace()
assert tile_retrieved.image.dtype == np.uint8
assert tile_retrieved.masks["testmask"].dtype == bool


def test_tile_dtype_IF(tileVectra, vectra_slide):
"""make sure that retrieved tiles and corresponding masks are float16"""
"""Test that tiles have float16 dtype and masks have bool dtype."""
vectra_slide.tiles.add(tileVectra)
tile_retrieved = vectra_slide.tiles[tileVectra.coords]
assert tile_retrieved.image.dtype == np.float16
assert tile_retrieved.masks["testmask"].dtype == np.float16
assert tile_retrieved.masks["testmask"].dtype == bool