Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Datasets] Improve size estimation of image folder data source #27219

Merged
merged 6 commits into from
Aug 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 120 additions & 4 deletions python/ray/data/datasource/image_folder_datasource.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,42 @@
import io
import logging
import pathlib
from typing import TYPE_CHECKING, Tuple, Optional
import time
from typing import TYPE_CHECKING, List, Optional, Tuple, Union

import numpy as np

from ray.data._internal.util import _check_import
from ray.data.block import Block, BlockMetadata
from ray.data.datasource.binary_datasource import BinaryDatasource
from ray.data.datasource.datasource import Reader
from ray.data.datasource.file_based_datasource import (
_FileBasedDatasourceReader,
FileBasedDatasource,
_resolve_paths_and_filesystem,
FileExtensionFilter,
)
from ray.data.datasource.file_meta_provider import DefaultFileMetadataProvider
from ray.data.datasource.partitioning import PathPartitionFilter
from ray.util.annotations import DeveloperAPI

if TYPE_CHECKING:
import pyarrow
from ray.data.block import T


logger = logging.getLogger(__name__)

IMAGE_EXTENSIONS = ["png", "jpg", "jpeg", "tiff", "bmp", "gif"]

# The default size multiplier for reading image data source.
# This essentially is using image on-disk file size to estimate
# in-memory data size.
IMAGE_ENCODING_RATIO_ESTIMATE_DEFAULT = 1

# The lower bound value to estimate image encoding ratio.
IMAGE_ENCODING_RATIO_ESTIMATE_LOWER_BOUND = 0.5


@DeveloperAPI
class ImageFolderDatasource(BinaryDatasource):
Expand Down Expand Up @@ -119,10 +137,11 @@ def create_reader(
paths, filesystem = _resolve_paths_and_filesystem([root])
root = paths[0]

return super().create_reader(
return _ImageFolderDatasourceReader(
delegate=self,
paths=paths,
partition_filter=FileExtensionFilter(file_extensions=IMAGE_EXTENSIONS),
filesystem=filesystem,
partition_filter=FileExtensionFilter(file_extensions=IMAGE_EXTENSIONS),
root=root,
size=size,
mode=mode,
Expand All @@ -135,7 +154,7 @@ def _read_file(
root: str,
size: Optional[Tuple[int, int]],
mode: Optional[str],
):
) -> Block:
import pandas as pd
from PIL import Image

Expand All @@ -160,6 +179,103 @@ def _read_file(
)


class _ImageFileMetadataProvider(DefaultFileMetadataProvider):
def _set_encoding_ratio(self, encoding_ratio: int):
"""Set image file encoding ratio, to provide accurate size in bytes metadata."""
self._encoding_ratio = encoding_ratio

def _get_block_metadata(
self,
paths: List[str],
schema: Optional[Union[type, "pyarrow.lib.Schema"]],
*,
rows_per_file: Optional[int],
file_sizes: List[Optional[int]],
) -> BlockMetadata:
metadata = super()._get_block_metadata(
paths, schema, rows_per_file=rows_per_file, file_sizes=file_sizes
)
if metadata.size_bytes is not None:
metadata.size_bytes = int(metadata.size_bytes * self._encoding_ratio)
return metadata


class _ImageFolderDatasourceReader(_FileBasedDatasourceReader):
def __init__(
self,
delegate: FileBasedDatasource,
paths: List[str],
filesystem: "pyarrow.fs.FileSystem",
partition_filter: PathPartitionFilter,
meta_provider: _ImageFileMetadataProvider = _ImageFileMetadataProvider(),
**reader_args,
):
super().__init__(
delegate=delegate,
paths=paths,
filesystem=filesystem,
schema=None,
open_stream_args=None,
meta_provider=meta_provider,
partition_filter=partition_filter,
**reader_args,
)
self._encoding_ratio = self._estimate_files_encoding_ratio()
meta_provider._set_encoding_ratio(self._encoding_ratio)

def estimate_inmemory_data_size(self) -> Optional[int]:
return sum(self._file_sizes) * self._encoding_ratio

def _estimate_files_encoding_ratio(self) -> float:
c21 marked this conversation as resolved.
Show resolved Hide resolved
"""Return an estimate of the image files encoding ratio."""
start_time = time.perf_counter()
# Filter out empty file to avoid noise.
non_empty_path_and_size = list(
filter(lambda p: p[1] > 0, zip(self._paths, self._file_sizes))
)
num_files = len(non_empty_path_and_size)
if num_files == 0:
logger.warn(
"All input image files are empty. "
"Use on-disk file size to estimate images in-memory size."
)
return IMAGE_ENCODING_RATIO_ESTIMATE_DEFAULT

size = self._reader_args.get("size")
mode = self._reader_args.get("mode")
if size is not None and mode is not None:
# Use image size and mode to calculate data size for all images,
# because all images are homogeneous with same size after resizing.
# Resizing is enforced when reading every image in ImageFolderDatasource
# when `size` argument is provided.
if mode in ["1", "L", "P"]:
dimension = 1
elif mode in ["RGB", "YCbCr", "LAB", "HSV"]:
dimension = 3
elif mode in ["RGBA", "CMYK", "I", "F"]:
dimension = 4
else:
logger.warn(f"Found unknown image mode: {mode}.")
return IMAGE_ENCODING_RATIO_ESTIMATE_DEFAULT
height, width = size
single_image_size = height * width * dimension
total_estimated_size = single_image_size * num_files
total_file_size = sum(p[1] for p in non_empty_path_and_size)
ratio = total_estimated_size / total_file_size
else:
# TODO(chengsu): sample images to estimate data size
ratio = IMAGE_ENCODING_RATIO_ESTIMATE_DEFAULT

sampling_duration = time.perf_counter() - start_time
if sampling_duration > 5:
logger.warn(
"Image input size estimation took "
f"{round(sampling_duration, 2)} seconds."
)
logger.debug(f"Estimated image encoding ratio from sampling is {ratio}.")
return max(ratio, IMAGE_ENCODING_RATIO_ESTIMATE_LOWER_BOUND)


def _get_class_from_path(path: str, root: str) -> str:
# The class is the name of the first directory after the root. For example, if
# the root is "/data/imagenet/train" and the path is
Expand Down
52 changes: 51 additions & 1 deletion python/ray/data/tests/test_dataset_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
import pyarrow.parquet as pq
import pytest
from ray.data.datasource.file_meta_provider import _handle_read_os_error
from ray.data.datasource.image_folder_datasource import (
IMAGE_EXTENSIONS,
_ImageFolderDatasourceReader,
)
import requests
import snappy
from fsspec.implementations.local import LocalFileSystem
Expand All @@ -36,7 +40,10 @@
SimpleTorchDatasource,
WriteResult,
)
from ray.data.datasource.file_based_datasource import _unwrap_protocol
from ray.data.datasource.file_based_datasource import (
FileExtensionFilter,
_unwrap_protocol,
)
from ray.data.datasource.parquet_datasource import (
PARALLELIZE_META_FETCH_THRESHOLD,
_ParquetDatasourceReader,
Expand Down Expand Up @@ -2952,6 +2959,49 @@ def preprocess(df):
predictor.predict(dataset, feature_columns=["image"])


@pytest.mark.parametrize(
"image_size,image_mode,expected_size,expected_ratio",
[(64, "RGB", 30000, 4), (32, "L", 3500, 0.5), (256, "RGBA", 750000, 85)],
)
def test_image_folder_reader_estimate_data_size(
ray_start_regular_shared, image_size, image_mode, expected_size, expected_ratio
):
root = "example:https://image-folders/different-sizes"
ds = ray.data.read_datasource(
ImageFolderDatasource(),
root=root,
size=(image_size, image_size),
mode=image_mode,
)

data_size = ds.size_bytes()
assert (
data_size >= expected_size and data_size <= expected_size * 1.5
), "estimated data size is out of expected bound"
data_size = ds.fully_executed().size_bytes()
assert (
data_size >= expected_size and data_size <= expected_size * 1.5
), "actual data size is out of expected bound"

reader = _ImageFolderDatasourceReader(
delegate=ImageFolderDatasource(),
paths=[root],
filesystem=LocalFileSystem(),
partition_filter=FileExtensionFilter(file_extensions=IMAGE_EXTENSIONS),
root=root,
size=(image_size, image_size),
mode=image_mode,
)
assert (
reader._encoding_ratio >= expected_ratio
and reader._encoding_ratio <= expected_ratio * 1.5
), "encoding ratio is out of expected bound"
data_size = reader.estimate_inmemory_data_size()
assert (
data_size >= expected_size and data_size <= expected_size * 1.5
), "estimated data size is out of expected bound"


# NOTE: The last test using the shared ray_start_regular_shared cluster must use the
# shutdown_only fixture so the shared cluster is shut down, otherwise the below
# test_write_datasource_ray_remote_args test, which uses a cluster_utils cluster, will
Expand Down