ray-project · ericl · Aug 12, 2022 · Jul 28, 2022 · Jul 28, 2022 · Jul 28, 2022
@@ -1,24 +1,42 @@
 import io
+import logging
 import pathlib
-from typing import TYPE_CHECKING, Tuple, Optional
+import time
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
 import numpy as np
 
 from ray.data._internal.util import _check_import
+from ray.data.block import Block, BlockMetadata
 from ray.data.datasource.binary_datasource import BinaryDatasource
 from ray.data.datasource.datasource import Reader
 from ray.data.datasource.file_based_datasource import (
+ _FileBasedDatasourceReader,
+ FileBasedDatasource,
  _resolve_paths_and_filesystem,
  FileExtensionFilter,
 )
+from ray.data.datasource.file_meta_provider import DefaultFileMetadataProvider
+from ray.data.datasource.partitioning import PathPartitionFilter
 from ray.util.annotations import DeveloperAPI
 
 if TYPE_CHECKING:
  import pyarrow
  from ray.data.block import T
 
+
+logger = logging.getLogger(__name__)
+
 IMAGE_EXTENSIONS = ["png", "jpg", "jpeg", "tiff", "bmp", "gif"]
 
+# The default size multiplier for reading image data source.
+# This essentially is using image on-disk file size to estimate
+# in-memory data size.
+IMAGE_ENCODING_RATIO_ESTIMATE_DEFAULT = 1
+
+# The lower bound value to estimate image encoding ratio.
+IMAGE_ENCODING_RATIO_ESTIMATE_LOWER_BOUND = 0.5
+
 
 @DeveloperAPI
 class ImageFolderDatasource(BinaryDatasource):
@@ -119,10 +137,11 @@ def create_reader(
  paths, filesystem = _resolve_paths_and_filesystem([root])
  root = paths[0]
 
- return super().create_reader(
+ return _ImageFolderDatasourceReader(
+ delegate=self,
  paths=paths,
- partition_filter=FileExtensionFilter(file_extensions=IMAGE_EXTENSIONS),
  filesystem=filesystem,
+ partition_filter=FileExtensionFilter(file_extensions=IMAGE_EXTENSIONS),
  root=root,
  size=size,
  mode=mode,
@@ -135,7 +154,7 @@ def _read_file(
  root: str,
  size: Optional[Tuple[int, int]],
  mode: Optional[str],
- ):
+ ) -> Block:
  import pandas as pd
  from PIL import Image
 
@@ -160,6 +179,103 @@ def _read_file(
  )
 
 
+class _ImageFileMetadataProvider(DefaultFileMetadataProvider):
+ def _set_encoding_ratio(self, encoding_ratio: int):
+ """Set image file encoding ratio, to provide accurate size in bytes metadata."""
+ self._encoding_ratio = encoding_ratio
+
+ def _get_block_metadata(
+ self,
+ paths: List[str],
+ schema: Optional[Union[type, "pyarrow.lib.Schema"]],
+ *,
+ rows_per_file: Optional[int],
+ file_sizes: List[Optional[int]],
+ ) -> BlockMetadata:
+ metadata = super()._get_block_metadata(
+ paths, schema, rows_per_file=rows_per_file, file_sizes=file_sizes
+ )
+ if metadata.size_bytes is not None:
+ metadata.size_bytes = int(metadata.size_bytes * self._encoding_ratio)
+ return metadata
+
+
+class _ImageFolderDatasourceReader(_FileBasedDatasourceReader):
+ def __init__(
+ self,
+ delegate: FileBasedDatasource,
+ paths: List[str],
+ filesystem: "pyarrow.fs.FileSystem",
+ partition_filter: PathPartitionFilter,
+ meta_provider: _ImageFileMetadataProvider = _ImageFileMetadataProvider(),
+ **reader_args,
+ ):
+ super().__init__(
+ delegate=delegate,
+ paths=paths,
+ filesystem=filesystem,
+ schema=None,
+ open_stream_args=None,
+ meta_provider=meta_provider,
+ partition_filter=partition_filter,
+ **reader_args,
+ )
+ self._encoding_ratio = self._estimate_files_encoding_ratio()
+ meta_provider._set_encoding_ratio(self._encoding_ratio)
+
+ def estimate_inmemory_data_size(self) -> Optional[int]:
+ return sum(self._file_sizes) * self._encoding_ratio
+
+ def _estimate_files_encoding_ratio(self) -> float:
+ """Return an estimate of the image files encoding ratio."""
+ start_time = time.perf_counter()
+ # Filter out empty file to avoid noise.
+ non_empty_path_and_size = list(
+ filter(lambda p: p[1] > 0, zip(self._paths, self._file_sizes))
+ )
+ num_files = len(non_empty_path_and_size)
+ if num_files == 0:
+ logger.warn(
+ "All input image files are empty. "
+ "Use on-disk file size to estimate images in-memory size."
+ )
+ return IMAGE_ENCODING_RATIO_ESTIMATE_DEFAULT
+
+ size = self._reader_args.get("size")
+ mode = self._reader_args.get("mode")
+ if size is not None and mode is not None:
+ # Use image size and mode to calculate data size for all images,
+ # because all images are homogeneous with same size after resizing.
+ # Resizing is enforced when reading every image in ImageFolderDatasource
+ # when `size` argument is provided.
+ if mode in ["1", "L", "P"]:
+ dimension = 1
+ elif mode in ["RGB", "YCbCr", "LAB", "HSV"]:
+ dimension = 3
+ elif mode in ["RGBA", "CMYK", "I", "F"]:
+ dimension = 4
+ else:
+ logger.warn(f"Found unknown image mode: {mode}.")
+ return IMAGE_ENCODING_RATIO_ESTIMATE_DEFAULT
+ height, width = size
+ single_image_size = height * width * dimension
+ total_estimated_size = single_image_size * num_files
+ total_file_size = sum(p[1] for p in non_empty_path_and_size)
+ ratio = total_estimated_size / total_file_size
+ else:
+ # TODO(chengsu): sample images to estimate data size
+ ratio = IMAGE_ENCODING_RATIO_ESTIMATE_DEFAULT
+
+ sampling_duration = time.perf_counter() - start_time
+ if sampling_duration > 5:
+ logger.warn(
+ "Image input size estimation took "
+ f"{round(sampling_duration, 2)} seconds."
+ )
+ logger.debug(f"Estimated image encoding ratio from sampling is {ratio}.")
+ return max(ratio, IMAGE_ENCODING_RATIO_ESTIMATE_LOWER_BOUND)
+
+
 def _get_class_from_path(path: str, root: str) -> str:
  # The class is the name of the first directory after the root. For example, if
  # the root is "/data/imagenet/train" and the path is

@@ -11,6 +11,10 @@
 import pyarrow.parquet as pq
 import pytest
 from ray.data.datasource.file_meta_provider import _handle_read_os_error
+from ray.data.datasource.image_folder_datasource import (
+ IMAGE_EXTENSIONS,
+ _ImageFolderDatasourceReader,
+)
 import requests
 import snappy
 from fsspec.implementations.local import LocalFileSystem
@@ -36,7 +40,10 @@
  SimpleTorchDatasource,
  WriteResult,
 )
-from ray.data.datasource.file_based_datasource import _unwrap_protocol
+from ray.data.datasource.file_based_datasource import (
+ FileExtensionFilter,
+ _unwrap_protocol,
+)
 from ray.data.datasource.parquet_datasource import (
  PARALLELIZE_META_FETCH_THRESHOLD,
  _ParquetDatasourceReader,
@@ -2952,6 +2959,49 @@ def preprocess(df):
  predictor.predict(dataset, feature_columns=["image"])
 
 
+@pytest.mark.parametrize(
+ "image_size,image_mode,expected_size,expected_ratio",
+ [(64, "RGB", 30000, 4), (32, "L", 3500, 0.5), (256, "RGBA", 750000, 85)],
+)
+def test_image_folder_reader_estimate_data_size(
+ ray_start_regular_shared, image_size, image_mode, expected_size, expected_ratio
+):
+ root = "example:https://image-folders/different-sizes"
+ ds = ray.data.read_datasource(
+ ImageFolderDatasource(),
+ root=root,
+ size=(image_size, image_size),
+ mode=image_mode,
+ )
+
+ data_size = ds.size_bytes()
+ assert (
+ data_size >= expected_size and data_size <= expected_size * 1.5
+ ), "estimated data size is out of expected bound"
+ data_size = ds.fully_executed().size_bytes()
+ assert (
+ data_size >= expected_size and data_size <= expected_size * 1.5
+ ), "actual data size is out of expected bound"
+
+ reader = _ImageFolderDatasourceReader(
+ delegate=ImageFolderDatasource(),
+ paths=[root],
+ filesystem=LocalFileSystem(),
+ partition_filter=FileExtensionFilter(file_extensions=IMAGE_EXTENSIONS),
+ root=root,
+ size=(image_size, image_size),
+ mode=image_mode,
+ )
+ assert (
+ reader._encoding_ratio >= expected_ratio
+ and reader._encoding_ratio <= expected_ratio * 1.5
+ ), "encoding ratio is out of expected bound"
+ data_size = reader.estimate_inmemory_data_size()
+ assert (
+ data_size >= expected_size and data_size <= expected_size * 1.5
+ ), "estimated data size is out of expected bound"
+
+
 # NOTE: The last test using the shared ray_start_regular_shared cluster must use the
 # shutdown_only fixture so the shared cluster is shut down, otherwise the below
 # test_write_datasource_ray_remote_args test, which uses a cluster_utils cluster, will