lazily construct samplers, index lazy datasets

allenai · DeNeutoy · Feb 26, 2020 · Jan 30, 2020 · Feb 19, 2020 · Feb 19, 2020
commit a23f47a04588a5f49a97ac4628ad49164b3279f7
diff --git a/allennlp/data/dataset_readers/dataset_reader.py b/allennlp/data/dataset_readers/dataset_reader.py
@@ -59,13 +59,19 @@ def __iter__(self) -> Iterator[Instance]:
  if self.cache_file is not None and os.path.exists(self.cache_file):
  with open(self.cache_file) as data_file:
  for line in data_file:
- yield self.deserialize(line)
+ instance = self.deserialize(line)
+ if self.vocab is not None:
+ instance.index_fields(self.vocab)
+ yield instance
+
  # Case 2: Need to cache instances
  elif self.cache_file is not None:
  with open(self.cache_file, "w") as data_file:
  for instance in self.instance_generator():
  data_file.write(self.serialize(instance))
  data_file.write("\n")
+ if self.vocab is not None:
+ instance.index_fields(self.vocab)
  yield instance
  # Case 3: No cache
  else:
@@ -74,11 +80,17 @@ def __iter__(self) -> Iterator[Instance]:
  raise ConfigurationError(
  "For a lazy dataset reader, _read() must return a generator"
  )
- yield from instances
+ for instance in instances:
+ if self.vocab is not None:
+ instance.index_fields(self.vocab)
+ yield instance
 
  def index_with(self, vocab: Vocabulary):
  self.vocab = vocab
 
+ def __len__(self):
+ return 1
 return 1 
 return 1 
+
 
 class DatasetReader(Registrable):
  """

diff --git a/allennlp/data/samplers/__init__.py b/allennlp/data/samplers/__init__.py
@@ -5,6 +5,7 @@
 from allennlp.common.registrable import Registrable
 
 from allennlp.common.util import add_noise_to_dict_values, lazy_groups_of
+from allennlp.common.lazy import Lazy
 from allennlp.data.batch import Batch as AllennlpBatch
 from allennlp.data.instance import Instance
 from allennlp.data.vocabulary import Vocabulary
@@ -30,7 +31,7 @@ def __iter__(self) -> Iterable[List[int]]:
 
 @Sampler.register("sequential")
 class SequentialSampler(Sampler, data.SequentialSampler):
- def __init__(self, data_source: data.Dataset):
+ def __init__(self, data_source: data.Dataset, **kwargs):
  super().__init__(data_source)
 
 
@@ -47,7 +48,7 @@ class RandomSampler(Sampler, data.RandomSampler):
  """
 
  def __init__(
- self, data_source: data.Dataset, replacement: bool = False, num_samples: int = None
+ self, data_source: data.Dataset, replacement: bool = False, num_samples: int = None, **kwargs
  ):
  super().__init__(data_source, replacement, num_samples)
 
@@ -60,7 +61,7 @@ class SubsetRandomSampler(Sampler, data.SubsetRandomSampler):
  indices (sequence): a sequence of indices
  """
 
- def __init__(self, indices: List[int]):
+ def __init__(self, indices: List[int], **kwargs):
  super().__init__(indices)
 
 
@@ -82,7 +83,7 @@ class WeightedRandomSampler(Sampler, data.WeightedRandomSampler):
  [0, 1, 4, 3, 2]
  """
 
- def __init__(self, weights: List[float], num_samples: int, replacement: bool = True):
+ def __init__(self, weights: List[float], num_samples: int, replacement: bool = True, **kwargs):
  super().__init__(weights, num_samples, replacement)
 
 
@@ -103,15 +104,15 @@ class BasicBatchSampler(BatchSampler, data.BatchSampler):
  [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
  """
 
- def __init__(self, sampler: Sampler, batch_size: int, drop_last: bool):
+ def __init__(self, sampler: Sampler, batch_size: int, drop_last: bool, **kwargs):
  super().__init__(sampler, batch_size, drop_last)
 
 
 @BatchSampler.register("bucket")
 class BatchInstanceSampler(BatchSampler):
  def __init__(
  self,
- data: data.Dataset,
+ data_source: data.Dataset,
  batch_size: int,
  sorting_keys: List[Tuple[str, str]] = None,
  padding_noise: float = 0.1,
@@ -121,7 +122,7 @@ def __init__(
  self._sorting_keys = sorting_keys
  self._padding_noise = padding_noise
  self._batch_size = batch_size
- self.data = data
+ self.data_source = data_source
 
  def _argsort_by_padding(self, instances: List[Instance]) -> List[int]:
  """
@@ -159,7 +160,7 @@ def _argsort_by_padding(self, instances: List[Instance]) -> List[int]:
 
  def __iter__(self) -> Iterable[List[int]]:
 
- indices = self._argsort_by_padding(self.data)
+ indices = self._argsort_by_padding(self.data_source)
  for group in lazy_groups_of(indices, self._batch_size):
  yield list(group)
 
@@ -195,8 +196,8 @@ def __init__(
  dataset: data.Dataset,
  batch_size: int = 1,
  shuffle: bool = False,
- sampler: Sampler = None,
- batch_sampler: BatchSampler = None,
+ sampler: Lazy[Sampler] = None,
  This is only recommended for use when you have registered a `@classmethod` as the constructor 
  for your class, instead of using `__init__`. Having a `Lazy[]` type annotation on an argument 
  to an `__init__` method makes your class completely dependent on being constructed using the 
  `FromParams` pipeline, which is not a good idea. 
  This method exists so that we can have a documented method to construct this class using 
  `FromParams`. If you are not using `FromParams` or config files, you can safely ignore this 
  method. 
  The reason we can't just use `__init__` with `FromParams` here is because there are 
  sequential dependencies to this class's arguments. Anything that has a `Lazy[]` type 
  annotation needs something from one of the non-`Lazy` arguments. The `Optimizer` needs to 
  have the parameters from the `Model` before it's constructed, and the `Schedulers` need to 
  have the `Optimizer`. Because of this, the typical way we construct things `FromParams` 
  doesn't work, so we use `Lazy` to allow for constructing the objects sequentially. 
  If you're not using `FromParams`, you can just construct these arguments in the right order 
  yourself in your code and call the constructor directly. 
  This is only recommended for use when you have registered a `@classmethod` as the constructor 
  for your class, instead of using `__init__`. Having a `Lazy[]` type annotation on an argument 
  to an `__init__` method makes your class completely dependent on being constructed using the 
  `FromParams` pipeline, which is not a good idea. 
  This method exists so that we can have a documented method to construct this class using 
  `FromParams`. If you are not using `FromParams` or config files, you can safely ignore this 
  method. 
  
  The reason we can't just use `__init__` with `FromParams` here is because there are 
  sequential dependencies to this class's arguments. Anything that has a `Lazy[]` type 
  annotation needs something from one of the non-`Lazy` arguments. The `Optimizer` needs to 
  have the parameters from the `Model` before it's constructed, and the `Schedulers` need to 
  have the `Optimizer`. Because of this, the typical way we construct things `FromParams` 
  doesn't work, so we use `Lazy` to allow for constructing the objects sequentially. 
  
  If you're not using `FromParams`, you can just construct these arguments in the right order 
  yourself in your code and call the constructor directly. 
+ batch_sampler: Lazy[BatchSampler] = None,
  num_workers: int = 0,
  collate_fn=None,
  pin_memory: bool = False,
@@ -207,12 +208,21 @@ def __init__(
  ):
 
  collate_fn = allennlp_collocate
+ if batch_sampler is not None:
+ batch_sampler_ = batch_sampler.construct(dataset=dataset)
+ else:
+ batch_sampler_ = None
+ if sampler is not None:
+ sampler_ = sampler.construct(dataset=dataset)
+ else:
+ sampler_ = None
+
  super().__init__(
  dataset=dataset,
  batch_size=batch_size,
  shuffle=shuffle,
- sampler=sampler,
- batch_sampler=batch_sampler,
+ sampler=sampler_,
+ batch_sampler=batch_sampler_,
  num_workers=num_workers,
  collate_fn=collate_fn,
  pin_memory=pin_memory,