Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Data V2 #3700

Merged
merged 59 commits into from
Feb 26, 2020
Merged

Data V2 #3700

Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
0c42cb9
example for feedback
DeNeutoy Jan 30, 2020
5ffedfc
Merge branch 'master' into data-v2
DeNeutoy Feb 19, 2020
80049f8
remove all existing multiprocessing
DeNeutoy Feb 19, 2020
6f58c2a
sneak torch datasets inside DatasetReader
DeNeutoy Feb 19, 2020
1b3ad9a
lint
DeNeutoy Feb 19, 2020
effc445
trainer_v2, We Love To See It
DeNeutoy Feb 19, 2020
9d44ad6
datasets have index_with now, not iterators
DeNeutoy Feb 19, 2020
7e89ea6
use iter, custom collate function in allennlp wrapper
DeNeutoy Feb 19, 2020
883b6d7
we don't even need the data in the trainer anymore
DeNeutoy Feb 19, 2020
56d022a
all trainer tests passing
DeNeutoy Feb 20, 2020
01e12f5
black
DeNeutoy Feb 20, 2020
5aea291
make find learning rate work
DeNeutoy Feb 20, 2020
f026946
update test fixtures to new config
DeNeutoy Feb 20, 2020
5973b50
get train command tests mostly working
DeNeutoy Feb 20, 2020
a23f47a
lazily construct samplers, index lazy datasets
DeNeutoy Feb 20, 2020
a76ea0a
Merge branch 'master' into data-v2
DeNeutoy Feb 20, 2020
ebf3854
update some fixtures
DeNeutoy Feb 20, 2020
57a67e5
evaluate tests passing
DeNeutoy Feb 20, 2020
7d21ed8
all command tests passing
DeNeutoy Feb 20, 2020
24a500c
lint
DeNeutoy Feb 20, 2020
fb13769
update model test case, common and module tests passing
DeNeutoy Feb 20, 2020
ef5187f
fix test interdependence introduced by #3762
DeNeutoy Feb 21, 2020
b1ea845
more test interdependence
DeNeutoy Feb 21, 2020
0231616
tests tests tests
DeNeutoy Feb 21, 2020
01d76bb
remove unnecessary brackets
DeNeutoy Feb 21, 2020
12b6efb
Merge branch 'master' into data-v2
DeNeutoy Feb 21, 2020
859d3ca
update a chunk of the configs
DeNeutoy Feb 21, 2020
c22dee3
fix archival test, couple more configs
DeNeutoy Feb 21, 2020
fe5b470
rm pointless gan test
DeNeutoy Feb 21, 2020
7533c91
more tests passing
DeNeutoy Feb 21, 2020
ad45659
add current state of from params changes
DeNeutoy Feb 21, 2020
f944840
Revert "add current state of from params changes"
DeNeutoy Feb 21, 2020
3b12a2f
Merge branch 'master' into data-v2
DeNeutoy Feb 21, 2020
be1f58c
updated understanding of Lazy
DeNeutoy Feb 21, 2020
ebdabe0
add discussion of None comparison to Lazy
DeNeutoy Feb 21, 2020
8693739
lint
DeNeutoy Feb 21, 2020
b9b0650
it's a hard doc life
DeNeutoy Feb 21, 2020
88314c7
pull samplers into separate file
DeNeutoy Feb 21, 2020
14296a1
more docs updates
DeNeutoy Feb 22, 2020
8a08899
fold in #3812
DeNeutoy Feb 22, 2020
3520280
remove torch dataset
DeNeutoy Feb 22, 2020
0f1d8a4
add example to lazy
DeNeutoy Feb 22, 2020
93e1e89
rename to collate
DeNeutoy Feb 22, 2020
40dd695
no kwargs
DeNeutoy Feb 23, 2020
da3b1b4
Revert "fold in #3812"
DeNeutoy Feb 23, 2020
801a8f5
don't break up dataset
DeNeutoy Feb 23, 2020
007fd0c
add comment to iterable dataset len
DeNeutoy Feb 23, 2020
d00e1a9
Merge branch 'master' into data-v2
DeNeutoy Feb 23, 2020
c066804
improve docstrings, build dataloader using partial_objects
DeNeutoy Feb 23, 2020
61c7b14
flake
DeNeutoy Feb 23, 2020
2b56b14
give dataloader a default implementation
DeNeutoy Feb 24, 2020
354010a
safer default for DataLoader init
DeNeutoy Feb 24, 2020
568291d
more coherent dir structure
DeNeutoy Feb 24, 2020
a016103
update imports
DeNeutoy Feb 24, 2020
47db16a
Merge branch 'master' into data-v2
DeNeutoy Feb 24, 2020
04fdb70
add a test for the BucketBatchSampler
DeNeutoy Feb 24, 2020
d1d5c4a
split bucket sampler into own file, tests
DeNeutoy Feb 24, 2020
5f0c8db
PR comments
DeNeutoy Feb 26, 2020
6f63a53
Merge branch 'master' into data-v2
DeNeutoy Feb 26, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
split bucket sampler into own file, tests
  • Loading branch information
DeNeutoy committed Feb 24, 2020
commit d1d5c4ace8dd85b4489d621e8e112a345fb723bf
2 changes: 1 addition & 1 deletion allennlp/data/samplers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
WeightedRandomSampler,
RandomSampler,
BasicBatchSampler,
BucketBatchSampler,
)
from allennlp.data.samplers.bucket_batch_sampler import BucketBatchSampler
146 changes: 146 additions & 0 deletions allennlp/data/samplers/bucket_batch_sampler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
from typing import List, Iterable, Tuple, Dict, cast
import logging
from torch.utils import data

from allennlp.common.util import add_noise_to_dict_values, lazy_groups_of
from allennlp.data.instance import Instance
from allennlp.data.samplers import BatchSampler

logger = logging.getLogger(__name__)


@BatchSampler.register("bucket")
class BucketBatchSampler(BatchSampler):
"""
An sampler which by default, argsorts batches with respect to the maximum input lengths `per
batch`. You can provide a list of field names and padding keys (or pass none, in which case they
will be inferred) which the dataset will be sorted by before doing this batching, causing inputs
with similar length to be batched together, making computation more efficient (as less time is
wasted on padded elements of the batch).

# Parameters

data_source: `data.Dataset`, required,
The pytorch `Dataset` of allennlp Instances to bucket.
sorting_keys : List[Tuple[str, str]], optional
To bucket inputs into batches, we want to group the instances by padding length, so that we
minimize the amount of padding necessary per batch. In order to do this, we need to know
which fields need what type of padding, and in what order.

Specifying the right keys for this is a bit cryptic, so if this is not given we try to
auto-detect the right keys by iterating once through the data up front, reading all of the
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"auto-detect the right keys by iterating through a few instances up front" ?

padding keys and seeing which one has the longest length. We use that one for padding.
This should give reasonable results in most cases.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it worth giving some example cases where this isn't a reasonable default? "Some cases where it might not be the right thing to do are when you have a ListField[TextField], or when you have a really long, constant length ArrayField."

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can add that in if you say it's true, but I haven't thought about this deeply 😄


When you need to specify this yourself, you can create an instance from your dataset and
call `Instance.get_padding_lengths()` to see a list of all keys used in your data. You
should give one or more of those as the sorting keys here.
batch_size : int, required.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move this up one, so it's in order?

The size of each batch of instances yielded when calling the dataloader.
padding_noise : float, optional (default=.1)
When sorting by padding length, we add a bit of noise to the lengths, so that the sorting
isn't deterministic. This parameter determines how much noise we add, as a percentage of
the actual padding value for each instance.
drop_last : `bool`
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Give the default here.

If `True`, the sampler will drop the last batch if
its size would be less than batch_size`.
"""

def __init__(
self,
data_source: data.Dataset,
batch_size: int,
sorting_keys: List[Tuple[str, str]] = None,
padding_noise: float = 0.1,
drop_last: bool = False,
):

self.vocab = data_source.vocab
self.sorting_keys = sorting_keys
self.padding_noise = padding_noise
self.batch_size = batch_size
self.data_source = data_source
self.drop_last = drop_last

def _argsort_by_padding(self, instances: Iterable[Instance]) -> List[int]:
"""
Argsorts the instances by their padding lengths, using the keys in
`sorting_keys` (in the order in which they are provided). `sorting_keys`
is a list of `(field_name, padding_key)` tuples.
"""
if not self.sorting_keys:
logger.info("No sorting keys given; trying to guess a good one")
self._guess_sorting_keys(instances)
logger.info(f"Using {self.sorting_keys} as the sorting keys")
instances_with_lengths = []
for instance in instances:
# Make sure instance is indexed before calling .get_padding
instance.index_fields(self.vocab)
padding_lengths = cast(Dict[str, Dict[str, float]], instance.get_padding_lengths())
if self.padding_noise > 0.0:
noisy_lengths = {}
for field_name, field_lengths in padding_lengths.items():
noisy_lengths[field_name] = add_noise_to_dict_values(
field_lengths, self.padding_noise
)
padding_lengths = noisy_lengths
instance_with_lengths = (
[
padding_lengths[field_name][padding_key]
for (field_name, padding_key) in self.sorting_keys
],
instance,
)
instances_with_lengths.append(instance_with_lengths)
with_indices = [(x, i) for i, x in enumerate(instances_with_lengths)]
with_indices.sort(key=lambda x: x[0][0])
return [instance_with_index[-1] for instance_with_index in with_indices]

def __iter__(self) -> Iterable[List[int]]:

indices = self._argsort_by_padding(self.data_source)
for group in lazy_groups_of(indices, self.batch_size):
batch_indices = list(group)
if self.drop_last and len(batch_indices) < self.batch_size:
continue
yield batch_indices

def _guess_sorting_keys(self, instances: Iterable[Instance], num_instances: int = 10) -> None:
"""
Use `num_instances` instances from the dataset to infer the keys used
for sorting the dataset for bucketing.

# Parameters

instances : `Iterable[Instance]`, required.
The dataset to guess sorting keys for.
num_instances : `int`, optional (default = 10)
The number of instances to use to guess sorting keys. Typically
the default value is completely sufficient, but if your instances
are not homogeneous, you might need more.
"""
max_length = 0.0
longest_padding_key: Tuple[str, str] = None
for i, instance in enumerate(instances):
instance.index_fields(self.vocab)
padding_lengths = cast(Dict[str, Dict[str, float]], instance.get_padding_lengths())
for field_name, field_padding in padding_lengths.items():
for padding_key, length in field_padding.items():
if length > max_length:
max_length = length
longest_padding_key = (field_name, padding_key)
if i > num_instances:
# Only use num_instances instances to guess the sorting keys.
break

if not longest_padding_key:
# This shouldn't ever happen (you basically have to have an empty instance list), but
# just in case...
raise AssertionError(
"Found no field that needed padding; we are surprised you got this error, please "
"open an issue on github"
)
self.sorting_keys = [longest_padding_key]

def __len__(self):
return len(self.data_source) // self.batch_size
143 changes: 1 addition & 142 deletions allennlp/data/samplers/samplers.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,8 @@
from typing import List, Iterable, Tuple, Dict, cast
import logging
from typing import List, Iterable
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be worth somewhere in here saying that you can just use the pytorch classes directly without issue if you aren't using FromParams.

from torch.utils import data

from allennlp.common.registrable import Registrable

from allennlp.common.util import add_noise_to_dict_values, lazy_groups_of
from allennlp.data.instance import Instance

logger = logging.getLogger(__name__)


class Sampler(Registrable):
"""
Expand Down Expand Up @@ -141,138 +135,3 @@ class BasicBatchSampler(BatchSampler, data.BatchSampler):

def __init__(self, sampler: Sampler, batch_size: int, drop_last: bool):
super().__init__(sampler, batch_size, drop_last)


@BatchSampler.register("bucket")
class BucketBatchSampler(BatchSampler):
"""
An sampler which by default, argsorts batches with respect to the maximum input lengths `per
batch`. You can provide a list of field names and padding keys (or pass none, in which case they
will be inferred) which the dataset will be sorted by before doing this batching, causing inputs
with similar length to be batched together, making computation more efficient (as less time is
wasted on padded elements of the batch).

# Parameters

sorting_keys : List[Tuple[str, str]], optional
To bucket inputs into batches, we want to group the instances by padding length, so that we
minimize the amount of padding necessary per batch. In order to do this, we need to know
which fields need what type of padding, and in what order.

Specifying the right keys for this is a bit cryptic, so if this is not given we try to
auto-detect the right keys by iterating once through the data up front, reading all of the
padding keys and seeing which one has the longest length. We use that one for padding.
This should give reasonable results in most cases.

When you need to specify this yourself, you can create an instance from your dataset and
call `Instance.get_padding_lengths()` to see a list of all keys used in your data. You
should give one or more of those as the sorting keys here.
batch_size : int, required.
The size of each batch of instances yielded when calling the dataloader.
padding_noise : float, optional (default=.1)
When sorting by padding length, we add a bit of noise to the lengths, so that the sorting
isn't deterministic. This parameter determines how much noise we add, as a percentage of
the actual padding value for each instance.
drop_last : `bool`
If `True`, the sampler will drop the last batch if
its size would be less than batch_size`.
"""

def __init__(
self,
data_source: data.Dataset,
batch_size: int,
sorting_keys: List[Tuple[str, str]] = None,
padding_noise: float = 0.1,
drop_last: bool = False,
):

self.vocab = data_source.vocab
self.sorting_keys = sorting_keys
self.padding_noise = padding_noise
self.batch_size = batch_size
self.data_source = data_source
self.drop_last = drop_last

def _argsort_by_padding(self, instances: Iterable[Instance]) -> List[int]:
"""
Argsorts the instances by their padding lengths, using the keys in
`sorting_keys` (in the order in which they are provided). `sorting_keys`
is a list of `(field_name, padding_key)` tuples.
"""
if not self.sorting_keys:
logger.info("No sorting keys given; trying to guess a good one")
self._guess_sorting_keys(instances)
logger.info(f"Using {self.sorting_keys} as the sorting keys")
instances_with_lengths = []
for instance in instances:
# Make sure instance is indexed before calling .get_padding
instance.index_fields(self.vocab)
padding_lengths = cast(Dict[str, Dict[str, float]], instance.get_padding_lengths())
if self.padding_noise > 0.0:
noisy_lengths = {}
for field_name, field_lengths in padding_lengths.items():
noisy_lengths[field_name] = add_noise_to_dict_values(
field_lengths, self.padding_noise
)
padding_lengths = noisy_lengths
instance_with_lengths = (
[
padding_lengths[field_name][padding_key]
for (field_name, padding_key) in self.sorting_keys
],
instance,
)
instances_with_lengths.append(instance_with_lengths)
with_indices = [(x, i) for i, x in enumerate(instances_with_lengths)]
with_indices.sort(key=lambda x: x[0][0])
return [instance_with_index[-1] for instance_with_index in with_indices]

def __iter__(self) -> Iterable[List[int]]:

indices = self._argsort_by_padding(self.data_source)
for group in lazy_groups_of(indices, self.batch_size):
batch_indices = list(group)
if self.drop_last and len(batch_indices) < self.batch_size:
continue
yield batch_indices

def _guess_sorting_keys(self, instances: Iterable[Instance], num_instances: int = 10) -> None:
"""
Use `num_instances` instances from the dataset to infer the keys used
for sorting the dataset for bucketing.

# Parameters

instances : `Iterable[Instance]`, required.
The dataset to guess sorting keys for.
num_instances : `int`, optional (default = 10)
The number of instances to use to guess sorting keys. Typically
the default value is completely sufficient, but if your instances
are not homogeneous, you might need more.
"""
max_length = 0.0
longest_padding_key: Tuple[str, str] = None
for i, instance in enumerate(instances):
instance.index_fields(self.vocab)
padding_lengths = cast(Dict[str, Dict[str, float]], instance.get_padding_lengths())
for field_name, field_padding in padding_lengths.items():
for padding_key, length in field_padding.items():
if length > max_length:
max_length = length
longest_padding_key = (field_name, padding_key)
if i > num_instances:
# Only use num_instances instances to guess the sorting keys.
break

if not longest_padding_key:
# This shouldn't ever happen (you basically have to have an empty instance list), but
# just in case...
raise AssertionError(
"Found no field that needed padding; we are surprised you got this error, please "
"open an issue on github"
)
self.sorting_keys = [longest_padding_key]

def __len__(self):
return len(self.data_source) // self.batch_size
Empty file.
Loading