Load different random subset of dataset on each epoch (IntelLabs#149)

* For CIFAR-10 / ImageNet only * Refactor data_loaders.py, reduce code duplication * Implemented custom sampler * Integrated in image classification sample * Since we now shuffle the test set, had to update expected results in 2 full_flow_tests that do evaluation
caihengyu520 · Feb 10, 2019 · 4b1d0c8 · 4b1d0c8
1 parent bb12616
commit 4b1d0c8
Show file tree

Hide file tree

Showing 16 changed files with 147 additions and 113 deletions.
diff --git a/apputils/data_loaders.py b/apputils/data_loaders.py
@@ -23,47 +23,37 @@
 import torch
 import torchvision.transforms as transforms
 import torchvision.datasets as datasets
-from torch.utils.data.sampler import SubsetRandomSampler
+from torch.utils.data.sampler import Sampler
 import numpy as np
 
 DATASETS_NAMES = ['imagenet', 'cifar10']
 
 
-def load_data(dataset, data_dir, batch_size, workers, valid_size=0.1, deterministic=False):
+def load_data(dataset, data_dir, batch_size, workers, validation_split=0.1, deterministic=False,
+ effective_train_size=1., effective_valid_size=1., effective_test_size=1.):
  """Load a dataset.
 
  Args:
  dataset: a string with the name of the dataset to load (cifar10/imagenet)
  data_dir: the directory where the datset resides
  batch_size: the batch size
  workers: the number of worker threads to use for loading the data
- valid_size: portion of training dataset to set aside for validation
+ validation_split: portion of training dataset to set aside for validation
  deterministic: set to True if you want the data loading process to be deterministic.
  Note that deterministic data loading suffers from poor performance.
+ effective_train/valid/test_size: portion of the datasets to load on each epoch.
+ The subset is chosen randomly each time. For the training and validation sets, this is applied AFTER
+ the split to those sets according to the validation_split parameter
  """
- assert dataset in DATASETS_NAMES
- if dataset == 'cifar10':
- return cifar10_load_data(data_dir, batch_size, workers, valid_size=valid_size, deterministic=deterministic)
- if dataset == 'imagenet':
- return imagenet_load_data(data_dir, batch_size, workers, valid_size=valid_size, deterministic=deterministic)
- print("FATAL ERROR: load_data does not support dataset %s" % dataset)
- exit(1)
+ if dataset not in DATASETS_NAMES:
+ raise ValueError('load_data does not support dataset %s" % dataset')
+ datasets_fn = cifar10_get_datasets if dataset == 'cifar10' else imagenet_get_datasets
+ return get_data_loaders(datasets_fn, data_dir, batch_size, workers, validation_split=validation_split,
+ deterministic=deterministic, effective_train_size=effective_train_size,
+ effective_valid_size=effective_valid_size, effective_test_size=effective_test_size)
 
 
-def __image_size(dataset):
- # un-squeeze is used here to add the batch dimension (value=1), which is missing
- return dataset[0][0].unsqueeze(0).size()
-
-
-def __deterministic_worker_init_fn(worker_id, seed=0):
- import random
- import numpy
- random.seed(seed)
- numpy.random.seed(seed)
- torch.manual_seed(seed)
-
-
-def cifar10_load_data(data_dir, batch_size, num_workers, valid_size=0.1, deterministic=False):
+def cifar10_get_datasets(data_dir):
  """Load the CIFAR10 dataset.
 
  The original training dataset is split into training and validation sets (code is
@@ -81,116 +71,139 @@ def cifar10_load_data(data_dir, batch_size, num_workers, valid_size=0.1, determi
  [1] C.-Y. Lee, S. Xie, P. Gallagher, Z. Zhang, and Z. Tu. Deeply Supervised Nets.
  arXiv:1409.5185, 2014
  """
- transform = transforms.Compose([
+ train_transform = transforms.Compose([
  transforms.RandomCrop(32, padding=4),
  transforms.RandomHorizontalFlip(),
  transforms.ToTensor(),
  transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
  ])
 
- transform_test = transforms.Compose([
+ train_dataset = datasets.CIFAR10(root=data_dir, train=True,
+ download=True, transform=train_transform)
+
+ test_transform = transforms.Compose([
  transforms.ToTensor(),
  transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
  ])
 
- train_dataset = datasets.CIFAR10(root=data_dir, train=True,
-  download=True, transform=transform)
+ test_dataset = datasets.CIFAR10(root=data_dir, train=False,
+ download=True, transform=test_transform)
 
- num_train = len(train_dataset)
- indices = list(range(num_train))
- split = int(np.floor(valid_size * num_train))
+ return train_dataset, test_dataset
 
- np.random.shuffle(indices)
 
- train_idx, valid_idx = indices[split:], indices[:split]
- train_sampler = SubsetRandomSampler(train_idx)
+def imagenet_get_datasets(data_dir):
+ """
+ Load the ImageNet dataset.
+ """
+ train_dir = os.path.join(data_dir, 'train')
+ test_dir = os.path.join(data_dir, 'val')
+ normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225])
 
- worker_init_fn = __deterministic_worker_init_fn if deterministic else None
+ train_transform = transforms.Compose([
+ transforms.RandomResizedCrop(224),
+ transforms.RandomHorizontalFlip(),
+ transforms.ToTensor(),
+ normalize,
+ ])
 
- train_loader = torch.utils.data.DataLoader(train_dataset,
- batch_size=batch_size, sampler=train_sampler,
- num_workers=num_workers, pin_memory=True,
- worker_init_fn=worker_init_fn)
+ train_dataset = datasets.ImageFolder(train_dir, train_transform)
 
- valid_loader = None
- if split > 0:
- valid_sampler = SubsetRandomSampler(valid_idx)
- valid_loader = torch.utils.data.DataLoader(train_dataset,
- batch_size=batch_size, sampler=valid_sampler,
- num_workers=num_workers, pin_memory=True,
- worker_init_fn=worker_init_fn)
+ test_transform = transforms.Compose([
+ transforms.Resize(256),
+ transforms.CenterCrop(224),
+ transforms.ToTensor(),
+ normalize,
+ ])
 
- testset = datasets.CIFAR10(root=data_dir, train=False,
- download=True, transform=transform_test)
+ test_dataset = datasets.ImageFolder(test_dir, test_transform)
 
- test_loader = torch.utils.data.DataLoader(
- testset, batch_size=batch_size, shuffle=False,
- num_workers=num_workers, pin_memory=True)
+ return train_dataset, test_dataset
 
- input_shape = __image_size(train_dataset)
 
- # If validation split was 0 we use the test set as the validation set
- return train_loader, valid_loader or test_loader, test_loader, input_shape
+def __image_size(dataset):
+ # un-squeeze is used here to add the batch dimension (value=1), which is missing
+ return dataset[0][0].unsqueeze(0).size()
+
+
+def __deterministic_worker_init_fn(worker_id, seed=0):
+ import random
+ import numpy
+ random.seed(seed)
+ numpy.random.seed(seed)
+ torch.manual_seed(seed)
+
+
+def __split_list(l, ratio):
+ split_idx = int(np.floor(ratio * len(l)))
+ return l[:split_idx], l[split_idx:]
 
 
-def imagenet_load_data(data_dir, batch_size, num_workers, valid_size=0.1, deterministic=False):
- """Load the ImageNet dataset.
+class SwitchingSubsetRandomSampler(Sampler):
+ """Samples a random subset of elements from a data source, without replacement.
 
- Somewhat unconventionally, we use the ImageNet validation dataset as our test dataset,
- and split the training dataset for training and validation (90/10 by default).
+ The subset of elements is re-chosen randomly each time the sampler is enumerated
+
+ Args:
+ data_source (Dataset): dataset to sample from
+ subset_size (float): value in (0..1], representing the portion of dataset to sample at each enumeration.
  """
- train_dir = os.path.join(data_dir, 'train')
- test_dir = os.path.join(data_dir, 'val')
- normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
- std=[0.229, 0.224, 0.225])
+ def __init__(self, data_source, subset_size):
+ if subset_size <= 0 or subset_size > 1:
+ raise ValueError('subset_size must be in (0..1]')
+ self.data_source = data_source
+ self.subset_length = int(np.floor(len(self.data_source) * subset_size))
+
+ def __iter__(self):
+ # Randomizing in the same way as in torch.utils.data.sampler.SubsetRandomSampler to maintain
+ # reproducibility with the previous data loaders implementation
+ indices = torch.randperm(len(self.data_source))
+ subset_indices = indices[:self.subset_length]
+ return (self.data_source[i] for i in subset_indices)
+
+ def __len__(self):
+ return self.subset_length
+
 
- train_dataset = datasets.ImageFolder(
- train_dir,
- transforms.Compose([
- transforms.RandomResizedCrop(224),
- transforms.RandomHorizontalFlip(),
- transforms.ToTensor(),
- normalize,
- ]))
+def get_data_loaders(datasets_fn, data_dir, batch_size, num_workers, validation_split=0.1, deterministic=False,
+ effective_train_size=1., effective_valid_size=1., effective_test_size=1.):
+ train_dataset, test_dataset = datasets_fn(data_dir)
+
+ worker_init_fn = __deterministic_worker_init_fn if deterministic else None
 
  num_train = len(train_dataset)
  indices = list(range(num_train))
- split = int(np.floor(valid_size * num_train))
 
- # Note! We must shuffle the imagenet data because the files are ordered
- # by class. If we don't shuffle, the train and validation datasets will
- # by mutually-exclusive
- np.random.shuffle(indices)
+ # TODO: Switch to torch.utils.data.datasets.random_split()
 
- train_idx, valid_idx = indices[split:], indices[:split]
- train_sampler = SubsetRandomSampler(train_idx)
-
- input_shape = __image_size(train_dataset)
+ # We shuffle indices here in case the data is arranged by class, in which case we'd would get mutually
+ # exclusive datasets if we didn't shuffle
+ np.random.shuffle(indices)
 
- worker_init_fn = __deterministic_worker_init_fn if deterministic else None
+ valid_indices, train_indices = __split_list(indices, validation_split)
 
+ train_sampler = SwitchingSubsetRandomSampler(train_indices, effective_train_size)
  train_loader = torch.utils.data.DataLoader(train_dataset,
  batch_size=batch_size, sampler=train_sampler,
  num_workers=num_workers, pin_memory=True,
  worker_init_fn=worker_init_fn)
 
  valid_loader = None
- if split > 0:
- valid_sampler = SubsetRandomSampler(valid_idx)
+ if valid_indices:
+ valid_sampler = SwitchingSubsetRandomSampler(valid_indices, effective_valid_size)
  valid_loader = torch.utils.data.DataLoader(train_dataset,
  batch_size=batch_size, sampler=valid_sampler,
  num_workers=num_workers, pin_memory=True,
  worker_init_fn=worker_init_fn)
 
- test_loader = torch.utils.data.DataLoader(
- datasets.ImageFolder(test_dir, transforms.Compose([
- transforms.Resize(256),
- transforms.CenterCrop(224),
- transforms.ToTensor(),
- normalize,
- ])),
- batch_size=batch_size, shuffle=False,
- num_workers=num_workers, pin_memory=True)
+ test_indices = list(range(len(test_dataset)))
+ test_sampler = SwitchingSubsetRandomSampler(test_indices, effective_test_size)
+ test_loader = torch.utils.data.DataLoader(test_dataset,
+ batch_size=batch_size, sampler=test_sampler,
+ num_workers=num_workers, pin_memory=True)
+
+ input_shape = __image_size(train_dataset)
 
  # If validation split was 0 we use the test set as the validation set
  return train_loader, valid_loader or test_loader, test_loader, input_shape
diff --git a/examples/agp-pruning/resnet20_filters.schedule_agp.yaml b/examples/agp-pruning/resnet20_filters.schedule_agp.yaml
@@ -14,7 +14,7 @@
 # Total sparsity: 41.10
 # # of parameters: 120,000 (=55.7% of the baseline parameters)
 #
-# time python3 compress_classifier.py --arch resnet20_cifar ../../../data.cifar10 -p=50 --lr=0.3 --epochs=180 --compress=../agp-pruning/resnet20_filters.schedule_agp.yaml -j=1 --deterministic --resume=../ssl/checkpoints/checkpoint_trained_dense.pth.tar --validation-size=0
+# time python3 compress_classifier.py --arch resnet20_cifar ../../../data.cifar10 -p=50 --lr=0.3 --epochs=180 --compress=../agp-pruning/resnet20_filters.schedule_agp.yaml -j=1 --deterministic --resume=../ssl/checkpoints/checkpoint_trained_dense.pth.tar --validation-split=0
 #
 # Parameters:
 # +----+-------------------------------------+----------------+---------------+----------------+------------+------------+----------+----------+----------+------------+---------+----------+------------+

diff --git a/examples/agp-pruning/resnet20_filters.schedule_agp_3.yaml b/examples/agp-pruning/resnet20_filters.schedule_agp_3.yaml
@@ -14,7 +14,7 @@
 # Total sparsity: 56.41%
 # # of parameters: 95922 (=35.4% of the baseline parameters ==> 64.6% sparsity)
 #
-# time python3 compress_classifier.py --arch resnet20_cifar ../../../data.cifar10 -p=50 --lr=0.4 --epochs=180 --compress=../agp-pruning/resnet20_filters.schedule_agp_3.yaml -j=1 --deterministic --resume=../ssl/checkpoints/checkpoint_trained_dense.pth.tar --validation-size=0
+# time python3 compress_classifier.py --arch resnet20_cifar ../../../data.cifar10 -p=50 --lr=0.4 --epochs=180 --compress=../agp-pruning/resnet20_filters.schedule_agp_3.yaml -j=1 --deterministic --resume=../ssl/checkpoints/checkpoint_trained_dense.pth.tar --validation-split=0
 #
 # Parameters:
 # +----+-------------------------------------+----------------+---------------+----------------+------------+------------+----------+----------+----------+------------+---------+----------+------------+

diff --git a/examples/agp-pruning/resnet20_filters.schedule_agp_4.yaml b/examples/agp-pruning/resnet20_filters.schedule_agp_4.yaml
@@ -14,7 +14,7 @@
 # Total sparsity: 39.66
 # # of parameters: 78,776 (=29.1% of the baseline parameters)
 #
-# time python3 compress_classifier.py --arch resnet20_cifar ../../../data.cifar10 -p=50 --lr=0.3 --epochs=180 --compress=../agp-pruning/resnet20_filters.schedule_agp_4.yaml -j=1 --deterministic --resume=../ssl/checkpoints/checkpoint_trained_dense.pth.tar --validation-size=0
+# time python3 compress_classifier.py --arch resnet20_cifar ../../../data.cifar10 -p=50 --lr=0.3 --epochs=180 --compress=../agp-pruning/resnet20_filters.schedule_agp_4.yaml -j=1 --deterministic --resume=../ssl/checkpoints/checkpoint_trained_dense.pth.tar --validation-split=0
 #
 # Parameters:
 # +----+-------------------------------------+----------------+---------------+----------------+------------+------------+----------+----------+----------+------------+---------+----------+------------+

diff --git a/examples/agp-pruning/resnet50.schedule_agp.1x1x8-blocks.yaml b/examples/agp-pruning/resnet50.schedule_agp.1x1x8-blocks.yaml
@@ -4,7 +4,7 @@
 #
 # Best Top1: 76.358 (epoch 72) vs. 76.15 baseline (+0.2%)
 #
-# time python3 compress_classifier.py -a=resnet50 --pretrained -p=50 ../../../data.imagenet/ -j=22 --epochs=100 --lr=0.0005 --compress=../agp-pruning/resnet50.schedule_agp.1x1x8-blocks.yaml --validation-size=0 --num-best-scores=10
+# time python3 compress_classifier.py -a=resnet50 --pretrained -p=50 ../../../data.imagenet/ -j=22 --epochs=100 --lr=0.0005 --compress=../agp-pruning/resnet50.schedule_agp.1x1x8-blocks.yaml --validation-split=0 --num-best-scores=10
 #
 # Parameters:
 # +----+-------------------------------------+--------------------+---------------+----------------+------------+------------+----------+----------+----------+------------+---------+----------+------------+

diff --git a/examples/agp-pruning/resnet50.schedule_agp.filters.yaml b/examples/agp-pruning/resnet50.schedule_agp.filters.yaml
@@ -5,7 +5,7 @@
 # No. of Parameters: 12,335,296 (of 25,502,912) = 43.37% dense (56.63% sparse)
 # Total MACs: 1,822,031,872 (of 4,089,184,256) = 44.56% compute = 2.24x
 #
-# time python3 compress_classifier.py -a=resnet50 --pretrained -p=50 ../../../data.imagenet/ -j=22 --epochs=100 --lr=0.0005 --compress=resnet50.schedule_agp.filters.yaml --validation-size=0 --num-best-scores=10 --name="resnet50_filters_v3.2"
+# time python3 compress_classifier.py -a=resnet50 --pretrained -p=50 ../../../data.imagenet/ -j=22 --epochs=100 --lr=0.0005 --compress=resnet50.schedule_agp.filters.yaml --validation-split=0 --num-best-scores=10 --name="resnet50_filters_v3.2"
 #
 # Parameters:
 # +----+-------------------------------------+--------------------+---------------+----------------+------------+------------+----------+----------+----------+------------+---------+----------+------------+

diff --git a/examples/agp-pruning/resnet50.schedule_agp.filters_2.yaml b/examples/agp-pruning/resnet50.schedule_agp.filters_2.yaml
@@ -5,7 +5,7 @@
 # No. of Parameters: 12,671,168 (of 25,502,912) = 49.69% dense (50.31% sparse)
 # Total MACs: 2,037,186,560 (of 4,089,184,256) = 49.82% compute = 2.01x
 #
-# time python3 compress_classifier.py -a=resnet50 --pretrained -p=50 ../../../data.imagenet/ -j=22 --epochs=100 --lr=0.0005 --compress=resnet50.schedule_agp.filters_2.yaml --validation-size=0 --num-best-scores=10
+# time python3 compress_classifier.py -a=resnet50 --pretrained -p=50 ../../../data.imagenet/ -j=22 --epochs=100 --lr=0.0005 --compress=resnet50.schedule_agp.filters_2.yaml --validation-split=0 --num-best-scores=10
 #
 # Parameters:
 # +----+-------------------------------------+--------------------+---------------+----------------+------------+------------+----------+----------+----------+------------+---------+----------+------------+

diff --git a/examples/agp-pruning/resnet50.schedule_agp.filters_3.yaml b/examples/agp-pruning/resnet50.schedule_agp.filters_3.yaml
@@ -5,7 +5,7 @@
 # No. of Parameters: 17,329,344 (of 25,502,912) = 67.95% dense (32.05% sparse)
 # Total MACs: 2,753,298,432 (of 4,089,184,256) = 67.33% compute = 1.49x
 #
-# time python3 compress_classifier.py -a=resnet50 --pretrained -p=50 ../../../data.imagenet/ -j=22 --epochs=100 --lr=0.0005 --compress=resnet50.schedule_agp.filters_3.yaml --validation-size=0 --num-best-scores=10
+# time python3 compress_classifier.py -a=resnet50 --pretrained -p=50 ../../../data.imagenet/ -j=22 --epochs=100 --lr=0.0005 --compress=resnet50.schedule_agp.filters_3.yaml --validation-split=0 --num-best-scores=10
 #
 # Parameters:
 # +----+-------------------------------------+--------------------+---------------+----------------+------------+------------+----------+----------+----------+------------+---------+----------+------------+

diff --git a/examples/agp-pruning/resnet50.schedule_agp.filters_with_FC.yaml b/examples/agp-pruning/resnet50.schedule_agp.filters_with_FC.yaml
@@ -6,7 +6,7 @@
 # Best Top1: 74.564 (epoch 84) vs. 76.15 baseline (-1.6%)
 # No. of Parameters: 10,901,696 (of 25,502,912) = 42.74% dense (57.26% sparse)
 # Total MACs: 1,822,031,872 (of 4,089,184,256) = 44.56% compute = 2.24x
-# time python3 compress_classifier.py -a=resnet50 --pretrained -p=50 ~/datasets/imagenet/ -j=22 --epochs=100 --lr=0.0005 --compress=resnet50.schedule_agp.filters.yaml --validation-size=0 --num-best-scores=10
+# time python3 compress_classifier.py -a=resnet50 --pretrained -p=50 ~/datasets/imagenet/ -j=22 --epochs=100 --lr=0.0005 --compress=resnet50.schedule_agp.filters.yaml --validation-split=0 --num-best-scores=10
 #
 # Parameters:
 # +----+-------------------------------------+--------------------+---------------+----------------+------------+------------+----------+----------+----------+------------+---------+----------+------------+

diff --git a/examples/classifier_compression/compress_classifier.py b/examples/classifier_compression/compress_classifier.py
@@ -194,7 +194,8 @@ def main():
  # substring "_cifar", then cifar10 is used.
  train_loader, val_loader, test_loader, _ = apputils.load_data(
  args.dataset, os.path.expanduser(args.data), args.batch_size,
- args.workers, args.validation_size, args.deterministic)
+ args.workers, args.validation_split, args.deterministic,
+ args.effective_train_size, args.effective_valid_size, args.effective_test_size)
  msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d',
  len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler))
 
@@ -644,7 +645,8 @@ def automated_deep_compression(model, criterion, optimizer, loggers, args):
 
  train_loader, val_loader, test_loader, _ = apputils.load_data(
  args.dataset, os.path.expanduser(args.data), args.batch_size,
- args.workers, args.validation_size, args.deterministic)
+ args.workers, args.validation_split, args.deterministic,
+ args.effective_train_size, args.effective_valid_size, args.effective_test_size)
 
  args.display_confusion = True
  validate_fn = partial(validate, val_loader=test_loader, criterion=criterion,