import numbers import numpy as np import theano.tensor as T import warnings from theano.tensor.nnet import crossentropy_categorical_1hot from lasagne.layers import InputLayer, DropoutLayer, EmbeddingLayer, NonlinearityLayer, NINLayer from lasagne.layers import ConcatLayer, ReshapeLayer, DenseLayer, get_output, dimshuffle from lasagne.layers.recurrent import Gate from lasagne.init import Constant from lasagne.nonlinearities import softmax from lasagne.objectives import categorical_crossentropy from lasagne.updates import rmsprop from stanza.monitoring import progress from stanza.research import config, iterators, instance from stanza.research.rng import get_rng import color_instances from neural import NeuralLearner, SimpleLasagneModel from neural import NONLINEARITIES, OPTIMIZERS, CELLS, sample from vectorizers import SequenceVectorizer, SymbolVectorizer, strip_invalid_tokens, COLOR_REPRS from vectorizers import BucketsVectorizer parser = config.get_options_parser() parser.add_argument('--speaker_cell_size', type=int, default=20, help='The number of dimensions of all hidden layers and cells in ' 'the speaker model. If 0 and using the AtomicSpeakerLearner, ' 'remove all hidden layers and only train a linear classifier.') parser.add_argument('--speaker_forget_bias', type=float, default=5.0, help='The initial value of the forget gate bias in LSTM cells in ' 'the speaker model. A positive initial forget gate bias ' 'encourages the model to remember everything by default. ' 'If speaker_cell is not LSTM, this value is ignored.') parser.add_argument('--speaker_nonlinearity', choices=NONLINEARITIES.keys(), default='tanh', help='The nonlinearity/activation function to use for dense and ' 'recurrent layers in the speaker model.') parser.add_argument('--speaker_cell', choices=CELLS.keys(), default='LSTM', help='The recurrent cell to use for the speaker model.') parser.add_argument('--speaker_dropout', type=float, default=0.2, help='The dropout rate (probability of setting a value to zero). ' 'Dropout will be disabled if nonpositive.') parser.add_argument('--speaker_color_resolution', type=int, nargs='+', default=[4], help='The number of buckets along each dimension of color space ' 'for the input of the speaker model.') parser.add_argument('--speaker_no_mask', type=config.boolean, default=False, help='If `True`, disable masking of sequence inputs in training.') parser.add_argument('--speaker_hidden_color_layers', type=int, default=0, help='The number of dense layers after the color representation.') parser.add_argument('--speaker_recurrent_layers', type=int, default=2, help='The number of recurrent layers to pass the input through.') parser.add_argument('--speaker_hidden_out_layers', type=int, default=0, help='The number of dense layers to pass activations through ' 'before the output.') parser.add_argument('--speaker_hsv', type=config.boolean, default=False, help='If `True`, input color buckets are in HSV space; otherwise, ' 'color buckets will be in RGB. Input instances should be in HSV ' 'regardless; this sets the internal representation for training ' 'and prediction.') parser.add_argument('--speaker_eval_batch_size', type=int, default=16384, help='The number of examples per batch for evaluating the speaker ' 'model. Higher means faster but more memory usage. This should ' 'not affect modeling accuracy.') parser.add_argument('--speaker_beam_size', type=int, default=1, help='The number of choices to keep in memory at each time step ' 'during prediction. Only used for recurrent speakers.') parser.add_argument('--speaker_optimizer', choices=OPTIMIZERS.keys(), default='rmsprop', help='The optimization (update) algorithm to use for speaker training.') parser.add_argument('--speaker_learning_rate', type=float, default=0.1, help='The learning rate to use for speaker training.') parser.add_argument('--speaker_grad_clipping', type=float, default=0.0, help='The maximum absolute value of the gradient messages for the' 'cell component of the speaker model.') parser.add_argument('--speaker_color_repr', choices=COLOR_REPRS.keys(), default='buckets', help='The representation of the color to use in the speaker model: a regular ' 'grid of `buckets` or the `raw` RGB/HSV values.') rng = get_rng() class UniformPrior(object): '''A uniform color prior in RGB space.''' def __init__(self, recurrent=False): self.sampler = BucketsVectorizer([1], hsv=False) self.recurrent = recurrent def train(self, training_instances, listener_data='ignored'): pass def apply(self, input_vars): c = input_vars[0] if self.recurrent: if c.ndim == 2: ones = T.ones_like(c[:, 0]) elif c.ndim == 3: ones = T.ones_like(c[:, 0, 0]) else: assert False, 'need handling for higher rank color vectors (recurrent): %d' % c.ndim else: if c.ndim == 1: ones = T.ones_like(c) elif c.ndim == 2: ones = T.ones_like(c[:, 0]) else: assert False, 'need handling for higher rank color vectors (atomic): %d' % c.ndim return -3.0 * np.log(256.0) * ones def sample(self, num_samples): ''' :return: a list of `num_samples` colors sampled uniformly in RGB space, but expressed as HSV triples. ''' colors = self.sampler.unvectorize_all(np.zeros(num_samples, dtype=np.int32), random=True, hsv=True) return [instance.Instance(c) for c in colors] class UniformContextPrior(UniformPrior): def __init__(self, recurrent=False): super(UniformContextPrior, self).__init__(recurrent=recurrent) def apply(self, input_vars): options = config.options() context_len = options.num_distractors return (super(UniformContextPrior, self).apply(input_vars) - 3.0 * np.log(256.0) * context_len) def sample(self, num_samples=1): colors = super(UniformContextPrior, self).sample(num_samples) insts = [instance.Instance(c.input) for c in colors] return color_instances.reference_game(insts, color_instances.uniform, listener=False) PRIORS = { 'Uniform': UniformPrior, 'UniformContext': UniformContextPrior, } parser.add_argument('--speaker_prior', choices=PRIORS.keys(), default='Uniform', help='The prior model for the speaker (prior over colors). ' 'Only used in RSA learner.') class SpeakerLearner(NeuralLearner): ''' An speaker with a feedforward neural net color input passed into an RNN to generate a description. ''' def __init__(self, id=None, context_len=1): super(SpeakerLearner, self).__init__(id=id) self.seq_vec = SequenceVectorizer() color_repr = COLOR_REPRS[self.options.speaker_color_repr] self.color_vec = color_repr(self.options.speaker_color_resolution, hsv=self.options.speaker_hsv) self.context_len = context_len def predict(self, eval_instances, random=False, verbosity=0): result = [] batches = iterators.iter_batches(eval_instances, self.options.speaker_eval_batch_size) num_batches = (len(eval_instances) - 1) // self.options.speaker_eval_batch_size + 1 eos_index = self.seq_vec.vectorize([''])[0] if self.options.verbosity + verbosity >= 2: print('Predicting') if self.options.verbosity + verbosity >= 1: progress.start_task('Predict batch', num_batches) for batch_num, batch in enumerate(batches): if self.options.verbosity + verbosity >= 1: progress.progress(batch_num) batch = list(batch) (c, _p, mask), (_y,) = self._data_to_arrays(batch, test=True) assert mask.all() # We shouldn't be masking anything in prediction beam_size = 1 if random else self.options.speaker_beam_size done = np.zeros((len(batch), beam_size), dtype=np.bool) beam = np.zeros((len(batch), beam_size, self.seq_vec.max_len), dtype=np.int32) beam[:, :, 0] = self.seq_vec.vectorize([''])[0] beam_scores = np.log(np.zeros((len(batch), beam_size))) beam_scores[:, 0] = 0.0 c = np.repeat(c, beam_size, axis=0) mask = np.repeat(mask, beam_size, axis=0) for length in range(1, self.seq_vec.max_len): if done.all(): break p = beam.reshape((beam.shape[0] * beam.shape[1], beam.shape[2]))[:, :-1] probs = self.model.predict([c, p, mask]) if random: indices = sample(probs[:, length - 1, :]) beam[:, 0, length] = indices done = np.logical_or(done, indices == eos_index) else: assert probs.shape[1] == p.shape[1], (probs.shape[1], p.shape[1]) assert probs.shape[2] == len(self.seq_vec.tokens), (probs.shape[2], len(self.seq_vec.tokens)) scores = np.log(probs)[:, length - 1, :].reshape((beam.shape[0], beam.shape[1], probs.shape[2])) beam_search_step(scores, length, beam, beam_scores, done, eos_index) outputs = self.seq_vec.unvectorize_all(beam[:, 0, :]) result.extend([' '.join(strip_invalid_tokens(o)) for o in outputs]) if self.options.verbosity + verbosity >= 1: progress.end_task() return result def score(self, eval_instances, verbosity=0): result = [] batches = iterators.iter_batches(eval_instances, self.options.speaker_eval_batch_size) num_batches = (len(eval_instances) - 1) // self.options.speaker_eval_batch_size + 1 if self.options.verbosity + verbosity >= 2: print('Scoring') if self.options.verbosity + verbosity >= 1: progress.start_task('Score batch', num_batches) for batch_num, batch in enumerate(batches): if self.options.verbosity + verbosity >= 1: progress.progress(batch_num) batch = list(batch) xs, (n,) = self._data_to_arrays(batch, test=False) _, _, mask = xs probs = self.model.predict(xs) token_probs = probs[np.arange(probs.shape[0])[:, np.newaxis], np.arange(probs.shape[1]), n] scores_arr = np.sum(np.log(token_probs) * mask, axis=1) scores = scores_arr.tolist() result.extend(scores) if self.options.verbosity + verbosity >= 1: progress.end_task() return result def _data_to_arrays(self, training_instances, init_vectorizer=False, test=False, inverted=False): context_len = self.context_len if hasattr(self, 'context_len') else 1 use_context = context_len > 1 def get_multi(val): if isinstance(val, tuple): assert len(val) == 1 return val[0] else: return val get_i, get_o = (lambda inst: inst.input), (lambda inst: inst.output) get_color, get_desc_simple = (get_o, get_i) if inverted else (get_i, get_o) get_desc = lambda inst: get_multi(get_desc_simple(inst)) get_i_ind, get_o_ind = ((lambda inst: inst.alt_inputs[get_multi(inst.input)]), (lambda inst: inst.alt_outputs[get_multi(inst.output)])) get_color_indexed = get_o_ind if inverted else get_i_ind get_alt_i, get_alt_o = (lambda inst: inst.alt_inputs), (lambda inst: inst.alt_outputs) get_alt_colors = get_alt_o if inverted else get_alt_i if init_vectorizer: self.seq_vec.add_all([''] + get_desc(inst).split() + [''] for inst in training_instances) colors = [] previous = [] next_tokens = [] if self.options.verbosity >= 9: print('%s _data_to_arrays:' % self.id) for i, inst in enumerate(training_instances): desc, color = get_desc(inst), get_color(inst) if isinstance(color, numbers.Number): color = get_color_indexed(inst) if test: full = [''] + [''] * (self.seq_vec.max_len - 1) else: desc = desc.split() full = ([''] + desc + [''] + [''] * (self.seq_vec.max_len - 1 - len(desc))) prev = full[:-1] next = full[1:] if self.options.verbosity >= 9: print('%s, %s -> %s' % (repr(color), repr(prev), repr(next))) colors.append(color) if use_context: new_context = get_alt_colors(inst) index = get_color(inst) if isinstance(index, tuple): assert len(index) == 1 index = index[0] assert len(new_context) == context_len, \ 'Inconsistent context lengths: %s' % ((context_len, len(new_context)),) colors.extend([c for j, c in enumerate(new_context) if j != index]) previous.append(prev) next_tokens.append(next) P = np.zeros((len(previous), self.seq_vec.max_len - 1), dtype=np.int32) mask = np.zeros((len(previous), self.seq_vec.max_len - 1), dtype=np.int32) N = np.zeros((len(next_tokens), self.seq_vec.max_len - 1), dtype=np.int32) c = self.color_vec.vectorize_all(colors, hsv=True) if len(c.shape) == 1: c = c.reshape((len(colors) / context_len, context_len)) else: c = c.reshape((len(colors) / context_len, context_len * c.shape[1]) + c.shape[2:]) for i, (color, prev, next) in enumerate(zip(colors, previous, next_tokens)): if len(prev) > P.shape[1]: prev = prev[:P.shape[1]] if len(next) > N.shape[1]: next = next[:N.shape[1]] P[i, :len(prev)] = self.seq_vec.vectorize(prev) N[i, :len(next)] = self.seq_vec.vectorize(next) for t, token in enumerate(next): mask[i, t] = (token != '') c = np.tile(c[:, np.newaxis, ...], [1, self.seq_vec.max_len - 1] + [1] * (c.ndim - 1)) if self.options.verbosity >= 9: print('c: %s' % (repr(c),)) print('P: %s' % (repr(P),)) print('mask: %s' % (repr(mask),)) print('N: %s' % (repr(N),)) return [c, P, mask], [N] def _build_model(self, model_class=SimpleLasagneModel): id_tag = (self.id + '/') if self.id else '' input_vars = self.color_vec.get_input_vars(self.id, recurrent=True) + [ T.imatrix(id_tag + 'previous'), T.imatrix(id_tag + 'mask') ] target_var = T.imatrix(id_tag + 'targets') self.l_out, self.input_layers = self._get_l_out(input_vars) self.model = model_class(input_vars, [target_var], self.l_out, id=self.id, loss=self.masked_loss(input_vars), optimizer=OPTIMIZERS[self.options.speaker_optimizer], learning_rate=self.options.speaker_learning_rate) def train_priors(self, training_instances, listener_data=False): prior_class = PRIORS[self.options.speaker_prior] self.prior_emp = prior_class(recurrent=True) self.prior_smooth = prior_class(recurrent=True) self.prior_emp.train(training_instances, listener_data=listener_data) self.prior_smooth.train(training_instances, listener_data=listener_data) def _get_l_out(self, input_vars): check_options(self.options) id_tag = (self.id + '/') if self.id else '' prev_output_var, mask_var = input_vars[-2:] color_input_vars = input_vars[:-2] context_len = self.context_len if hasattr(self, 'context_len') else 1 l_color_repr, color_inputs = self.color_vec.get_input_layer( color_input_vars, recurrent_length=self.seq_vec.max_len - 1, cell_size=self.options.speaker_cell_size, context_len=context_len, id=self.id ) l_hidden_color = dimshuffle(l_color_repr, (0, 2, 1)) for i in range(1, self.options.speaker_hidden_color_layers + 1): l_hidden_color = NINLayer( l_hidden_color, num_units=self.options.speaker_cell_size, nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity], name=id_tag + 'hidden_color%d' % i) l_hidden_color = dimshuffle(l_hidden_color, (0, 2, 1)) l_prev_out = InputLayer(shape=(None, self.seq_vec.max_len - 1), input_var=prev_output_var, name=id_tag + 'prev_input') l_prev_embed = EmbeddingLayer(l_prev_out, input_size=len(self.seq_vec.tokens), output_size=self.options.speaker_cell_size, name=id_tag + 'prev_embed') l_in = ConcatLayer([l_hidden_color, l_prev_embed], axis=2, name=id_tag + 'color_prev') l_mask_in = InputLayer(shape=(None, self.seq_vec.max_len - 1), input_var=mask_var, name=id_tag + 'mask_input') l_rec_drop = l_in cell = CELLS[self.options.speaker_cell] cell_kwargs = { 'mask_input': (None if self.options.speaker_no_mask else l_mask_in), 'grad_clipping': self.options.speaker_grad_clipping, 'num_units': self.options.speaker_cell_size, } if self.options.speaker_cell == 'LSTM': cell_kwargs['forgetgate'] = Gate(b=Constant(self.options.speaker_forget_bias)) if self.options.speaker_cell != 'GRU': cell_kwargs['nonlinearity'] = NONLINEARITIES[self.options.speaker_nonlinearity] for i in range(1, self.options.speaker_recurrent_layers): l_rec = cell(l_rec_drop, name=id_tag + 'rec%d' % i, **cell_kwargs) if self.options.speaker_dropout > 0.0: l_rec_drop = DropoutLayer(l_rec, p=self.options.speaker_dropout, name=id_tag + 'rec%d_drop' % i) else: l_rec_drop = l_rec l_rec = cell(l_rec_drop, name=id_tag + 'rec%d' % self.options.speaker_recurrent_layers, **cell_kwargs) l_shape = ReshapeLayer(l_rec, (-1, self.options.speaker_cell_size), name=id_tag + 'reshape') l_hidden_out = l_shape for i in range(1, self.options.speaker_hidden_out_layers + 1): l_hidden_out = DenseLayer( l_hidden_out, num_units=self.options.speaker_cell_size, nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity], name=id_tag + 'hidden_out%d' % i) l_softmax = DenseLayer(l_hidden_out, num_units=len(self.seq_vec.tokens), nonlinearity=softmax, name=id_tag + 'softmax') l_out = ReshapeLayer(l_softmax, (-1, self.seq_vec.max_len - 1, len(self.seq_vec.tokens)), name=id_tag + 'out') return l_out, color_inputs + [l_prev_out, l_mask_in] def loss_out(self, input_vars=None, target_var=None): if input_vars is None: input_vars = self.model.input_vars if target_var is None: target_var = self.model.target_var pred = get_output(self.l_out, dict(zip(self.input_layers, input_vars))) loss = self.masked_loss(input_vars) return loss(pred, target_var) def masked_loss(self, input_vars): return masked_seq_crossentropy(input_vars[-1]) def sample_prior_smooth(self, num_samples): return self.prior_smooth.sample(num_samples) class ContextSpeakerLearner(SpeakerLearner): def __init__(self, *args, **kwargs): self.get_options() context = self.options.num_distractors + 1 return super(ContextSpeakerLearner, self).__init__(*args, context_len=context, **kwargs) def check_options(options): if options.speaker_grad_clipping: warnings.warn('Per-dimension gradient clipping (--speaker_grad_clipping) is enabled. ' 'This feature is unlikely to correctly constrain gradients and avoid ' 'NaNs; use --true_grad_clipping instead.') if options.speaker_recurrent_layers and not options.true_grad_clipping: warnings.warn('Norm-constraint gradient clipping is disabled for a recurrent model. ' 'This will likely lead to exploding gradients.') if options.speaker_recurrent_layers and options.true_grad_clipping > 6.0: warnings.warn('Gradient clipping norm is unusually high (%s). ' 'This could lead to exploding gradients.' % options.true_grad_clipping) if options.speaker_nonlinearity == 'rectify': warnings.warn('Using ReLU as the output nonlinearity for a recurrent unit. This may ' 'be a source of NaNs in the gradient.') def crossentropy_categorical_1hot_nd(coding_dist, true_idx): ''' A n-dimensional generalization of `theano.tensor.nnet.crossentropy_categorical`. :param coding_dist: a float tensor with the last dimension equal to the number of categories :param true_idx: an integer tensor with one fewer dimension than `coding_dist`, giving the indices of the true targets ''' if coding_dist.ndim != true_idx.ndim + 1: raise ValueError('`coding_dist` must have one more dimension that `true_idx` ' '(got %s and %s)' % (coding_dist.type, true_idx.type)) coding_flattened = T.reshape(coding_dist, (-1, T.shape(coding_dist)[-1])) scores_flattened = crossentropy_categorical_1hot(coding_flattened, true_idx.flatten()) return T.reshape(scores_flattened, true_idx.shape) def masked_seq_crossentropy(mask): ''' Return a loss function for sequence models. :param mask: a 2-D int tensor (num_examples x max_length) with 1 in valid token locations and 0 in locations that should be masked out The returned function will have the following parameters and return type: :param coding_dist: a 3-D float tensor (num_examples x max_length x num_token_types) of log probabilities assigned to each token :param true_idx: a 2-D int tensor (num_examples x max_length) of true token indices :return: a 1-D float tensor of per-example cross-entropy values ''' def msxe_loss(coding_dist, true_idx): mask_float = T.cast(mask, 'float32') return (crossentropy_categorical_1hot_nd(coding_dist, true_idx) * mask_float).sum(axis=1) return msxe_loss def beam_search_step(scores, length, beam, beam_scores, done, eos_index): ''' Perform one step of beam search, given the matrix of probabilities for each possible following token. Modifies `beam`, `beam_scores`, and `done` *in place*. :param scores: Scores (log probabilities, up to a constant) assigned by the model to each token for each sequence on the various beams. :type scores: float ndarray, shape `(batch_size, beam_size, vocab_size)` :param int length: Current length of already predicted sequences. Should equal the axis-1 index in `beam` where the next predicted tokens will be populated. :param beam: Token indices for the top-k sequences predicted for each example. :type beam: int ndarray, shape `(batch_size, beam_size, max_seq_len)` :param beam_scores: log probabilities assigned to current candidate sequences :type beam_scores: float ndarray, shape `(batch_size, beam_size)` :param done: Mask of beam entries that have reached the </s> token :type done: boolean ndarray, shape `(batch_size, beam_size)` As an example, suppose the distribution represented by the model is: 'a cat': 0.375, 'cat': 0.25, 'cat a': 0.125, 'cat cat': 0.125, 'a': 0.0625, '': 0.03125, 'a a': 0.03125, >>> a_cat, cat, cat_a, cat_cat, a, null, a_a = \\ ... [0.375, 0.25, 0.125, 0.125, 0.0625, 0.03125, 0.03125] >>> vec = SequenceVectorizer(); vec.add(['', 'a', 'cat', '']) >>> vec.vectorize(['', 'a', 'cat', '']) array([1, 2, 3, 4], dtype=int32) >>> eos_index = vec.vectorize([''])[0] Initialize the beam. Note that -inf should be the initial score for all but one item on each beam; if all scores start at 0, the beam will be saturated with duplicates of the greedy choice. >>> batch_size = 1; beam_size = 2; max_seq_len = 3 >>> beam = np.zeros((batch_size, beam_size, max_seq_len), dtype=np.int) >>> beam_scores = np.log(np.zeros((batch_size, beam_size))) >>> beam_scores[:, 0] = 0.0 >>> done = np.zeros((batch_size, beam_size), dtype=np.bool) >>> next_scores = np.log([[[0.0, 0.0, ... a_cat + a + a_a, ... cat + cat_cat + cat_a, ... null]] * 2]) >>> beam_search_step(next_scores, 0, beam, beam_scores, done, eos_index) >>> beam array([[[3, 0, 0], [2, 0, 0]]]) >>> np.exp(beam_scores).round(5) array([[ 0.5 , 0.46875]]) >>> done array([[False, False]], dtype=bool) Note that 'cat' is the greedy first choice, but 'a cat' will end up with a higher score. >>> next_scores = np.log([[[0.0, 0.0, cat_a / 0.5, cat_cat / 0.5, cat / 0.5], ... [0.0, 0.0, a_a / 0.46875, a_cat / 0.46875, a / 0.46875]]]) >>> beam_search_step(next_scores, 1, beam, beam_scores, done, eos_index) >>> beam array([[[2, 3, 0], [3, 4, 0]]]) >>> np.exp(beam_scores).round(3) array([[ 0.375, 0.25 ]]) >>> done array([[False, True]], dtype=bool) The best sequences have been identified; the score for 'cat' stays constant after it reaches the end-of-sentence token, and the beam is padded with end-of-sentence tokens regardless of the returned scores. >>> next_scores = np.log([[[0.0, 0.0, 0.0, 0.0, 1.0], ... [0.0, 0.0, 0.25, 0.5, 0.25]]]) >>> beam_search_step(next_scores, 2, beam, beam_scores, done, eos_index) >>> beam array([[[2, 3, 4], [3, 4, 4]]]) >>> np.exp(beam_scores).round(3) array([[ 0.375, 0.25 ]]) >>> done array([[ True, True]], dtype=bool) ''' assert len(scores.shape) == 3, scores.shape batch_size, beam_size, vocab_size = scores.shape assert len(beam.shape) == 3, beam.shape assert beam.shape[:2] == (batch_size, beam_size), \ '%s != (%s, %s, *)' % (beam.shape, batch_size, beam_size) max_seq_len = beam.shape[2] assert beam_scores.shape == (batch_size, beam_size), \ '%s != %s' % (beam_scores.shape, (batch_size, beam_size)) assert done.shape == (batch_size, beam_size), \ '%s != %s' % (done.shape, (batch_size, beam_size)) # Compute updated scores new_scores = (scores * ~done[:, :, np.newaxis] + beam_scores[:, :, np.newaxis]).reshape((batch_size, beam_size * vocab_size)) # Get indices of top k scores topk = np.argsort(-new_scores)[:, :beam_size] # Transform into previous beam indices and new token indices rows, new_indices = np.unravel_index(topk, (beam_size, vocab_size)) assert rows.shape == (batch_size, beam_size), \ '%s != %s' % (rows.shape, (batch_size, beam_size)) assert new_indices.shape == (batch_size, beam_size), \ '%s != %s' % (new_indices.shape, (batch_size, beam_size)) # Extract best pre-existing rows beam[:, :, :] = beam[np.arange(batch_size)[:, np.newaxis], rows, :] assert beam.shape == (batch_size, beam_size, max_seq_len), \ '%s != %s' % (beam.shape, (batch_size, beam_size, max_seq_len)) # Append new token indices beam[:, :, length] = new_indices # Update beam scores beam_scores[:, :] = new_scores[np.arange(batch_size)[:, np.newaxis], topk] # Get previous done status and update it with # which rows have newly reached done[:, :] = done[np.arange(batch_size)[:, np.newaxis], rows] | (new_indices == eos_index) # Pad already-finished sequences with beam[done, length] = eos_index class AtomicSpeakerLearner(NeuralLearner): ''' A speaker that learns to produce descriptions as indivisible symbols (as opposed to word-by-word sequences) given colors. ''' def __init__(self, id=None): super(AtomicSpeakerLearner, self).__init__(id=id) self.seq_vec = SymbolVectorizer() color_repr = COLOR_REPRS[self.options.speaker_color_repr] self.color_vec = color_repr(self.options.speaker_color_resolution, hsv=self.options.speaker_hsv) def predict_and_score(self, eval_instances, random=False, verbosity=0): predictions = [] scores = [] batches = iterators.iter_batches(eval_instances, self.options.speaker_eval_batch_size) num_batches = (len(eval_instances) - 1) // self.options.speaker_eval_batch_size + 1 if self.options.verbosity + verbosity >= 2: print('Testing') if self.options.verbosity + verbosity >= 1: progress.start_task('Eval batch', num_batches) for batch_num, batch in enumerate(batches): if self.options.verbosity + verbosity >= 1: progress.progress(batch_num) batch = list(batch) xs, (y,) = self._data_to_arrays(batch, test=True) probs = self.model.predict(xs) if random: indices = sample(probs) else: indices = probs.argmax(axis=1) predictions.extend(self.seq_vec.unvectorize_all(indices)) scores_arr = np.log(probs[np.arange(len(batch)), y]) scores.extend(scores_arr.tolist()) if self.options.verbosity + verbosity >= 1: progress.end_task() if self.options.verbosity >= 9: print('%s %ss:') % (self.id, 'sample' if random else 'prediction') for inst, prediction in zip(eval_instances, predictions): print('%s -> %s' % (repr(inst.input), repr(prediction))) return predictions, scores def _data_to_arrays(self, training_instances, init_vectorizer=False, test=False, inverted=False): get_i, get_o = (lambda inst: inst.input), (lambda inst: inst.output) get_color, get_desc = (get_o, get_i) if inverted else (get_i, get_o) if init_vectorizer: self.seq_vec.add_all(get_desc(inst) for inst in training_instances) sentences = [] colors = [] if self.options.verbosity >= 9: print('%s _data_to_arrays:' % self.id) for i, inst in enumerate(training_instances): desc = get_desc(inst) if desc is None: assert test desc = '' color = get_color(inst) assert color if self.options.verbosity >= 9: print('%s -> %s' % (repr(desc), repr(color))) sentences.append(desc) colors.append(color) x = self.color_vec.vectorize_all(colors, hsv=True) if len(x.shape) == 1: x = x[:, np.newaxis] y = self.seq_vec.vectorize_all(sentences) if self.options.verbosity >= 9: print('%s x: %s' % (self.id, x)) print('%s y: %s' % (self.id, y)) return [x], [y] def _build_model(self, model_class=SimpleLasagneModel): id_tag = (self.id + '/') if self.id else '' input_vars = self.color_vec.get_input_vars(self.id) target_var = T.ivector(id_tag + 'targets') self.l_out, self.input_layers = self._get_l_out(input_vars) self.loss = categorical_crossentropy self.model = model_class(input_vars, [target_var], self.l_out, loss=self.loss, optimizer=rmsprop, id=self.id) def train_priors(self, training_instances, listener_data=False): prior_class = PRIORS[self.options.speaker_prior] self.prior_emp = prior_class() self.prior_smooth = prior_class() self.prior_emp.train(training_instances, listener_data=listener_data) self.prior_smooth.train(training_instances, listener_data=listener_data) def _get_l_out(self, input_vars): id_tag = (self.id + '/') if self.id else '' cell_size = self.options.speaker_cell_size or self.seq_vec.num_types l_color_repr, color_inputs = self.color_vec.get_input_layer( input_vars, recurrent_length=0, cell_size=cell_size, id=self.id ) l_hidden_color = l_color_repr for i in range(1, self.options.speaker_hidden_color_layers + 1): l_hidden_color = NINLayer( l_hidden_color, num_units=cell_size, nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity], name=id_tag + 'hidden_color%d' % i) l_hidden_color = l_hidden_color if self.options.speaker_cell_size == 0: l_scores = l_color_repr # BiasLayer(l_color_repr, name=id_tag + 'bias') else: if self.options.speaker_dropout > 0.0: l_color_drop = DropoutLayer(l_hidden_color, p=self.options.speaker_dropout, name=id_tag + 'color_drop') else: l_color_drop = l_hidden_color l_hidden = DenseLayer(l_color_drop, num_units=self.options.speaker_cell_size, nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity], name=id_tag + 'hidden') if self.options.speaker_dropout > 0.0: l_hidden_drop = DropoutLayer(l_hidden, p=self.options.speaker_dropout, name=id_tag + 'hidden_drop') else: l_hidden_drop = l_hidden l_scores = DenseLayer(l_hidden_drop, num_units=self.seq_vec.num_types, nonlinearity=None, name=id_tag + 'scores') l_out = NonlinearityLayer(l_scores, nonlinearity=softmax, name=id_tag + 'softmax') return l_out, color_inputs def sample_prior_smooth(self, num_samples): return self.prior_smooth.sample(num_samples) SPEAKERS = { 'Speaker': SpeakerLearner, 'ContextSpeaker': ContextSpeakerLearner, 'AtomicSpeaker': AtomicSpeakerLearner, }