Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

💫 Fix loading of multiple vector models #2158

Merged
merged 10 commits into from
Mar 28, 2018
21 changes: 13 additions & 8 deletions spacy/_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,11 @@ def predict(ids, tokvecs):

def link_vectors_to_models(vocab):
vectors = vocab.vectors
if vectors.name is None:
vectors.name = VECTORS_KEY
print(
"Warning: Unnamed vectors -- this won't allow multiple vectors "
"models to be loaded. (Shape: (%d, %d))" % vectors.data.shape)
ops = Model.ops
for word in vocab:
if word.orth in vectors.key2row:
Expand All @@ -251,11 +256,11 @@ def link_vectors_to_models(vocab):
data = ops.asarray(vectors.data)
# Set an entry here, so that vectors are accessed by StaticVectors
# (unideal, I know)
thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data
thinc.extra.load_nlp.VECTORS[(ops.device, vectors.name)] = data


def Tok2Vec(width, embed_size, **kwargs):
pretrained_dims = kwargs.get('pretrained_dims', 0)
pretrained_vectors = kwargs.get('pretrained_vectors', None)
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone,
Expand All @@ -268,16 +273,16 @@ def Tok2Vec(width, embed_size, **kwargs):
name='embed_suffix')
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE),
name='embed_shape')
if pretrained_dims is not None and pretrained_dims >= 1:
glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID))
if pretrained_vectors is not None:
glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID))

embed = uniqued(
(glove | norm | prefix | suffix | shape)
>> LN(Maxout(width, width*5, pieces=3)), column=5)
>> LN(Maxout(width, width*5, pieces=3)), column=cols.index(ORTH))
else:
embed = uniqued(
(norm | prefix | suffix | shape)
>> LN(Maxout(width, width*4, pieces=3)), column=5)
>> LN(Maxout(width, width*4, pieces=3)), column=cols.index(ORTH))

convolution = Residual(
ExtractWindow(nW=1)
Expand Down Expand Up @@ -433,13 +438,13 @@ def build_tagger_model(nr_class, **cfg):
token_vector_width = cfg['token_vector_width']
else:
token_vector_width = util.env_opt('token_vector_width', 128)
pretrained_dims = cfg.get('pretrained_dims', 0)
pretrained_vectors = cfg.get('pretrained_vectors')
with Model.define_operators({'>>': chain, '+': add}):
if 'tok2vec' in cfg:
tok2vec = cfg['tok2vec']
else:
tok2vec = Tok2Vec(token_vector_width, embed_size,
pretrained_dims=pretrained_dims)
pretrained_vectors=pretrained_vectors)
softmax = with_flatten(Softmax(nr_class, token_vector_width))
model = (
tok2vec
Expand Down
1 change: 1 addition & 0 deletions spacy/cli/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
meta['pipeline'] = pipeline
nlp.meta.update(meta)
if vectors:
print("Load vectors model", vectors)
util.load_model(vectors, vocab=nlp.vocab)
for lex in nlp.vocab:
values = {}
Expand Down
28 changes: 27 additions & 1 deletion spacy/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@ def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs):
if vocab is True:
factory = self.Defaults.create_vocab
vocab = factory(self, **meta.get('vocab', {}))
if vocab.vectors.name is None:
vocab.vectors.name = meta.get('vectors', {}).get('name')
self.vocab = vocab
if make_doc is True:
factory = self.Defaults.create_tokenizer
Expand All @@ -158,7 +160,8 @@ def meta(self):
self._meta.setdefault('license', '')
self._meta['vectors'] = {'width': self.vocab.vectors_length,
'vectors': len(self.vocab.vectors),
'keys': self.vocab.vectors.n_keys}
'keys': self.vocab.vectors.n_keys,
'name': self.vocab.vectors.name}
self._meta['pipeline'] = self.pipe_names
return self._meta

Expand Down Expand Up @@ -457,6 +460,8 @@ def begin_training(self, get_gold_tuples=None, sgd=None, **cfg):
else:
device = None
link_vectors_to_models(self.vocab)
if self.vocab.vectors.data.shape[1]:
cfg['pretrained_vectors'] = self.vocab.vectors.name
if sgd is None:
sgd = create_default_optimizer(Model.ops)
self._optimizer = sgd
Expand Down Expand Up @@ -629,6 +634,7 @@ def from_disk(self, path, disable=tuple()):
('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)),
('meta.json', lambda p: self.meta.update(util.read_json(p)))
))
_fix_pretrained_vectors_name(self)
for name, proc in self.pipeline:
if name in disable:
continue
Expand Down Expand Up @@ -674,6 +680,7 @@ def from_bytes(self, bytes_data, disable=[]):
('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)),
('meta', lambda b: self.meta.update(ujson.loads(b)))
))
_fix_pretrained_vectors_name(self)
for i, (name, proc) in enumerate(self.pipeline):
if name in disable:
continue
Expand All @@ -683,6 +690,25 @@ def from_bytes(self, bytes_data, disable=[]):
msg = util.from_bytes(bytes_data, deserializers, {})
return self

def _fix_pretrained_vectors_name(nlp):
# TODO: Replace this once we handle vectors consistently as static
# data
if 'vectors' in nlp.meta and nlp.meta['vectors'].get('name'):
nlp.vocab.vectors.name = nlp.meta['vectors']['name']
elif not nlp.vocab.vectors.size:
nlp.vocab.vectors.name = None
elif 'name' in nlp.meta and 'lang' in nlp.meta:
vectors_name = '%s_%s.vectors' % (nlp.meta['lang'], nlp.meta['name'])
nlp.vocab.vectors.name = vectors_name
else:
raise ValueError("Unnamed vectors")
for name, proc in nlp.pipeline:
if not hasattr(proc, 'cfg'):
continue
if proc.cfg.get('pretrained_dims'):
assert nlp.vocab.vectors.name
proc.cfg['pretrained_vectors'] = nlp.vocab.vectors.name


class DisabledPipes(list):
"""Manager for temporary pipeline disabling."""
Expand Down
36 changes: 25 additions & 11 deletions spacy/pipeline.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -202,8 +202,10 @@ class Pipe(object):
def from_bytes(self, bytes_data, **exclude):
"""Load the pipe from a bytestring."""
def load_model(b):
# TODO: Remove this once we don't have to handle previous models
if 'pretrained_dims' in self.cfg and 'pretrained_vectors' not in self.cfg:
self.cfg['pretrained_vectors'] = self.vocab.vectors.name
if self.model is True:
self.cfg.setdefault('pretrained_dims', self.vocab.vectors_length)
self.model = self.Model(**self.cfg)
self.model.from_bytes(b)

Expand All @@ -227,8 +229,10 @@ class Pipe(object):
def from_disk(self, path, **exclude):
"""Load the pipe from disk."""
def load_model(p):
# TODO: Remove this once we don't have to handle previous models
if 'pretrained_dims' in self.cfg and 'pretrained_vectors' not in self.cfg:
self.cfg['pretrained_vectors'] = self.vocab.vectors.name
if self.model is True:
self.cfg.setdefault('pretrained_dims', self.vocab.vectors_length)
self.model = self.Model(**self.cfg)
self.model.from_bytes(p.open('rb').read())

Expand Down Expand Up @@ -286,7 +290,6 @@ class Tensorizer(Pipe):
self.model = model
self.input_models = []
self.cfg = dict(cfg)
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
self.cfg.setdefault('cnn_maxout_pieces', 3)

def __call__(self, doc):
Expand Down Expand Up @@ -403,8 +406,6 @@ class Tagger(Pipe):
self.model = model
self.cfg = OrderedDict(sorted(cfg.items()))
self.cfg.setdefault('cnn_maxout_pieces', 2)
self.cfg.setdefault('pretrained_dims',
self.vocab.vectors.data.shape[1])

@property
def labels(self):
Expand Down Expand Up @@ -515,8 +516,8 @@ class Tagger(Pipe):
vocab.morphology = Morphology(vocab.strings, new_tag_map,
vocab.morphology.lemmatizer,
exc=vocab.morphology.exc)
self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')
if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
link_vectors_to_models(self.vocab)
if sgd is None:
Expand All @@ -525,6 +526,13 @@ class Tagger(Pipe):

@classmethod
def Model(cls, n_tags, **cfg):
if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'):
raise ValueError(
"Bad configuration of Tagger --- this is probably a bug "
"within spaCy. We changed the name of an internal attribute "
"for loading pre-trained vectors, and the class has been "
"passed the old name (pretrained_dims) but not the new name "
"(pretrained_vectors)")
return build_tagger_model(n_tags, **cfg)

def add_label(self, label, values=None):
Expand Down Expand Up @@ -572,6 +580,10 @@ class Tagger(Pipe):

def from_bytes(self, bytes_data, **exclude):
def load_model(b):
# TODO: Remove this once we don't have to handle previous models
if 'pretrained_dims' in self.cfg and 'pretrained_vectors' not in self.cfg:
self.cfg['pretrained_vectors'] = self.vocab.vectors.name

if self.model is True:
token_vector_width = util.env_opt(
'token_vector_width',
Expand All @@ -597,7 +609,6 @@ class Tagger(Pipe):
return self

def to_disk(self, path, **exclude):
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
serialize = OrderedDict((
('vocab', lambda p: self.vocab.to_disk(p)),
Expand All @@ -610,6 +621,9 @@ class Tagger(Pipe):

def from_disk(self, path, **exclude):
def load_model(p):
# TODO: Remove this once we don't have to handle previous models
if 'pretrained_dims' in self.cfg and 'pretrained_vectors' not in self.cfg:
self.cfg['pretrained_vectors'] = self.vocab.vectors.name
if self.model is True:
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
with p.open('rb') as file_:
Expand Down Expand Up @@ -659,8 +673,6 @@ class MultitaskObjective(Tagger):
"one of: dep, tag, ent, dep_tag_offset, ent_tag.")
self.cfg = dict(cfg)
self.cfg.setdefault('cnn_maxout_pieces', 2)
self.cfg.setdefault('pretrained_dims',
self.vocab.vectors.data.shape[1])

@property
def labels(self):
Expand Down Expand Up @@ -898,13 +910,15 @@ class TextCategorizer(Pipe):
self.labels.append(label)
return 1

def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None):
def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None,
**kwargs):
if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer':
token_vector_width = pipeline[0].model.nO
else:
token_vector_width = 64

if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors_length
self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')
self.model = self.Model(len(self.labels), token_vector_width,
**self.cfg)
link_vectors_to_models(self.vocab)
Expand Down
22 changes: 12 additions & 10 deletions spacy/syntax/nn_parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,9 @@ cdef class Parser:
raise ValueError("Currently history size is hard-coded to 0")
if hist_width != 0:
raise ValueError("Currently history width is hard-coded to 0")
pretrained_vectors = cfg.get('pretrained_vectors', None)
tok2vec = Tok2Vec(token_vector_width, embed_size,
pretrained_dims=cfg.get('pretrained_dims', 0))
pretrained_vectors=pretrained_vectors)
tok2vec = chain(tok2vec, flatten)
lower = PrecomputableAffine(hidden_width,
nF=cls.nr_feature, nI=token_vector_width,
Expand All @@ -275,6 +276,7 @@ cdef class Parser:
'token_vector_width': token_vector_width,
'hidden_width': hidden_width,
'maxout_pieces': parser_maxout_pieces,
'pretrained_vectors': pretrained_vectors,
'hist_size': hist_size,
'hist_width': hist_width
}
Expand All @@ -294,9 +296,9 @@ cdef class Parser:
unless True (default), in which case a new instance is created with
`Parser.Moves()`.
model (object): Defines how the parse-state is created, updated and
evaluated. The value is set to the .model attribute unless True
(default), in which case a new instance is created with
`Parser.Model()`.
evaluated. The value is set to the .model attribute. If set to True
(default), a new instance will be created with `Parser.Model()`
in parser.begin_training(), parser.from_disk() or parser.from_bytes().
**cfg: Arbitrary configuration parameters. Set to the `.cfg` attribute
"""
self.vocab = vocab
Expand All @@ -308,8 +310,6 @@ cdef class Parser:
cfg['beam_width'] = util.env_opt('beam_width', 1)
if 'beam_density' not in cfg:
cfg['beam_density'] = util.env_opt('beam_density', 0.0)
if 'pretrained_dims' not in cfg:
cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
cfg.setdefault('cnn_maxout_pieces', 3)
self.cfg = cfg
if 'actions' in self.cfg:
Expand Down Expand Up @@ -832,7 +832,6 @@ cdef class Parser:
self.moves.add_action(action, label)
cfg.setdefault('token_vector_width', 128)
if self.model is True:
cfg['pretrained_dims'] = self.vocab.vectors_length
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
if sgd is None:
sgd = self.create_optimizer()
Expand Down Expand Up @@ -896,9 +895,11 @@ cdef class Parser:
}
util.from_disk(path, deserializers, exclude)
if 'model' not in exclude:
# TODO: Remove this once we don't have to handle previous models
if 'pretrained_dims' in self.cfg and 'pretrained_vectors' not in self.cfg:
self.cfg['pretrained_vectors'] = self.vocab.vectors.name
path = util.ensure_path(path)
if self.model is True:
self.cfg.setdefault('pretrained_dims', self.vocab.vectors_length)
self.model, cfg = self.Model(**self.cfg)
else:
cfg = {}
Expand Down Expand Up @@ -941,12 +942,13 @@ cdef class Parser:
))
msg = util.from_bytes(bytes_data, deserializers, exclude)
if 'model' not in exclude:
# TODO: Remove this once we don't have to handle previous models
if 'pretrained_dims' in self.cfg and 'pretrained_vectors' not in self.cfg:
self.cfg['pretrained_vectors'] = self.vocab.vectors.name
if self.model is True:
self.model, cfg = self.Model(**self.cfg)
cfg['pretrained_dims'] = self.vocab.vectors_length
else:
cfg = {}
cfg['pretrained_dims'] = self.vocab.vectors_length
if 'tok2vec_model' in msg:
self.model[0].from_bytes(msg['tok2vec_model'])
if 'lower_model' in msg:
Expand Down
7 changes: 6 additions & 1 deletion spacy/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
_models = {'en': ['en_core_web_sm'],
'de': ['de_core_news_md'],
'fr': ['fr_core_news_sm'],
'xx': ['xx_ent_web_md']}
'xx': ['xx_ent_web_md'],
'en_core_web_md': ['en_core_web_md'],
'es_core_news_md': ['es_core_news_md']}


# only used for tests that require loading the models
Expand Down Expand Up @@ -183,6 +185,9 @@ def pytest_addoption(parser):

for lang in _languages + ['all']:
parser.addoption("--%s" % lang, action="store_true", help="Use %s models" % lang)
for model in _models:
if model not in _languages:
parser.addoption("--%s" % model, action="store_true", help="Use %s model" % model)


def pytest_runtest_setup(item):
Expand Down
12 changes: 12 additions & 0 deletions spacy/tests/regression/test_issue1660.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from __future__ import unicode_literals
import pytest
from ...util import load_model

@pytest.mark.models("en_core_web_md")
@pytest.mark.models("es_core_news_md")
def test_models_with_different_vectors():
nlp = load_model('en_core_web_md')
doc = nlp(u'hello world')
nlp2 = load_model('es_core_news_md')
doc2 = nlp2(u'hola')
doc = nlp(u'hello world')
1 change: 1 addition & 0 deletions spacy/tests/serialize/test_serialize_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def meta_data():
'email': 'email-in-fixture',
'url': 'url-in-fixture',
'license': 'license-in-fixture',
'vectors': {'width': 0, 'vectors': 0, 'keys': 0, 'name': None}
}


Expand Down
Loading