From 95a96152213a0c3e4827c00ab396357d3abf02d2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 28 Mar 2018 16:02:59 +0200 Subject: [PATCH 01/10] Fix loading of multiple pre-trained vectors This patch addresses #1660, which was caused by keying all pre-trained vectors with the same ID when telling Thinc how to refer to them. This meant that if multiple models were loaded that had pre-trained vectors, errors or incorrect behaviour resulted. The vectors class now includes a .name attribute, which defaults to: {nlp.meta['lang']_nlp.meta['name']}.vectors The vectors name is set in the cfg of the pipeline components under the key pretrained_vectors. This replaces the previous cfg key pretrained_dims. In order to make existing models compatible with this change, we check for the pretrained_dims key when loading models in from_disk and from_bytes, and add the cfg key pretrained_vectors if we find it. --- spacy/_ml.py | 20 ++++++++++++-------- spacy/language.py | 27 ++++++++++++++++++++++++++- spacy/pipeline.pyx | 31 +++++++++++++++++++++---------- spacy/syntax/nn_parser.pyx | 22 ++++++++++++---------- spacy/tests/conftest.py | 7 ++++++- spacy/vectors.pyx | 20 +++++++++++++++++++- spacy/vocab.pyx | 5 ++++- 7 files changed, 100 insertions(+), 32 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index e5d1cfc636a..d4f0e8bef43 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -242,6 +242,10 @@ def predict(ids, tokvecs): def link_vectors_to_models(vocab): vectors = vocab.vectors + if vectors.name is None: + raise ValueError( + "Unnamed vectors -- this won't allow multiple vectors " + "models to be loaded. (Shape: (%d, %d))" % vectors.data.shape) ops = Model.ops for word in vocab: if word.orth in vectors.key2row: @@ -251,11 +255,11 @@ def link_vectors_to_models(vocab): data = ops.asarray(vectors.data) # Set an entry here, so that vectors are accessed by StaticVectors # (unideal, I know) - thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data + thinc.extra.load_nlp.VECTORS[(ops.device, vectors.name)] = data def Tok2Vec(width, embed_size, **kwargs): - pretrained_dims = kwargs.get('pretrained_dims', 0) + pretrained_vectors = kwargs.get('pretrained_vectors', None) cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2) cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, @@ -268,16 +272,16 @@ def Tok2Vec(width, embed_size, **kwargs): name='embed_suffix') shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape') - if pretrained_dims is not None and pretrained_dims >= 1: - glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID)) + if pretrained_vectors is not None: + glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID)) embed = uniqued( (glove | norm | prefix | suffix | shape) - >> LN(Maxout(width, width*5, pieces=3)), column=5) + >> LN(Maxout(width, width*5, pieces=3)), column=cols.index(ORTH)) else: embed = uniqued( (norm | prefix | suffix | shape) - >> LN(Maxout(width, width*4, pieces=3)), column=5) + >> LN(Maxout(width, width*4, pieces=3)), column=cols.index(ORTH)) convolution = Residual( ExtractWindow(nW=1) @@ -433,13 +437,13 @@ def build_tagger_model(nr_class, **cfg): token_vector_width = cfg['token_vector_width'] else: token_vector_width = util.env_opt('token_vector_width', 128) - pretrained_dims = cfg.get('pretrained_dims', 0) + pretrained_vectors = cfg['pretrained_vectors'] with Model.define_operators({'>>': chain, '+': add}): if 'tok2vec' in cfg: tok2vec = cfg['tok2vec'] else: tok2vec = Tok2Vec(token_vector_width, embed_size, - pretrained_dims=pretrained_dims) + pretrained_vectors=pretrained_vectors) softmax = with_flatten(Softmax(nr_class, token_vector_width)) model = ( tok2vec diff --git a/spacy/language.py b/spacy/language.py index f04da7d3023..fe3c574a19c 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -133,6 +133,8 @@ def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs): if vocab is True: factory = self.Defaults.create_vocab vocab = factory(self, **meta.get('vocab', {})) + if vocab.vectors.name is None: + vocab.vectors.name = meta.get('vectors', {}).get('name') self.vocab = vocab if make_doc is True: factory = self.Defaults.create_tokenizer @@ -158,7 +160,8 @@ def meta(self): self._meta.setdefault('license', '') self._meta['vectors'] = {'width': self.vocab.vectors_length, 'vectors': len(self.vocab.vectors), - 'keys': self.vocab.vectors.n_keys} + 'keys': self.vocab.vectors.n_keys, + 'name': self.vocab.vectors.name} self._meta['pipeline'] = self.pipe_names return self._meta @@ -457,6 +460,8 @@ def begin_training(self, get_gold_tuples=None, sgd=None, **cfg): else: device = None link_vectors_to_models(self.vocab) + if self.vocab.vectors.data.shape[1]: + cfg['pretrained_vectors'] = self.vocab.vectors.name if sgd is None: sgd = create_default_optimizer(Model.ops) self._optimizer = sgd @@ -629,6 +634,7 @@ def from_disk(self, path, disable=tuple()): ('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)), ('meta.json', lambda p: self.meta.update(util.read_json(p))) )) + _fix_pretrained_vectors_name(self) for name, proc in self.pipeline: if name in disable: continue @@ -674,6 +680,7 @@ def from_bytes(self, bytes_data, disable=[]): ('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)), ('meta', lambda b: self.meta.update(ujson.loads(b))) )) + _fix_pretrained_vectors_name(self) for i, (name, proc) in enumerate(self.pipeline): if name in disable: continue @@ -683,6 +690,24 @@ def from_bytes(self, bytes_data, disable=[]): msg = util.from_bytes(bytes_data, deserializers, {}) return self +def _fix_pretrained_vectors_name(nlp): + # TODO: Replace this once we handle vectors consistently as static + # data + if 'vectors' in nlp.meta and nlp.meta['vectors'].get('name'): + nlp.vocab.vectors.name = nlp.meta['vectors']['name'] + elif 'name' in nlp.meta and 'lang' in nlp.meta: + vectors_name = '%s_%s.vectors' % (nlp.meta['lang'], nlp.meta['name']) + nlp.vocab.vectors.name = vectors_name + else: + raise ValueError("Unnamed vectors") + for name, proc in nlp.pipeline: + if not hasattr(proc, 'cfg'): + continue + if proc.cfg.get('pretrained_dims'): + assert nlp.vocab.vectors.name + proc.cfg['pretrained_vectors'] = nlp.vocab.vectors.name + print(proc.cfg) + class DisabledPipes(list): """Manager for temporary pipeline disabling.""" diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 743f6ac85ff..c1190ca0c00 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -202,8 +202,10 @@ class Pipe(object): def from_bytes(self, bytes_data, **exclude): """Load the pipe from a bytestring.""" def load_model(b): + # TODO: Remove this once we don't have to handle previous models + if 'pretrained_dims' in self.cfg and 'pretrained_vectors' not in self.cfg: + self.cfg['pretrained_vectors'] = self.vocab.vectors.name if self.model is True: - self.cfg.setdefault('pretrained_dims', self.vocab.vectors_length) self.model = self.Model(**self.cfg) self.model.from_bytes(b) @@ -227,8 +229,10 @@ class Pipe(object): def from_disk(self, path, **exclude): """Load the pipe from disk.""" def load_model(p): + # TODO: Remove this once we don't have to handle previous models + if 'pretrained_dims' in self.cfg and 'pretrained_vectors' not in self.cfg: + self.cfg['pretrained_vectors'] = self.vocab.vectors.name if self.model is True: - self.cfg.setdefault('pretrained_dims', self.vocab.vectors_length) self.model = self.Model(**self.cfg) self.model.from_bytes(p.open('rb').read()) @@ -286,7 +290,6 @@ class Tensorizer(Pipe): self.model = model self.input_models = [] self.cfg = dict(cfg) - self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] self.cfg.setdefault('cnn_maxout_pieces', 3) def __call__(self, doc): @@ -403,8 +406,6 @@ class Tagger(Pipe): self.model = model self.cfg = OrderedDict(sorted(cfg.items())) self.cfg.setdefault('cnn_maxout_pieces', 2) - self.cfg.setdefault('pretrained_dims', - self.vocab.vectors.data.shape[1]) @property def labels(self): @@ -516,7 +517,6 @@ class Tagger(Pipe): vocab.morphology.lemmatizer, exc=vocab.morphology.exc) if self.model is True: - self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) link_vectors_to_models(self.vocab) if sgd is None: @@ -525,6 +525,14 @@ class Tagger(Pipe): @classmethod def Model(cls, n_tags, **cfg): + if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'): + raise ValueError( + "Bad configuration of Tagger --- this is probably a bug " + "within spaCy. We changed the name of an internal attribute " + "for loading pre-trained vectors, and the class has been " + "passed the old name (pretrained_dims) but not the new name " + "(pretrained_vectors)") + print(cfg) return build_tagger_model(n_tags, **cfg) def add_label(self, label, values=None): @@ -572,6 +580,10 @@ class Tagger(Pipe): def from_bytes(self, bytes_data, **exclude): def load_model(b): + # TODO: Remove this once we don't have to handle previous models + if 'pretrained_dims' in self.cfg and 'pretrained_vectors' not in self.cfg: + self.cfg['pretrained_vectors'] = self.vocab.vectors.name + if self.model is True: token_vector_width = util.env_opt( 'token_vector_width', @@ -597,7 +609,6 @@ class Tagger(Pipe): return self def to_disk(self, path, **exclude): - self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1]) tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items())) serialize = OrderedDict(( ('vocab', lambda p: self.vocab.to_disk(p)), @@ -610,6 +621,9 @@ class Tagger(Pipe): def from_disk(self, path, **exclude): def load_model(p): + # TODO: Remove this once we don't have to handle previous models + if 'pretrained_dims' in self.cfg and 'pretrained_vectors' not in self.cfg: + self.cfg['pretrained_vectors'] = self.vocab.vectors.name if self.model is True: self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) with p.open('rb') as file_: @@ -659,8 +673,6 @@ class MultitaskObjective(Tagger): "one of: dep, tag, ent, dep_tag_offset, ent_tag.") self.cfg = dict(cfg) self.cfg.setdefault('cnn_maxout_pieces', 2) - self.cfg.setdefault('pretrained_dims', - self.vocab.vectors.data.shape[1]) @property def labels(self): @@ -904,7 +916,6 @@ class TextCategorizer(Pipe): else: token_vector_width = 64 if self.model is True: - self.cfg['pretrained_dims'] = self.vocab.vectors_length self.model = self.Model(len(self.labels), token_vector_width, **self.cfg) link_vectors_to_models(self.vocab) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index b4b8d477938..c12b733e773 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -256,7 +256,7 @@ cdef class Parser: if hist_width != 0: raise ValueError("Currently history width is hard-coded to 0") tok2vec = Tok2Vec(token_vector_width, embed_size, - pretrained_dims=cfg.get('pretrained_dims', 0)) + pretrained_vectors=cfg.get('pretrained_vectors', None)) tok2vec = chain(tok2vec, flatten) lower = PrecomputableAffine(hidden_width, nF=cls.nr_feature, nI=token_vector_width, @@ -294,9 +294,9 @@ cdef class Parser: unless True (default), in which case a new instance is created with `Parser.Moves()`. model (object): Defines how the parse-state is created, updated and - evaluated. The value is set to the .model attribute unless True - (default), in which case a new instance is created with - `Parser.Model()`. + evaluated. The value is set to the .model attribute. If set to True + (default), a new instance will be created with `Parser.Model()` + in parser.begin_training(), parser.from_disk() or parser.from_bytes(). **cfg: Arbitrary configuration parameters. Set to the `.cfg` attribute """ self.vocab = vocab @@ -308,8 +308,6 @@ cdef class Parser: cfg['beam_width'] = util.env_opt('beam_width', 1) if 'beam_density' not in cfg: cfg['beam_density'] = util.env_opt('beam_density', 0.0) - if 'pretrained_dims' not in cfg: - cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] cfg.setdefault('cnn_maxout_pieces', 3) self.cfg = cfg if 'actions' in self.cfg: @@ -832,7 +830,6 @@ cdef class Parser: self.moves.add_action(action, label) cfg.setdefault('token_vector_width', 128) if self.model is True: - cfg['pretrained_dims'] = self.vocab.vectors_length self.model, cfg = self.Model(self.moves.n_moves, **cfg) if sgd is None: sgd = self.create_optimizer() @@ -896,9 +893,12 @@ cdef class Parser: } util.from_disk(path, deserializers, exclude) if 'model' not in exclude: + # TODO: Remove this once we don't have to handle previous models + if 'pretrained_dims' in self.cfg and 'pretrained_vectors' not in self.cfg: + self.cfg['pretrained_vectors'] = self.vocab.vectors.name + print("Create parser model", self.cfg) path = util.ensure_path(path) if self.model is True: - self.cfg.setdefault('pretrained_dims', self.vocab.vectors_length) self.model, cfg = self.Model(**self.cfg) else: cfg = {} @@ -941,12 +941,14 @@ cdef class Parser: )) msg = util.from_bytes(bytes_data, deserializers, exclude) if 'model' not in exclude: + # TODO: Remove this once we don't have to handle previous models + if 'pretrained_dims' in self.cfg and 'pretrained_vectors' not in self.cfg: + self.cfg['pretrained_vectors'] = self.vocab.vectors.name + print("Create parser model", self.cfg) if self.model is True: self.model, cfg = self.Model(**self.cfg) - cfg['pretrained_dims'] = self.vocab.vectors_length else: cfg = {} - cfg['pretrained_dims'] = self.vocab.vectors_length if 'tok2vec_model' in msg: self.model[0].from_bytes(msg['tok2vec_model']) if 'lower_model' in msg: diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 1200ebe8c11..3530ca6e2b2 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -19,7 +19,9 @@ _models = {'en': ['en_core_web_sm'], 'de': ['de_core_news_md'], 'fr': ['fr_core_news_sm'], - 'xx': ['xx_ent_web_md']} + 'xx': ['xx_ent_web_md'], + 'en_core_web_md': ['en_core_web_md'], + 'es_core_news_md': ['es_core_news_md']} # only used for tests that require loading the models @@ -183,6 +185,9 @@ def pytest_addoption(parser): for lang in _languages + ['all']: parser.addoption("--%s" % lang, action="store_true", help="Use %s models" % lang) + for model in _models: + if model not in _languages: + parser.addoption("--%s" % model, action="store_true", help="Use %s model" % model) def pytest_runtest_setup(item): diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 1b265e189ba..f4880b31c74 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -1,6 +1,7 @@ # coding: utf8 from __future__ import unicode_literals +import functools import numpy from collections import OrderedDict import msgpack @@ -19,6 +20,20 @@ def unpickle_vectors(bytes_data): return Vectors().from_bytes(bytes_data) +class GlobalRegistry(object): + '''Global store of vectors, to avoid repeatedly loading the data.''' + data = {} + + @classmethod + def register(cls, name, data): + cls.data[name] = data + return functools.partial(cls.get, name) + + @classmethod + def get(cls, name): + return cls.data[name] + + cdef class Vectors: """Store, save and load word vectors. @@ -31,18 +46,21 @@ cdef class Vectors: the table need to be assigned --- so len(list(vectors.keys())) may be greater or smaller than vectors.shape[0]. """ + cdef public object name cdef public object data cdef public object key2row cdef public object _unset - def __init__(self, *, shape=None, data=None, keys=None): + def __init__(self, *, shape=None, data=None, keys=None, name=None): """Create a new vector store. shape (tuple): Size of the table, as (# entries, # columns) data (numpy.ndarray): The vector data. keys (iterable): A sequence of keys, aligned with the data. + name (string): A name to identify the vectors table. RETURNS (Vectors): The newly created object. """ + self.name = name if data is None: if shape is None: shape = (0,0) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 0a675253ba5..95d97bbf036 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -381,7 +381,8 @@ cdef class Vocab: self.lexemes_from_bytes(file_.read()) if self.vectors is not None: self.vectors.from_disk(path, exclude='strings.json') - link_vectors_to_models(self) + if self.vectors.name is not None: + link_vectors_to_models(self) return self def to_bytes(self, **exclude): @@ -421,6 +422,8 @@ cdef class Vocab: ('vectors', lambda b: serialize_vectors(b)) )) util.from_bytes(bytes_data, setters, exclude) + if self.vectors.name is not None: + link_vectors_to_models(self) return self def lexemes_to_bytes(self): From 9bf6e93b3e61211468347fcfe6f3241c9d8d2e3b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 28 Mar 2018 16:32:41 +0200 Subject: [PATCH 02/10] Set pretrained_vectors in begin_training --- spacy/pipeline.pyx | 6 +++++- spacy/syntax/nn_parser.pyx | 2 -- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index c1190ca0c00..20f6ac3d5a7 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -516,6 +516,7 @@ class Tagger(Pipe): vocab.morphology = Morphology(vocab.strings, new_tag_map, vocab.morphology.lemmatizer, exc=vocab.morphology.exc) + self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors') if self.model is True: self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) link_vectors_to_models(self.vocab) @@ -910,12 +911,15 @@ class TextCategorizer(Pipe): self.labels.append(label) return 1 - def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None): + def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None, + **kwargs): if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer': token_vector_width = pipeline[0].model.nO else: token_vector_width = 64 + if self.model is True: + self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors') self.model = self.Model(len(self.labels), token_vector_width, **self.cfg) link_vectors_to_models(self.vocab) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index c12b733e773..3e3ecbf3105 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -896,7 +896,6 @@ cdef class Parser: # TODO: Remove this once we don't have to handle previous models if 'pretrained_dims' in self.cfg and 'pretrained_vectors' not in self.cfg: self.cfg['pretrained_vectors'] = self.vocab.vectors.name - print("Create parser model", self.cfg) path = util.ensure_path(path) if self.model is True: self.model, cfg = self.Model(**self.cfg) @@ -944,7 +943,6 @@ cdef class Parser: # TODO: Remove this once we don't have to handle previous models if 'pretrained_dims' in self.cfg and 'pretrained_vectors' not in self.cfg: self.cfg['pretrained_vectors'] = self.vocab.vectors.name - print("Create parser model", self.cfg) if self.model is True: self.model, cfg = self.Model(**self.cfg) else: From 17c3e7efa2e39bc53b6faec1750a810f9a9c0ea0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 28 Mar 2018 16:33:43 +0200 Subject: [PATCH 03/10] Add message noting vectors --- spacy/cli/train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index be5be0f0bb2..344d8c0b695 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -93,6 +93,7 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, meta['pipeline'] = pipeline nlp.meta.update(meta) if vectors: + print("Load vectors model", vectors) util.load_model(vectors, vocab=nlp.vocab) for lex in nlp.vocab: values = {} From 79dc241caa538524693da4af35c1875aaa8a1bf9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 28 Mar 2018 17:35:07 +0200 Subject: [PATCH 04/10] Set pretrained_vectors in parser cfg --- spacy/syntax/nn_parser.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 3e3ecbf3105..458bf4d2247 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -255,8 +255,9 @@ cdef class Parser: raise ValueError("Currently history size is hard-coded to 0") if hist_width != 0: raise ValueError("Currently history width is hard-coded to 0") + pretrained_vectors = cfg.get('pretrained_vectors', None) tok2vec = Tok2Vec(token_vector_width, embed_size, - pretrained_vectors=cfg.get('pretrained_vectors', None)) + pretrained_vectors=pretrained_vectors) tok2vec = chain(tok2vec, flatten) lower = PrecomputableAffine(hidden_width, nF=cls.nr_feature, nI=token_vector_width, @@ -275,6 +276,7 @@ cdef class Parser: 'token_vector_width': token_vector_width, 'hidden_width': hidden_width, 'maxout_pieces': parser_maxout_pieces, + 'pretrained_vectors': pretrained_vectors, 'hist_size': hist_size, 'hist_width': hist_width } From bc4afa988164bb1c50a3f1626c397fdb43de6966 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 28 Mar 2018 17:48:37 +0200 Subject: [PATCH 05/10] Remove print statement --- spacy/pipeline.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 20f6ac3d5a7..83535924f84 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -533,7 +533,6 @@ class Tagger(Pipe): "for loading pre-trained vectors, and the class has been " "passed the old name (pretrained_dims) but not the new name " "(pretrained_vectors)") - print(cfg) return build_tagger_model(n_tags, **cfg) def add_label(self, label, values=None): From fd9e259414cef76855002af5a6bc22bb3fea9a37 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 28 Mar 2018 18:22:51 +0200 Subject: [PATCH 06/10] Add test for #1660 --- spacy/tests/regression/test_issue1660.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 spacy/tests/regression/test_issue1660.py diff --git a/spacy/tests/regression/test_issue1660.py b/spacy/tests/regression/test_issue1660.py new file mode 100644 index 00000000000..d46de046563 --- /dev/null +++ b/spacy/tests/regression/test_issue1660.py @@ -0,0 +1,12 @@ +from __future__ import unicode_literals +import pytest +from ...util import load_model + +@pytest.mark.models("en_core_web_md") +@pytest.mark.models("es_core_news_md") +def test_models_with_different_vectors(): + nlp = load_model('en_core_web_md') + doc = nlp(u'hello world') + nlp2 = load_model('es_core_news_md') + doc2 = nlp2(u'hola') + doc = nlp(u'hello world') From f8dd905a24b964aca33a36bb150150374d857177 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 28 Mar 2018 18:24:53 +0200 Subject: [PATCH 07/10] Warn and fallback if vectors have no name --- spacy/_ml.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index d4f0e8bef43..2d366af74ae 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -243,8 +243,9 @@ def predict(ids, tokvecs): def link_vectors_to_models(vocab): vectors = vocab.vectors if vectors.name is None: - raise ValueError( - "Unnamed vectors -- this won't allow multiple vectors " + vectors.name = VECTORS_KEY + print( + "Warning: Unnamed vectors -- this won't allow multiple vectors " "models to be loaded. (Shape: (%d, %d))" % vectors.data.shape) ops = Model.ops for word in vocab: From 4555e3e251ed1923c29cd1c08a429397ca6fa6dd Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 28 Mar 2018 20:12:45 +0200 Subject: [PATCH 08/10] Dont assume pretrained_vectors cfg set in build_tagger --- spacy/_ml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 2d366af74ae..7a76405fbe5 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -438,7 +438,7 @@ def build_tagger_model(nr_class, **cfg): token_vector_width = cfg['token_vector_width'] else: token_vector_width = util.env_opt('token_vector_width', 128) - pretrained_vectors = cfg['pretrained_vectors'] + pretrained_vectors = cfg.get('pretrained_vectors') with Model.define_operators({'>>': chain, '+': add}): if 'tok2vec' in cfg: tok2vec = cfg['tok2vec'] From cf5fcf0546b751cbee56bf88b4c47faed60133b4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 28 Mar 2018 20:12:53 +0200 Subject: [PATCH 09/10] Update serialization test --- spacy/tests/serialize/test_serialize_language.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/serialize/test_serialize_language.py b/spacy/tests/serialize/test_serialize_language.py index 1fcf8ef18a5..9b6a011c98d 100644 --- a/spacy/tests/serialize/test_serialize_language.py +++ b/spacy/tests/serialize/test_serialize_language.py @@ -17,6 +17,7 @@ def meta_data(): 'email': 'email-in-fixture', 'url': 'url-in-fixture', 'license': 'license-in-fixture', + 'vectors': {'width': 0, 'vectors': 0, 'keys': 0, 'name': None} } From a7c5ae2bebe7053736650e4722eb2ad5db68fb31 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 28 Mar 2018 21:08:58 +0200 Subject: [PATCH 10/10] Avoid forcing a name on empty vectors, and remove print statement --- spacy/language.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index fe3c574a19c..cb37b62fcfd 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -695,6 +695,8 @@ def _fix_pretrained_vectors_name(nlp): # data if 'vectors' in nlp.meta and nlp.meta['vectors'].get('name'): nlp.vocab.vectors.name = nlp.meta['vectors']['name'] + elif not nlp.vocab.vectors.size: + nlp.vocab.vectors.name = None elif 'name' in nlp.meta and 'lang' in nlp.meta: vectors_name = '%s_%s.vectors' % (nlp.meta['lang'], nlp.meta['name']) nlp.vocab.vectors.name = vectors_name @@ -706,7 +708,6 @@ def _fix_pretrained_vectors_name(nlp): if proc.cfg.get('pretrained_dims'): assert nlp.vocab.vectors.name proc.cfg['pretrained_vectors'] = nlp.vocab.vectors.name - print(proc.cfg) class DisabledPipes(list):