Merge branch 'release-0.12.1rc1'

QGao · Jul 20, 2015 · b9121d0 · b9121d0
2 parents 451d94f + 93f6385
commit b9121d0
Show file tree

Hide file tree

Showing 17 changed files with 703 additions and 533 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,3 +1,4 @@
+sudo: false
 language: python
 python:
   - "2.6"
@@ -10,9 +11,6 @@ before_install:
   - ./miniconda.sh -b
   - export PATH=/home/travis/miniconda/bin:$PATH
   - conda update --yes conda
-  # The next couple lines fix a crash with multiprocessing on Travis and are not specific to using Miniconda
-  - sudo rm -rf /dev/shm
-  - sudo ln -s /run/shm /dev/shm
 install:
   - conda create --yes -n gensim-test python=$TRAVIS_PYTHON_VERSION pip atlas numpy scipy
   - source activate gensim-test

diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -1,14 +1,23 @@
 Changes
 =======
 
-0.11.2, 05/07/2015
+0.12.1, 20/07/2015
+
+* improvements to testing, switch to Travis CI containers
+* support for loading old word2vec models (<=0.11.1) in 0.12+ (Gordon Mohr, #405)
+* various bug fixes to word2vec, doc2vec (Gordon Mohr, #393, #386, #404)
+* TextSummatization support for very short texts (Federico Barrios, #390)
+* support for word2vec[['word1', 'word2'...]] convenience API calls (Satish Palaniappan, #395)
+* MatrixSimilarity supports indexing generator corpora (single pass)
+
+0.12.0, 06/07/2015
 
 * complete API, performance, memory overhaul of doc2vec (Gordon Mohr, #356, #373, #380, #384)
   - fast infer_vector(); optional memory-mapped doc vectors; memory savings with int doc IDs
   - 'dbow_words' for combined DBOW & word skip-gram training; new 'dm_concat' mode
   - multithreading & negative-sampling optimizations (also benefitting word2vec)
-  - API NOTE: doc vectors must now be accessed/compared through model's 'docvecs' field 
-    (eg: "model.docvecs['my_ID']" or "model.docvecs.most_similar('my_ID')") 
+  - API NOTE: doc vectors must now be accessed/compared through model's 'docvecs' field
+    (eg: "model.docvecs['my_ID']" or "model.docvecs.most_similar('my_ID')")
   - https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb
 * new "text summarization" module (PR #324: Federico Lopez, Federico Barrios)
   - https://github.com/summanlp/docs/raw/master/articulo/articulo-en.pdf
@@ -18,6 +27,7 @@ Changes
   - https://arxiv.org/abs/1504.07295
   - https://nbviewer.ipython.org/github/taddylab/deepir/blob/master/w2v-inversion.ipynb
 * word2vec supports "encoding" parameter when loading from C format, for non-utf8 models
+* more memory-efficient word2vec training (#385)
 * fixes to Python3 compatibility (Pavel Kalaidin #330, S-Eugene #369)
 * enhancements to save/load format (Liang Bo Wang #363, Gordon Mohr #356)
   - pickle defaults to protocol=2 for better py3 compatibility

diff --git a/docs/notebooks/doc2vec-IMDB.ipynb b/docs/notebooks/doc2vec-IMDB.ipynb
@@ -393,7 +393,7 @@
       "            best_indicator = '*' \n",
       "        print(\"%s%f : %i passes : %s %ss %ss\" % (best_indicator, err, epoch + 1, name, duration, eval_duration))\n",
       "\n",
-      "        if (epoch % 5) == 0:\n",
+      "        if ((epoch + 1) % 5) == 0 or epoch == 0:\n",
       "            eval_duration = ''\n",
       "            with elapsed_timer() as eval_elapsed:\n",
       "                infer_err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs, infer=True)\n",
@@ -1538,10 +1538,10 @@
       "doc_id = np.random.randint(simple_models[0].docvecs.count)  # pick random doc, re-run cell for more examples\n",
       "model = random.choice(simple_models)  # and a random model\n",
       "sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count)  # get *all* similar documents\n",
-      "print('TARGET (%d): \u00ab%s\u00bb\\n' % (doc_id, ' '.join(alldocs[doc_id].words)))\n",
-      "print('SIMILAR/DISSIMILAR DOCS PER MODEL %s:\\n' % model)\n",
+      "print(u'TARGET (%d): \u00ab%s\u00bb\\n' % (doc_id, ' '.join(alldocs[doc_id].words)))\n",
+      "print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\\n' % model)\n",
       "for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:\n",
-      "    print('%s %s: \u00ab%s\u00bb\\n' % (label, sims[index], ' '.join(alldocs[sims[index][0]].words)))\n"
+      "    print(u'%s %s: \u00ab%s\u00bb\\n' % (label, sims[index], ' '.join(alldocs[sims[index][0]].words)))\n"
      ],
      "language": "python",
      "metadata": {},
@@ -1602,8 +1602,8 @@
       "    word = random.choice(word_models[0].index2word)\n",
       "    if word_models[0].vocab[word].count > 10:\n",
       "        break\n",
-      "# or just pick a word from the relevant domain:\n",
-      "word = 'comedy/drama'\n",
+      "# or uncomment below line, to just pick a word from the relevant domain:\n",
+      "#word = 'comedy/drama'\n",
       "similars_per_model = [str(model.most_similar(word, topn=20)).replace('), ','),<br>\\n') for model in word_models]\n",
       "similar_table = (\"<table><tr><th>\" +\n",
       "    \"</th><th>\".join([str(model) for model in word_models]) + \n",
@@ -1721,7 +1721,7 @@
       "# note: this takes many minutes\n",
       "for model in word_models:\n",
       "    sections = model.accuracy('questions-words.txt')\n",
-      "    correct, incorrect = (len(sum((s['correct'] for s in sections), [])), len(sum((s['incorrect'] for s in sections),[])))\n",
+      "    correct, incorrect = len(sections[-1]['correct']), len(sections[-1]['incorrect'])\n",
       "    print('%s: %0.2f%% correct (%d of %d)' % (model, float(correct*100)/(correct+incorrect), correct, correct+incorrect))"
      ],
      "language": "python",
@@ -1731,16 +1731,16 @@
        "output_type": "stream",
        "stream": "stdout",
        "text": [
-        "Doc2Vec(dm/c,d100,n5,w5,mc2,t8): 28.70% correct (5746 of 20024)\n",
-        "Doc2Vec(dbow,d100,n5,mc2,t8): 0.01% correct (2 of 20024)"
+        "Doc2Vec(dm/c,d100,n5,w5,mc2,t8): 28.70% correct (2873 of 10012)\n",
+        "Doc2Vec(dbow,d100,n5,mc2,t8): 0.01% correct (1 of 10012)"
        ]
       },
       {
        "output_type": "stream",
        "stream": "stdout",
        "text": [
         "\n",
-        "Doc2Vec(dm/m,d100,n5,w10,mc2,t8): 27.24% correct (5454 of 20024)"
+        "Doc2Vec(dm/m,d100,n5,w10,mc2,t8): 27.24% correct (2727 of 10012)"
        ]
       },
       {

diff --git a/docs/src/conf.py b/docs/src/conf.py
@@ -52,9 +52,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.12.0'
+version = '0.12.1'
 # The full version, including alpha/beta/rc tags.
-release = '0.12.0'
+release = '0.12.1'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py
@@ -155,16 +155,14 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
 
         token2id = self.token2id
         if allow_update or return_missing:
-            missing = dict((w, freq) for w, freq in iteritems(counter)
-                               if w not in token2id)
+            missing = dict((w, freq) for w, freq in iteritems(counter) if w not in token2id)
             if allow_update:
                 for w in missing:
                     # new id = number of ids made so far;
                     # NOTE this assumes there are no gaps in the id sequence!
                     token2id[w] = len(token2id)
 
-        result = dict((token2id[w], freq) for w, freq in iteritems(counter)
-                  if w in token2id)
+        result = dict((token2id[w], freq) for w, freq in iteritems(counter) if w in token2id)
 
         if allow_update:
             self.num_docs += 1

diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
@@ -48,7 +48,7 @@
 
 from numpy import zeros, random, sum as np_sum, add as np_add, concatenate, \
     repeat as np_repeat, array, float32 as REAL, empty, ones, memmap as np_memmap, \
-    sqrt, newaxis, ndarray, dot, vstack
+    sqrt, newaxis, ndarray, dot, vstack, dtype, divide as np_divide
 
 from gensim import utils, matutils  # utility fnc for pickling, common scipy operations etc
 from gensim.models.word2vec import Word2Vec, Vocab, train_cbow_pair, train_sg_pair, train_sentence_sg
@@ -112,8 +112,8 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N
         Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. This
         method implements the DM model with a projection (input) layer that is
         either the sum or mean of the context vectors, depending on the model's
-        `dm_mean` configuration field.  See `train_dm_concat()` for the DM model
-        with a concatenated input layer.
+        `dm_mean` configuration field.  See `train_document_dm_concat()` for the DM
+        model with a concatenated input layer.
 
         The document is provided as `doc_words`, a list of word tokens which are looked up
         in the model's vocab dictionary, and `doctag_indexes`, which provide indexes
@@ -137,28 +137,27 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N
             doctag_locks = model.docvecs.doctag_syn0_lockf
 
         word_vocabs = [model.vocab[w] for w in doc_words if w in model.vocab and
-                       model.vocab[w].sample_int > model.random.randint(2**32)]
-        doctag_sum = np_sum(doctag_vectors[doctag_indexes], axis=0)
-        doctag_len = len(doctag_indexes)
+                       model.vocab[w].sample_int > model.random.rand() * 2**32]
 
         for pos, word in enumerate(word_vocabs):
             reduced_window = model.random.randint(model.window)  # `b` in the original doc2vec code
             start = max(0, pos - model.window + reduced_window)
             window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
-            word2_indexes = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]
-            l1 = np_sum(word_vectors[word2_indexes], axis=0) + doctag_sum  # 1 x layer1_size
-            if word2_indexes and model.cbow_mean:
-                l1 /= (len(word2_indexes) + doctag_len)
+            word2_indexes = [word2.index for pos2, word2 in window_pos if pos2 != pos]
+            l1 = np_sum(word_vectors[word2_indexes], axis=0) + np_sum(doctag_vectors[doctag_indexes], axis=0)
+            count = len(word2_indexes) + len(doctag_indexes)
+            if model.cbow_mean and count > 1 :
+                l1 /= count
             neu1e = train_cbow_pair(model, word, word2_indexes, l1, alpha,
                                     learn_vectors=False, learn_hidden=learn_hidden)
-            if word2_indexes and not model.cbow_mean:
-                neu1e /= (len(word2_indexes) + doctag_len)
+            if not model.cbow_mean and count > 1:
+                neu1e /= count
             if learn_doctags:
-                doctag_vectors[doctag_indexes] += neu1e * \
-                    np_repeat(doctag_locks[doctag_indexes], model.vector_size).reshape(-1, model.vector_size)
+                for i in doctag_indexes:
+                    doctag_vectors[i] += neu1e * doctag_locks[i]
             if learn_words:
-                word_vectors[word2_indexes] += neu1e * \
-                    np_repeat(word_locks[word2_indexes], model.vector_size).reshape(-1, model.vector_size)
+                for i in word2_indexes:
+                    word_vectors[i] += neu1e * word_locks[i]
 
         return len(word_vocabs)
 
@@ -193,7 +192,7 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None,
             doctag_locks = model.docvecs.doctag_syn0_lockf
 
         word_vocabs = [model.vocab[w] for w in doc_words if w in model.vocab and
-                       model.vocab[w].sample_int > model.random.randint(2**32)]
+                       model.vocab[w].sample_int > model.random.rand() * 2**32]
         doctag_len = len(doctag_indexes)
         if doctag_len != model.dm_tag_count:
             return 0  # skip doc without expected number of doctag(s) (TODO: warn/pad?)
@@ -262,8 +261,8 @@ class DocvecsArray(utils.SaveLoad):
     >>> docvec = d2v_model.docvecs[99]
     >>> docvec = d2v_model.docvecs['SENT_99']  # if string tag used in training
     >>> sims = d2v_model.docvecs.most_similar(99)
-    >>> sims = d2v_model.docvecs.most_similar('SENT_99'))
-    >>> sims = d2v_model.docvecs.most_similar(docvec))
+    >>> sims = d2v_model.docvecs.most_similar('SENT_99')
+    >>> sims = d2v_model.docvecs.most_similar(docvec)
 
     If only plain int tags are presented during training, the dict (of
     string tag -> index) and list (of index -> string tag) stay empty,
@@ -320,7 +319,19 @@ def _key_index(self, i_index, missing=None):
             return i_index
 
     def __getitem__(self, index):
-        return self.doctag_syn0[self._int_index(index)]
+        """
+        Accept a single key (int or string tag) or list of keys as input.
+
+        If a single string or int, return designated tag's vector
+        representation, as a 1D numpy array.
+
+        If a list, return designated tags' vector representations as a
+        2D numpy array: #tags x #vector_size.
+        """
+        if isinstance(index, string_types + (int,)):
+            return self.doctag_syn0[self._int_index(index)]
+
+        return vstack([self[i] for i in index])
 
     def __len__(self):
         return self.count
@@ -378,10 +389,14 @@ def init_sims(self, replace=False):
                     self.doctag_syn0[i, :] /= sqrt((self.doctag_syn0[i, :] ** 2).sum(-1))
                 self.doctag_syn0norm = self.doctag_syn0
             else:
-                self.doctag_syn0norm = (self.doctag_syn0 /
-                                        sqrt((self.doctag_syn0 ** 2).sum(-1))[..., newaxis]).astype(REAL)
+                if self.mapfile_path:
+                    self.doctag_syn0norm = np_memmap(self.mapfile_path+'.doctag_syn0norm', dtype=REAL,
+                                                     mode='w+', shape=self.doctag_syn0.shape)
+                else:
+                    self.doctag_syn0norm = empty(self.doctag_syn0.shape, dtype=REAL)
+                np_divide(self.doctag_syn0, sqrt((self.doctag_syn0 ** 2).sum(-1))[..., newaxis], self.doctag_syn0norm)
 
-    def most_similar(self, positive=[], negative=[], topn=10):
+    def most_similar(self, positive=[], negative=[], topn=10, clip_start=0, clip_end=None):
         """
         Find the top-N most similar docvecs known from training. Positive docs contribute
         positively towards the similarity, negative docs negatively.
@@ -390,8 +405,13 @@ def most_similar(self, positive=[], negative=[], topn=10):
         weight vectors of the given docs. Docs may be specified as vectors, integer indexes
         of trained docvecs, or if the documents were originally presented with string tags,
         by the corresponding tags.
+
+        The 'clip_start' and 'clip_end' allow limiting results to a particular contiguous
+        range of the underlying doctag_syn0norm vectors. (This may be useful if the ordering
+        there was chosen to be significant, such as more popular tag IDs in lower indexes.)
         """
         self.init_sims()
+        clip_end = clip_end or len(self.doctag_syn0norm)
 
         if isinstance(positive, string_types + integer_types) and not negative:
             # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
@@ -417,7 +437,7 @@ def most_similar(self, positive=[], negative=[], topn=10):
             raise ValueError("cannot compute similarity with no input")
         mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)
 
-        dists = dot(self.doctag_syn0norm, mean)
+        dists = dot(self.doctag_syn0norm[clip_start:clip_end], mean)
         if not topn:
             return dists
         best = matutils.argsort(dists, topn=topn + len(all_docs), reverse=True)
@@ -589,7 +609,7 @@ def scan_vocab(self, documents, progress_per=10000):
             if document_no % progress_per == 0:
                 interval_rate = (total_words - interval_count) / (default_timer() - interval_start)
                 logger.info("PROGRESS: at example #%i, processed %i words (%i/s), %i word types, %i tags",
-                            document_no, sum(itervalues(vocab)) + total_words, interval_rate, len(vocab), len(self.docvecs))
+                            document_no, total_words, interval_rate, len(vocab), len(self.docvecs))
                 interval_start = default_timer()
                 interval_count = total_words
             document_length = len(document.words)
@@ -599,12 +619,12 @@ def scan_vocab(self, documents, progress_per=10000):
 
             for word in document.words:
                 vocab[word] += 1
+            total_words += len(document.words)
 
             if self.max_vocab_size and len(vocab) > self.max_vocab_size:
-                total_words += utils.prune_vocab(vocab, min_reduce)
+                utils.prune_vocab(vocab, min_reduce)
                 min_reduce += 1
 
-        total_words += sum(itervalues(vocab))
         logger.info("collected %i word types and %i unique tags from a corpus of %i examples and %i words",
                     len(vocab), len(self.docvecs), document_no + 1, total_words)
         self.corpus_count = document_no + 1
@@ -613,6 +633,7 @@ def scan_vocab(self, documents, progress_per=10000):
     def _do_train_job(self, job, alpha, inits):
         work, neu1 = inits
         tally = 0
+        raw_tally = 0
         for doc in job:
             indexed_doctags = self.docvecs.indexed_doctags(doc.tags)
             doctag_indexes, doctag_vectors, doctag_locks, ignored = indexed_doctags
@@ -626,8 +647,12 @@ def _do_train_job(self, job, alpha, inits):
             else:
                 tally += train_document_dm(self, doc.words, doctag_indexes, alpha, work, neu1,
                                            doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
+            raw_tally += len(doc.words)
             self.docvecs.trained_item(indexed_doctags)
-        return tally
+        return (tally, raw_tally)
+
+    def _raw_word_count(self, items):
+        return sum(len(item.words) for item in items)
 
     def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5):
         """
@@ -661,12 +686,12 @@ def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5):
 
         return doctag_vectors[0]
 
-    def estimate_memory(self, vocab_size=None):
+    def estimate_memory(self, vocab_size=None, report=None):
         """Estimate required memory for a model using current settings."""
-        report = super(Doc2Vec, self).estimate_memory(vocab_size)
+        report = report or {}
         report['doctag_lookup'] = self.docvecs.estimated_lookup_memory()
-        report['doctag_syn0'] = self.docvecs.count * self.vector_size * 4
-        return report
+        report['doctag_syn0'] = self.docvecs.count * self.vector_size * dtype(REAL).itemsize
+        return super(Doc2Vec, self).estimate_memory(vocab_size, report=report)
 
     def __str__(self):
         """Abbreviated name reflecting major configuration paramaters."""

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
@@ -314,8 +314,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
             self.update(corpus)
 
     def __str__(self):
-        return "LdaModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s, alpha=%s)" % \
-            (self.num_terms, self.num_topics, self.decay, self.chunksize, self.alpha)
+        return "LdaModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % \
+            (self.num_terms, self.num_topics, self.decay, self.chunksize)
 
     def sync_state(self):
         self.expElogbeta = numpy.exp(self.state.get_Elogbeta())
@@ -840,10 +840,10 @@ def save(self, fname, *args, **kwargs):
         Note: If you intend to use models across Python 2/3 versions there are a few things to
         keep in mind:
 
-        1. The pickled Python dictionaries will not work across Python versions
-        2. The `save` method does not automatically save all NumPy arrays using NumPy, only
-        those ones that exceed `sep_limit` set in `gensim.utils.SaveLoad.save`. The main
-        concern here is the `alpha` array if for instance using `alpha='auto'`.
+          1. The pickled Python dictionaries will not work across Python versions
+          2. The `save` method does not automatically save all NumPy arrays using NumPy, only
+             those ones that exceed `sep_limit` set in `gensim.utils.SaveLoad.save`. The main
+             concern here is the `alpha` array if for instance using `alpha='auto'`.
 
         Please refer to the wiki recipes section (https://github.com/piskvorky/gensim/wiki/Recipes-&-FAQ#q9-how-do-i-load-a-model-in-python-3-that-was-trained-and-saved-using-python-2)
         for an example on how to work around these issues.