Merge pull request NTMC-Community#62 from faneshion/dev_tian

Support python3 data_preprocess for QuoraQP
xyzhu12 · Mar 7, 2018 · 99b9a6a · 99b9a6a
2 parents 46b737f + 6df1ffd
commit 99b9a6a
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 12 deletions.
diff --git a/data/QuoraQP/prepare_mz_data.py b/data/QuoraQP/prepare_mz_data.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding: utf-8
-
+from __future__ import print_function
 
 import sys
 import random
@@ -14,19 +14,24 @@
 from preparation import Preparation
 from preprocess import Preprocess, NgramUtil
 
+
 def read_dict(infile):
  word_dict = {}
  for line in open(infile):
  r = line.strip().split()
  word_dict[r[1]] = r[0]
  return word_dict
+
+
 def read_doc(infile):
  doc = {}
  for line in open(infile):
  r = line.strip().split()
  doc[r[0]] = r[1:]
  #assert len(doc[r[0]]) == int(r[1])
  return doc
+
+
 def filter_triletter(tri_stats, min_filter_num=5, max_filter_num=10000):
  tri_dict = {}
  tri_stats = sorted(tri_stats.items(), key=lambda d:d[1], reverse=True)
@@ -36,6 +41,7 @@ def filter_triletter(tri_stats, min_filter_num=5, max_filter_num=10000):
  tri_dict[triinfo[0]] = len(tri_dict)
  return tri_dict
 
+
 if __name__ == '__main__':
  prepare = Preparation()
  srcdir = './'
@@ -82,9 +88,9 @@ def filter_triletter(tri_stats, min_filter_num=5, max_filter_num=10000):
  triletter_dict = filter_triletter(triletter_stats, 5, 10000)
  with open(triletter_dict_output, 'w') as f:
  for tri_id, tric in triletter_dict.items():
- print >> f, tri_id, tric
+ print(tri_id, tric, file=f)
  with open(word_triletter_output, 'w') as f:
  for wid, trics in word_triletter_map.items():
- print >> f, wid, ' '.join([str(triletter_dict[k]) for k in trics if k in triletter_dict])
+ print(wid, ' '.join([str(triletter_dict[k]) for k in trics if k in triletter_dict]), file=f)
 
  print('Triletter Processing finished ...')
diff --git a/data/QuoraQP/run_data.sh b/data/QuoraQP/run_data.sh
@@ -13,21 +13,22 @@ wget --load-cookies=cookies.txt "https://www.kaggle.com/c/quora-question-pairs/d
 unzip test.csv.zip
 !EOF!
 
-#You can also download and unzip it manually on the official web, and save it to the current directory
+# You can also download and unzip it manually on the official web, and save it to the current directory
 
 # download the glove vectors
-wget http:https://nlp.stanford.edu/data/glove.840B.300d.zip
-unzip glove.840B.300d.zip
-wget http:https://nlp.stanford.edu/data/glove.6B.zip
-unzip glove.6B.zip
+# wget http:https://nlp.stanford.edu/data/glove.840B.300d.zip
+# unzip glove.840B.300d.zip
+# wget http:https://nlp.stanford.edu/data/glove.6B.zip
+# unzip glove.6B.zip
 
 # generate the mz-datasets
 python prepare_mz_data.py
 
 # generate word embedding
-python gen_w2v.py glove.840B.300d.txt word_dict.txt embed_glove_d300
+GLOVE='.'
+python gen_w2v.py $GLOVE/glove.840B.300d.txt word_dict.txt embed_glove_d300
 python norm_embed.py embed_glove_d300 embed_glove_d300_norm
-python gen_w2v.py glove.6B.50d.txt word_dict.txt embed_glove_d50
+python gen_w2v.py $GLOVE/glove.6B.50d.txt word_dict.txt embed_glove_d50
 python norm_embed.py embed_glove_d50 embed_glove_d50_norm
 
 # generate idf file

diff --git a/matchzoo/inputs/preparation.py b/matchzoo/inputs/preparation.py
@@ -55,12 +55,12 @@ def run_with_one_corpus_for_quora(self, file_path):
  # hashid = {}
  corpus = {}
  rels = []
- f = open(file_path, 'r')
+ f = codecs.open(file_path, 'r', encoding='utf8')
  next(f)
  for line in f:
  # print("", i)
  # print("", i)
- line = line.decode('utf8')
+ # line = line.decode('utf8')
  line = line.strip()
  qid1, qid2, q1, q2, label = self.parse_line_for_quora(line, "\t")
  if q1 != 0: