Skip to content

Commit

Permalink
Merge pull request NTMC-Community#62 from faneshion/dev_tian
Browse files Browse the repository at this point in the history
Support python3 data_preprocess for QuoraQP
  • Loading branch information
faneshion committed Mar 7, 2018
2 parents 46b737f + 6df1ffd commit 99b9a6a
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 12 deletions.
12 changes: 9 additions & 3 deletions data/QuoraQP/prepare_mz_data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python
# coding: utf-8

from __future__ import print_function

import sys
import random
Expand All @@ -14,19 +14,24 @@
from preparation import Preparation
from preprocess import Preprocess, NgramUtil


def read_dict(infile):
word_dict = {}
for line in open(infile):
r = line.strip().split()
word_dict[r[1]] = r[0]
return word_dict


def read_doc(infile):
doc = {}
for line in open(infile):
r = line.strip().split()
doc[r[0]] = r[1:]
#assert len(doc[r[0]]) == int(r[1])
return doc


def filter_triletter(tri_stats, min_filter_num=5, max_filter_num=10000):
tri_dict = {}
tri_stats = sorted(tri_stats.items(), key=lambda d:d[1], reverse=True)
Expand All @@ -36,6 +41,7 @@ def filter_triletter(tri_stats, min_filter_num=5, max_filter_num=10000):
tri_dict[triinfo[0]] = len(tri_dict)
return tri_dict


if __name__ == '__main__':
prepare = Preparation()
srcdir = './'
Expand Down Expand Up @@ -82,9 +88,9 @@ def filter_triletter(tri_stats, min_filter_num=5, max_filter_num=10000):
triletter_dict = filter_triletter(triletter_stats, 5, 10000)
with open(triletter_dict_output, 'w') as f:
for tri_id, tric in triletter_dict.items():
print >> f, tri_id, tric
print(tri_id, tric, file=f)
with open(word_triletter_output, 'w') as f:
for wid, trics in word_triletter_map.items():
print >> f, wid, ' '.join([str(triletter_dict[k]) for k in trics if k in triletter_dict])
print(wid, ' '.join([str(triletter_dict[k]) for k in trics if k in triletter_dict]), file=f)

print('Triletter Processing finished ...')
15 changes: 8 additions & 7 deletions data/QuoraQP/run_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,22 @@ wget --load-cookies=cookies.txt "https://www.kaggle.com/c/quora-question-pairs/d
unzip test.csv.zip
!EOF!

#You can also download and unzip it manually on the official web, and save it to the current directory
# You can also download and unzip it manually on the official web, and save it to the current directory

# download the glove vectors
wget http:https://nlp.stanford.edu/data/glove.840B.300d.zip
unzip glove.840B.300d.zip
wget http:https://nlp.stanford.edu/data/glove.6B.zip
unzip glove.6B.zip
# wget http:https://nlp.stanford.edu/data/glove.840B.300d.zip
# unzip glove.840B.300d.zip
# wget http:https://nlp.stanford.edu/data/glove.6B.zip
# unzip glove.6B.zip

# generate the mz-datasets
python prepare_mz_data.py

# generate word embedding
python gen_w2v.py glove.840B.300d.txt word_dict.txt embed_glove_d300
GLOVE='.'
python gen_w2v.py $GLOVE/glove.840B.300d.txt word_dict.txt embed_glove_d300
python norm_embed.py embed_glove_d300 embed_glove_d300_norm
python gen_w2v.py glove.6B.50d.txt word_dict.txt embed_glove_d50
python gen_w2v.py $GLOVE/glove.6B.50d.txt word_dict.txt embed_glove_d50
python norm_embed.py embed_glove_d50 embed_glove_d50_norm

# generate idf file
Expand Down
4 changes: 2 additions & 2 deletions matchzoo/inputs/preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,12 @@ def run_with_one_corpus_for_quora(self, file_path):
# hashid = {}
corpus = {}
rels = []
f = open(file_path, 'r')
f = codecs.open(file_path, 'r', encoding='utf8')
next(f)
for line in f:
# print("", i)
# print("", i)
line = line.decode('utf8')
# line = line.decode('utf8')
line = line.strip()
qid1, qid2, q1, q2, label = self.parse_line_for_quora(line, "\t")
if q1 != 0:
Expand Down

0 comments on commit 99b9a6a

Please sign in to comment.