Skip to content

Commit

Permalink
prepare corpus for free
Browse files Browse the repository at this point in the history
  • Loading branch information
koth committed Jul 28, 2017
1 parent c39a88a commit 3afa292
Show file tree
Hide file tree
Showing 8 changed files with 6,435 additions and 5,815 deletions.
11,852 changes: 6,182 additions & 5,670 deletions kcws/models/basic_vocab.txt

Large diffs are not rendered by default.

Binary file modified kcws/models/seg_model.pbtxt
Binary file not shown.
7 changes: 7 additions & 0 deletions kcws/train/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,10 @@ py_binary(
data=['//utils:w2v.so'],
imports=['../../utils']
)

py_binary(
name = "generate_train_free",
srcs = ["generate_train_free.py"],
data = ["//utils:w2v.so"],
imports = ["../../utils"],
)
7 changes: 5 additions & 2 deletions kcws/train/filter_sentence.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@ def main(argc, argv):
if not line:
continue
ss = line.split(' ')
assert (len(ss) == (2 * SENTENCE_LEN))

if len(ss) != (2 * SENTENCE_LEN):
print("len is:%d" % (len(ss)))
continue
numV = 0
for i in range(SENTENCE_LEN):
if int(ss[i]) != 0:
Expand All @@ -48,4 +51,4 @@ def main(argc, argv):


if __name__ == '__main__':
main(len(sys.argv), sys.argv)
main(len(sys.argv), sys.argv)
83 changes: 83 additions & 0 deletions kcws/train/generate_train_free.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-

# File: generate_train_free.py
# Project: /e/code/kcws
# Created: Thu Jul 27 2017
# Author: Koth Chen
# Copyright (c) 2017 Koth
#
# <<licensetext>>


import sys
import os
import w2v
import fire
from sentence import Sentence

totalLine = 0
longLine = 0

MAX_LEN = 80
totalChars = 0


def processLine(line, vob, out):
global totalLine
global longLine
global totalChars
ss = line.split("\t")

sentence = Sentence()
nn = len(ss)
for i in range(nn):
ts = ss[i].split(" ")
ustr = unicode(ts[0].decode('utf8'))
sentence.addToken(ustr)
if sentence.chars > MAX_LEN:
longLine += 1
else:
x = []
y = []
totalChars += sentence.chars
sentence.generate_tr_line(x, y, vob)
nn = len(x)
assert (nn == len(y))
for j in range(nn, MAX_LEN):
x.append(0)
y.append(0)
line = ''
for i in range(MAX_LEN):
if i > 0:
line += " "
line += str(x[i])
for j in range(MAX_LEN):
line += " " + str(y[j])
out.write("%s\n" % (line))
totalLine += 1


def doGen(inputPath, outputPath, vocabPath):
global totalLine
global longLine
global totalChars
vob = w2v.Word2vecVocab()
vob.Load(vocabPath)
with open(inputPath, "r") as inp:
with open(outputPath, "w") as out:
for line in inp.readlines():
line = line.strip()
if not line:
continue
processLine(line, vob, out)
print("total:%d, long lines:%d, chars:%d" %
(totalLine, longLine, totalChars))


def main():
fire.Fire()


if __name__ == '__main__':
main()
242 changes: 104 additions & 138 deletions kcws/train/generate_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import sys
import os
import w2v
from sentence import Sentence

totalLine = 0
longLine = 0
Expand All @@ -15,151 +16,116 @@
totalChars = 0


class Setence:
def __init__(self):
self.tokens = []
self.chars = 0

def addToken(self, t):
self.chars += len(t)
self.tokens.append(t)

def clear(self):
self.tokens = []
self.chars = 0

# label -1, unknown
# 0-> 'S'
# 1-> 'B'
# 2-> 'M'
# 3-> 'E'
def generate_tr_line(self, x, y, vob):
for t in self.tokens:
if len(t) == 1:
x.append(vob.GetWordIndex(str(t[0].encode("utf8"))))
y.append(0)
else:
nn = len(t)
for i in range(nn):
x.append(vob.GetWordIndex(str(t[i].encode("utf8"))))
if i == 0:
y.append(1)
elif i == (nn - 1):
y.append(3)
else:
y.append(2)


def processToken(token, sentence, out, end, vob):
global totalLine
global longLine
global totalChars
global MAX_LEN
nn = len(token)
while nn > 0 and token[nn - 1] != '/':
nn = nn - 1

token = token[:nn - 1].strip()
if token != '。':
ustr = unicode(token.decode('utf8'))
sentence.addToken(ustr)
uline = u''
if token == '。' or end:
if sentence.chars > MAX_LEN:
longLine += 1
else:
x = []
y = []
totalChars += sentence.chars
sentence.generate_tr_line(x, y, vob)
nn = len(x)
assert (nn == len(y))
for j in range(nn, MAX_LEN):
x.append(0)
y.append(0)
line = ''
for i in range(MAX_LEN):
if i > 0:
line += " "
line += str(x[i])
for j in range(MAX_LEN):
line += " " + str(y[j])
out.write("%s\n" % (line))
totalLine += 1
sentence.clear()
global totalLine
global longLine
global totalChars
global MAX_LEN
nn = len(token)
while nn > 0 and token[nn - 1] != '/':
nn = nn - 1

token = token[:nn - 1].strip()
if token != '。':
ustr = unicode(token.decode('utf8'))
sentence.addToken(ustr)
uline = u''
if token == '。' or end:
if sentence.chars > MAX_LEN:
longLine += 1
else:
x = []
y = []
totalChars += sentence.chars
sentence.generate_tr_line(x, y, vob)
nn = len(x)
assert (nn == len(y))
for j in range(nn, MAX_LEN):
x.append(0)
y.append(0)
line = ''
for i in range(MAX_LEN):
if i > 0:
line += " "
line += str(x[i])
for j in range(MAX_LEN):
line += " " + str(y[j])
out.write("%s\n" % (line))
totalLine += 1
sentence.clear()


def processLine(line, out, vob):
line = line.strip()
nn = len(line)
seeLeftB = False
start = 0
sentence = Setence()
try:
for i in range(nn):
if line[i] == ' ':
if not seeLeftB:
token = line[start:i]
if token.startswith('['):
tokenLen = len(token)
while tokenLen > 0 and token[tokenLen - 1] != ']':
tokenLen = tokenLen - 1
token = token[1:tokenLen - 1]
ss = token.split(' ')
for s in ss:
processToken(s, sentence, out, False, vob)
else:
processToken(token, sentence, out, False, vob)
start = i + 1
elif line[i] == '[':
seeLeftB = True
elif line[i] == ']':
seeLeftB = False
if start < nn:
token = line[start:]
if token.startswith('['):
tokenLen = len(token)
while tokenLen > 0 and token[tokenLen - 1] != ']':
tokenLen = tokenLen - 1
token = token[1:tokenLen - 1]
ss = token.split(' ')
ns = len(ss)
for i in range(ns - 1):
processToken(ss[i], sentence, out, False, vob)
processToken(ss[-1], sentence, out, True, vob)
else:
processToken(token, sentence, out, True, vob)
except Exception as e:
pass
line = line.strip()
nn = len(line)
seeLeftB = False
start = 0
sentence = Sentence()
try:
for i in range(nn):
if line[i] == ' ':
if not seeLeftB:
token = line[start:i]
if token.startswith('['):
tokenLen = len(token)
while tokenLen > 0 and token[tokenLen - 1] != ']':
tokenLen = tokenLen - 1
token = token[1:tokenLen - 1]
ss = token.split(' ')
for s in ss:
processToken(s, sentence, out, False, vob)
else:
processToken(token, sentence, out, False, vob)
start = i + 1
elif line[i] == '[':
seeLeftB = True
elif line[i] == ']':
seeLeftB = False
if start < nn:
token = line[start:]
if token.startswith('['):
tokenLen = len(token)
while tokenLen > 0 and token[tokenLen - 1] != ']':
tokenLen = tokenLen - 1
token = token[1:tokenLen - 1]
ss = token.split(' ')
ns = len(ss)
for i in range(ns - 1):
processToken(ss[i], sentence, out, False, vob)
processToken(ss[-1], sentence, out, True, vob)
else:
processToken(token, sentence, out, True, vob)
except Exception as e:
pass


def main(argc, argv):
global totalLine
global longLine
global totalChars
if argc < 4:
print("Usage:%s <vob> <dir> <output>" % (argv[0]))
sys.exit(1)
vobPath = argv[1]
rootDir = argv[2]
vob = w2v.Word2vecVocab()
vob.Load(vobPath)
out = open(argv[3], "w")
for dirName, subdirList, fileList in os.walk(rootDir):
curDir = os.path.join(rootDir, dirName)
for file in fileList:
if file.endswith(".txt"):
curFile = os.path.join(curDir, file)
#print("processing:%s" % (curFile))
fp = open(curFile, "r")
for line in fp.readlines():
line = line.strip()
processLine(line, out, vob)
fp.close()
out.close()
print("total:%d, long lines:%d, chars:%d" %
(totalLine, longLine, totalChars))
global totalLine
global longLine
global totalChars
if argc < 4:
print("Usage:%s <vob> <dir> <output>" % (argv[0]))
sys.exit(1)
vobPath = argv[1]
rootDir = argv[2]
vob = w2v.Word2vecVocab()
vob.Load(vobPath)
out = open(argv[3], "w")
for dirName, subdirList, fileList in os.walk(rootDir):
curDir = os.path.join(rootDir, dirName)
for file in fileList:
if file.endswith(".txt"):
curFile = os.path.join(curDir, file)
#print("processing:%s" % (curFile))
fp = open(curFile, "r")
for line in fp.readlines():
line = line.strip()
processLine(line, out, vob)
fp.close()
out.close()
print("total:%d, long lines:%d, chars:%d" %
(totalLine, longLine, totalChars))


if __name__ == '__main__':
main(len(sys.argv), sys.argv)
main(len(sys.argv), sys.argv)
Loading

0 comments on commit 3afa292

Please sign in to comment.