Skip to content

Commit

Permalink
Improve Chinese segmentation
Browse files Browse the repository at this point in the history
  • Loading branch information
fangpenlin committed Dec 29, 2010
1 parent 0b9a47e commit 029aa22
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 7 deletions.
39 changes: 38 additions & 1 deletion loso/lexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
eng_term_pattern = """[a-zA-Z0-9\\-_']+"""

def iterEnglishTerms(text):
"""Iterator English terms from Chinese text
"""Iterate English terms from Chinese text
"""
terms = []
Expand All @@ -23,6 +23,43 @@ def iterEnglishTerms(text):
terms.append(term.group(0))
return terms

def iterMixTerms(text, eng_prefix='E'):
"""Iterate English terms and Chinese sentence, for example
"C1C2C3C4 E1 E2 C5C6"
will return
["C1C2C3C4", "Ee1", "Ee2", "C5C6"]
Another real example:
"請問一下為什麼我的ip會block ?"
will return
[u"請問一下為什麼我的", u'Eip', u"會", u'Eblock']
"""
# last position term
terms = []
parts = text.split()
for part in parts:
last = 0
for match in re.finditer(eng_term_pattern, part):
previous_term = part[last:match.start()]
if previous_term:
terms.append(previous_term)
if eng_prefix:
terms.append(eng_prefix + match.group(0).lower())
else:
terms.append(match.group(0).lower())
last = match.end()
final_term = part[last:]
if final_term:
terms.append(final_term)
return terms

def splitSentence(text, delimiters=None):
"""Split article into sentences by delimiters
Expand Down
24 changes: 18 additions & 6 deletions loso/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,13 @@ def splitTerms(self, text):
terms = []
for sentence in lexicon.splitSentence(text):
if sentence:
terms.extend(self.db.splitTerms(sentence, self.ngram))
for mixed in lexicon.iterMixTerms(sentence):
# English term
if mixed.startswith('E'):
terms.append(mixed)
# Chinese sentence
else:
terms.extend(self.db.splitTerms(mixed, self.ngram))
return terms

def splitNgramTerms(self, text):
Expand All @@ -43,8 +49,14 @@ def splitNgramTerms(self, text):
terms = []
for sentence in lexicon.splitSentence(text):
if sentence:
for n in xrange(1, self.ngram+1):
terms.extend(lexicon.iterTerms(n, sentence, False))
for mixed in lexicon.iterMixTerms(sentence):
# English term
if mixed.startswith('E'):
terms.append(mixed)
# Chinese sentence
else:
for n in xrange(1, self.ngram+1):
terms.extend(lexicon.iterTerms(n, mixed, False))
return terms

def splitSentence(self, text):
Expand All @@ -53,8 +65,8 @@ def splitSentence(self, text):
"""
return lexicon.splitSentence(text)

def extractEnglishTerms(self, text):
"""Extract English terms from Chinese text
def splitMixTerms(self, text):
"""Split text into Chinese sentence and English terms
"""
return list(lexicon.iterEnglishTerms(text))
return list(lexicon.iterMixTerms(text))

0 comments on commit 029aa22

Please sign in to comment.