Skip to content

Commit

Permalink
Add head tail score, add english term iterator
Browse files Browse the repository at this point in the history
  • Loading branch information
fangpenlin committed Dec 15, 2010
1 parent a9252d0 commit c6d3b77
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 13 deletions.
11 changes: 0 additions & 11 deletions loso/chinese_token.py

This file was deleted.

29 changes: 27 additions & 2 deletions loso/lexicon.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# -*- coding: utf8 -*-

import re
import logging

from redis import Redis
Expand All @@ -9,6 +10,19 @@
# default delimiters for splitSentence
default_delimiters = set(u"""\n\r\t ,.:"()[]{}。,、;:!「」『』─()﹝﹞…﹏_‧""")

eng_term_pattern = """[a-zA-Z0-9\\-_']+"""

def iterEnglishTerms(text):
"""Iterator English terms from Chinese text
"""
terms = []
parts = text.split()
for part in parts:
for term in re.finditer(eng_term_pattern, part):
terms.append(term.group(0))
return terms

def splitSentence(text, delimiters=None):
"""Split article into sentences by delimiters
Expand Down Expand Up @@ -236,8 +250,19 @@ def splitTerms(self, text, ngram=4):
score = count/v
if score == 0:
score = 0.00000001
self.logger.debug('Term=%s, Count=%s, Score=%s',
term, count, score)

head_tail_score = 0
head = 0
tail = 0
head_tail = self.getHeadTail(term)
if head_tail is not None and n != 1:
head, tail = head_tail
if head > 3 and tail > 3:
score += (head + tail) / v

self.logger.debug(
'Term=%s, Count=%s, Head=%s, Tail=%s, Score=%s',
term, count, head, tail, score)
terms.append((term, score))
grams.append(terms)
terms, best_score = findBestSegment(grams)
Expand Down
6 changes: 6 additions & 0 deletions loso/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ def splitSentence(self, text):
"""
return lexicon.splitSentence(text)

def extractEnglishTerms(self, text):
"""Extract English terms from Chinese text
"""
return list(lexicon.iterEnglishTerms(text))

def main():
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('segment.main')
Expand Down

0 comments on commit c6d3b77

Please sign in to comment.