Skip to content

Commit

Permalink
Add info command
Browse files Browse the repository at this point in the history
  • Loading branch information
fangpenlin committed Feb 8, 2011
1 parent 91e2128 commit 75388cc
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 28 deletions.
70 changes: 48 additions & 22 deletions loso/lexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def splitSentence(text, delimiters=None):
sentence.append(c)
yield ''.join(sentence)

def iterTerms(n, text, emmit_head_tail=True):
def iterTerms(n, text, emmit_head_tail=False):
"""Iterate n-gram terms in given text and return a generator.
All English in
Expand Down Expand Up @@ -231,7 +231,7 @@ def setMeta(self, key, value):

@property
def gram(self):
return self.getMeta('gram')
return int(self.getMeta('gram') or 0)

def increaseTerm(self, term, delta=1):
"""Increase value of a term
Expand Down Expand Up @@ -282,32 +282,39 @@ def getGramSum(self, n):
"""
key = '%s-gram-sum' % n
return self.getMeta(key)
return int(self.getMeta(key) or 0)

def getGramVariety(self, n):
"""Get variety of n-gram terms
"""
key = '%s-gram-variety' % n
return self.getMeta(key)
return int(self.getMeta(key) or 0)

def getStats(self):
"""Get statistics of this category
"""
stats = {}
for n in xrange(1, self.gram):
stats = dict(
gram=self.gram,
total_sum=0,
total_variety=0
)
for n in xrange(1, self.gram + 1):
sum = self.getGramSum(n)
variety = self.getGramCount(n)
sum_key = '%s-gram-sum' % n
variety_key = '%s-gram-variety' % n
variety = self.getGramVariety(n)
sum_key = '%sgram_sum' % n
variety_key = '%sgram_variety' % n
stats[sum_key] = sum
stats[variety_key] = variety
stats['total_sum'] += sum
stats['total_variety'] += variety
return stats

def dump(self, file):
self.logger.info('Dumping meta-data ...')
for n in xrange(1, self.gram):
print >>file, 'gram', self.gram
for n in xrange(1, self.gram + 1):
name = '%d-gram-sum' % n
value = self.getGramSum(n)
print >>file, name, value
Expand All @@ -327,7 +334,7 @@ def dump(self, file):
values = self.getTerms(*terms)
self.logger.info('Get %d values', len(terms))
for i, (term, count) in enumerate(zip(terms, values)):
term = term[len(self.lexicon_prefix):].decode('utf8')
term = term.decode('utf8')
print >>file, count, term
if i % self.progress_interval == 0:
if i % self.progress_interval == 0:
Expand Down Expand Up @@ -382,7 +389,20 @@ def __init__(
self._category_set_key = self.prefix + 'category'

def getCategory(self, name):
"""Get category and return, if not exist, just create one
"""Get category and return
"""
if name not in self.getCategoryList():
return
category = self._categories_cache.get(name)
if category:
return category
category = LexiconCategory(self, name)
self._categories_cache[name] = category
return category

def addCategory(self, name):
"""Add a category and return
"""
category = self._categories_cache.get(name)
Expand Down Expand Up @@ -415,19 +435,18 @@ def _getTermScore(self, term, ngram, categories):
"""Get score of a term
"""
sum = 0
score = 0.00000001
for c in categories:
count = int(c.getTerm(term) or 0)
n = len(term)
if c.getGramSum(n) is None:
sum = int(c.getGramSum(n) or 0)
variety = int(c.getGramVariety(n) or 0)
if not variety:
v = 1
else:
v = float(c.getGramSum(n))/float(c.getGramVariety(n))
v = sum/float(variety)
v *= v
score = count/v
if score == 0:
score = 0.00000001
sum += score
score += count/v
return score

def splitTerms(self, text, categories=None):
Expand All @@ -436,9 +455,16 @@ def splitTerms(self, text, categories=None):
categories
"""
all_category = self.getCategoryList()
if not categories:
categories = self.getCategoryList()
c_list = [self.getCategory(name) for name in categories]
categories = all_category
c_list = []
for name in categories:
c = self.getCategory(name)
if not c:
self.logger.error('Category %s not exist', name)
continue
c_list.append(c)
grams = []
for n in xrange(1, self.ngram+1):
terms = []
Expand Down Expand Up @@ -466,7 +492,7 @@ def feed(self, category, text):
"""Feed text into lexicon database and return total terms has been fed
"""
cat = self.db.getCategory(category)
cat = self.db.addCategory(category)
total = 0
for n in xrange(1, self.ngram+1):
self.logger.debug('Processing %d-gram', n)
Expand Down
53 changes: 48 additions & 5 deletions loso/scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,15 @@
class InteractCommand(Command):
description = 'provide interact interface for testing splitting terms'
user_options = [
('category=', 'c', 'category name'),
('category=', 'c', 'category name, split by comma'),
]

def initialize_options(self):
self.category = None

def finalize_options(self):
pass
if self.category:
self.category = self.category.split(',')

def run(self):
logging.basicConfig(level=logging.DEBUG)
Expand Down Expand Up @@ -105,21 +106,63 @@ class DumpCommand(Command):
user_options = [
('file=', 'f', '/path/to/text'),
('encoding=', 'e', 'encoding of text file'),
('category=', 'c', 'category name'),
]

def initialize_options(self):
self.encoding = 'utf8'
self.file = None
self.category = None

def finalize_options(self):
import codecs
if not self.file:
raise DistutilsOptionError('Must set text file path to feed')
raise DistutilsOptionError('Must set text file path to dump')
if not self.category:
raise DistutilsOptionError('Must set category to dump')
self.text_file = codecs.open(self.file, 'wt', encoding=self.encoding)

def run(self):
logging.basicConfig(level=logging.DEBUG)
seg_service = service.SegumentService()
seg_service.db.dump(self.text_file)
c = seg_service.db.getCategory(self.category)
if not c:
print 'Category %s not exist' % self.category
return
c.dump(self.text_file)
self.text_file.close()
print 'Done.'
print 'Done.'

class InfoCommand(Command):
description = 'Display info of lexicon database'
user_options = [
('category=', 'c', 'category name to display, split by comma'),
]

def initialize_options(self):
self.category = None

def finalize_options(self):
if self.category:
self.category = self.category.split(',')

def run(self):
logging.basicConfig(level=logging.DEBUG)
seg_service = service.SegumentService()
c_list = self.category
if not c_list:
c_list = seg_service.db.getCategoryList()
print
for name in c_list:
c = seg_service.db.getCategory(name)
if c is None:
print 'No such category', name
continue
stats = c.getStats()
print 'Category ' + name
print '=========' + '='*len(name)
print 'Ngram:', stats['gram']
for n in xrange(1, stats['gram']+1):
print '%d-gram sum:' % n, stats['%sgram_sum' % n]
print '%d-gram variety:' % n, stats['%sgram_variety' % n]
print
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
'feed': scripts.FeedCommand,
'reset': scripts.ResetCommand,
'serve': scripts.ServeCommand,
'dump': scripts.DumpCommand
'dump': scripts.DumpCommand,
'info': scripts.InfoCommand
}

setup(
Expand Down

0 comments on commit 75388cc

Please sign in to comment.