Skip to content

Commit

Permalink
Add dump command for dumping lexicon database as file
Browse files Browse the repository at this point in the history
  • Loading branch information
fangpenlin committed Feb 8, 2011
1 parent 30801e5 commit 7dcc502
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 5 deletions.
39 changes: 36 additions & 3 deletions loso/lexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,8 @@ def getCandidate(i, left, right):

class LexiconDatabase(object):

progress_interval = 10000

def __init__(
self,
lexicon_prefix='Lexicon_',
Expand Down Expand Up @@ -309,6 +311,37 @@ def splitTerms(self, text, ngram=4):
terms, best_score = findBestSegment(grams)
self.logger.debug('Best score: %s', best_score)
return terms

def dump(self, file):
self.logger.info('Dumping meta-data ...')
keys = self.redis.keys(self.meta_prefix + '*')
if not keys:
self.logger.error('The lexicon database is empty, nothing to dump')
return
keys = sorted(keys)
meta_values = self.redis.mget(keys)
for key, value in zip(keys, meta_values):
name = key[len(self.meta_prefix):]
print >>file, name, value
self.logger.info('Meta-data %s=%s', name, value)

# a blank line
print >>file

self.logger.info('Dumping lexicons keys ...')
keys = self.redis.keys(self.lexicon_prefix + '*')
self.logger.info('Get %d keys', len(keys))
self.logger.info('Dumping lexicons values ...')
values = self.redis.mget(keys)
self.logger.info('Get %d values', len(values))
for i, (key, value) in enumerate(zip(keys, values)):
term = key[len(self.lexicon_prefix):].decode('utf8')
print >>file, value, term
if i % self.progress_interval == 0:
if i % self.progress_interval == 0:
whole = len(keys)
per = (i/float(whole))*100.0
self.logger.info('Progress %d/%d (%02d%%)', i, whole, per)

class LexiconBuilder(object):

Expand Down Expand Up @@ -343,9 +376,9 @@ def feed(self, text):
result = self.db.increase(term, delta)
sum += delta
if i % self.progress_interval == 0:
n = len(terms_count)
per = (i/float(n))*100.0
self.logger.info('Progress %d/%d (%02d%%)', i, n, per)
whole = len(terms_count)
per = (i/float(whole))*100.0
self.logger.info('Progress %d/%d (%02d%%)', i, whole, per)

# add n-gram count
result = self.db.increaseNgramSum(n, sum)
Expand Down
26 changes: 25 additions & 1 deletion loso/scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,4 +92,28 @@ def run(self):
server = SimpleXMLRPCServer((interface, port), allow_none=True)
server.register_introspection_functions()
server.register_instance(seg_service)
server.serve_forever()
server.serve_forever()

class DumpCommand(Command):
description = 'dump lexicon database as a text file'
user_options = [
('file=', 'f', '/path/to/text'),
('encoding=', 'e', 'encoding of text file'),
]

def initialize_options(self):
self.encoding = 'utf8'
self.file = None

def finalize_options(self):
import codecs
if not self.file:
raise DistutilsOptionError('Must set text file path to feed')
self.text_file = codecs.open(self.file, 'wt', encoding=self.encoding)

def run(self):
logging.basicConfig(level=logging.DEBUG)
seg_service = service.SegumentService()
seg_service.db.dump(self.text_file)
self.text_file.close()
print 'Done.'
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
'interact': scripts.InteractCommand,
'feed': scripts.FeedCommand,
'reset': scripts.ResetCommand,
'serve': scripts.ServeCommand
'serve': scripts.ServeCommand,
'dump': scripts.DumpCommand
}

setup(
Expand Down

0 comments on commit 7dcc502

Please sign in to comment.