Skip to content

Commit

Permalink
Optionally display parser progress.
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolay-r committed Jan 13, 2024
1 parent 3dd3f3d commit 5af655f
Showing 1 changed file with 13 additions and 4 deletions.
17 changes: 13 additions & 4 deletions arekit/common/docs/parser.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from tqdm import tqdm
from arekit.common.docs.base import Document
from arekit.common.docs.parsed.base import ParsedDocument
from arekit.common.pipeline.base import BasePipelineLauncher
Expand All @@ -10,7 +11,7 @@
class DocumentParsers(object):

@staticmethod
def parse(doc, pipeline_items, parent_ppl_ctx=None, src_key="input"):
def parse(doc, pipeline_items, parent_ppl_ctx=None, src_key="input", show_progress=False):
""" This document parser is based on single text parts (sentences)
that passes sequentially through the pipeline of transformations.
"""
Expand All @@ -19,7 +20,11 @@ def parse(doc, pipeline_items, parent_ppl_ctx=None, src_key="input"):
assert(isinstance(parent_ppl_ctx, PipelineContext) or parent_ppl_ctx is None)

parsed_sentences = []
for sent_ind in range(doc.SentencesCount):

data_it = range(doc.SentencesCount)
progress_it = tqdm(data_it, disable=not show_progress)

for sent_ind in progress_it:

# Composing the context from a single sentence.
ctx = PipelineContext({src_key: doc.get_sentence(sent_ind)}, parent_ctx=parent_ppl_ctx)
Expand All @@ -33,7 +38,7 @@ def parse(doc, pipeline_items, parent_ppl_ctx=None, src_key="input"):
return ParsedDocument(doc_id=doc.ID, parsed_sentences=parsed_sentences)

@staticmethod
def parse_batch(doc, pipeline_items, batch_size, parent_ppl_ctx=None, src_key="input"):
def parse_batch(doc, pipeline_items, batch_size, parent_ppl_ctx=None, src_key="input", show_progress=False):
""" This document parser is based on batch of sentences.
"""
assert(isinstance(batch_size, int) and batch_size > 0)
Expand All @@ -42,7 +47,11 @@ def parse_batch(doc, pipeline_items, batch_size, parent_ppl_ctx=None, src_key="i
assert(isinstance(parent_ppl_ctx, PipelineContext) or parent_ppl_ctx is None)

parsed_sentences = []
for batch in BatchIterator(lst=list(range(doc.SentencesCount)), batch_size=batch_size):

data_it = BatchIterator(lst=list(range(doc.SentencesCount)), batch_size=batch_size)
progress_it = tqdm(data_it, total=round(doc.SentencesCount / batch_size), disable=not show_progress)

for batch in progress_it:

# Composing the context from a single sentence.
ctx = PipelineContext({src_key: [doc.get_sentence(s_ind) for s_ind in batch]},
Expand Down

0 comments on commit 5af655f

Please sign in to comment.