Merge branch '0.25.0-rc'

nicolay-r · Feb 27, 2024 · 0b18295 · 0b18295
2 parents 4c577cb + 7c92743
commit 0b18295
Show file tree

Hide file tree

Showing 197 changed files with 390 additions and 7,371 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# AREkit 0.24.0
+# AREkit 0.25.0
 
 ![](https://img.shields.io/badge/Python-3.9+-brightgreen.svg)
 
@@ -11,46 +11,50 @@ is a python toolkit, devoted to document level Attitude and Relation Extraction
 
 ## Description
 
-This toolkit aims to solve data preparation problems in Relation Extraction related taks, considiering such factors as:
+
+This toolkit aims at memory-effective data processing in Relation Extraction (RE) related tasks.
+
+<p align="center">
+ <img src="docs/arekit-pipeline-concept.png"/>
+</p>
+
+> Figure: AREkit pipelines design. More on 
+> **[ARElight: Context Sampling of Large Texts for Deep Learning Relation Extraction](https://www.ecir2024.org/accepted-paper/)** paper
+
+In particular, this framework serves the following features: 
+* ➿ [pipelines](https://github.com/nicolay-r/AREkit/wiki/Pipelines:-Text-Opinion-Annotation) and iterators for handling large-scale collections serialization without out-of-memory issues.
 * 🔗 EL (entity-linking) API support for objects, 
 * ➰ avoidance of cyclic connections,
 * :straight_ruler: distance consideration between relation participants (in `terms` or `sentences`),
 * 📑 relations annotations and filtering rules,
 * *️⃣ entities formatting or masking, and more.
 
-Using AREkit you may focus on preparation and experiments with your ML-models by shift all the data-preparation part onto toolset of this project for:
-[neural-networks](https://github.com/nicolay-r/AREkit/wiki/Sampling-for-Neural-Network), 
-[language-models](https://github.com/nicolay-r/AREkit/wiki/Sampling-for-BERT), 
-[ChatGPT](https://github.com/nicolay-r/AREkit/wiki/Sampling-for-ChatGPT).
-
-In order to do so, we provide:
-* :file_folder: API for external [collection binding](https://github.com/nicolay-r/AREkit/wiki/Binding-a-Custom-Source) (native support of [BRAT](https://brat.nlplab.org/)-based exported annotations)
-* ➿ [pipelines](https://github.com/nicolay-r/AREkit/wiki/Pipelines:-Text-Opinion-Annotation) and iterators for handling large-scale collections serialization without out-of-memory issues.
-* evaluators which allows you to assess your trained model.
-
-AREkit is a very close to opensource framework [SeqIO](https://github.com/google/seqio) proposed by [Google](https://github.com/google) 
-for data-preprocessing, evaluation, for sequence models.
-While SeqIO dedicated for conversion/pre-processing of datasets of any type, 
-this project proposes pipelines creation from the very raw or preannotated (BRAT-based) texts, including the solutions for problems mentioned above.
-
-The core functionality includes 
-(1) API for document presentation with EL (Entity Linking, i.e. Object Synonymy) support 
-for sentence level relations preparation (dubbed as contexts)
-(2) API for contexts extraction
-(3) relations transferring from sentence-level onto document-level, and more.
+The core functionality includes: 
+* API for document presentation with EL (Entity Linking, i.e. Object Synonymy) support 
+for sentence level relations preparation (dubbed as contexts);
+* API for contexts extraction;
+* Relations transferring from sentence-level onto document-level, and more.
 
 ## Installation 
 
-1. Install required dependencies
 ```bash
-pip install git+https://github.com/nicolay-r/[email protected]
-```
-
-2. Download Resources
-```bash
-python -m arekit.download_data
+pip install git+https://github.com/nicolay-r/[email protected]
 ```
 
 ## Usage
-Please follow the wiki page
-[Tutorials List](https://github.com/nicolay-r/AREkit/wiki/Tutorials).
+
+Please follow the **[tutorial section on project Wiki](https://github.com/nicolay-r/AREkit/wiki/Tutorials)** for mode details.
+
+## How to cite
+A great research is also accompanied by the faithful reference. 
+if you use or extend our work, please cite as follows:
+
+```bibtex
+@inproceedings{rusnachenko2024arelight,
+ title={ARElight: Context Sampling of Large Texts for Deep Learning Relation Extraction},
+ author={Rusnachenko, Nicolay and Liang, Huizhi and Kolomeets, Maxim and Shi, Lei},
+ booktitle={European Conference on Information Retrieval},
+ year={2024},
+ organization={Springer}
+}
+```
diff --git a/arekit/common/docs/entities_grouping.py b/arekit/common/docs/entities_grouping.py
@@ -4,8 +4,9 @@
 
 class EntitiesGroupingPipelineItem(BasePipelineItem):
 
- def __init__(self, value_to_group_id_func):
+ def __init__(self, value_to_group_id_func, **kwargs):
  assert(callable(value_to_group_id_func))
+ super(EntitiesGroupingPipelineItem, self).__init__(**kwargs)
  self.__value_to_group_id_func = value_to_group_id_func
 
  def apply_core(self, input_data, pipeline_ctx):

diff --git a/arekit/common/docs/objects_parser.py b/arekit/common/docs/objects_parser.py
diff --git a/arekit/common/docs/parser.py b/arekit/common/docs/parser.py
@@ -1,34 +1,66 @@
+from tqdm import tqdm
 from arekit.common.docs.base import Document
 from arekit.common.docs.parsed.base import ParsedDocument
+from arekit.common.pipeline.base import BasePipelineLauncher
+from arekit.common.pipeline.batching import BatchingPipelineLauncher
 from arekit.common.pipeline.context import PipelineContext
-from arekit.common.text.parser import BaseTextParser
+from arekit.common.pipeline.utils import BatchIterator
+from arekit.common.text.parsed import BaseParsedText
 
 
-class DocumentParser(object):
+class DocumentParsers(object):
 
  @staticmethod
- def __get_sent(doc, sent_ind):
- return doc.get_sentence(sent_ind)
-
- @staticmethod
- def parse(doc, text_parser, parent_ppl_ctx=None):
+ def parse(doc, pipeline_items, parent_ppl_ctx=None, src_key="input", show_progress=False):
+ """ This document parser is based on single text parts (sentences)
+ that passes sequentially through the pipeline of transformations.
+ """
  assert(isinstance(doc, Document))
- assert(isinstance(text_parser, BaseTextParser))
+ assert(isinstance(pipeline_items, list))
  assert(isinstance(parent_ppl_ctx, PipelineContext) or parent_ppl_ctx is None)
 
- parsed_sentences = [text_parser.run(input_data=DocumentParser.__get_sent(doc, sent_ind).Text,
- params_dict=DocumentParser.__create_ppl_params(doc=doc, sent_ind=sent_ind),
- parent_ctx=parent_ppl_ctx)
- for sent_ind in range(doc.SentencesCount)]
+ parsed_sentences = []
+
+ data_it = range(doc.SentencesCount)
+ progress_it = tqdm(data_it, disable=not show_progress)
+
+ for sent_ind in progress_it:
 
- return ParsedDocument(doc_id=doc.ID,
- parsed_sentences=parsed_sentences)
+ # Composing the context from a single sentence.
+ ctx = PipelineContext({src_key: doc.get_sentence(sent_ind)}, parent_ctx=parent_ppl_ctx)
+
+ # Apply all the operations.
+ BasePipelineLauncher.run(pipeline=pipeline_items, pipeline_ctx=ctx, src_key=src_key)
+
+ # Collecting the result.
+ parsed_sentences.append(BaseParsedText(terms=ctx.provide("result")))
+
+ return ParsedDocument(doc_id=doc.ID, parsed_sentences=parsed_sentences)
 
  @staticmethod
- def __create_ppl_params(doc, sent_ind):
+ def parse_batch(doc, pipeline_items, batch_size, parent_ppl_ctx=None, src_key="input", show_progress=False):
+ """ This document parser is based on batch of sentences.
+ """
+ assert(isinstance(batch_size, int) and batch_size > 0)
  assert(isinstance(doc, Document))
- return {
- "s_ind": sent_ind, # sentence index. (as Metadata)
- "doc_id": doc.ID, # document index. (as Metadata)
- "sentence": DocumentParser.__get_sent(doc, sent_ind), # Required for special sources.
- }
+ assert(isinstance(pipeline_items, list))
+ assert(isinstance(parent_ppl_ctx, PipelineContext) or parent_ppl_ctx is None)
+
+ parsed_sentences = []
+
+ data_it = BatchIterator(data_iter=iter(range(doc.SentencesCount)), batch_size=batch_size)
+ progress_it = tqdm(data_it, total=round(doc.SentencesCount / batch_size), disable=not show_progress)
+
+ for batch in progress_it:
+
+ # Composing the context from a single sentence.
+ ctx = PipelineContext({src_key: [doc.get_sentence(s_ind) for s_ind in batch]},
+ parent_ctx=parent_ppl_ctx)
+
+ # Apply all the operations.
+ BatchingPipelineLauncher.run(pipeline=pipeline_items, pipeline_ctx=ctx, src_key=src_key)
+
+ # Collecting the result.
+ parsed_sentences += [BaseParsedText(terms=result) for result in ctx.provide("result")]
+
+ return ParsedDocument(doc_id=doc.ID, parsed_sentences=parsed_sentences)
diff --git a/arekit/common/pipeline/base.py b/arekit/common/pipeline/base.py
@@ -2,24 +2,20 @@
 from arekit.common.pipeline.items.base import BasePipelineItem
 
 
-class BasePipeline(object):
+class BasePipelineLauncher:
 
- def __init__(self, pipeline):
+ @staticmethod
+ def run(pipeline, pipeline_ctx, src_key=None, has_input=True):
  assert(isinstance(pipeline, list))
- self.__pipeline = pipeline
+ assert(isinstance(pipeline_ctx, PipelineContext))
+ assert(isinstance(src_key, str) or src_key is None)
 
- def run(self, input_data, params_dict=None, parent_ctx=None):
- assert(isinstance(params_dict, dict) or params_dict is None)
-
- pipeline_ctx = PipelineContext(d=params_dict if params_dict is not None else dict(),
- parent_ctx=parent_ctx)
-
- for item in filter(lambda itm: itm is not None, self.__pipeline):
+ for ind, item in enumerate(filter(lambda itm: itm is not None, pipeline)):
  assert(isinstance(item, BasePipelineItem))
- input_data = item.apply(input_data=input_data, pipeline_ctx=pipeline_ctx)
-
- return input_data
+ do_force_key = src_key is not None and ind == 0
+ input_data = item.get_source(pipeline_ctx, force_key=src_key if do_force_key else None) \
+ if has_input or ind > 0 else None
+ item_result = item.apply(input_data=input_data, pipeline_ctx=pipeline_ctx)
+ pipeline_ctx.update(param=item.ResultKey, value=item_result, is_new_key=False)
 
- def append(self, item):
- assert(isinstance(item, BasePipelineItem))
- self.__pipeline.append(item)
+ return pipeline_ctx
diff --git a/arekit/common/pipeline/batching.py b/arekit/common/pipeline/batching.py
@@ -0,0 +1,28 @@
+from arekit.common.pipeline.context import PipelineContext
+from arekit.common.pipeline.items.base import BasePipelineItem
+
+
+class BatchingPipelineLauncher:
+
+ @staticmethod
+ def run(pipeline, pipeline_ctx, src_key=None):
+ assert(isinstance(pipeline, list))
+ assert(isinstance(pipeline_ctx, PipelineContext))
+ assert(isinstance(src_key, str) or src_key is None)
+
+ for ind, item in enumerate(filter(lambda itm: itm is not None, pipeline)):
+ assert (isinstance(item, BasePipelineItem))
+
+ # Handle the content of the batch or batch itself.
+ content = item.get_source(pipeline_ctx, call_func=False, force_key=src_key if ind == 0 else None)
+ handled_batch = [item._src_func(i) if item._src_func is not None else i for i in content]
+
+ if item.SupportBatching:
+ batch_result = list(item.apply(input_data=handled_batch, pipeline_ctx=pipeline_ctx))
+ else:
+ batch_result = [item.apply(input_data=input_data, pipeline_ctx=pipeline_ctx)
+ for input_data in handled_batch]
+
+ pipeline_ctx.update(param=item.ResultKey, value=batch_result, is_new_key=False)
+
+ return pipeline_ctx
diff --git a/arekit/common/pipeline/context.py b/arekit/common/pipeline/context.py
@@ -13,6 +13,8 @@ def __init__(self, d, parent_ctx=None):
  self._d[PARENT_CTX] = parent_ctx
 
  def __provide(self, param):
+ if param not in self._d:
+ raise Exception(f"Key `{param}` is not in dictionary.\n{self._d}")
  return self._d[param]
 
  # region public
@@ -23,7 +25,9 @@ def provide(self, param):
  def provide_or_none(self, param):
  return self.__provide(param) if param in self._d else None
 
- def update(self, param, value):
+ def update(self, param, value, is_new_key=False):
+ if is_new_key and param in self._d:
+ raise Exception(f"Key `{param}` is already presented in pipeline context dictionary.")
  self._d[param] = value
 
  # endregion

diff --git a/arekit/common/pipeline/items/base.py b/arekit/common/pipeline/items/base.py
@@ -1,9 +1,46 @@
+from arekit.common.pipeline.context import PipelineContext
+
+
 class BasePipelineItem(object):
  """ Single pipeline item that might be instatiated and embedded into pipeline.
  """
 
+ def __init__(self, src_key="result", result_key="result", src_func=None):
+ assert(isinstance(src_key, str) or src_key is None)
+ assert(callable(src_func) or src_func is None)
+ self.__src_key = src_key
+ self._src_func = src_func
+ self.__result_key = result_key
+
+ @property
+ def ResultKey(self):
+ return self.__result_key
+
+ @property
+ def SupportBatching(self):
+ """ By default pipeline item is not designed for batching.
+ """
+ return False
+
+ def get_source(self, src_ctx, call_func=True, force_key=None):
+ """ Extract input element for processing.
+ """
+ assert(isinstance(src_ctx, PipelineContext))
+
+ # If there is no information about key, then we consider absence of the source.
+ if self.__src_key is None:
+ return None
+
+ # Extracting actual source.
+ src_data = src_ctx.provide(self.__src_key if force_key is None else force_key)
+ if self._src_func is not None and call_func:
+ src_data = self._src_func(src_data)
+
+ return src_data
+
  def apply_core(self, input_data, pipeline_ctx):
- raise NotImplementedError()
+ """By default we do nothing."""
+ pass
 
  def apply(self, input_data, pipeline_ctx=None):
  """ Performs input processing an update it for a further pipeline items.

diff --git a/arekit/common/pipeline/items/flatten.py b/arekit/common/pipeline/items/flatten.py
@@ -5,10 +5,14 @@ class FlattenIterPipelineItem(BasePipelineItem):
  """ Considered to flat iterations of items that represent iterations.
  """
 
+ def __init__(self, **kwargs):
+ super(FlattenIterPipelineItem, self).__init__(**kwargs)
+ pass
+
  def __flat_iter(self, iter_data):
  for iter_item in iter_data:
  for item in iter_item:
  yield item
 
  def apply_core(self, input_data, pipeline_ctx):
- return self.__flat_iter(input_data)
+ return self.__flat_iter(input_data)
diff --git a/arekit/common/pipeline/items/handle.py b/arekit/common/pipeline/items/handle.py
@@ -3,8 +3,9 @@
 
 class HandleIterPipelineItem(BasePipelineItem):
 
- def __init__(self, handle_func=None):
+ def __init__(self, handle_func=None, **kwargs):
  assert(callable(handle_func))
+ super(HandleIterPipelineItem, self).__init__(**kwargs)
  self.__handle_func = handle_func
 
  def __updated_data(self, items_iter):

diff --git a/arekit/common/pipeline/items/iter.py b/arekit/common/pipeline/items/iter.py
@@ -3,8 +3,9 @@
 
 class FilterPipelineItem(BasePipelineItem):
 
- def __init__(self, filter_func=None):
+ def __init__(self, filter_func=None, **kwargs):
  assert(callable(filter_func))
+ super(FilterPipelineItem, self).__init__(**kwargs)
  self.__filter_func = filter_func
 
  def apply_core(self, input_data, pipeline_ctx):