Skip to content

Commit

Permalink
Updated Docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
iNeil77 committed Feb 15, 2021
1 parent ccebfc3 commit 67ac877
Showing 1 changed file with 25 additions and 11 deletions.
36 changes: 25 additions & 11 deletions Source/MultiLang.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ def _train_tokenizer(cls,
Parameters
----------
config: Configuration
The configuration object.
"""
Expand Down Expand Up @@ -86,7 +85,7 @@ def _process_document(cls,
Returns
-------
encode_list: List[int]
encode_list: list[int]
Subword tokens of the cleaned document, featurized in the Byte-Pair space.
"""

Expand All @@ -112,15 +111,20 @@ def _process_corpus(cls,
config: Configuration,
parser: Tokenizer) -> numpy.ndarray:
"""
Iterates over all documents in the training corpus, tokenizes them using a sub-routine and stores the
smoothed maximum-likelihood estimates.
Parameters
----------
config
parser
config: Configuration
The configuration object.
parser: Tokenizer
The configuration object.
Returns
-------
frequency_history: numpy.ndarray
The smoothed language-word co-occurence matrix.
"""

lang_set = config.CORPUS_LANGUAGES
Expand All @@ -143,16 +147,22 @@ def _gibbs_sampler(cls,
frequency_history: numpy.ndarray,
encoding: List[int]) -> List[numpy.ndarray]:
"""
Runs the smoothed Gibbs sampler to approximate the language-word generative distribution, over a configurable
number of iterations.
Parameters
----------
config
frequency_history
encoding
config: Configuration
The configuration object.
frequency_history: numpy.ndarray
The smoothed language-word co-occurence matrix.
encoding: list[int]
Subword tokens of the cleaned document, featurized in the Byte-Pair space.
Returns
-------
final_dict: dict[str, int]
Language probabilities.
"""

document_bag_of_words = numpy.bincount(encoding, minlength=config.VOCAB_SIZE)
Expand Down Expand Up @@ -182,14 +192,18 @@ def _gibbs_sampler(cls,
def infer_language_distribution(self,
doc_path: str) -> Dict[str, int]:
"""
Infers document language labels by invoking the gibbs sampler sub-routine and thresholding the probabilities
based on the configured threshold.
Parameters
----------
doc_path
doc_path: str
The file path to the document.
Returns
-------
result_dict: dict[str, int]
Language probabilities thresholded by configuration threshold.
"""

document_encoding = copy.deepcopy(self._process_document(self.configuration, doc_path, self.parser))
Expand Down

0 comments on commit 67ac877

Please sign in to comment.