made various changes to address fred's comments.

CODAIT · Jul 1, 2021 · ff73d7b · ff73d7b
1 parent eeec4ae
commit ff73d7b
Show file tree

Hide file tree

Showing 3 changed files with 100 additions and 51 deletions.
diff --git a/text_extensions_for_pandas/cleaning/analysis.py b/text_extensions_for_pandas/cleaning/analysis.py
@@ -13,19 +13,13 @@
 # limitations under the License.
 #
 ###############################################################################
-# util.py
-#
-# Cleaning utilities for finding errors in varied corpora
+# analysis.py
 #
+# Cleaning utilities for analyzing model outputs and flagging potentially incorrect
+# labels
 
 import numpy as np
 import pandas as pd
-import sklearn.random_projection
-import sklearn.pipeline
-import sklearn.linear_model
-import sklearn.metrics
-import transformers
-from transformers.utils.dummy_pt_objects import torch_distributed_zero_first
 
 import text_extensions_for_pandas as tp
 
@@ -41,25 +35,26 @@ def create_f1_score_report(
  predicted_features: Dict[str, pd.DataFrame],
  corpus_label_col: str,
  predicted_label_col: str,
- print_output: bool = False,
 ):
  """
  Takes in a set of non-IOB formatted documents such as those returned by
  `infer_and_extract_entities` as well as two column names and returns a
- pandas DataFrame with the per-category precision, recall and F1 scores.
+ Pandas DataFrame with the per-category precision, recall and F1 scores.
+ Requires sklearn.metrics.
  if desired, a printout of the dataframe is printed as output.
  :param predicted_features: a DataFrame containing predicted outputs from
  the model, as well as the corpus labels for those same elements
  :param corpus_label_col: the name of the `predicted_features` column that
  contains the corpus labels for the entitity types
  :param predicted_label_col: the name of the `predicted_features` column that
  contains the predicted labels for the entitity types
- :param print_output: if true, the dataframe will be printed.
  :returns: A dataframe containing four columns: `'precision'`, `'recall;`
  `'f1-score'` and `'support'` with one row for each entity type, as well as
  three additional rows containing accuracy, micro averaged and macro averaged
  scores.
  """
+ import sklearn.metrics
+
  df = pd.DataFrame(
  sklearn.metrics.classification_report(
  predicted_features[corpus_label_col],
@@ -68,8 +63,6 @@ def create_f1_score_report(
  zero_division=0,
  )
  ).transpose()
- if print_output:
- print(df)
  return df
 
 
@@ -84,10 +77,11 @@ def create_f1_score_report_iob(
  Calculates precision, recall and F1 scores for the given predicted elements and model
  entities. This function has two modes. In normal operation it calculates classs-wise
  precision, recall and accuacy figures, as well as global averaged metrics, and r
- eturns them as a pandas DataFrame In the 'Simple' mode, calculates micro averaged
+ eturns them as a Pandas DataFrame In the 'Simple' mode, calculates micro averaged
  precision recall and F1 scorereturns them as a dictionary.
  :param predicted_ents: entities returned from the predictions of the model, in the
- form of a pandas DataFrame, with one entity per line, and some sort of 'type' column
+ form of a Pandas DataFrame, with one entity per line, and some sort of 'type' column
+ with a name specified in `entity_type_col_name`
  :param corpus_ents: the ground truth entities from the model, with one entity per line
  and some sort of entity type columns
  :param span_id_col_names: a list of column names which by themselves will be sufficent
@@ -96,20 +90,33 @@ def create_f1_score_report_iob(
  and `infer_and_extract_entities_iob` from this module
  :param entity_type_col_name: the name of a column in both entity DataFrames that identifies
  the type of the element.
- :param simple: by default `false`. If `false`, a pandas DataFrame is returned
+ :param simple: by default `false`. If `false`, a Pandas DataFrame is returned
  with four columns: `'precision'`, `'recall;`,`'f1-score'` and `'support'`
  with one row for each entity type, as well as two additional rows
  micro averaged and macro averaged scores.
  If `true`, an dictionary with three elements `'precision'` `'recall'` and `'f1-score'`
  is returned.
- :returns: If `simple` is `false`, a pandas DataFrame is returned
+ :returns: If `simple` is `false`, a Pandas DataFrame is returned
  with four columns: `'precision'`, `'recall;`,`'f1-score'` and `'support'`
  with one row for each entity type, as well as two additional rows
  micro averaged and macro averaged scores.
  If `simple` is `true`, an dictionary with three elements `'precision'` `'recall'` and `'f1-score'`
  is returned.
  """
  # use an inner join to count the number of identical elts.
+ # TODO: create a regression test to check zero-predicted-ents Behaviour
+ if predicted_ents.shape[0] == 0:
+ if simple:
+ return {"precision": 0, "recall": 0, "f1-score": 0}
+ else:
+ zero_by_rows = {name: 0 for name in ["Macro-avg", "Micro-avg"]}
+ zeros_df = pd.DataFrame(
+ {
+ name: zero_by_rows
+ for name in ["precision", "recall", "f1-score", "support"]
+ }
+ )
+ return zeros_df
  inner = predicted_ents.copy().merge(
  corpus_ents, on=span_id_col_names + [entity_type_col_name], how="inner"
  )
@@ -171,20 +178,20 @@ def create_f1_report_ensemble_iob(
  """
  Given an ensemble of model predictions (in the form of entities) and ground truth
  labels creates a precision-recall-f1_score report for each model, and returns the
- output as a pandas DataFrame. The outputs are of the same form as the simple output
+ output as a Pandas DataFrame. The outputs are of the same form as the simple output
  from :func:`create_f1_score_report_iob`
  :param predicted_ents_by_model: a dictionary from model name (or other unique
  identifier) to outputs as produced by
  :func:`cleaning.ensenble.infer_and_extract_entities_iob` or analagous.
  Must have one of each column in `span_id_col_names` and some entity type column
- :param corpus_ents: the entities given in the corpus. in the form of a pandas DataFrame
+ :param corpus_ents: the entities given in the corpus. in the form of a Pandas DataFrame
  Must have one of each column name in `span_id_col_names` and `entity_type_col_name`
  Can be produced by :func: `cleaning.preprocess.combine_raw_spans_docs`
  :param span_id_col_names: a list column names in all input dataFrames by which each
  span may be uniquely identified. By default, `["fold", "doc_num", "span"]`
  :param entity_type_col_name: the name of the column in the input DataFrames containing
  the entity type labels for each entity.
- :returns: a pandas DataFrame with indices of the model names, and columns
+ :returns: a Pandas DataFrame with indices of the model names, and columns
  `'precision'` `'recall'` and `'f1-score'`
  """
  reports = {
@@ -216,7 +223,7 @@ def flag_suspicious_labels(
  correspond to with the respective elements in the raw corpus labels. It then
  aggregates these model results according to their values and whether or not they
  agree with the corpus.
- :returns: two pandas DataFrames:
+ :returns: two Pandas DataFrames:
  * `in_gold`: A DataFrame listing Elements in the corpus but with low agreement
  among the models, sorted by least agreement upwards
  * `not_in_gold`: a DataFrame listing elements that are not in the corpus labels

diff --git a/text_extensions_for_pandas/cleaning/ensemble.py b/text_extensions_for_pandas/cleaning/ensemble.py
@@ -13,18 +13,14 @@
 # limitations under the License.
 #
 ###############################################################################
-# util.py
+# ensemble.py
 #
-# Cleaning utilities for finding errors in varied corpora
+# Cleaning utilities training and running ensembles of reduced models on BERT embeddings,
+# for use with analysis.py to identify potentially incorrect labels
 #
 
 import numpy as np
 import pandas as pd
-import sklearn.random_projection
-import sklearn.pipeline
-import sklearn.linear_model
-import sklearn.metrics
-
 import text_extensions_for_pandas as tp
 
 # Always run with the latest version of Text Extensions for Pandas
@@ -41,10 +37,11 @@ def train_reduced_model(
  n_components: int,
  seed: int,
  max_iter: int = 10000,
-) -> sklearn.base.BaseEstimator:
+):
  """
  Train a reduced-quality model by putting a Gaussian random projection in
  front of the multinomial logistic regression stage of the pipeline.
+ Requires `sklearn` and `ray` packages to run
 
  :param x_values: input embeddings for training set
  :param y_values: integer labels corresponding to embeddings
@@ -57,6 +54,10 @@ def train_reduced_model(
  input training data with the specified level of dimension reduction
  by random projection.
  """
+ import sklearn.pipeline
+ import sklearn.random_projection
+ import sklearn.linear_model
+
  reduce_pipeline = sklearn.pipeline.Pipeline(
  [
  (
@@ -89,6 +90,7 @@ def train_model_ensemble(
  Train an ensemble of reduced-quality models by putting a Gaussian
  random projection in front of the multinomial logistic regression
  stage of the pipelines for a set of models
+ requires `sklearn` and `ray` packages to run
 
  two lists are given of model sizes and seeds, and the power set
  of the two is the complete set ofparameters used to train the models
@@ -110,6 +112,7 @@ def train_model_ensemble(
  """
 
  import ray # TODO: put a note about this in the docstring
+ import sklearn.pipeline
 
  # input logic
  if model_sizes is None:
@@ -176,6 +179,11 @@ def infer_on_df(
  :param iob: a boolean value, when set to true, additional logic for iob-formatted
  classes is activated
  :param embeddings_col: the column in `df` that contains BERT embeddings for that document
+ :returns: a Pandas DataFrame, mirroring df, and conaining three extra columns:
+ * `'predicted_id'` with the id as predicted by the model of the categorical element
+ * `'predicted_class'` containing the predicted categorical value corresponding to
+ predicted_id
+ * `'raw_output'` a TensorArray containing the raw output vectors from the model
  """
  result_df = df.copy()
  raw_outputs = tp.TensorArray(predictor.predict_proba(result_df[embeddings_col]))
@@ -221,10 +229,18 @@ def infer_and_extract_raw_entites(
  :param raw_span_id_col: the name of the column of `doc` containing some identifier of the raw
  token that each bert token came from.
  :param agg_func: if specified, a function that takes in a series of tensorArrays and returns a
- pandas-compatible type; used to aggregate the predictions of multiple subtokens when
+ Pandas-compatible type; used to aggregate the predictions of multiple subtokens when
  multiple subtokens all describe the same original token.
  :param keep_cols: any column that you wish to be carried over to the output dataframe, by default
  the column 'raw_span' is the only column to be carried over, if it exists.
+ :returns: a Pandas DataFrame containing a set of entities aligned with the orignal
+ tokenization of the document, containing the following columns:
+ * `'predicted_id'` the id number of the predicted element
+ * `'raw_output'` a vector of prediction 'probabilities' from the model. If the
+ entity span covers multiple tokens, it is aggregated using agg_func
+ * `'predicted_class'` the class of the entity, matching predicted_id, and converted
+ using `id_to_class_dict`
+ * any columns specified in `keep_cols`
  """
  if agg_func is None:
 
@@ -308,6 +324,12 @@ def extract_entities_iob(
  the tokens of those documents as spans.
  :param iob_col: the column containing the predicted iob values from the model
  :param entity_type_col: the column containing the predicted element types from the model
+ :returns: a Pandas DataFrame containing the extracted entities from the predicted iob
+ tags, with each iob-labelled element as its own line, and with the following columns:
+ * `'span'` containing the spans of the entities flagged by the model
+ * `'ent_type'` with the predicted type of the flagged entity
+ as well as two columns containing the fold and doc numbers for each element, using
+ the same names specified in `fold_col` and `doc_col` respecively
  """
 
  # create predicted spans using inference
@@ -330,6 +352,7 @@ def extract_entities_iob(
  pred_aligned_doc = tp.io.bert.align_bert_tokens_to_corpus_tokens(
  pred_spans, raw_docs[fold][doc_num].rename({raw_docs_span_col_name: "span"})
  )
+ pred_aligned_doc.rename(columns={"ent_type": entity_type_col})
  pred_aligned_doc[[fold_col, doc_col]] = [fold, doc_num]
  pred_dfs.append(pred_aligned_doc)
  result_df = pd.concat(pred_dfs)
@@ -369,6 +392,14 @@ def infer_and_extract_entities_iob(
  :param predict_on_col: the name of the column of `doc` containing the BERT embedding of that token
  :param raw_docs_span_col_name: the name of the column of the documents in `raw_docs` containing
  the tokens of those documents as spans.
+ :returns: a Pandas DataFrame containing the predicted entities from the model,
+ converted from iob format with each element as its own line,
+ and with the following columns:
+ * `'span'` containing the spans of the entities flagged by the model
+ * `'ent_type'` with the predicted type of the flagged entity
+ as well as two columns containing the fold and doc numbers for each element, using
+ the same names specified in `fold_col` and `doc_col` respecively
+
  """
 
  df = doc.copy()