Skip to content

Commit

Permalink
made various changes to address fred's comments.
Browse files Browse the repository at this point in the history
  • Loading branch information
ZachEichen committed Jul 1, 2021
1 parent eeec4ae commit ff73d7b
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 51 deletions.
51 changes: 29 additions & 22 deletions text_extensions_for_pandas/cleaning/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,13 @@
# limitations under the License.
#
###############################################################################
# util.py
#
# Cleaning utilities for finding errors in varied corpora
# analysis.py
#
# Cleaning utilities for analyzing model outputs and flagging potentially incorrect
# labels

import numpy as np
import pandas as pd
import sklearn.random_projection
import sklearn.pipeline
import sklearn.linear_model
import sklearn.metrics
import transformers
from transformers.utils.dummy_pt_objects import torch_distributed_zero_first

import text_extensions_for_pandas as tp

Expand All @@ -41,25 +35,26 @@ def create_f1_score_report(
predicted_features: Dict[str, pd.DataFrame],
corpus_label_col: str,
predicted_label_col: str,
print_output: bool = False,
):
"""
Takes in a set of non-IOB formatted documents such as those returned by
`infer_and_extract_entities` as well as two column names and returns a
pandas DataFrame with the per-category precision, recall and F1 scores.
Pandas DataFrame with the per-category precision, recall and F1 scores.
Requires sklearn.metrics.
if desired, a printout of the dataframe is printed as output.
:param predicted_features: a DataFrame containing predicted outputs from
the model, as well as the corpus labels for those same elements
:param corpus_label_col: the name of the `predicted_features` column that
contains the corpus labels for the entitity types
:param predicted_label_col: the name of the `predicted_features` column that
contains the predicted labels for the entitity types
:param print_output: if true, the dataframe will be printed.
:returns: A dataframe containing four columns: `'precision'`, `'recall;`
`'f1-score'` and `'support'` with one row for each entity type, as well as
three additional rows containing accuracy, micro averaged and macro averaged
scores.
"""
import sklearn.metrics

df = pd.DataFrame(
sklearn.metrics.classification_report(
predicted_features[corpus_label_col],
Expand All @@ -68,8 +63,6 @@ def create_f1_score_report(
zero_division=0,
)
).transpose()
if print_output:
print(df)
return df


Expand All @@ -84,10 +77,11 @@ def create_f1_score_report_iob(
Calculates precision, recall and F1 scores for the given predicted elements and model
entities. This function has two modes. In normal operation it calculates classs-wise
precision, recall and accuacy figures, as well as global averaged metrics, and r
eturns them as a pandas DataFrame In the 'Simple' mode, calculates micro averaged
eturns them as a Pandas DataFrame In the 'Simple' mode, calculates micro averaged
precision recall and F1 scorereturns them as a dictionary.
:param predicted_ents: entities returned from the predictions of the model, in the
form of a pandas DataFrame, with one entity per line, and some sort of 'type' column
form of a Pandas DataFrame, with one entity per line, and some sort of 'type' column
with a name specified in `entity_type_col_name`
:param corpus_ents: the ground truth entities from the model, with one entity per line
and some sort of entity type columns
:param span_id_col_names: a list of column names which by themselves will be sufficent
Expand All @@ -96,20 +90,33 @@ def create_f1_score_report_iob(
and `infer_and_extract_entities_iob` from this module
:param entity_type_col_name: the name of a column in both entity DataFrames that identifies
the type of the element.
:param simple: by default `false`. If `false`, a pandas DataFrame is returned
:param simple: by default `false`. If `false`, a Pandas DataFrame is returned
with four columns: `'precision'`, `'recall;`,`'f1-score'` and `'support'`
with one row for each entity type, as well as two additional rows
micro averaged and macro averaged scores.
If `true`, an dictionary with three elements `'precision'` `'recall'` and `'f1-score'`
is returned.
:returns: If `simple` is `false`, a pandas DataFrame is returned
:returns: If `simple` is `false`, a Pandas DataFrame is returned
with four columns: `'precision'`, `'recall;`,`'f1-score'` and `'support'`
with one row for each entity type, as well as two additional rows
micro averaged and macro averaged scores.
If `simple` is `true`, an dictionary with three elements `'precision'` `'recall'` and `'f1-score'`
is returned.
"""
# use an inner join to count the number of identical elts.
# TODO: create a regression test to check zero-predicted-ents Behaviour
if predicted_ents.shape[0] == 0:
if simple:
return {"precision": 0, "recall": 0, "f1-score": 0}
else:
zero_by_rows = {name: 0 for name in ["Macro-avg", "Micro-avg"]}
zeros_df = pd.DataFrame(
{
name: zero_by_rows
for name in ["precision", "recall", "f1-score", "support"]
}
)
return zeros_df
inner = predicted_ents.copy().merge(
corpus_ents, on=span_id_col_names + [entity_type_col_name], how="inner"
)
Expand Down Expand Up @@ -171,20 +178,20 @@ def create_f1_report_ensemble_iob(
"""
Given an ensemble of model predictions (in the form of entities) and ground truth
labels creates a precision-recall-f1_score report for each model, and returns the
output as a pandas DataFrame. The outputs are of the same form as the simple output
output as a Pandas DataFrame. The outputs are of the same form as the simple output
from :func:`create_f1_score_report_iob`
:param predicted_ents_by_model: a dictionary from model name (or other unique
identifier) to outputs as produced by
:func:`cleaning.ensenble.infer_and_extract_entities_iob` or analagous.
Must have one of each column in `span_id_col_names` and some entity type column
:param corpus_ents: the entities given in the corpus. in the form of a pandas DataFrame
:param corpus_ents: the entities given in the corpus. in the form of a Pandas DataFrame
Must have one of each column name in `span_id_col_names` and `entity_type_col_name`
Can be produced by :func: `cleaning.preprocess.combine_raw_spans_docs`
:param span_id_col_names: a list column names in all input dataFrames by which each
span may be uniquely identified. By default, `["fold", "doc_num", "span"]`
:param entity_type_col_name: the name of the column in the input DataFrames containing
the entity type labels for each entity.
:returns: a pandas DataFrame with indices of the model names, and columns
:returns: a Pandas DataFrame with indices of the model names, and columns
`'precision'` `'recall'` and `'f1-score'`
"""
reports = {
Expand Down Expand Up @@ -216,7 +223,7 @@ def flag_suspicious_labels(
correspond to with the respective elements in the raw corpus labels. It then
aggregates these model results according to their values and whether or not they
agree with the corpus.
:returns: two pandas DataFrames:
:returns: two Pandas DataFrames:
* `in_gold`: A DataFrame listing Elements in the corpus but with low agreement
among the models, sorted by least agreement upwards
* `not_in_gold`: a DataFrame listing elements that are not in the corpus labels
Expand Down
49 changes: 40 additions & 9 deletions text_extensions_for_pandas/cleaning/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,14 @@
# limitations under the License.
#
###############################################################################
# util.py
# ensemble.py
#
# Cleaning utilities for finding errors in varied corpora
# Cleaning utilities training and running ensembles of reduced models on BERT embeddings,
# for use with analysis.py to identify potentially incorrect labels
#

import numpy as np
import pandas as pd
import sklearn.random_projection
import sklearn.pipeline
import sklearn.linear_model
import sklearn.metrics

import text_extensions_for_pandas as tp

# Always run with the latest version of Text Extensions for Pandas
Expand All @@ -41,10 +37,11 @@ def train_reduced_model(
n_components: int,
seed: int,
max_iter: int = 10000,
) -> sklearn.base.BaseEstimator:
):
"""
Train a reduced-quality model by putting a Gaussian random projection in
front of the multinomial logistic regression stage of the pipeline.
Requires `sklearn` and `ray` packages to run
:param x_values: input embeddings for training set
:param y_values: integer labels corresponding to embeddings
Expand All @@ -57,6 +54,10 @@ def train_reduced_model(
input training data with the specified level of dimension reduction
by random projection.
"""
import sklearn.pipeline
import sklearn.random_projection
import sklearn.linear_model

reduce_pipeline = sklearn.pipeline.Pipeline(
[
(
Expand Down Expand Up @@ -89,6 +90,7 @@ def train_model_ensemble(
Train an ensemble of reduced-quality models by putting a Gaussian
random projection in front of the multinomial logistic regression
stage of the pipelines for a set of models
requires `sklearn` and `ray` packages to run
two lists are given of model sizes and seeds, and the power set
of the two is the complete set ofparameters used to train the models
Expand All @@ -110,6 +112,7 @@ def train_model_ensemble(
"""

import ray # TODO: put a note about this in the docstring
import sklearn.pipeline

# input logic
if model_sizes is None:
Expand Down Expand Up @@ -176,6 +179,11 @@ def infer_on_df(
:param iob: a boolean value, when set to true, additional logic for iob-formatted
classes is activated
:param embeddings_col: the column in `df` that contains BERT embeddings for that document
:returns: a Pandas DataFrame, mirroring df, and conaining three extra columns:
* `'predicted_id'` with the id as predicted by the model of the categorical element
* `'predicted_class'` containing the predicted categorical value corresponding to
predicted_id
* `'raw_output'` a TensorArray containing the raw output vectors from the model
"""
result_df = df.copy()
raw_outputs = tp.TensorArray(predictor.predict_proba(result_df[embeddings_col]))
Expand Down Expand Up @@ -221,10 +229,18 @@ def infer_and_extract_raw_entites(
:param raw_span_id_col: the name of the column of `doc` containing some identifier of the raw
token that each bert token came from.
:param agg_func: if specified, a function that takes in a series of tensorArrays and returns a
pandas-compatible type; used to aggregate the predictions of multiple subtokens when
Pandas-compatible type; used to aggregate the predictions of multiple subtokens when
multiple subtokens all describe the same original token.
:param keep_cols: any column that you wish to be carried over to the output dataframe, by default
the column 'raw_span' is the only column to be carried over, if it exists.
:returns: a Pandas DataFrame containing a set of entities aligned with the orignal
tokenization of the document, containing the following columns:
* `'predicted_id'` the id number of the predicted element
* `'raw_output'` a vector of prediction 'probabilities' from the model. If the
entity span covers multiple tokens, it is aggregated using agg_func
* `'predicted_class'` the class of the entity, matching predicted_id, and converted
using `id_to_class_dict`
* any columns specified in `keep_cols`
"""
if agg_func is None:

Expand Down Expand Up @@ -308,6 +324,12 @@ def extract_entities_iob(
the tokens of those documents as spans.
:param iob_col: the column containing the predicted iob values from the model
:param entity_type_col: the column containing the predicted element types from the model
:returns: a Pandas DataFrame containing the extracted entities from the predicted iob
tags, with each iob-labelled element as its own line, and with the following columns:
* `'span'` containing the spans of the entities flagged by the model
* `'ent_type'` with the predicted type of the flagged entity
as well as two columns containing the fold and doc numbers for each element, using
the same names specified in `fold_col` and `doc_col` respecively
"""

# create predicted spans using inference
Expand All @@ -330,6 +352,7 @@ def extract_entities_iob(
pred_aligned_doc = tp.io.bert.align_bert_tokens_to_corpus_tokens(
pred_spans, raw_docs[fold][doc_num].rename({raw_docs_span_col_name: "span"})
)
pred_aligned_doc.rename(columns={"ent_type": entity_type_col})
pred_aligned_doc[[fold_col, doc_col]] = [fold, doc_num]
pred_dfs.append(pred_aligned_doc)
result_df = pd.concat(pred_dfs)
Expand Down Expand Up @@ -369,6 +392,14 @@ def infer_and_extract_entities_iob(
:param predict_on_col: the name of the column of `doc` containing the BERT embedding of that token
:param raw_docs_span_col_name: the name of the column of the documents in `raw_docs` containing
the tokens of those documents as spans.
:returns: a Pandas DataFrame containing the predicted entities from the model,
converted from iob format with each element as its own line,
and with the following columns:
* `'span'` containing the spans of the entities flagged by the model
* `'ent_type'` with the predicted type of the flagged entity
as well as two columns containing the fold and doc numbers for each element, using
the same names specified in `fold_col` and `doc_col` respecively
"""

df = doc.copy()
Expand Down
Loading

0 comments on commit ff73d7b

Please sign in to comment.