Skip to content

Commit

Permalink
broke up util.py into smaller, more descriptive modules
Browse files Browse the repository at this point in the history
  • Loading branch information
ZachEichen committed Jun 23, 2021
1 parent ae0483a commit 310cee8
Show file tree
Hide file tree
Showing 5 changed files with 801 additions and 735 deletions.
29 changes: 29 additions & 0 deletions text_extensions_for_pandas/cleaning/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#
# Copyright (c) 2020 IBM Corp.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

################################################################################
# io module
#
# Functions in text_extensions_for_pandas that create DataFrames and convert
# them to other formats.

# Expose the public APIs that users should get from importing the top-level
# library.

from text_extensions_for_pandas.cleaning import train
from text_extensions_for_pandas.cleaning import analysis
from text_extensions_for_pandas.cleaning import preprocess

__all__ = ['train', 'analysis', 'preprocess']
209 changes: 209 additions & 0 deletions text_extensions_for_pandas/cleaning/analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
#
# Copyright (c) 2021 IBM Corp.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
###############################################################################
# util.py
#
# Cleaning utilities for finding errors in varied corpora
#

import numpy as np
import pandas as pd
import sklearn.random_projection
import sklearn.pipeline
import sklearn.linear_model
import sklearn.metrics
import transformers

import text_extensions_for_pandas as tp

# Always run with the latest version of Text Extensions for Pandas
import importlib

tp = importlib.reload(tp)

from typing import *


def create_f1_score_report(
predicted_features: Dict[str, pd.DataFrame],
corpus_label_col: str,
predicted_label_col: str,
print_output: bool = False,
):
"""
Takes in a set of non-IOB formatted documents such as those returned by
`infer_and_extract_entities` as well as two column names and
"""
if print_output:
print(
sklearn.metrics.classification_report(
predicted_features[corpus_label_col],
predicted_features[predicted_label_col],
zero_division=0,
)
)
return pd.DataFrame(
sklearn.metrics.classification_report(
predicted_features[corpus_label_col],
predicted_features[predicted_label_col],
output_dict=True,
zero_division=0,
)
)


def create_f1_score_report_iob(
predicted_ents: pd.DataFrame,
corpus_ents: pd.DataFrame,
span_id_col_names: List[str] = ["fold", "doc_num", "span"],
entity_type_col_name: str = "ent_type",
simple:bool = False
):
"""
Calculates precision, recall and F1 scores for the given predicted elements and model
entities. This function has two modes. In normal operation it calculates classs-wise
precision, recall and accuacy figures, as well as global averaged metrics, and r
eturns them as a pandas DataFrame In the 'Simple' mode, calculates micro averaged
precision recall and F1 scorereturns them as a dictionary.
:param predicted_ents: entities returned from the predictions of the model, in the
form of a pandas DataFrame, with one entity per line, and some sort of 'type' column
:param corpus_ents: the ground truth entities from the model, with one entity per line
and some sort of entity type columns
:param span_id_col_names: a list of column names which by themselves will be sufficent
to uniquely identify each entity by default `['fold', 'doc_num', 'span']` to be
compatible with outputs from `combine_raw_spans_docs`
and `infer_and_extract_entities_iob` from this module
:param entity_type_col_name: the name of a column in both entity DataFrames that identifies
the type of the element.
:param simple: by default `false`. If `false`, a full report is generated, for each entity
type with individual precisions, recalls and F1 scores, as well as averaged metrics
If `true`, an dictionary with three elements `'precision'` `'recall'` and `'F1 score'`
is returned.
:returns: If simple is `false`, a full report is generated, for each entity
type with individual precisions, recalls and F1 scores, as well as averaged metrics
If simple is `true`, an dictionary with three elements `'precision'` `'recall'` and
`'F1 score'` is returned.
:returns:
"""
# use an inner join to count the number of identical elts.
inner = predicted_ents.copy().merge(
corpus_ents, on=span_id_col_names + [entity_type_col_name], how="inner"
)
if simple:
res_dict = {}
res_dict['precision'] = inner.shape[0]/predicted_ents.shape[0]
res_dict['recall'] = inner.shape[0]/corpus_ents.shape[0]
res_dict['f1_score'] =( 2*res_dict['precision']*res_dict['recall']/
(res_dict['precision']+res_dict['recall']))
return res_dict
inner["true_positives"] = 1
inner_counts = inner.groupby(entity_type_col_name).agg({"true_positives": "count"})

pos = predicted_ents
pos["predicted_positives"] = 1
positive_counts = pos.groupby(entity_type_col_name).agg({"predicted_positives": "count"})

actuals = corpus_ents
actuals["actual_positives"] = 1
actual_counts = actuals.groupby(entity_type_col_name).agg({"actual_positives": "count"})

stats = pd.concat([inner_counts, positive_counts, actual_counts], axis=1)
# add micro average
micro = stats.sum()
micro.name = 'Micro-avg'
stats = stats.append(micro)
# calc stuff
stats['precision'] = stats.true_positives / stats.predicted_positives
stats['recall'] = stats.true_positives / stats.actual_positives
# macro average
macro = stats.mean()
macro.name = 'Macro-avg'
stats = stats.append(macro)
# f1 calc
stats['f1_score'] = 2*(stats.precision*stats.recall)/(stats.precision + stats.recall)
stats['support'] = stats['actual_positives']
stats.loc['Micro-avg':'Macro-avg','support'] = pd.NA
# return
stats = stats.drop(columns =[col for col in stats.columns if 'positives' in col])
return stats


def flag_suspicious_labels(
predicted_features: Dict[str, pd.DataFrame],
corpus_label_col: str,
predicted_label_col: str,
label_name=None,
gold_feats: pd.DataFrame = None,
align_over_cols: List[str] = ["fold", "doc_num", "raw_span_id"],
keep_cols: List[str] = ["raw_span"],
):
"""
Takes in the outputs of a number of models and and correlates the elements they
correspond to with the respective elements in the raw corpus labels. It then
aggregates these model results according to their values and whether or not they
agree with the corpus.
:returns: two pandas DataFrames:
* `in_gold`: A DataFrame listing Elements in the corpus but with low agreement
among the models, sorted by least agreement upwards
* `not_in_gold`: a DataFrame listing elements that are not in the corpus labels
but for which there is high agreement among the models of their existence
These DataFrames have the following columns:
* `in_gold`: boolean value of whether or not the element is in the corpus "gold standard"
* `count`: the number of models in agreement on this datapoint
* `models`: the list of the names of models in agreement on that datapoint as listed by
by their names in the `predicted_features` dictionary
"""
df_cols = align_over_cols + keep_cols
if label_name is None:
label_name = "class"
# create gold features dataframe
if gold_feats is None:
gold_feats = predicted_features[list(predicted_features.keys())[0]]
gold_df = gold_feats[df_cols + [corpus_label_col]].copy()
gold_df["models"] = "GOLD"
gold_df["in_gold"] = True
gold_df.rename(columns={corpus_label_col: label_name}, inplace=True)
# create list of features
features_list = [gold_df]
# now populate that list with all of the features from the model
for model_name in predicted_features.keys():
model_pred_df = predicted_features[model_name][
df_cols + [predicted_label_col]
].copy()
model_pred_df["models"] = model_name
model_pred_df["in_gold"] = False
model_pred_df.rename(columns={predicted_label_col: label_name}, inplace=True)
features_list.append(model_pred_df)
# now combine the dataframes of features and combine them with a groupby operation`
all_features = pd.concat(features_list)
all_features["count"] = 1
all_features.loc[all_features.in_gold, "count"] = 0
# create groupby aggregation dict:
aggby = {"in_gold": "any", "count": "sum", "models": lambda x: list(x)}
aggby.update({col: "first" for col in keep_cols})
# now groupby
grouped_features = (
all_features.groupby(align_over_cols + [label_name]).agg(aggby).reset_index()
)
grouped_features.sort_values(
["count"] + align_over_cols, ascending=False, inplace=True
)
in_gold = grouped_features[grouped_features.in_gold].sort_values(
"count", ascending=True, kind="mergesort"
)
not_in_gold = grouped_features[~grouped_features.in_gold].sort_values(
"count", ascending=False, kind="mergesort"
)
return in_gold, not_in_gold
Loading

0 comments on commit 310cee8

Please sign in to comment.