From eeec4ae6049c3ba7d37008489642b090db4f410f Mon Sep 17 00:00:00 2001
From: ZachEichen <zachary.eichen@gmail.com>
Date: Wed, 30 Jun 2021 11:50:18 -0400
Subject: [PATCH]  ran and updated CoNLL4 notebook

---
 tutorials/corpus/CoNLL_4.ipynb | 3094 +++++++++++++++-----------------
 1 file changed, 1474 insertions(+), 1620 deletions(-)

diff --git a/tutorials/corpus/CoNLL_4.ipynb b/tutorials/corpus/CoNLL_4.ipynb
index 3a4fcc1b..625d6f86 100644
--- a/tutorials/corpus/CoNLL_4.ipynb
+++ b/tutorials/corpus/CoNLL_4.ipynb
@@ -56,9 +56,8 @@
     "                    \"from the directory containing this notebook, or use a Python \"\n",
     "                    \"environment on which you have used `pip` to install the package.\")\n",
     "\n",
-    "# Code shared among notebooks is kept in util.py, in this directory.\n",
-    "import util\n",
-    "\n",
+    "from text_extensions_for_pandas import cleaning\n",
+    "    \n",
     "# BERT Configuration\n",
     "# Keep this in sync with `CoNLL_3.ipynb`.\n",
     "#bert_model_name = \"bert-base-uncased\"\n",
@@ -156,57 +155,18 @@
    "cell_type": "code",
    "execution_count": 4,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2021-04-08 19:07:45,448\tINFO services.py:1174 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "{'node_ip_address': '192.168.0.238',\n",
-       " 'raylet_ip_address': '192.168.0.238',\n",
-       " 'redis_address': '192.168.0.238:6379',\n",
-       " 'object_store_address': '/tmp/ray/session_2021-04-08_19-07-44_920647_24089/sockets/plasma_store',\n",
-       " 'raylet_socket_name': '/tmp/ray/session_2021-04-08_19-07-44_920647_24089/sockets/raylet',\n",
-       " 'webui_url': '127.0.0.1:8265',\n",
-       " 'session_dir': '/tmp/ray/session_2021-04-08_19-07-44_920647_24089',\n",
-       " 'metrics_export_port': 63726,\n",
-       " 'node_id': 'd3d3ce9b64423f35ff87532e918e52feb8744a9104c7b830498fd8c6'}"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Use Ray to make things faster\n",
-    "import ray\n",
-    "if ray.is_initialized():\n",
-    "    ray.shutdown()\n",
-    "ray.init()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Processing fold 'train'...\n"
+      "preprocessing fold train\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "623678efe9214ad7a48b6736aaafda82",
+       "model_id": "2a902c1e0d31464493eb8627e4a8c334",
        "version_major": 2,
        "version_minor": 0
       },
@@ -217,17 +177,24 @@
      "metadata": {},
      "output_type": "display_data"
     },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (559 > 512). Running this sequence through the model will result in indexing errors\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Processing fold 'dev'...\n"
+      "preprocessing fold dev\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "26b82e15bcac40ca96f13cc5a8f08dc9",
+       "model_id": "1cde38752f204098980b845f6995e3b8",
        "version_major": 2,
        "version_minor": 0
       },
@@ -242,13 +209,13 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Processing fold 'test'...\n"
+      "preprocessing fold test\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e79c2d5063054dd5951690b86ce63d2b",
+       "model_id": "d241a32cd56f41cf9409268d4ff3b702",
        "version_major": 2,
        "version_minor": 0
       },
@@ -261,199 +228,8 @@
     }
    ],
    "source": [
-    "# Retokenize with the BERT tokenizer and optinally regenerate embeddings.\n",
-    "\n",
-    "actor_pool = ray.util.actor_pool.ActorPool([\n",
-    "    util.BertActor.remote(bert_model_name, token_class_dtype, \n",
-    "                          compute_embeddings=_REGENERATE_EMBEDDINGS)\n",
-    "    for i in range(multiprocessing.cpu_count())\n",
-    "])\n",
-    "\n",
-    "bert_toks_by_fold = {}\n",
-    "for fold_name in corpus_raw.keys():\n",
-    "    print(f\"Processing fold '{fold_name}'...\")\n",
-    "    raw = corpus_raw[fold_name]\n",
-    "    for tokens_df in raw:\n",
-    "        actor_pool.submit(lambda a, v: a.process_doc.remote(v), tokens_df)\n",
-    "    bert_toks_by_fold[fold_name] = tp.jupyter.run_with_progress_bar(\n",
-    "        len(raw), lambda i: actor_pool.get_next())\n",
-    "\n",
-    "    \n",
-    "# The actors will stay active until their associated Python objects\n",
-    "# go out of scope and are garbage-collected.\n",
-    "del actor_pool\n",
-    "gc.collect(0)\n",
-    "    \n",
-    "bert_data = bert_toks_by_fold"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>fold</th>\n",
-       "      <th>doc_num</th>\n",
-       "      <th>token_id</th>\n",
-       "      <th>input_id</th>\n",
-       "      <th>token_type_id</th>\n",
-       "      <th>attention_mask</th>\n",
-       "      <th>special_tokens_mask</th>\n",
-       "      <th>ent_iob</th>\n",
-       "      <th>ent_type</th>\n",
-       "      <th>token_class</th>\n",
-       "      <th>token_class_id</th>\n",
-       "      <th>embedding</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>train</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>101</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>True</td>\n",
-       "      <td>O</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>O</td>\n",
-       "      <td>0</td>\n",
-       "      <td>[  -0.098505184,     -0.4050192,     0.7428884...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>train</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>118</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>False</td>\n",
-       "      <td>O</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>O</td>\n",
-       "      <td>0</td>\n",
-       "      <td>[  -0.057021223,    -0.48112097,      0.989868...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>train</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>141</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>False</td>\n",
-       "      <td>O</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>O</td>\n",
-       "      <td>0</td>\n",
-       "      <td>[   -0.04824195,    -0.25330004,      1.167191...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>train</td>\n",
-       "      <td>0</td>\n",
-       "      <td>3</td>\n",
-       "      <td>9244</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>False</td>\n",
-       "      <td>O</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>O</td>\n",
-       "      <td>0</td>\n",
-       "      <td>[   -0.26682988,    -0.31008753,      1.007472...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>train</td>\n",
-       "      <td>0</td>\n",
-       "      <td>4</td>\n",
-       "      <td>9272</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>False</td>\n",
-       "      <td>O</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>O</td>\n",
-       "      <td>0</td>\n",
-       "      <td>[   -0.22296889,    -0.21308492,     0.9331016...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "    fold  doc_num  token_id  input_id  token_type_id  attention_mask  \\\n",
-       "0  train        0         0       101              0               1   \n",
-       "1  train        0         1       118              0               1   \n",
-       "2  train        0         2       141              0               1   \n",
-       "3  train        0         3      9244              0               1   \n",
-       "4  train        0         4      9272              0               1   \n",
-       "\n",
-       "   special_tokens_mask ent_iob ent_type token_class  token_class_id  \\\n",
-       "0                 True       O     <NA>           O               0   \n",
-       "1                False       O     <NA>           O               0   \n",
-       "2                False       O     <NA>           O               0   \n",
-       "3                False       O     <NA>           O               0   \n",
-       "4                False       O     <NA>           O               0   \n",
-       "\n",
-       "                                           embedding  \n",
-       "0  [  -0.098505184,     -0.4050192,     0.7428884...  \n",
-       "1  [  -0.057021223,    -0.48112097,      0.989868...  \n",
-       "2  [   -0.04824195,    -0.25330004,      1.167191...  \n",
-       "3  [   -0.26682988,    -0.31008753,      1.007472...  \n",
-       "4  [   -0.22296889,    -0.21308492,     0.9331016...  "
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Create a single dataframe of annotated tokens for the entire corpus\n",
-    "if _REGENERATE_EMBEDDINGS:\n",
-    "    corpus_df = tp.io.conll.combine_folds(bert_data)\n",
-    "    # We can't currently serialize span columns that cover multiple documents (see issue 73),\n",
-    "    # so the Feather file won't contain them. Drop these columns for consistency when\n",
-    "    # we regenerate the embeddings here.\n",
-    "    cols_to_drop = [c for c in corpus_df.columns if \"span\" in c]\n",
-    "    corpus_df.drop(columns=cols_to_drop, inplace=True)\n",
-    "else:\n",
-    "    # Use embeddings computed in CoNLL_3.ipynb\n",
-    "    _EMBEDDINGS_FILE = \"outputs/corpus.feather\"\n",
-    "    if not os.path.exists(_EMBEDDINGS_FILE):\n",
-    "        raise ValueError(f\"Precomputed embeddings not found at {_EMBEDDINGS_FILE}. \"\n",
-    "                         f\"Please rerun CoNLL_3.ipynb to regenerate this file, or \"\n",
-    "                         f\"set _REGENERATE_EMBEDDINGS to True in the previous cell.\")\n",
-    "    corpus_df = pd.read_feather(\"outputs/corpus.feather\")\n",
-    "\n",
-    "corpus_df.head()"
+    "# Retokenize with the BERT tokenizer and regenerate embeddings.\n",
+    "corpus_df,token_class_dtype, int_to_label, label_to_int = cleaning.preprocess.preprocess_documents(corpus_raw,'ent_type',True,carry_cols=['line_num'],iob_col='ent_iob')"
    ]
   },
   {
@@ -467,7 +243,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -573,7 +349,7 @@
        "[1393 rows x 2 columns]"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -586,7 +362,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -683,7 +459,7 @@
        "1213   test       51"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -726,7 +502,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -753,15 +529,19 @@
        "      <th>fold</th>\n",
        "      <th>doc_num</th>\n",
        "      <th>token_id</th>\n",
+       "      <th>span</th>\n",
        "      <th>input_id</th>\n",
        "      <th>token_type_id</th>\n",
        "      <th>attention_mask</th>\n",
        "      <th>special_tokens_mask</th>\n",
+       "      <th>raw_span</th>\n",
+       "      <th>line_num</th>\n",
+       "      <th>raw_span_id</th>\n",
        "      <th>ent_iob</th>\n",
        "      <th>ent_type</th>\n",
+       "      <th>embedding</th>\n",
        "      <th>token_class</th>\n",
        "      <th>token_class_id</th>\n",
-       "      <th>embedding</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
@@ -770,75 +550,95 @@
        "      <td>train</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
+       "      <td>[0, 0): ''</td>\n",
        "      <td>101</td>\n",
        "      <td>0</td>\n",
        "      <td>1</td>\n",
        "      <td>True</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
        "      <td>O</td>\n",
        "      <td>&lt;NA&gt;</td>\n",
+       "      <td>[  -0.098505184,     -0.4050192,     0.7428884...</td>\n",
        "      <td>O</td>\n",
        "      <td>0</td>\n",
-       "      <td>[  -0.098505184,     -0.4050192,     0.7428884...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>train</td>\n",
        "      <td>0</td>\n",
        "      <td>1</td>\n",
+       "      <td>[0, 1): '-'</td>\n",
        "      <td>118</td>\n",
        "      <td>0</td>\n",
        "      <td>1</td>\n",
        "      <td>False</td>\n",
+       "      <td>[0, 10): '-DOCSTART-'</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>O</td>\n",
        "      <td>&lt;NA&gt;</td>\n",
+       "      <td>[  -0.057021223,    -0.48112097,      0.989868...</td>\n",
        "      <td>O</td>\n",
        "      <td>0</td>\n",
-       "      <td>[  -0.057021223,    -0.48112097,      0.989868...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>train</td>\n",
        "      <td>0</td>\n",
        "      <td>2</td>\n",
+       "      <td>[1, 2): 'D'</td>\n",
        "      <td>141</td>\n",
        "      <td>0</td>\n",
        "      <td>1</td>\n",
        "      <td>False</td>\n",
+       "      <td>[0, 10): '-DOCSTART-'</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>O</td>\n",
        "      <td>&lt;NA&gt;</td>\n",
+       "      <td>[   -0.04824195,    -0.25330004,      1.167191...</td>\n",
        "      <td>O</td>\n",
        "      <td>0</td>\n",
-       "      <td>[   -0.04824195,    -0.25330004,      1.167191...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>train</td>\n",
        "      <td>0</td>\n",
        "      <td>3</td>\n",
+       "      <td>[2, 4): 'OC'</td>\n",
        "      <td>9244</td>\n",
        "      <td>0</td>\n",
        "      <td>1</td>\n",
        "      <td>False</td>\n",
+       "      <td>[0, 10): '-DOCSTART-'</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>O</td>\n",
        "      <td>&lt;NA&gt;</td>\n",
+       "      <td>[   -0.26682988,    -0.31008753,      1.007472...</td>\n",
        "      <td>O</td>\n",
        "      <td>0</td>\n",
-       "      <td>[   -0.26682988,    -0.31008753,      1.007472...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>train</td>\n",
        "      <td>0</td>\n",
        "      <td>4</td>\n",
+       "      <td>[4, 6): 'ST'</td>\n",
        "      <td>9272</td>\n",
        "      <td>0</td>\n",
        "      <td>1</td>\n",
        "      <td>False</td>\n",
+       "      <td>[0, 10): '-DOCSTART-'</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>O</td>\n",
        "      <td>&lt;NA&gt;</td>\n",
+       "      <td>[   -0.22296889,    -0.21308492,     0.9331016...</td>\n",
        "      <td>O</td>\n",
        "      <td>0</td>\n",
-       "      <td>[   -0.22296889,    -0.21308492,     0.9331016...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>...</th>\n",
@@ -854,131 +654,181 @@
        "      <td>...</td>\n",
        "      <td>...</td>\n",
        "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>371472</th>\n",
        "      <td>test</td>\n",
        "      <td>230</td>\n",
        "      <td>314</td>\n",
+       "      <td>[1386, 1393): 'brother'</td>\n",
        "      <td>1711</td>\n",
        "      <td>0</td>\n",
        "      <td>1</td>\n",
        "      <td>False</td>\n",
+       "      <td>[1386, 1393): 'brother'</td>\n",
+       "      <td>50345.0</td>\n",
+       "      <td>267.0</td>\n",
        "      <td>O</td>\n",
        "      <td>&lt;NA&gt;</td>\n",
+       "      <td>[  -0.028172785,    -0.08062388,     0.9804888...</td>\n",
        "      <td>O</td>\n",
        "      <td>0</td>\n",
-       "      <td>[  -0.028172785,    -0.08062388,     0.9804888...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>371473</th>\n",
        "      <td>test</td>\n",
        "      <td>230</td>\n",
        "      <td>315</td>\n",
+       "      <td>[1393, 1394): ','</td>\n",
        "      <td>117</td>\n",
        "      <td>0</td>\n",
        "      <td>1</td>\n",
        "      <td>False</td>\n",
+       "      <td>[1393, 1394): ','</td>\n",
+       "      <td>50346.0</td>\n",
+       "      <td>268.0</td>\n",
        "      <td>O</td>\n",
        "      <td>&lt;NA&gt;</td>\n",
+       "      <td>[    0.11817408,    -0.07008513,      0.865484...</td>\n",
        "      <td>O</td>\n",
        "      <td>0</td>\n",
-       "      <td>[    0.11817408,    -0.07008513,      0.865484...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>371474</th>\n",
        "      <td>test</td>\n",
        "      <td>230</td>\n",
        "      <td>316</td>\n",
+       "      <td>[1395, 1400): 'Bobby'</td>\n",
        "      <td>5545</td>\n",
        "      <td>0</td>\n",
        "      <td>1</td>\n",
        "      <td>False</td>\n",
+       "      <td>[1395, 1400): 'Bobby'</td>\n",
+       "      <td>50347.0</td>\n",
+       "      <td>269.0</td>\n",
        "      <td>B</td>\n",
        "      <td>PER</td>\n",
-       "      <td>B-PER</td>\n",
-       "      <td>4</td>\n",
        "      <td>[   -0.35689482,     0.31400457,      1.573853...</td>\n",
+       "      <td>B-PER</td>\n",
+       "      <td>3</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>371475</th>\n",
        "      <td>test</td>\n",
        "      <td>230</td>\n",
        "      <td>317</td>\n",
+       "      <td>[1400, 1401): '.'</td>\n",
        "      <td>119</td>\n",
        "      <td>0</td>\n",
        "      <td>1</td>\n",
        "      <td>False</td>\n",
+       "      <td>[1400, 1401): '.'</td>\n",
+       "      <td>50348.0</td>\n",
+       "      <td>270.0</td>\n",
        "      <td>O</td>\n",
        "      <td>&lt;NA&gt;</td>\n",
+       "      <td>[   -0.18957126,    -0.24581163,       0.66257...</td>\n",
        "      <td>O</td>\n",
        "      <td>0</td>\n",
-       "      <td>[   -0.18957126,    -0.24581163,       0.66257...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>371476</th>\n",
        "      <td>test</td>\n",
        "      <td>230</td>\n",
        "      <td>318</td>\n",
+       "      <td>[0, 0): ''</td>\n",
        "      <td>102</td>\n",
        "      <td>0</td>\n",
        "      <td>1</td>\n",
        "      <td>True</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
        "      <td>O</td>\n",
        "      <td>&lt;NA&gt;</td>\n",
+       "      <td>[   -0.44689128,    -0.31665266,      0.779688...</td>\n",
        "      <td>O</td>\n",
        "      <td>0</td>\n",
-       "      <td>[   -0.44689128,    -0.31665266,      0.779688...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>371477 rows × 12 columns</p>\n",
+       "<p>371477 rows × 16 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "         fold  doc_num  token_id  input_id  token_type_id  attention_mask  \\\n",
-       "0       train        0         0       101              0               1   \n",
-       "1       train        0         1       118              0               1   \n",
-       "2       train        0         2       141              0               1   \n",
-       "3       train        0         3      9244              0               1   \n",
-       "4       train        0         4      9272              0               1   \n",
-       "...       ...      ...       ...       ...            ...             ...   \n",
-       "371472   test      230       314      1711              0               1   \n",
-       "371473   test      230       315       117              0               1   \n",
-       "371474   test      230       316      5545              0               1   \n",
-       "371475   test      230       317       119              0               1   \n",
-       "371476   test      230       318       102              0               1   \n",
+       "         fold  doc_num  token_id                     span  input_id  \\\n",
+       "0       train        0         0               [0, 0): ''       101   \n",
+       "1       train        0         1              [0, 1): '-'       118   \n",
+       "2       train        0         2              [1, 2): 'D'       141   \n",
+       "3       train        0         3             [2, 4): 'OC'      9244   \n",
+       "4       train        0         4             [4, 6): 'ST'      9272   \n",
+       "...       ...      ...       ...                      ...       ...   \n",
+       "371472   test      230       314  [1386, 1393): 'brother'      1711   \n",
+       "371473   test      230       315        [1393, 1394): ','       117   \n",
+       "371474   test      230       316    [1395, 1400): 'Bobby'      5545   \n",
+       "371475   test      230       317        [1400, 1401): '.'       119   \n",
+       "371476   test      230       318               [0, 0): ''       102   \n",
+       "\n",
+       "        token_type_id  attention_mask  special_tokens_mask  \\\n",
+       "0                   0               1                 True   \n",
+       "1                   0               1                False   \n",
+       "2                   0               1                False   \n",
+       "3                   0               1                False   \n",
+       "4                   0               1                False   \n",
+       "...               ...             ...                  ...   \n",
+       "371472              0               1                False   \n",
+       "371473              0               1                False   \n",
+       "371474              0               1                False   \n",
+       "371475              0               1                False   \n",
+       "371476              0               1                 True   \n",
        "\n",
-       "        special_tokens_mask ent_iob ent_type token_class  token_class_id  \\\n",
-       "0                      True       O     <NA>           O               0   \n",
-       "1                     False       O     <NA>           O               0   \n",
-       "2                     False       O     <NA>           O               0   \n",
-       "3                     False       O     <NA>           O               0   \n",
-       "4                     False       O     <NA>           O               0   \n",
-       "...                     ...     ...      ...         ...             ...   \n",
-       "371472                False       O     <NA>           O               0   \n",
-       "371473                False       O     <NA>           O               0   \n",
-       "371474                False       B      PER       B-PER               4   \n",
-       "371475                False       O     <NA>           O               0   \n",
-       "371476                 True       O     <NA>           O               0   \n",
+       "                       raw_span  line_num  raw_span_id ent_iob ent_type  \\\n",
+       "0                           NaN       NaN          NaN       O     <NA>   \n",
+       "1         [0, 10): '-DOCSTART-'       0.0          0.0       O     <NA>   \n",
+       "2         [0, 10): '-DOCSTART-'       0.0          0.0       O     <NA>   \n",
+       "3         [0, 10): '-DOCSTART-'       0.0          0.0       O     <NA>   \n",
+       "4         [0, 10): '-DOCSTART-'       0.0          0.0       O     <NA>   \n",
+       "...                         ...       ...          ...     ...      ...   \n",
+       "371472  [1386, 1393): 'brother'   50345.0        267.0       O     <NA>   \n",
+       "371473        [1393, 1394): ','   50346.0        268.0       O     <NA>   \n",
+       "371474    [1395, 1400): 'Bobby'   50347.0        269.0       B      PER   \n",
+       "371475        [1400, 1401): '.'   50348.0        270.0       O     <NA>   \n",
+       "371476                      NaN       NaN          NaN       O     <NA>   \n",
        "\n",
-       "                                                embedding  \n",
-       "0       [  -0.098505184,     -0.4050192,     0.7428884...  \n",
-       "1       [  -0.057021223,    -0.48112097,      0.989868...  \n",
-       "2       [   -0.04824195,    -0.25330004,      1.167191...  \n",
-       "3       [   -0.26682988,    -0.31008753,      1.007472...  \n",
-       "4       [   -0.22296889,    -0.21308492,     0.9331016...  \n",
-       "...                                                   ...  \n",
-       "371472  [  -0.028172785,    -0.08062388,     0.9804888...  \n",
-       "371473  [    0.11817408,    -0.07008513,      0.865484...  \n",
-       "371474  [   -0.35689482,     0.31400457,      1.573853...  \n",
-       "371475  [   -0.18957126,    -0.24581163,       0.66257...  \n",
-       "371476  [   -0.44689128,    -0.31665266,      0.779688...  \n",
+       "                                                embedding token_class  \\\n",
+       "0       [  -0.098505184,     -0.4050192,     0.7428884...           O   \n",
+       "1       [  -0.057021223,    -0.48112097,      0.989868...           O   \n",
+       "2       [   -0.04824195,    -0.25330004,      1.167191...           O   \n",
+       "3       [   -0.26682988,    -0.31008753,      1.007472...           O   \n",
+       "4       [   -0.22296889,    -0.21308492,     0.9331016...           O   \n",
+       "...                                                   ...         ...   \n",
+       "371472  [  -0.028172785,    -0.08062388,     0.9804888...           O   \n",
+       "371473  [    0.11817408,    -0.07008513,      0.865484...           O   \n",
+       "371474  [   -0.35689482,     0.31400457,      1.573853...       B-PER   \n",
+       "371475  [   -0.18957126,    -0.24581163,       0.66257...           O   \n",
+       "371476  [   -0.44689128,    -0.31665266,      0.779688...           O   \n",
        "\n",
-       "[371477 rows x 12 columns]"
+       "        token_class_id  \n",
+       "0                    0  \n",
+       "1                    0  \n",
+       "2                    0  \n",
+       "3                    0  \n",
+       "4                    0  \n",
+       "...                ...  \n",
+       "371472               0  \n",
+       "371473               0  \n",
+       "371474               3  \n",
+       "371475               0  \n",
+       "371476               0  \n",
+       "\n",
+       "[371477 rows x 16 columns]"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -992,7 +842,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -1019,15 +869,19 @@
        "      <th>fold</th>\n",
        "      <th>doc_num</th>\n",
        "      <th>token_id</th>\n",
+       "      <th>span</th>\n",
        "      <th>input_id</th>\n",
        "      <th>token_type_id</th>\n",
        "      <th>attention_mask</th>\n",
        "      <th>special_tokens_mask</th>\n",
+       "      <th>raw_span</th>\n",
+       "      <th>line_num</th>\n",
+       "      <th>raw_span_id</th>\n",
        "      <th>ent_iob</th>\n",
        "      <th>ent_type</th>\n",
+       "      <th>embedding</th>\n",
        "      <th>token_class</th>\n",
        "      <th>token_class_id</th>\n",
-       "      <th>embedding</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
@@ -1036,75 +890,95 @@
        "      <td>train</td>\n",
        "      <td>12</td>\n",
        "      <td>0</td>\n",
+       "      <td>[0, 0): ''</td>\n",
        "      <td>101</td>\n",
        "      <td>0</td>\n",
        "      <td>1</td>\n",
        "      <td>True</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
        "      <td>O</td>\n",
        "      <td>&lt;NA&gt;</td>\n",
+       "      <td>[  -0.101977676,    -0.42442498,     0.8440171...</td>\n",
        "      <td>O</td>\n",
        "      <td>0</td>\n",
-       "      <td>[  -0.101977676,    -0.42442498,     0.8440171...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>train</td>\n",
        "      <td>12</td>\n",
        "      <td>1</td>\n",
+       "      <td>[0, 1): '-'</td>\n",
        "      <td>118</td>\n",
        "      <td>0</td>\n",
        "      <td>1</td>\n",
        "      <td>False</td>\n",
+       "      <td>[0, 10): '-DOCSTART-'</td>\n",
+       "      <td>2664.0</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>O</td>\n",
        "      <td>&lt;NA&gt;</td>\n",
+       "      <td>[   -0.09124618,    -0.47710702,      1.120292...</td>\n",
        "      <td>O</td>\n",
        "      <td>0</td>\n",
-       "      <td>[   -0.09124618,    -0.47710702,      1.120292...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>train</td>\n",
        "      <td>12</td>\n",
        "      <td>2</td>\n",
+       "      <td>[1, 2): 'D'</td>\n",
        "      <td>141</td>\n",
        "      <td>0</td>\n",
        "      <td>1</td>\n",
        "      <td>False</td>\n",
+       "      <td>[0, 10): '-DOCSTART-'</td>\n",
+       "      <td>2664.0</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>O</td>\n",
        "      <td>&lt;NA&gt;</td>\n",
+       "      <td>[    -0.1695277,    -0.27063507,      1.209566...</td>\n",
        "      <td>O</td>\n",
        "      <td>0</td>\n",
-       "      <td>[    -0.1695277,    -0.27063507,      1.209566...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>train</td>\n",
        "      <td>12</td>\n",
        "      <td>3</td>\n",
+       "      <td>[2, 4): 'OC'</td>\n",
        "      <td>9244</td>\n",
        "      <td>0</td>\n",
        "      <td>1</td>\n",
        "      <td>False</td>\n",
+       "      <td>[0, 10): '-DOCSTART-'</td>\n",
+       "      <td>2664.0</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>O</td>\n",
        "      <td>&lt;NA&gt;</td>\n",
+       "      <td>[   -0.27648172,     -0.3675844,      1.092024...</td>\n",
        "      <td>O</td>\n",
        "      <td>0</td>\n",
-       "      <td>[   -0.27648172,     -0.3675844,      1.092024...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>train</td>\n",
        "      <td>12</td>\n",
        "      <td>4</td>\n",
+       "      <td>[4, 6): 'ST'</td>\n",
        "      <td>9272</td>\n",
        "      <td>0</td>\n",
        "      <td>1</td>\n",
        "      <td>False</td>\n",
+       "      <td>[0, 10): '-DOCSTART-'</td>\n",
+       "      <td>2664.0</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>O</td>\n",
        "      <td>&lt;NA&gt;</td>\n",
+       "      <td>[   -0.24050614,    -0.24247544,       1.07511...</td>\n",
        "      <td>O</td>\n",
        "      <td>0</td>\n",
-       "      <td>[   -0.24050614,    -0.24247544,       1.07511...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>...</th>\n",
@@ -1120,131 +994,181 @@
        "      <td>...</td>\n",
        "      <td>...</td>\n",
        "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>45059</th>\n",
        "      <td>test</td>\n",
        "      <td>225</td>\n",
        "      <td>75</td>\n",
+       "      <td>[208, 213): 'fight'</td>\n",
        "      <td>2147</td>\n",
        "      <td>0</td>\n",
        "      <td>1</td>\n",
        "      <td>False</td>\n",
+       "      <td>[208, 213): 'fight'</td>\n",
+       "      <td>49418.0</td>\n",
+       "      <td>29.0</td>\n",
        "      <td>O</td>\n",
        "      <td>&lt;NA&gt;</td>\n",
+       "      <td>[   -0.09621397,    -0.48016888,      0.510937...</td>\n",
        "      <td>O</td>\n",
        "      <td>0</td>\n",
-       "      <td>[   -0.09621397,    -0.48016888,      0.510937...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>45060</th>\n",
        "      <td>test</td>\n",
        "      <td>225</td>\n",
        "      <td>76</td>\n",
+       "      <td>[214, 216): 'on'</td>\n",
        "      <td>1113</td>\n",
        "      <td>0</td>\n",
        "      <td>1</td>\n",
        "      <td>False</td>\n",
+       "      <td>[214, 216): 'on'</td>\n",
+       "      <td>49419.0</td>\n",
+       "      <td>30.0</td>\n",
        "      <td>O</td>\n",
        "      <td>&lt;NA&gt;</td>\n",
+       "      <td>[    -0.0858628,     -0.2341724,      0.832928...</td>\n",
        "      <td>O</td>\n",
        "      <td>0</td>\n",
-       "      <td>[    -0.0858628,     -0.2341724,      0.832928...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>45061</th>\n",
        "      <td>test</td>\n",
        "      <td>225</td>\n",
        "      <td>77</td>\n",
+       "      <td>[217, 225): 'Saturday'</td>\n",
        "      <td>4306</td>\n",
        "      <td>0</td>\n",
        "      <td>1</td>\n",
        "      <td>False</td>\n",
+       "      <td>[217, 225): 'Saturday'</td>\n",
+       "      <td>49420.0</td>\n",
+       "      <td>31.0</td>\n",
        "      <td>O</td>\n",
        "      <td>&lt;NA&gt;</td>\n",
+       "      <td>[  -0.012238501,     -0.4282664,      0.619483...</td>\n",
        "      <td>O</td>\n",
        "      <td>0</td>\n",
-       "      <td>[  -0.012238501,     -0.4282664,      0.619483...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>45062</th>\n",
        "      <td>test</td>\n",
        "      <td>225</td>\n",
        "      <td>78</td>\n",
+       "      <td>[225, 226): '.'</td>\n",
        "      <td>119</td>\n",
        "      <td>0</td>\n",
        "      <td>1</td>\n",
        "      <td>False</td>\n",
+       "      <td>[225, 226): '.'</td>\n",
+       "      <td>49421.0</td>\n",
+       "      <td>32.0</td>\n",
        "      <td>O</td>\n",
        "      <td>&lt;NA&gt;</td>\n",
+       "      <td>[  -0.042955935,    -0.36315423,      0.660203...</td>\n",
        "      <td>O</td>\n",
        "      <td>0</td>\n",
-       "      <td>[  -0.042955935,    -0.36315423,      0.660203...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>45063</th>\n",
        "      <td>test</td>\n",
        "      <td>225</td>\n",
        "      <td>79</td>\n",
+       "      <td>[0, 0): ''</td>\n",
        "      <td>102</td>\n",
        "      <td>0</td>\n",
        "      <td>1</td>\n",
        "      <td>True</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
        "      <td>O</td>\n",
        "      <td>&lt;NA&gt;</td>\n",
+       "      <td>[    -0.9504192,    0.012983555,     0.7374987...</td>\n",
        "      <td>O</td>\n",
        "      <td>0</td>\n",
-       "      <td>[    -0.9504192,    0.012983555,     0.7374987...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>45064 rows × 12 columns</p>\n",
+       "<p>45064 rows × 16 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "        fold  doc_num  token_id  input_id  token_type_id  attention_mask  \\\n",
-       "0      train       12         0       101              0               1   \n",
-       "1      train       12         1       118              0               1   \n",
-       "2      train       12         2       141              0               1   \n",
-       "3      train       12         3      9244              0               1   \n",
-       "4      train       12         4      9272              0               1   \n",
-       "...      ...      ...       ...       ...            ...             ...   \n",
-       "45059   test      225        75      2147              0               1   \n",
-       "45060   test      225        76      1113              0               1   \n",
-       "45061   test      225        77      4306              0               1   \n",
-       "45062   test      225        78       119              0               1   \n",
-       "45063   test      225        79       102              0               1   \n",
+       "        fold  doc_num  token_id                    span  input_id  \\\n",
+       "0      train       12         0              [0, 0): ''       101   \n",
+       "1      train       12         1             [0, 1): '-'       118   \n",
+       "2      train       12         2             [1, 2): 'D'       141   \n",
+       "3      train       12         3            [2, 4): 'OC'      9244   \n",
+       "4      train       12         4            [4, 6): 'ST'      9272   \n",
+       "...      ...      ...       ...                     ...       ...   \n",
+       "45059   test      225        75     [208, 213): 'fight'      2147   \n",
+       "45060   test      225        76        [214, 216): 'on'      1113   \n",
+       "45061   test      225        77  [217, 225): 'Saturday'      4306   \n",
+       "45062   test      225        78         [225, 226): '.'       119   \n",
+       "45063   test      225        79              [0, 0): ''       102   \n",
+       "\n",
+       "       token_type_id  attention_mask  special_tokens_mask  \\\n",
+       "0                  0               1                 True   \n",
+       "1                  0               1                False   \n",
+       "2                  0               1                False   \n",
+       "3                  0               1                False   \n",
+       "4                  0               1                False   \n",
+       "...              ...             ...                  ...   \n",
+       "45059              0               1                False   \n",
+       "45060              0               1                False   \n",
+       "45061              0               1                False   \n",
+       "45062              0               1                False   \n",
+       "45063              0               1                 True   \n",
        "\n",
-       "       special_tokens_mask ent_iob ent_type token_class  token_class_id  \\\n",
-       "0                     True       O     <NA>           O               0   \n",
-       "1                    False       O     <NA>           O               0   \n",
-       "2                    False       O     <NA>           O               0   \n",
-       "3                    False       O     <NA>           O               0   \n",
-       "4                    False       O     <NA>           O               0   \n",
-       "...                    ...     ...      ...         ...             ...   \n",
-       "45059                False       O     <NA>           O               0   \n",
-       "45060                False       O     <NA>           O               0   \n",
-       "45061                False       O     <NA>           O               0   \n",
-       "45062                False       O     <NA>           O               0   \n",
-       "45063                 True       O     <NA>           O               0   \n",
+       "                     raw_span  line_num  raw_span_id ent_iob ent_type  \\\n",
+       "0                         NaN       NaN          NaN       O     <NA>   \n",
+       "1       [0, 10): '-DOCSTART-'    2664.0          0.0       O     <NA>   \n",
+       "2       [0, 10): '-DOCSTART-'    2664.0          0.0       O     <NA>   \n",
+       "3       [0, 10): '-DOCSTART-'    2664.0          0.0       O     <NA>   \n",
+       "4       [0, 10): '-DOCSTART-'    2664.0          0.0       O     <NA>   \n",
+       "...                       ...       ...          ...     ...      ...   \n",
+       "45059     [208, 213): 'fight'   49418.0         29.0       O     <NA>   \n",
+       "45060        [214, 216): 'on'   49419.0         30.0       O     <NA>   \n",
+       "45061  [217, 225): 'Saturday'   49420.0         31.0       O     <NA>   \n",
+       "45062         [225, 226): '.'   49421.0         32.0       O     <NA>   \n",
+       "45063                     NaN       NaN          NaN       O     <NA>   \n",
        "\n",
-       "                                               embedding  \n",
-       "0      [  -0.101977676,    -0.42442498,     0.8440171...  \n",
-       "1      [   -0.09124618,    -0.47710702,      1.120292...  \n",
-       "2      [    -0.1695277,    -0.27063507,      1.209566...  \n",
-       "3      [   -0.27648172,     -0.3675844,      1.092024...  \n",
-       "4      [   -0.24050614,    -0.24247544,       1.07511...  \n",
-       "...                                                  ...  \n",
-       "45059  [   -0.09621397,    -0.48016888,      0.510937...  \n",
-       "45060  [    -0.0858628,     -0.2341724,      0.832928...  \n",
-       "45061  [  -0.012238501,     -0.4282664,      0.619483...  \n",
-       "45062  [  -0.042955935,    -0.36315423,      0.660203...  \n",
-       "45063  [    -0.9504192,    0.012983555,     0.7374987...  \n",
+       "                                               embedding token_class  \\\n",
+       "0      [  -0.101977676,    -0.42442498,     0.8440171...           O   \n",
+       "1      [   -0.09124618,    -0.47710702,      1.120292...           O   \n",
+       "2      [    -0.1695277,    -0.27063507,      1.209566...           O   \n",
+       "3      [   -0.27648172,     -0.3675844,      1.092024...           O   \n",
+       "4      [   -0.24050614,    -0.24247544,       1.07511...           O   \n",
+       "...                                                  ...         ...   \n",
+       "45059  [   -0.09621397,    -0.48016888,      0.510937...           O   \n",
+       "45060  [    -0.0858628,     -0.2341724,      0.832928...           O   \n",
+       "45061  [  -0.012238501,     -0.4282664,      0.619483...           O   \n",
+       "45062  [  -0.042955935,    -0.36315423,      0.660203...           O   \n",
+       "45063  [    -0.9504192,    0.012983555,     0.7374987...           O   \n",
        "\n",
-       "[45064 rows x 12 columns]"
+       "       token_class_id  \n",
+       "0                   0  \n",
+       "1                   0  \n",
+       "2                   0  \n",
+       "3                   0  \n",
+       "4                   0  \n",
+       "...               ...  \n",
+       "45059               0  \n",
+       "45060               0  \n",
+       "45061               0  \n",
+       "45062               0  \n",
+       "45063               0  \n",
+       "\n",
+       "[45064 rows x 16 columns]"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1264,9 +1188,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2021-06-30 00:53:27,285\tINFO services.py:1272 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -1288,22 +1219,22 @@
       "Training model '256_2' (#2 at 256 dimensions) with seed 643865\n",
       "Training model '256_3' (#3 at 256 dimensions) with seed 402414\n",
       "Training model '256_4' (#4 at 256 dimensions) with seed 822761\n",
-      "\u001b[2m\u001b[36m(pid=24222)\u001b[0m Training model with n_components=64 and seed=94177.\n",
-      "\u001b[2m\u001b[36m(pid=24226)\u001b[0m Training model with n_components=32 and seed=654571.\n",
-      "\u001b[2m\u001b[36m(pid=24230)\u001b[0m Training model with n_components=64 and seed=201469.\n",
-      "\u001b[2m\u001b[36m(pid=24223)\u001b[0m Training model with n_components=32 and seed=773956.\n",
-      "\u001b[2m\u001b[36m(pid=24224)\u001b[0m Training model with n_components=32 and seed=438878.\n",
-      "\u001b[2m\u001b[36m(pid=24225)\u001b[0m Training model with n_components=32 and seed=89250.\n",
-      "\u001b[2m\u001b[36m(pid=24235)\u001b[0m Training model with n_components=64 and seed=975622.\n",
-      "\u001b[2m\u001b[36m(pid=24227)\u001b[0m Training model with n_components=64 and seed=526478.\n",
-      "\u001b[2m\u001b[36m(pid=24231)\u001b[0m Training model with n_components=128 and seed=128113.\n",
-      "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=256 and seed=781567.\n",
-      "\u001b[2m\u001b[36m(pid=24234)\u001b[0m Training model with n_components=128 and seed=513226.\n",
-      "\u001b[2m\u001b[36m(pid=24233)\u001b[0m Training model with n_components=128 and seed=839748.\n",
-      "\u001b[2m\u001b[36m(pid=24232)\u001b[0m Training model with n_components=128 and seed=450385.\n",
-      "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=256 and seed=402414.\n",
-      "\u001b[2m\u001b[36m(pid=24229)\u001b[0m Training model with n_components=256 and seed=643865.\n",
-      "\u001b[2m\u001b[36m(pid=24225)\u001b[0m Training model with n_components=256 and seed=822761.\n",
+      "\u001b[2m\u001b[36m(pid=1424)\u001b[0m Training model with n_components=64 and seed=94177.\n",
+      "\u001b[2m\u001b[36m(pid=1427)\u001b[0m Training model with n_components=64 and seed=526478.\n",
+      "\u001b[2m\u001b[36m(pid=1421)\u001b[0m Training model with n_components=128 and seed=128113.\n",
+      "\u001b[2m\u001b[36m(pid=1428)\u001b[0m Training model with n_components=64 and seed=975622.\n",
+      "\u001b[2m\u001b[36m(pid=1422)\u001b[0m Training model with n_components=256 and seed=781567.\n",
+      "\u001b[2m\u001b[36m(pid=1431)\u001b[0m Training model with n_components=32 and seed=654571.\n",
+      "\u001b[2m\u001b[36m(pid=1430)\u001b[0m Training model with n_components=32 and seed=773956.\n",
+      "\u001b[2m\u001b[36m(pid=1432)\u001b[0m Training model with n_components=32 and seed=438878.\n",
+      "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=32 and seed=89250.\n",
+      "\u001b[2m\u001b[36m(pid=1426)\u001b[0m Training model with n_components=128 and seed=450385.\n",
+      "\u001b[2m\u001b[36m(pid=1419)\u001b[0m Training model with n_components=256 and seed=643865.\n",
+      "\u001b[2m\u001b[36m(pid=1425)\u001b[0m Training model with n_components=128 and seed=839748.\n",
+      "\u001b[2m\u001b[36m(pid=1423)\u001b[0m Training model with n_components=128 and seed=513226.\n",
+      "\u001b[2m\u001b[36m(pid=1420)\u001b[0m Training model with n_components=256 and seed=402414.\n",
+      "\u001b[2m\u001b[36m(pid=1429)\u001b[0m Training model with n_components=64 and seed=201469.\n",
+      "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=256 and seed=822761.\n",
       "Trained 17 models.\n",
       "Model names after loading or training: 768_1, 32_1, 32_2, 32_3, 32_4, 64_1, 64_2, 64_3, 64_4, 128_1, 128_2, 128_3, 128_4, 256_1, 256_2, 256_3, 256_4\n"
      ]
@@ -1311,15 +1242,16 @@
    ],
    "source": [
     "import importlib\n",
-    "util = importlib.reload(util)\n",
     "import sklearn.linear_model\n",
+    "import ray\n",
+    "ray.init()\n",
     "\n",
-    "# Wrap util.train_reduced_model in a Ray task\n",
+    "# Wrap train_reduced_model in a Ray task\n",
     "@ray.remote\n",
     "def train_reduced_model_task(\n",
     "        x_values: np.ndarray, y_values: np.ndarray, n_components: int,\n",
     "        seed: int, max_iter: int = 10000) -> sklearn.base.BaseEstimator:\n",
-    "    return util.train_reduced_model(x_values, y_values, n_components, seed, max_iter)\n",
+    "    return cleaning.ensemble.train_reduced_model(x_values, y_values, n_components, seed, max_iter)\n",
     "\n",
     "# Ray task that trains a model using the entire embedding\n",
     "@ray.remote\n",
@@ -1370,7 +1302,6 @@
     "            names_list.append(model_name)\n",
     "            futures_list.append(train_reduced_model_task.remote(X_id, Y_id, \n",
     "                                                                num_dims, seed))\n",
-    "            #models[model_name] = util.train_reduced_model(X, Y, num_dims, seed)\n",
     "    \n",
     "    # Block until all training tasks have completed and fetch the resulting models.\n",
     "    models_list = ray.get(futures_list)\n",
@@ -1400,7 +1331,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1430,13 +1361,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "affbf755344e45cfb595819ca5a00614",
+       "model_id": "db850b403c7f4056b57362ca627295f0",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1446,6 +1377,86 @@
      },
      "metadata": {},
      "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>span</th>\n",
+       "      <th>ent_type</th>\n",
+       "      <th>fold</th>\n",
+       "      <th>doc_num</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>[11, 16): 'Saudi'</td>\n",
+       "      <td>MISC</td>\n",
+       "      <td>train</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>[59, 65): 'MANAMA'</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>train</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>[86, 91): 'Saudi'</td>\n",
+       "      <td>MISC</td>\n",
+       "      <td>train</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>[259, 264): 'Saudi'</td>\n",
+       "      <td>MISC</td>\n",
+       "      <td>train</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>[55, 65): 'MONTGOMERY'</td>\n",
+       "      <td>LOC</td>\n",
+       "      <td>train</td>\n",
+       "      <td>20</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                     span ent_type   fold  doc_num\n",
+       "0       [11, 16): 'Saudi'     MISC  train       12\n",
+       "1      [59, 65): 'MANAMA'      LOC  train       12\n",
+       "2       [86, 91): 'Saudi'     MISC  train       12\n",
+       "3     [259, 264): 'Saudi'     MISC  train       12\n",
+       "0  [55, 65): 'MONTGOMERY'      LOC  train       20"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -1465,46 +1476,219 @@
     "    todo = [(name, model) for name, model in models.items()]\n",
     "    results = tp.jupyter.run_with_progress_bar(\n",
     "        len(todo),\n",
-    "        lambda i: util.analyze_model(test_df, int_to_label, todo[i][1], \n",
-    "                                     bert_data, corpus_raw, expand_matches=True),\n",
+    "        lambda i: cleaning.infer_and_extract_entities_iob(test_df,corpus_raw, int_to_label, todo[i][1]),\n",
     "        \"model\"\n",
     "    )\n",
     "    return {t[0]: result for t, result in zip(todo, results)}\n",
     "\n",
-    "evals = eval_models(models, test_inputs_df)"
+    "evals = eval_models(models, test_inputs_df)\n",
+    "# display one of the results\n",
+    "evals[list(evals.keys())[0]].head()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 12,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>precision</th>\n",
+       "      <th>recall</th>\n",
+       "      <th>f1-score</th>\n",
+       "      <th>dims</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>768_1</th>\n",
+       "      <td>0.947149</td>\n",
+       "      <td>0.938839</td>\n",
+       "      <td>0.942976</td>\n",
+       "      <td>768</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32_1</th>\n",
+       "      <td>0.924075</td>\n",
+       "      <td>0.863742</td>\n",
+       "      <td>0.892890</td>\n",
+       "      <td>32</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32_2</th>\n",
+       "      <td>0.924755</td>\n",
+       "      <td>0.875355</td>\n",
+       "      <td>0.899377</td>\n",
+       "      <td>32</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32_3</th>\n",
+       "      <td>0.925028</td>\n",
+       "      <td>0.866065</td>\n",
+       "      <td>0.894576</td>\n",
+       "      <td>32</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32_4</th>\n",
+       "      <td>0.932949</td>\n",
+       "      <td>0.876129</td>\n",
+       "      <td>0.903647</td>\n",
+       "      <td>32</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>64_1</th>\n",
+       "      <td>0.940086</td>\n",
+       "      <td>0.902968</td>\n",
+       "      <td>0.921153</td>\n",
+       "      <td>64</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>64_2</th>\n",
+       "      <td>0.938321</td>\n",
+       "      <td>0.902968</td>\n",
+       "      <td>0.920305</td>\n",
+       "      <td>64</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>64_3</th>\n",
+       "      <td>0.936808</td>\n",
+       "      <td>0.895226</td>\n",
+       "      <td>0.915545</td>\n",
+       "      <td>64</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>64_4</th>\n",
+       "      <td>0.940828</td>\n",
+       "      <td>0.902710</td>\n",
+       "      <td>0.921375</td>\n",
+       "      <td>64</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>128_1</th>\n",
+       "      <td>0.944401</td>\n",
+       "      <td>0.924903</td>\n",
+       "      <td>0.934550</td>\n",
+       "      <td>128</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>128_2</th>\n",
+       "      <td>0.947577</td>\n",
+       "      <td>0.923613</td>\n",
+       "      <td>0.935442</td>\n",
+       "      <td>128</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>128_3</th>\n",
+       "      <td>0.943212</td>\n",
+       "      <td>0.921548</td>\n",
+       "      <td>0.932254</td>\n",
+       "      <td>128</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>128_4</th>\n",
+       "      <td>0.940991</td>\n",
+       "      <td>0.921806</td>\n",
+       "      <td>0.931300</td>\n",
+       "      <td>128</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>256_1</th>\n",
+       "      <td>0.949201</td>\n",
+       "      <td>0.935484</td>\n",
+       "      <td>0.942293</td>\n",
+       "      <td>256</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>256_2</th>\n",
+       "      <td>0.943396</td>\n",
+       "      <td>0.929032</td>\n",
+       "      <td>0.936159</td>\n",
+       "      <td>256</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>256_3</th>\n",
+       "      <td>0.945478</td>\n",
+       "      <td>0.930839</td>\n",
+       "      <td>0.938101</td>\n",
+       "      <td>256</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>256_4</th>\n",
+       "      <td>0.945055</td>\n",
+       "      <td>0.932129</td>\n",
+       "      <td>0.938547</td>\n",
+       "      <td>256</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       precision    recall  f1-score  dims\n",
+       "768_1   0.947149  0.938839  0.942976   768\n",
+       "32_1    0.924075  0.863742  0.892890    32\n",
+       "32_2    0.924755  0.875355  0.899377    32\n",
+       "32_3    0.925028  0.866065  0.894576    32\n",
+       "32_4    0.932949  0.876129  0.903647    32\n",
+       "64_1    0.940086  0.902968  0.921153    64\n",
+       "64_2    0.938321  0.902968  0.920305    64\n",
+       "64_3    0.936808  0.895226  0.915545    64\n",
+       "64_4    0.940828  0.902710  0.921375    64\n",
+       "128_1   0.944401  0.924903  0.934550   128\n",
+       "128_2   0.947577  0.923613  0.935442   128\n",
+       "128_3   0.943212  0.921548  0.932254   128\n",
+       "128_4   0.940991  0.921806  0.931300   128\n",
+       "256_1   0.949201  0.935484  0.942293   256\n",
+       "256_2   0.943396  0.929032  0.936159   256\n",
+       "256_3   0.945478  0.930839  0.938101   256\n",
+       "256_4   0.945055  0.932129  0.938547   256"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "# Summarize how each of the models does on the test set.\n",
+    "gold_elts = cleaning.preprocess.combine_raw_spans_docs_to_match(corpus_raw,evals[list(evals.keys())[0]],label_col = 'ent_type')\n",
     "def make_summary_df(evals_df: pd.DataFrame) -> pd.DataFrame:\n",
-    "    global_scores = [r[\"global_scores\"] for r in evals_df.values()]\n",
-    "    return pd.DataFrame({\n",
-    "        \"name\": list(evals_df.keys()),\n",
-    "        \"dims\": pd.Series([n.split(\"_\")[0] for n in evals_df.keys()]).astype(int),\n",
-    "        \"num_true_positives\": [r[\"num_true_positives\"] for r in global_scores],\n",
-    "        \"num_entities\": [r[\"num_entities\"] for r in global_scores],\n",
-    "        \"num_extracted\": [r[\"num_extracted\"] for r in global_scores],\n",
-    "        \"precision\": [r[\"precision\"] for r in global_scores],\n",
-    "        \"recall\": [r[\"recall\"] for r in global_scores],\n",
-    "        \"F1\": [r[\"F1\"] for r in global_scores]\n",
-    "    }).sort_values(\"dims\")\n",
+    "    gold_elts = cleaning.preprocess.combine_raw_spans_docs_to_match(corpus_raw, evals['256_4'], label_col = 'ent_type')\n",
+    "    summary_df= cleaning.analysis.create_f1_report_ensemble_iob(evals,gold_elts)\n",
+    "    summary_df['dims'] = [int(name.split('_')[0]) for name in evals.keys()]\n",
+    "    return summary_df\n",
     "\n",
-    "summary_df = make_summary_df(evals)"
+    "summary_df = make_summary_df(evals)\n",
+    "summary_df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAASAAAAEKCAYAAACytIjQAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAXbElEQVR4nO3dfZQX1X3H8ffHBcUYIygkR8EHNCiSGMVs1FRPNEaFeBKfalqpbdTaUFs1xsQ0cuJRQ9qjRvOktUZM0BqthhiK1NjgEyap9YFFjIiKIj7AaiIJPlRFlOXbP+YuDOvu8lt2Z+/ubz+vc36HmXtnZu/srB/vzPzmjiICM7McNsvdADMbuBxAZpaNA8jMsnEAmVk2DiAzy8YBZGbZVBZAkqZLelnSYx3US9LlkpZIelTSvqW6kyQ9nT4nVdVGM8uryh7QdcDETuo/C4xJn8nAVQCStgUuAPYH9gMukDSswnaaWSaVBVBE/AZY2ckiRwPXR+EBYKik7YEJwJ0RsTIiXgHupPMgM7N+alDGnz0SWFaaX57KOip/D0mTKXpPbLXVVh8fO3ZsNS01s07Nnz//jxExoqvr5QygbouIacA0gMbGxmhqasrcIrOBSdLzm7JezrtgzcCOpflRqayjcjOrMzkDaDbwxXQ37ADgtYh4CZgDHCFpWLr4fEQqM7M6U9kpmKSbgEOA4ZKWU9zZGgwQET8CbgeOBJYAbwGnpLqVkr4NzEubmhoRnV3MNrN+qrIAiohJG6kP4PQO6qYD06tol5n1Hf4mtJll4wAys2wcQGaWjQPIzLJxAJlZNg4gM8vGAWRm2TiAzCwbB5CZZeMAMrNsHEBmlo0DyMyycQCZWTYOIDPLxgFkZtk4gMwsGweQmWXjADKzbBxAZpaNA8jMsnEAmVk2DiAzy8YBZGbZOIDMLBsHkJll4wAys2wcQGaWjQPIzLJxAJlZNg4gM8vGAWRm2TiAzCwbB5CZZeMAMrNsHEBmlk2lASRpoqTFkpZIOred+p0l3S3pUUn3ShpVqmuR9Ej6zK6ynWaWx6CqNiypAbgSOBxYDsyTNDsiHi8tdhlwfUT8u6RDgYuAv0l1qyJin6raZ2b5VdkD2g9YEhFLI+Id4Gbg6DbLjAPuSdNz26k3szpWZQCNBJaV5pensrLfAcel6WOBrSVtl+aHSGqS9ICkY9r7AZImp2WaVqxY0YNNN7PekPsi9DnAwZIWAAcDzUBLqts5IhqBvwJ+IGm3titHxLSIaIyIxhEjRvRao82sZ1R2DYgiTHYszY9KZetExIukHpCk9wN/HhGvprrm9O9SSfcC44FnKmyvmfWyKgNoHjBG0miK4DmBojezjqThwMqIWAtMAaan8mHAWxGxOi1zIPCdCttqZiWzFjRz6ZzFvPjqKnYYuiVfn7AHx4xvewWl+yoLoIhYI+kMYA7QAEyPiEWSpgJNETEbOAS4SFIAvwFOT6vvCVwtaS3FaeLFbe6eWY166w/J6sesBc1MmbmQVe8WV0OaX13FlJkLAXr8b0cR0aMbzKWxsTGamppyN6NPafuHBLDl4AYuOm4vh5B16MCL76H51VXvKR85dEvuO/fQdteRND9ds+2S3BehrUKXzlm8QfgArHq3hUvnLM7UIusPXmwnfDor7w4HUB3rzT8kqx87DN2yS+Xd4QCqY+/bvKFL5WYAX5+wB1sO3vBvZMvBDXx9wh49/rOqvAtmmb31TkuXys1g/YXmfn0XbCA7b9ZCbnpwGS0RNEhM2n9H/vmYvXq9HR3dXqiP2w5WpWPGj+yVGxUOoB523qyF3PDAC+vmWyLWzecIIbO+zNeAethNDy7rUrnZQOYA6mEtHXyvqqPyKo3s4K5FR+Vmvc0BVMd6826G2abwNaA61pt3M8w2hQOozvXW3QyzTeFTsB427H2Du1RuNpA5gHrYBZ//CIMbtEHZ4AZxwec/kqlFZn2XT8F6mK+7mNXOAVQBX3cxq41PwcwsG/eASvrKM1xmA4UDKPEzXGa9z6dgyY2l8Kml3My6zwGUeOgKs97nADKzbBxAyVYdDFPaUbmZdZ8DKDl23/a/t9NRuZl1nwMomfvkii6Vm1n3OYASv8LGrPc5gJLefBeSmRUcQIlHDzTrff4mdOKn2M16nwOoxE+xm/Uun4KZWTYOIDPLZkCdgs1a0OxrPGZ9yIAJoFkLmpkycyGr3m0BoPnVVUyZuRDAIWSWyYA5Bbt0zuJ14dNq1bstXDpncaYWmVmlASRpoqTFkpZIOred+p0l3S3pUUn3ShpVqjtJ0tPpc1J32+JvOpv1PZUFkKQG4Ergs8A4YJKkcW0Wuwy4PiI+BkwFLkrrbgtcAOwP7AdcIGlYd9rjbzqb9T1V9oD2A5ZExNKIeAe4GTi6zTLjgHvS9NxS/QTgzohYGRGvAHcCE7vTmE+PHdGlcjOrXpUBNBJYVppfnsrKfgccl6aPBbaWtF2N6yJpsqQmSU0rVnT+1Lqfdjfre3JfhD4HOFjSAuBgoBlo6XyV9SJiWkQ0RkTjiBGd92R8Dcis76kpgCQdJOmUND1C0ugaVmsGdizNj0pl60TEixFxXESMB76Zyl6tZd2u8jUgs75nowEk6QLgG8CUVDQYuKGGbc8DxkgaLWlz4ARgdpttD5fU2oYpwPQ0PQc4QtKwdPH5iFS2yXwNyKzvqaUHdCxwFPAmFL0WYOuNrRQRa4AzKILjCWBGRCySNFXSUWmxQ4DFkp4CPgT8S1p3JfBtihCbB0xNZZvM14DM+p5avgn9TkSEpACQtFWtG4+I24Hb25SdX5q+Bbilg3Wns75H1G3NHVzr6ajczKpXSw9ohqSrgaGSvgTcBVxTbbN6XoPUpXIzq16nPSBJAn4GjAVeB/YAzo+IO3uhbT2qJdp/xWBH5WZWvU4DKJ163R4Re1F8GbDfGjl0y3ZPt0b6LphZNrWcgj0s6ROVt6RiHvPZrO+p5SL0/sCJkp6nuBMmis7RxyptWQ/zmM9mfU8tATSh8lb0Eo/5bNa3bPQULCKeB4YCn0+foanMzKxbavkm9FnAjcAH0+cGSWdW3TAzq3+1nIKdCuwfEW8CSLoEuB+4osqGmVn9q+UumNjwCfWWVGZm1i219ICuBR6U9J9p/hjgJ5W1yMwGjI0GUER8T9K9wEGp6JSIWFBpq8xsQNhoAEk6AFgUEQ+n+Q9I2j8iHqy8dWZW12q5BnQV8EZp/o1UZmbWLTVdhI5Y/8RmRKxlAL3Q0MyqU0sALZX0ZUmD0+csYGnVDTOz+ldLAJ0G/BnFmMzNFM+GTa6yUWY2MNRyF+xlivGczcx6VIc9IElfkjQmTUvSdEmvpdco79t7TTSzetXZKdhZwHNpehKwN7Ar8FXgh9U2y8wGgs4CaE1EvJumP0fxDvc/RcRdQM0D05uZdaSzAForaXtJQ4DPUAxG38rjmJpZt3V2Efp8oAloAGZHxCIASQfj2/Bm1gM6DKCIuE3SzsDWEfFKqaoJ+MvKW2ZmdW9jb8VYA7zSpuzNSltkZgNGLV9ENDOrhAPIzLLZpACSNLanG2JmA8+m9oDu6NFWmNmA1OFFaEmXd1RF8ZoeM7Nu6ewu2CnA14DV7dRNqqY5ZjaQdBZA84DHIuJ/21ZIurCyFpnZgNFZAB0PvN1eRUSMrqY5ZjaQdHYR+v0R8VavtcTMBpzOAmhW64SkX1TfFDMbaDoLoPLbT3fdlI1LmihpsaQlks5tp34nSXMlLUgDnR2ZyneRtErSI+nzo035+WbWt3V2DSg6mK6JpAbgSuBwYDkwT9LsiHi8tNh5wIyIuErSOOB2YJdU90xE7NPVn2tm/UdnAbS3pNcpekJbpmnSfETEBzay7f2AJRGxFEDSzcDRQDmAAmjdzjbAi11sv5n1Y50Nx9HQzW2PBJaV5pdTvFGj7ELgDklnUoyyeFipbrSkBcDrwHkR8du2P0DSZNIbOnbaaaduNtfMelvuh1EnAddFxCjgSOCnkjYDXgJ2iojxFGNQ/4ek9/S4ImJaRDRGROOIESN6teFm1n1VBlAzsGNpflQqKzsVmAEQEfcDQ4DhEbE6Iv6UyucDzwC7V9hWM8ugygCaB4yRNFrS5hTvFpvdZpkXKMabRtKeFAG0QtKIdBEbSbsCY/AwsGZ1p7J3vEfEGklnAHMoxpWeHhGLJE0FmiJiNsWzZtdIOpvigvTJERGSPgVMlfQusBY4LSJWVtVWM8tDEV2+w94nNTY2RlNTU+5mmA1IkuZHRGNX18t9EdrMBjAHkJll4wAys2wcQGaWjQPIzLJxAJlZNg4gM8vGAWRm2TiAzCwbB5CZZeMAMrNsHEBmlo0DyMyycQCZWTYOIDPLxgFkZtk4gMwsGweQmWXjADKzbBxAZpaNA8jMsnEAmVk2DiAzy8YBZGbZOIDMLBsHkJll4wAys2wcQGaWjQPIzLJxAJlZNg4gM8vGAWRm2TiAzCwbB5CZZVNpAEmaKGmxpCWSzm2nfidJcyUtkPSopCNLdVPSeoslTaiynWaWx6CqNiypAbgSOBxYDsyTNDsiHi8tdh4wIyKukjQOuB3YJU2fAHwE2AG4S9LuEdFSVXvNrPdV2QPaD1gSEUsj4h3gZuDoNssE8IE0vQ3wYpo+Grg5IlZHxLPAkrQ9M6sjVQbQSGBZaX55Kiu7EPhrScspej9ndmFdJE2W1CSpacWKFT3VbjPrJbkvQk8CrouIUcCRwE8l1dymiJgWEY0R0ThixIjKGmlm1ajsGhDQDOxYmh+VyspOBSYCRMT9koYAw2tc18z6uSp7QPOAMZJGS9qc4qLy7DbLvAB8BkDSnsAQYEVa7gRJW0gaDYwBHqqwrWaWQWU9oIhYI+kMYA7QAEyPiEWSpgJNETEb+BpwjaSzKS5InxwRASySNAN4HFgDnO47YGb1R8V/7/1fY2NjNDU15W6G2YAkaX5ENHZ1vdwXoc1sAHMAmVk2DiAzy8YBZGbZOIDMLBsHkJll4wAys2wcQGaWjQPIzLJxAJlZNg4gM8vGAWRm2TiAzCwbB5CZZeMAMrNsHEBmlo0DyMyycQCZWTYOIDPLxgFkZtk4gMwsGweQmWXjADKzbBxAZpaNA8jMsnEAmVk2DiAzy8YBZGbZOIDMLBsHkJll4wAys2wcQGaWjQPIzLJxAJlZNg4gM8um0gCSNFHSYklLJJ3bTv33JT2SPk9JerVU11Kqm11lO80sj0FVbVhSA3AlcDiwHJgnaXZEPN66TEScXVr+TGB8aROrImKfqtpnZvlV2QPaD1gSEUsj4h3gZuDoTpafBNxUYXvMrI+prAcEjASWleaXA/u3t6CknYHRwD2l4iGSmoA1wMURMaud9SYDk9PsG5IWA8OBP3a79X2T961/Ggj7tvOmrFxlAHXFCcAtEdFSKts5Ipol7QrcI2lhRDxTXikipgHTymWSmiKisfom9z7vW//kfetYladgzcCOpflRqaw9J9Dm9CsimtO/S4F72fD6kJnVgSoDaB4wRtJoSZtThMx77mZJGgsMA+4vlQ2TtEWaHg4cCDzedl0z698qOwWLiDWSzgDmAA3A9IhYJGkq0BQRrWF0AnBzRERp9T2BqyWtpQjJi8t3zzZi2sYX6be8b/2T960D2vC/ezOz3uNvQptZNg4gM8umbgJoY4999HWSdpQ0V9LjkhZJOiuVbyvpTklPp3+HpXJJujzt76OS9s27BxsnqUHSAkm3pfnRkh5M+/CzdLMCSVuk+SWpfpesDa+BpKGSbpH0pKQnJH2yXo6dpLPT3+Rjkm6SNKSnjl1dBFDpsY/PAuOASZLG5W1Vl60BvhYR44ADgNPTPpwL3B0RY4C70zwU+zomfSYDV/V+k7vsLOCJ0vwlwPcj4sPAK8CpqfxU4JVU/v20XF/3Q+BXETEW2JtiP/v9sZM0Evgy0BgRH6W4oXQCPXXsIqLff4BPAnNK81OAKbnb1c19upXiObrFwPapbHtgcZq+GphUWn7dcn3xQ/E9sLuBQ4HbAFF8g3ZQ22NIcef0k2l6UFpOufehk33bBni2bRvr4dix/omGbdOxuA2Y0FPHri56QLT/2MfITG3pttRtHQ88CHwoIl5KVb8HPpSm+9s+/wD4J2Btmt8OeDUi1qT5cvvX7Vuqfy0t31eNBlYA16ZTzB9L2oo6OHZRfCH4MuAF4CWKYzGfHjp29RJAdUPS+4FfAF+JiNfLdVH8b6XffW9C0ueAlyNifu62VGQQsC9wVUSMB95k/ekW0K+P3TCKh8hHAzsAWwETe2r79RJAXXnso8+SNJgifG6MiJmp+A+Stk/12wMvp/L+tM8HAkdJeo5iVIRDKa6ZDJXU+mXYcvvX7Vuq3wb4U282uIuWA8sj4sE0fwtFINXDsTsMeDYiVkTEu8BMiuPZI8euXgKopsc++jJJAn4CPBER3ytVzQZOStMnUVwbai3/YrqjcgDwWqm736dExJSIGBURu1Acm3si4kRgLnB8WqztvrXu8/Fp+T7be4iI3wPLJO2Rij5D8ehQvz92FKdeB0h6X/obbd23njl2uS9y9eDFsiOBp4BngG/mbs8mtP8gii76o8Aj6XMkxfnz3cDTwF3Atml5Udz5ewZYSHGXIvt+1LCfhwC3peldgYeAJcDPgS1S+ZA0vyTV75q73TXs1z5AUzp+syieb6yLYwd8C3gSeAz4KbBFTx07P4phZtnUyymYmfVDDiAzy8YBZGbZOIDMLBsHkJll4wDqJySFpO+W5s+RdGEPbfs6ScdvfMlu/5wvpCfF57Yp30XSqvQYwxOSHpJ0cqn+qBwjHEjaQdItvf1zB5K+8lYM27jVwHGSLoqIPvOKF0mDYv0zQRtzKvCliPifduqeieIxBlS8CWWmJEXEtVEM39vrXyyNiBdZ/2U7q4B7QP3HGorxd89uW9G2ByPpjfTvIZJ+LelWSUslXSzpxNTDWChpt9JmDpPUpOIV2Z9L6zdIulTSvDRuzd+XtvtbFa/Mfs9Y3ZImpe0/JumSVHY+xZctfyLp0s52NIo3oXyVYhgIJJ0s6V9L+3qVpAfSPh0iaXrqOV1XasMRku6X9LCkn6dn7JD0nKRvpfKFKl6KgKSDtf5V4AskbZ16Zo+l+iGSrk3rLJD06VLbZkr6lYpxf75T+t1dl34HCyW957iZe0D9zZXAo61/5DXam2KQ/5XAUuDHEbGfigHPzgS+kpbbheJttrsBcyV9GPgixWMCn1DxlpL7JN2Rlt8X+GhEPFv+YZJ2oBgD5uMU48TcIemYiJgq6VDgnIhoqqHdDwNjO6gbRjEExFEUPaMDgb+jeP33PhTPZp0HHBYRb0r6BkWgTU3r/zEi9pX0j8A5ad1zgNMj4r4UVm+3+ZmnUzxTulcKrTsk7Z7q9qEYvWA1sFjSFcAHgZFRjKGDpKE17POA4x5QPxLF0/HXk3oGNZoXES9FxGqKr/63BshCitBpNSMi1kbE0xRBNRY4guKZpUcohgbZjmIQLYCH2oZP8gng3igeXlwD3Ah8qgvtbaVO6v4riq/wLwT+EBELI2ItsCjt0wEUA9Pdl9p+Ehu+ubP1Qd/5rP8d3Ad8T9KXgaHtnFYeBNwAEBFPAs8DrQF0d0S8FhFvU/QId6b4He4q6QpJE4HXsfdwD6j/+QFF7+DaUtka0v9MJG0GbF6qW12aXluaX8uGx7/tMzlBEQJnRsSccoWkQyiGnKjSeDYcPbGsvA9t928Q0ALcGRGTNrJ+S1qeiLhY0i8pnr+7T9IE3tsL6ki5DS0UA3W9ImlvisG7TgP+AvjbGrc3YLgH1M9ExEpgBuuHwAR4juKUB4rTksGbsOkvSNosXRfalWKUvjnAP6gYJgRJu6sYaKszDwEHSxquYqjcScCvu9IQFQOyXQZc0cV9aPUAcGA6jUTSVqXTpY5+5m6pJ3UJxegKbU//fgucmJbdHdiJ4nfU0faGA5tFxC8oTgf79LjPubgH1D99FzijNH8NcKuk3wG/YtN6Jy9QhMcHgNMi4m1JP6Y4RXlYkihG/Tums41ExEvplvlcih7ULyPi1s7WSXaTtIDiaer/Ay6PiOs2YT+IiBUqbuPflK5dQRECT3Wy2lfSheXWU7n/phhGtdW/AVdJWkjR4zw5IlYXv5Z2jaQYIbH1f/JTNmVf6p2fhjezbHwKZmbZOIDMLBsHkJll4wAys2wcQGaWjQPIzLJxAJlZNv8PJWDR/pTDR+kAAAAASUVORK5CYII=\n",
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAASAAAAEGCAYAAADFdkirAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAYQ0lEQVR4nO3df5RdZX3v8feHZIKjgIMkZZGEmwBiaGzB0DFCpU2uWhK6bPihbUlZFaxXrldRrE1uybILNV0s6g22/ijXVargpbqkiGmM2DpgDNbSipkwhBBxQoxaMqF1LAy2OkJ+fO8f+znJyZCZnJk5+zxnzvm81pqVvZ+99znfPSf55Nl7n/1sRQRmZjkcl7sAM2tfDiAzy8YBZGbZOIDMLBsHkJllMz13AfUyc+bMmD9/fu4yzNrS1q1bfxwRs8a7XcsE0Pz58+nt7c1dhllbkvTDiWznQzAzy8YBZGbZOIDMLBsHkJll4wAys2xa5iqYmdXPhr4B1vX0s3domNldnaxetoDLFs2p+/s4gMzsCBv6BlizfjvD+w4AMDA0zJr12wHqHkI+BDOzI6zr6T8UPhXD+w6wrqe/7u/lHlCLa1RX2lrH3qHhcbVPhntALazSlR4YGiY43JXe0DeQuzRrYrO7OsfVPhkOoBbWyK60tY7VyxbQ2THtiLbOjmmsXrag7u/lQ7AW1siutLWOyiG6r4LZpMzu6mTgKGFTRlfaWstli+Y05FyhD8FaWCO70mYT4R5QC2tkV9psIhxALa5RXWmzifAhmJllU2oASVouqV/SLkk3HGX5PEmbJD0q6QFJc0csP0nSHkl/WWadZpZHaQEkaRpwK3AJsBBYKWnhiNVuAe6MiHOBtcDNI5b/KfCPZdVoZnmV2QNaDOyKiN0R8TxwF3DpiHUWAl9P05url0v6FeBU4L4SazSzjMoMoDnAk1Xze1JbtW3AFWn6cuBESadIOg74CLBqrDeQdK2kXkm9g4ODdSrbzBol90noVcASSX3AEmAAOAC8E/j7iNgz1sYRcVtEdEdE96xZ434iiJllVuZl+AHg9Kr5uantkIjYS+oBSToBeFNEDEm6EPg1Se8ETgBmSPqviHjBiWwzm7rKDKAtwNmSzqAIniuB36teQdJM4OmIOAisAW4HiIirqta5Buh2+EyMh+OwZlbaIVhE7AeuA3qAx4G7I2KHpLWSVqTVlgL9knZSnHC+qax62tGGvgFW37PtiOE4Vt+zzcNxWNNQROSuoS66u7vDT0Y90qK19/HMz/a9oP3kF3fQd+PFGSqyViVpa0R0j3e73CehrURHC5+x2s0azQFkZtk4gFpYV2fHuNrNGs0B1MI+uOKVdBynI9o6jhMfXPHKTBWZHcnDcbQwjwdkzc4B1OI8HpA1MwdQi/MXEa2ZOYBaWCMfsWs2ET4J3cL8XDBrdu4BlaBZDnv8XDBrdu4B1VkzPQ65kY/YNZsIB1CdNdNhj58LZs3Oh2B11kyHPf4ekDU7B1Cddb2446g3e3a9OM/tD/4ekDUzH4LV2Wijm7TIqCdmdeUAqrNnh48+1MVo7WbtzAFUZ77yZFY7B1Cd+cqTWe18ErrOfOXJrHYOoBL4ypNZbXwIZmbZOIDMLBsfgpWgWW5GNWt2DqA68xg8ZrXzIVidNdPNqGbNzgFUZ810M6pZs3MA1dlLR3nm1mjtZu3MAVRn0vjazdqZA6jOhkZ57vpo7WbtzAFUZ74Z1ax2pQaQpOWS+iXtknTDUZbPk7RJ0qOSHpA0t6r9YUmPSNoh6R1l1llPvhnVrHalfQ9I0jTgVuA3gD3AFkkbI+I7VavdAtwZEf9P0uuAm4HfB54CLoyI5ySdADyWtt1bVr314ptRzWpX5hcRFwO7ImI3gKS7gEuB6gBaCLwvTW8GNgBExPNV6xzPFDtU9M2oZrUp8x/2HODJqvk9qa3aNuCKNH05cKKkUwAknS7p0fQaHz5a70fStZJ6JfUODg7WfQfMrFy5exargCWS+oAlwABwACAinoyIc4GXA1dLOnXkxhFxW0R0R0T3rFmzGlm3mdVBmQE0AJxeNT83tR0SEXsj4oqIWAS8P7UNjVwHeAz4tRJrNbMMygygLcDZks6QNAO4EthYvYKkmZIqNawBbk/tcyV1pumTgYsA30xl1mJKC6CI2A9cB/QAjwN3R8QOSWslrUirLQX6Je0ETgVuSu2/CDwkaRvwDeCWiNheVq1mloeiRR5Y1d3dHb29vbnLMGtLkrZGRPd4t8t9EtrM2pgDyMyycQCZWTYOIDPLxgFkZtk4gMwsGweQmWXjADKzbBxAZpaNA8jMsnEAmVk2DiAzy8YBZGbZOIDMLBsHkJll4wAys2wcQGaWjQPIzLJxAJlZNg4gM8vGAWRm2TiAzCybmgJI0kWS3pqmZ0k6o9yyzKwdHDOAJH0A+GOKJ5cCdACfLbMoM2sPtfSALgdWAD+FQ89qP7HMosysPUyvYZ3nIyIkBYCkl5RcUzYb+gZY19PP3qFhZnd1snrZAi5bNCd3WWYtq5YAulvSXwFdkt4O/AHw1+WW1Xgb+gZYfc829h0oHlU9MDTM6nu2ATiEzEoy5iGYJAF/C9wDfBFYANwYEZ9oQG0N9aEv7zgUPhX7DgQf+vKOTBWZtb4xe0Dp0OvvI+KXgfsbVFMWz/xs37jazWzyajkJ/bCkV5deiZm1nVrOAb0GuErSDymuhImic3RuqZU1WFdnB0PDL+ztdHV2ZKjGrD3U0gNaBpwFvA74LeCN6c9jkrRcUr+kXZJuOMryeZI2SXpU0gOS5qb2V0n6F0k70rLfrX2XJuaN5502rnYzm7xjBlBE/BDoogid3wK6UtuYJE0DbgUuARYCKyUtHLHaLcCdqTe1Frg5tf8MeEtEvBJYDnxUUlctOzRRm787OK52M5u8Wr4JfT3wOeAX0s9nJb27htdeDOyKiN0R8TxwF3DpiHUWAl9P05sryyNiZ0Q8kab3Aj8CZtXwnhO2d2h4XO1mNnm1HIK9DXhNRNwYETcCFwBvr2G7OcCTVfN7Ulu1bcAVafpy4ERJp1SvIGkxMAP43sg3kHStpF5JvYODk+upzO7qHFe7mU1eLQEk4EDV/IHUVg+rgCWS+oAlwED1e0k6Dfgb4K0RcXDkxhFxW0R0R0T3rFmT6yCtXraAzo5pR7R1dkxj9bIFk3pdMxtdLVfB7gAekvR3af4y4NM1bDcAnF41Pze1HZIOr64AkHQC8KaIGErzJwFfAd4fEd+q4f0mpfJtZ9+KYdY4iohjrySdD1yUZr8ZEX01bDMd2Am8niJ4tgC/FxE7qtaZCTwdEQcl3QQciIgbJc0A/gH4ckR8tJYd6e7ujt7e3lpWNbM6k7Q1IrrHu90xe0CSLgB2RMTDaf4kSa+JiIfG2i4i9ku6DugBpgG3R8QOSWuB3ojYCCwFbk43uv4j8K60+e8Avw6cIuma1HZNRDwy3h00s+Z1zB5QOj9zfqQVJR1HESDnN6C+mrkHZJbPRHtANZ2EjqqUSieDazl3ZGY2ploCaLek90jqSD/XA7vLLszMWl8tAfQO4FcpTiQPUNwbdm2ZRZlZezjmoVRE/Ai4sgG1mFmbGbUHJOntks5O05J0u6Rn082hTXUC2symprEOwa4HfpCmVwLnAWcC7wM+Vm5ZZtYOxgqg/RFRGSDnjRR3rf9HRHwNaNmB6c2sccYKoIOSTpP0IopvM3+tapnv0DSzSRvrJPSNQC/Ft5g3Vm6hkLQEX4Y3szoYNYAi4l5J84ATI+KZqkW9QOkjFObg54KZNdaxnoqxH3hmRNtPS60okw19A6xZv53hfcVoIANDw6xZvx3wc8HMylLLFxHbwrqe/kPhUzG87wDrevozVWTW+hxAiYdkNWu8CQWQpHPqXUhuHpLVrPEm2gO6r65VNAEPyWrWeKOehJb08dEWUTymp6V4SFazxhvrKthbgT8CnjvKspXllJPXZYvmOHDMGmisANoCPBYR/zxygaQPllaRmbWNsQLozcDPj7YgIs4opxwzaydjnYQ+ISJ+1rBKzKztjBVAGyoTkr5Yfilm1m7GCqDqp5+eWXYhZtZ+xgqgGGXazKwuxjoJfZ6kn1D0hDrTNGk+IuKk0qszs5Y21nAc00ZbZmZWD74Z1cyycQCZWTYOIDPLxgFkZtk4gMwsm1IDSNJySf2Sdkm64SjL50nalJ62+oCkuVXLvippSNK9ZdZoZvmUFkCSpgG3ApcAC4GVkhaOWO0WigcengusBW6uWrYO+P2y6jOz/MrsAS0GdkXE7oh4HrgLuHTEOguBr6fpzdXLI2IT8J8l1mdmmZUZQHOAJ6vm96S2atuAK9L05cCJkk6p9Q0kXSupV1Lv4ODgpIo1s8bLfRJ6FbBEUh+wBBgADoy9yWERcVtEdEdE96xZs8qq0cxKMuaDCSdpADi9an5uajskIvaSekCSTgDeFBFDJdZkZk2kzB7QFuBsSWdImgFcCWysXkHSTEmVGtYAt5dYj5k1mdICKD3W+TqgB3gcuDsidkhaK2lFWm0p0C9pJ3AqcFNle0nfBL4AvF7SHknLyqrVzPJQRGsM9dPd3R29vb25yzBrS5K2RkT3eLfLfRLazNqYA8jMsnEAmVk2DiAzy8YBZGbZOIDMLBsHkJll4wAys2wcQGaWjQPIzLJxAJlZNg4gM8vGAWRm2TiAzCwbB5CZZeMAMrNsyhwTuuls6BtgXU8/e4eGmd3VyeplC7hs0cgHdZhZo7RNAG3oG2DN+u0M7yseujEwNMya9dsBHEJmmbTNIdi6nv5D4VMxvO8A63r6M1VkZm0TQHuHhsfVbmbla5sAmt3VOa52Mytf2wTQ6mUL6OyYdkRbZ8c0Vi9bkKkiM2ubk9CVE82+CmbWPNomgKAIIQeOWfNom0MwM2s+DiAzy8YBZGbZOIDMLBsHkJll4wAys2wcQGaWTakBJGm5pH5JuyTdcJTl8yRtkvSopAckza1adrWkJ9LP1WXWaWZ5lBZAkqYBtwKXAAuBlZIWjljtFuDOiDgXWAvcnLZ9GfAB4DXAYuADkk4uq1Yzy6PMb0IvBnZFxG4ASXcBlwLfqVpnIfC+NL0Z2JCmlwH3R8TTadv7geXA5ydTkAckM2suZR6CzQGerJrfk9qqbQOuSNOXAydKOqXGbZF0raReSb2Dg4NjFlMZkGxgaJjg8IBkG/oGxrVTZlY/uU9CrwKWSOoDlgADwIGxNzksIm6LiO6I6J41a9aY63pAMrPmU+Yh2ABwetX83NR2SETsJfWAJJ0AvCkihiQNAEtHbPvAZIrxgGRmzafMHtAW4GxJZ0iaAVwJbKxeQdJMSZUa1gC3p+ke4GJJJ6eTzxentgnzgGRmzae0AIqI/cB1FMHxOHB3ROyQtFbSirTaUqBf0k7gVOCmtO3TwJ9ShNgWYG3lhPREeUAys+ajiMhdQ110d3dHb2/vmOv4KphZOSRtjYju8W7nAcnMLJvcV8HMrI05gMwsGweQmWXjADKzbBxAZpaNA8jMsnEAmVk2DiAzy8YBZGbZOIDMLBsHkJll01b3gvlmVLPm0jYBVBmStTIqYmVIVsAhZJZJ2xyCeUhWs+bTNgHkIVnNmk/bBJCHZDVrPm0TQB6S1az5tM1J6MqJZl8FM2sebdMDMrPm0zY9IF+GN2s+bdMD8mV4s+bTNgHky/BmzadtAsiX4c2aT9sEkC/DmzWftjkJ7cvwZs2nbQII/GRUs2bTNodgZtZ8HEBmlo0DyMyycQCZWTYOIDPLRhGRu4a6kDQI/BCYCfw4czll8b5NTe2wb/MiYtZ4N26ZAKqQ1BsR3bnrKIP3bWryvo3Oh2Bmlo0DyMyyacUAui13ASXyvk1N3rdRtNw5IDObOlqxB2RmU4QDyMyyaZkAkrRcUr+kXZJuyF3PeEk6XdJmSd+RtEPS9an9ZZLul/RE+vPk1C5JH0/7+6ik8/PuwbFJmiapT9K9af4MSQ+lffhbSTNS+/FpfldaPj9r4TWQ1CXpHknflfS4pAtb5bOT9Ifp7+Rjkj4v6UX1+uxaIoAkTQNuBS4BFgIrJS3MW9W47Qf+KCIWAhcA70r7cAOwKSLOBjaleSj29ez0cy3wycaXPG7XA49XzX8Y+IuIeDnwDPC21P424JnU/hdpvWb3MeCrEXEOcB7Ffk75z07SHOA9QHdE/BIwDbiSen12ETHlf4ALgZ6q+TXAmtx1TXKfvgT8BtAPnJbaTgP60/RfASur1j+0XjP+AHMp/hG+DrgXEMU3aKeP/AyBHuDCND09rafc+zDGvr0U+P7IGlvhswPmAE8CL0ufxb3Asnp9di3RA+LwL6liT2qbklK3dRHwEHBqRDyVFv0bcGqanmr7/FHgfwMH0/wpwFBE7E/z1fUf2re0/Nm0frM6AxgE7kiHmJ+S9BJa4LOLiAHgFuBfgacoPout1Omza5UAahmSTgC+CLw3In5SvSyK/1am3PcmJL0R+FFEbM1dS0mmA+cDn4yIRcBPOXy4BUzpz+5k4FKKkJ0NvARYXq/Xb5UAGgBOr5qfm9qmFEkdFOHzuYhYn5r/XdJpaflpwI9S+1Ta59cCKyT9ALiL4jDsY0CXpMqwwNX1H9q3tPylwH80suBx2gPsiYiH0vw9FIHUCp/dG4DvR8RgROwD1lN8nnX57FolgLYAZ6cz8zMoTpJtzFzTuEgS8Gng8Yj486pFG4Gr0/TVFOeGKu1vSVdULgCereruN5WIWBMRcyNiPsVn8/WIuArYDLw5rTZy3yr7/Oa0ftP2HiLi34AnJVUesfJ64Du0wGdHceh1gaQXp7+jlX2rz2eX+yRXHU+W/SawE/ge8P7c9Uyg/osouuiPAo+kn9+kOH7eBDwBfA14WVpfFFf+vgdsp7hKkX0/atjPpcC9afpM4NvALuALwPGp/UVpfldafmbuumvYr1cBvenz2wCc3CqfHfAh4LvAY8DfAMfX67PzrRhmlk2rHIKZ2RTkADKzbBxAZpaNA8jMsnEAmVk2DqApQlJI+kjV/CpJH6zTa39G0puPveak3+e3053im0e0z5c0nG5jeFzStyVdU7V8RY4RDiTNlnRPo9+3nUw/9irWJJ4DrpB0c0Q0zSNeJE2Pw/cEHcvbgLdHxD8dZdn3oriNAUlnAuslKSLuiIiNZPhiaUTs5fCX7awE7gFNHfspxt/9w5ELRvZgJP1X+nOppG9I+pKk3ZL+TNJVqYexXdJZVS/zBkm9kname7cq4/esk7QljVvzP6te95uSNlJ8K3ZkPSvT6z8m6cOp7UaKL1t+WtK6sXY0InYD76MYBgJJ10j6y6p9/aSkb6V9Wirp9tRz+kxVDRdL+hdJD0v6QrrHDkk/kPSh1L5d0jmpfYmkR9JPn6QTU8/ssbT8RZLuSNv0SfrvVbWtl/RVFeP+/J+q391n0u9gu6QXfG7mHtBUcyvwaOUveY3OA34ReBrYDXwqIharGPDs3cB703rzgcXAWcBmSS8H3kJxm8CrJR0PPCjpvrT++cAvRcT3q99M0myKMWB+hWKcmPskXRYRayW9DlgVEb011P0wcM4oy06mGAJiBUXP6LXA/wC2SHoVxb1ZfwK8ISJ+KumPKQJtbdr+xxFxvqR3AqvStquAd0XEgymsfj7iPd9FcU/pL6fQuk/SK9KyV1GMXvAc0C/pE8AvAHOiGEMHSV017HPbcQ9oConi7vg7ST2DGm2JiKci4jmKr/5XAmQ7RehU3B0RByPiCYqgOge4mOKepUcohgY5hWIQLYBvjwyf5NXAA1HcvLgf+Bzw6+Oot0JjLPtyFF/h3w78e0Rsj4iDwI60TxdQDEz3YKr9amBe1faVG323cvh38CDw55LeA3Qd5bDyIuCzABHxXYqn8FYCaFNEPBsRP6foEc6j+B2eKekTkpYDP8FewD2gqeejFL2DO6ra9pP+M5F0HDCjatlzVdMHq+YPcuTnP/KenKAIgXdHRE/1AklLKYacKNMijhw9sVr1Pozcv+nAAeD+iFh5jO0PpPWJiD+T9BWK++8elLSMF/aCRlNdwwGKgbqekXQexeBd7wB+B/iDGl+vbbgHNMVExNPA3RweAhPgBxSHPFAclnRM4KV/W9Jx6bzQmRSj9PUA/0vFMCFIeoWKgbbG8m1giaSZKobKXQl8YzyFqBiQ7RbgE+Pch4pvAa9Nh5FIeknV4dJo73lW6kl9mGJ0hZGHf98ErkrrvgL4bxS/o9FebyZwXER8keJwsKnHfc7FPaCp6SPAdVXzfw18SdI24KtMrHfyrxThcRLwjoj4uaRPURyiPCxJFKP+XTbWi0TEU+mS+WaKHtRXIuJLY22TnCWpj+Ju6v8EPh4Rn5nAfhARgyou438+nbuCIgR2jrHZe9OJ5cqh3D9QDKNa8X+BT0raTtHjvCYinit+LUc1h2KExMp/8msmsi+tznfDm1k2PgQzs2wcQGaWjQPIzLJxAJlZNg4gM8vGAWRm2TiAzCyb/w/cTs4P76EQxgAAAABJRU5ErkJggg==\n",
       "text/plain": [
        "<Figure size 288x288 with 1 Axes>"
       ]
@@ -1518,13 +1702,12 @@
    "source": [
     "# Plot the tradeoff between dimensionality and F1 score\n",
     "x = summary_df[\"dims\"]\n",
-    "y = summary_df[\"F1\"]\n",
+    "y = summary_df[\"f1-score\"]\n",
     "\n",
     "plt.figure(figsize=(4,4))\n",
     "plt.scatter(x, y)\n",
     "#plt.yscale(\"log\")\n",
     "#plt.xscale(\"log\")\n",
-    "plt.ylim([0.75, 1.0])\n",
     "plt.xlabel(\"Number of Dimensions\")\n",
     "plt.ylabel(\"F1 Score\")\n",
     "\n",
@@ -1544,23 +1727,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6dcf56c7d5c942c48750c96a6d1edcf2",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=140, style=ProgressStyle(desc…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "data": {
       "text/html": [
@@ -1582,149 +1751,65 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>doc_num</th>\n",
        "      <th>fold</th>\n",
-       "      <th>doc_offset</th>\n",
+       "      <th>doc_num</th>\n",
        "      <th>span</th>\n",
        "      <th>ent_type</th>\n",
-       "      <th>gold</th>\n",
-       "      <th>768_1</th>\n",
-       "      <th>32_1</th>\n",
-       "      <th>32_2</th>\n",
-       "      <th>32_3</th>\n",
-       "      <th>...</th>\n",
-       "      <th>64_4</th>\n",
-       "      <th>128_1</th>\n",
-       "      <th>128_2</th>\n",
-       "      <th>128_3</th>\n",
-       "      <th>128_4</th>\n",
-       "      <th>256_1</th>\n",
-       "      <th>256_2</th>\n",
-       "      <th>256_3</th>\n",
-       "      <th>256_4</th>\n",
-       "      <th>num_models</th>\n",
+       "      <th>in_gold</th>\n",
+       "      <th>count</th>\n",
+       "      <th>models</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0</td>\n",
+       "      <th>4927</th>\n",
        "      <td>train</td>\n",
-       "      <td>12</td>\n",
-       "      <td>[11, 16): 'Saudi'</td>\n",
-       "      <td>MISC</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
+       "      <td>907</td>\n",
+       "      <td>[590, 598): 'Gorleben'</td>\n",
+       "      <td>LOC</td>\n",
        "      <td>True</td>\n",
        "      <td>17</td>\n",
+       "      <td>[GOLD, 768_1, 32_1, 32_2, 32_3, 32_4, 64_1, 64...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>0</td>\n",
+       "      <th>4925</th>\n",
        "      <td>train</td>\n",
-       "      <td>12</td>\n",
-       "      <td>[59, 65): 'MANAMA'</td>\n",
+       "      <td>907</td>\n",
+       "      <td>[63, 67): 'BONN'</td>\n",
        "      <td>LOC</td>\n",
        "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
        "      <td>17</td>\n",
+       "      <td>[GOLD, 768_1, 32_1, 32_2, 32_3, 32_4, 64_1, 64...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>0</td>\n",
+       "      <th>4924</th>\n",
        "      <td>train</td>\n",
-       "      <td>12</td>\n",
-       "      <td>[86, 91): 'Saudi'</td>\n",
+       "      <td>907</td>\n",
+       "      <td>[11, 17): 'German'</td>\n",
        "      <td>MISC</td>\n",
        "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>14</td>\n",
+       "      <td>17</td>\n",
+       "      <td>[GOLD, 768_1, 32_1, 32_2, 32_3, 32_4, 64_1, 64...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0</td>\n",
+       "      <th>4923</th>\n",
        "      <td>train</td>\n",
-       "      <td>12</td>\n",
-       "      <td>[259, 264): 'Saudi'</td>\n",
-       "      <td>MISC</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
+       "      <td>896</td>\n",
+       "      <td>[523, 528): 'China'</td>\n",
+       "      <td>LOC</td>\n",
        "      <td>True</td>\n",
-       "      <td>13</td>\n",
+       "      <td>17</td>\n",
+       "      <td>[GOLD, 768_1, 32_1, 32_2, 32_3, 32_4, 64_1, 64...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>0</td>\n",
+       "      <th>4922</th>\n",
        "      <td>train</td>\n",
-       "      <td>12</td>\n",
-       "      <td>[403, 412): 'One-month'</td>\n",
-       "      <td>MISC</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>True</td>\n",
-       "      <td>...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
+       "      <td>896</td>\n",
+       "      <td>[512, 518): 'Mexico'</td>\n",
+       "      <td>LOC</td>\n",
        "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>9</td>\n",
+       "      <td>17</td>\n",
+       "      <td>[GOLD, 768_1, 32_1, 32_2, 32_3, 32_4, 64_1, 64...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>...</th>\n",
@@ -1735,202 +1820,105 @@
        "      <td>...</td>\n",
        "      <td>...</td>\n",
        "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>13</th>\n",
-       "      <td>139</td>\n",
-       "      <td>test</td>\n",
-       "      <td>225</td>\n",
-       "      <td>[45, 48): 'IBF'</td>\n",
+       "      <th>374</th>\n",
+       "      <td>dev</td>\n",
+       "      <td>149</td>\n",
+       "      <td>[81, 93): 'Major League'</td>\n",
        "      <td>MISC</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>False</td>\n",
        "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>2</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[GOLD]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>14</th>\n",
-       "      <td>139</td>\n",
-       "      <td>test</td>\n",
-       "      <td>225</td>\n",
-       "      <td>[11, 17): 'BOXING'</td>\n",
-       "      <td>LOC</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
+       "      <th>246</th>\n",
+       "      <td>dev</td>\n",
+       "      <td>120</td>\n",
+       "      <td>[63, 70): 'English'</td>\n",
+       "      <td>MISC</td>\n",
        "      <td>True</td>\n",
-       "      <td>...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[GOLD]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>15</th>\n",
-       "      <td>139</td>\n",
-       "      <td>test</td>\n",
-       "      <td>225</td>\n",
-       "      <td>[86, 104): 'German Axel Schulz'</td>\n",
+       "      <th>78</th>\n",
+       "      <td>dev</td>\n",
+       "      <td>64</td>\n",
+       "      <td>[2571, 2575): 'AIDS'</td>\n",
        "      <td>MISC</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>1</td>\n",
+       "      <td>True</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[GOLD]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>16</th>\n",
-       "      <td>139</td>\n",
-       "      <td>test</td>\n",
-       "      <td>225</td>\n",
-       "      <td>[19, 25): 'SCHULZ'</td>\n",
-       "      <td>LOC</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
+       "      <th>3</th>\n",
+       "      <td>dev</td>\n",
+       "      <td>21</td>\n",
+       "      <td>[86, 90): 'UEFA'</td>\n",
+       "      <td>ORG</td>\n",
        "      <td>True</td>\n",
-       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[GOLD]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>17</th>\n",
-       "      <td>139</td>\n",
-       "      <td>test</td>\n",
-       "      <td>225</td>\n",
-       "      <td>[145, 158): 'International'</td>\n",
+       "      <th>0</th>\n",
+       "      <td>dev</td>\n",
+       "      <td>21</td>\n",
+       "      <td>[25, 39): 'STANDARD LIEGE'</td>\n",
        "      <td>ORG</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
        "      <td>True</td>\n",
-       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[GOLD]</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>4927 rows × 24 columns</p>\n",
+       "<p>4928 rows × 7 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "    doc_num   fold  doc_offset                             span ent_type  \\\n",
-       "0         0  train          12                [11, 16): 'Saudi'     MISC   \n",
-       "1         0  train          12               [59, 65): 'MANAMA'      LOC   \n",
-       "2         0  train          12                [86, 91): 'Saudi'     MISC   \n",
-       "3         0  train          12              [259, 264): 'Saudi'     MISC   \n",
-       "4         0  train          12          [403, 412): 'One-month'     MISC   \n",
-       "..      ...    ...         ...                              ...      ...   \n",
-       "13      139   test         225                  [45, 48): 'IBF'     MISC   \n",
-       "14      139   test         225               [11, 17): 'BOXING'      LOC   \n",
-       "15      139   test         225  [86, 104): 'German Axel Schulz'     MISC   \n",
-       "16      139   test         225               [19, 25): 'SCHULZ'      LOC   \n",
-       "17      139   test         225      [145, 158): 'International'      ORG   \n",
-       "\n",
-       "     gold  768_1   32_1   32_2   32_3  ...   64_4  128_1  128_2  128_3  128_4  \\\n",
-       "0    True   True   True   True   True  ...   True   True   True   True   True   \n",
-       "1    True   True   True   True   True  ...   True   True   True   True   True   \n",
-       "2    True   True   True   True  False  ...   True   True   True   True   True   \n",
-       "3    True   True   True  False  False  ...   True   True   True   True   True   \n",
-       "4    True  False   True  False   True  ...  False   True   True  False  False   \n",
-       "..    ...    ...    ...    ...    ...  ...    ...    ...    ...    ...    ...   \n",
-       "13  False  False  False   True  False  ...  False   True  False  False  False   \n",
-       "14  False  False  False  False   True  ...  False  False  False  False  False   \n",
-       "15  False  False  False  False  False  ...  False  False  False  False  False   \n",
-       "16  False  False  False  False  False  ...  False  False  False  False  False   \n",
-       "17  False  False  False  False  False  ...  False  False  False  False  False   \n",
+       "       fold  doc_num                        span ent_type  in_gold  count  \\\n",
+       "4927  train      907      [590, 598): 'Gorleben'      LOC     True     17   \n",
+       "4925  train      907            [63, 67): 'BONN'      LOC     True     17   \n",
+       "4924  train      907          [11, 17): 'German'     MISC     True     17   \n",
+       "4923  train      896         [523, 528): 'China'      LOC     True     17   \n",
+       "4922  train      896        [512, 518): 'Mexico'      LOC     True     17   \n",
+       "...     ...      ...                         ...      ...      ...    ...   \n",
+       "374     dev      149    [81, 93): 'Major League'     MISC     True      0   \n",
+       "246     dev      120         [63, 70): 'English'     MISC     True      0   \n",
+       "78      dev       64        [2571, 2575): 'AIDS'     MISC     True      0   \n",
+       "3       dev       21            [86, 90): 'UEFA'      ORG     True      0   \n",
+       "0       dev       21  [25, 39): 'STANDARD LIEGE'      ORG     True      0   \n",
        "\n",
-       "    256_1  256_2  256_3  256_4  num_models  \n",
-       "0    True   True   True   True          17  \n",
-       "1    True   True   True   True          17  \n",
-       "2    True   True   True   True          14  \n",
-       "3    True   True   True   True          13  \n",
-       "4   False   True  False  False           9  \n",
-       "..    ...    ...    ...    ...         ...  \n",
-       "13  False  False  False  False           2  \n",
-       "14  False  False  False  False           1  \n",
-       "15  False  False  False  False           1  \n",
-       "16  False  False  False   True           1  \n",
-       "17  False  False  False   True           1  \n",
+       "                                                 models  \n",
+       "4927  [GOLD, 768_1, 32_1, 32_2, 32_3, 32_4, 64_1, 64...  \n",
+       "4925  [GOLD, 768_1, 32_1, 32_2, 32_3, 32_4, 64_1, 64...  \n",
+       "4924  [GOLD, 768_1, 32_1, 32_2, 32_3, 32_4, 64_1, 64...  \n",
+       "4923  [GOLD, 768_1, 32_1, 32_2, 32_3, 32_4, 64_1, 64...  \n",
+       "4922  [GOLD, 768_1, 32_1, 32_2, 32_3, 32_4, 64_1, 64...  \n",
+       "...                                                 ...  \n",
+       "374                                              [GOLD]  \n",
+       "246                                              [GOLD]  \n",
+       "78                                               [GOLD]  \n",
+       "3                                                [GOLD]  \n",
+       "0                                                [GOLD]  \n",
        "\n",
-       "[4927 rows x 24 columns]"
+       "[4928 rows x 7 columns]"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "full_results = util.merge_model_results(evals)\n",
+    "full_results = cleaning.flag_suspicious_labels(evals,'ent_type','ent_type',label_name='ent_type',gold_feats=gold_elts,align_over_cols=['fold','doc_num','span'],keep_cols=[],split_doc=False)\n",
     "full_results"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -1955,58 +1943,58 @@
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>fold</th>\n",
-       "      <th>doc_offset</th>\n",
+       "      <th>doc_num</th>\n",
        "      <th>span</th>\n",
        "      <th>ent_type</th>\n",
-       "      <th>gold</th>\n",
-       "      <th>num_models</th>\n",
+       "      <th>in_gold</th>\n",
+       "      <th>count</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>0</th>\n",
+       "      <th>4927</th>\n",
        "      <td>train</td>\n",
-       "      <td>12</td>\n",
-       "      <td>[11, 16): 'Saudi'</td>\n",
-       "      <td>MISC</td>\n",
+       "      <td>907</td>\n",
+       "      <td>[590, 598): 'Gorleben'</td>\n",
+       "      <td>LOC</td>\n",
        "      <td>True</td>\n",
        "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1</th>\n",
+       "      <th>4925</th>\n",
        "      <td>train</td>\n",
-       "      <td>12</td>\n",
-       "      <td>[59, 65): 'MANAMA'</td>\n",
+       "      <td>907</td>\n",
+       "      <td>[63, 67): 'BONN'</td>\n",
        "      <td>LOC</td>\n",
        "      <td>True</td>\n",
        "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2</th>\n",
+       "      <th>4924</th>\n",
        "      <td>train</td>\n",
-       "      <td>12</td>\n",
-       "      <td>[86, 91): 'Saudi'</td>\n",
+       "      <td>907</td>\n",
+       "      <td>[11, 17): 'German'</td>\n",
        "      <td>MISC</td>\n",
        "      <td>True</td>\n",
-       "      <td>14</td>\n",
+       "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>3</th>\n",
+       "      <th>4923</th>\n",
        "      <td>train</td>\n",
-       "      <td>12</td>\n",
-       "      <td>[259, 264): 'Saudi'</td>\n",
-       "      <td>MISC</td>\n",
+       "      <td>896</td>\n",
+       "      <td>[523, 528): 'China'</td>\n",
+       "      <td>LOC</td>\n",
        "      <td>True</td>\n",
-       "      <td>13</td>\n",
+       "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>4</th>\n",
+       "      <th>4922</th>\n",
        "      <td>train</td>\n",
-       "      <td>12</td>\n",
-       "      <td>[403, 412): 'One-month'</td>\n",
-       "      <td>MISC</td>\n",
+       "      <td>896</td>\n",
+       "      <td>[512, 518): 'Mexico'</td>\n",
+       "      <td>LOC</td>\n",
        "      <td>True</td>\n",
-       "      <td>9</td>\n",
+       "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>...</th>\n",
@@ -2018,99 +2006,86 @@
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>13</th>\n",
-       "      <td>test</td>\n",
-       "      <td>225</td>\n",
-       "      <td>[45, 48): 'IBF'</td>\n",
+       "      <th>374</th>\n",
+       "      <td>dev</td>\n",
+       "      <td>149</td>\n",
+       "      <td>[81, 93): 'Major League'</td>\n",
        "      <td>MISC</td>\n",
-       "      <td>False</td>\n",
-       "      <td>2</td>\n",
+       "      <td>True</td>\n",
+       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>14</th>\n",
-       "      <td>test</td>\n",
-       "      <td>225</td>\n",
-       "      <td>[11, 17): 'BOXING'</td>\n",
-       "      <td>LOC</td>\n",
-       "      <td>False</td>\n",
-       "      <td>1</td>\n",
+       "      <th>246</th>\n",
+       "      <td>dev</td>\n",
+       "      <td>120</td>\n",
+       "      <td>[63, 70): 'English'</td>\n",
+       "      <td>MISC</td>\n",
+       "      <td>True</td>\n",
+       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>15</th>\n",
-       "      <td>test</td>\n",
-       "      <td>225</td>\n",
-       "      <td>[86, 104): 'German Axel Schulz'</td>\n",
+       "      <th>78</th>\n",
+       "      <td>dev</td>\n",
+       "      <td>64</td>\n",
+       "      <td>[2571, 2575): 'AIDS'</td>\n",
        "      <td>MISC</td>\n",
-       "      <td>False</td>\n",
-       "      <td>1</td>\n",
+       "      <td>True</td>\n",
+       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>16</th>\n",
-       "      <td>test</td>\n",
-       "      <td>225</td>\n",
-       "      <td>[19, 25): 'SCHULZ'</td>\n",
-       "      <td>LOC</td>\n",
-       "      <td>False</td>\n",
-       "      <td>1</td>\n",
+       "      <th>3</th>\n",
+       "      <td>dev</td>\n",
+       "      <td>21</td>\n",
+       "      <td>[86, 90): 'UEFA'</td>\n",
+       "      <td>ORG</td>\n",
+       "      <td>True</td>\n",
+       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>17</th>\n",
-       "      <td>test</td>\n",
-       "      <td>225</td>\n",
-       "      <td>[145, 158): 'International'</td>\n",
+       "      <th>0</th>\n",
+       "      <td>dev</td>\n",
+       "      <td>21</td>\n",
+       "      <td>[25, 39): 'STANDARD LIEGE'</td>\n",
        "      <td>ORG</td>\n",
-       "      <td>False</td>\n",
-       "      <td>1</td>\n",
+       "      <td>True</td>\n",
+       "      <td>0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>4927 rows × 6 columns</p>\n",
+       "<p>4928 rows × 6 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "     fold  doc_offset                             span ent_type   gold  \\\n",
-       "0   train          12                [11, 16): 'Saudi'     MISC   True   \n",
-       "1   train          12               [59, 65): 'MANAMA'      LOC   True   \n",
-       "2   train          12                [86, 91): 'Saudi'     MISC   True   \n",
-       "3   train          12              [259, 264): 'Saudi'     MISC   True   \n",
-       "4   train          12          [403, 412): 'One-month'     MISC   True   \n",
-       "..    ...         ...                              ...      ...    ...   \n",
-       "13   test         225                  [45, 48): 'IBF'     MISC  False   \n",
-       "14   test         225               [11, 17): 'BOXING'      LOC  False   \n",
-       "15   test         225  [86, 104): 'German Axel Schulz'     MISC  False   \n",
-       "16   test         225               [19, 25): 'SCHULZ'      LOC  False   \n",
-       "17   test         225      [145, 158): 'International'      ORG  False   \n",
-       "\n",
-       "    num_models  \n",
-       "0           17  \n",
-       "1           17  \n",
-       "2           14  \n",
-       "3           13  \n",
-       "4            9  \n",
-       "..         ...  \n",
-       "13           2  \n",
-       "14           1  \n",
-       "15           1  \n",
-       "16           1  \n",
-       "17           1  \n",
+       "       fold  doc_num                        span ent_type  in_gold  count\n",
+       "4927  train      907      [590, 598): 'Gorleben'      LOC     True     17\n",
+       "4925  train      907            [63, 67): 'BONN'      LOC     True     17\n",
+       "4924  train      907          [11, 17): 'German'     MISC     True     17\n",
+       "4923  train      896         [523, 528): 'China'      LOC     True     17\n",
+       "4922  train      896        [512, 518): 'Mexico'      LOC     True     17\n",
+       "...     ...      ...                         ...      ...      ...    ...\n",
+       "374     dev      149    [81, 93): 'Major League'     MISC     True      0\n",
+       "246     dev      120         [63, 70): 'English'     MISC     True      0\n",
+       "78      dev       64        [2571, 2575): 'AIDS'     MISC     True      0\n",
+       "3       dev       21            [86, 90): 'UEFA'      ORG     True      0\n",
+       "0       dev       21  [25, 39): 'STANDARD LIEGE'      ORG     True      0\n",
        "\n",
-       "[4927 rows x 6 columns]"
+       "[4928 rows x 6 columns]"
       ]
      },
-     "execution_count": 17,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# Drop Boolean columns for now\n",
-    "results = full_results[[\"fold\", \"doc_offset\", \"span\", \"ent_type\", \"gold\", \"num_models\"]]\n",
+    "results = full_results[[\"fold\", \"doc_num\", \"span\", \"ent_type\", \"in_gold\", \"count\"]]\n",
     "results"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
@@ -2134,10 +2109,10 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>count</th>\n",
+       "      <th>num_ents</th>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>num_models</th>\n",
+       "      <th>count</th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
@@ -2188,11 +2163,11 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>11</th>\n",
-       "      <td>42</td>\n",
+       "      <td>41</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>12</th>\n",
-       "      <td>47</td>\n",
+       "      <td>48</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>13</th>\n",
@@ -2208,53 +2183,53 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>16</th>\n",
-       "      <td>247</td>\n",
+       "      <td>248</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>17</th>\n",
-       "      <td>2941</td>\n",
+       "      <td>2940</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "            count\n",
-       "num_models       \n",
-       "0             115\n",
-       "1              31\n",
-       "2              23\n",
-       "3              20\n",
-       "4              17\n",
-       "5              18\n",
-       "6              23\n",
-       "7              23\n",
-       "8              19\n",
-       "9              29\n",
-       "10             28\n",
-       "11             42\n",
-       "12             47\n",
-       "13             62\n",
-       "14             75\n",
-       "15            115\n",
-       "16            247\n",
-       "17           2941"
+       "       num_ents\n",
+       "count          \n",
+       "0           115\n",
+       "1            31\n",
+       "2            23\n",
+       "3            20\n",
+       "4            17\n",
+       "5            18\n",
+       "6            23\n",
+       "7            23\n",
+       "8            19\n",
+       "9            29\n",
+       "10           28\n",
+       "11           41\n",
+       "12           48\n",
+       "13           62\n",
+       "14           75\n",
+       "15          115\n",
+       "16          248\n",
+       "17         2940"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "(results[results[\"gold\"] == True][[\"num_models\", \"span\"]]\n",
-    " .groupby(\"num_models\").count()\n",
-    " .rename(columns={\"span\": \"count\"}))"
+    "(results[results[\"in_gold\"] == True][[\"count\", \"span\"]]\n",
+    " .groupby(\"count\").count()\n",
+    " .rename(columns={\"span\": \"num_ents\"}))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -2278,10 +2253,10 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>count</th>\n",
+       "      <th>num_ents</th>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>num_models</th>\n",
+       "      <th>count</th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
@@ -2292,7 +2267,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>173</td>\n",
+       "      <td>174</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
@@ -2304,11 +2279,11 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>51</td>\n",
+       "      <td>52</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>27</td>\n",
+       "      <td>26</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
@@ -2320,19 +2295,19 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>18</td>\n",
+       "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>10</th>\n",
-       "      <td>11</td>\n",
+       "      <td>12</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>11</th>\n",
-       "      <td>10</td>\n",
+       "      <td>9</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>12</th>\n",
-       "      <td>8</td>\n",
+       "      <td>9</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>13</th>\n",
@@ -2359,41 +2334,41 @@
        "</div>"
       ],
       "text/plain": [
-       "            count\n",
-       "num_models       \n",
-       "1             468\n",
-       "2             173\n",
-       "3              94\n",
-       "4              61\n",
-       "5              51\n",
-       "6              27\n",
-       "7              36\n",
-       "8              16\n",
-       "9              18\n",
-       "10             11\n",
-       "11             10\n",
-       "12              8\n",
-       "13              8\n",
-       "14             11\n",
-       "15             14\n",
-       "16             15\n",
-       "17             31"
+       "       num_ents\n",
+       "count          \n",
+       "1           468\n",
+       "2           174\n",
+       "3            94\n",
+       "4            61\n",
+       "5            52\n",
+       "6            26\n",
+       "7            36\n",
+       "8            16\n",
+       "9            17\n",
+       "10           12\n",
+       "11            9\n",
+       "12            9\n",
+       "13            8\n",
+       "14           11\n",
+       "15           14\n",
+       "16           15\n",
+       "17           31"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "(results[results[\"gold\"] == False][[\"num_models\", \"span\"]]\n",
-    " .groupby(\"num_models\").count()\n",
-    " .rename(columns={\"span\": \"count\"}))"
+    "(results[results[\"in_gold\"] == False][[\"count\", \"span\"]]\n",
+    " .groupby(\"count\").count()\n",
+    " .rename(columns={\"span\": \"num_ents\"}))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -2418,34 +2393,34 @@
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>fold</th>\n",
-       "      <th>doc_offset</th>\n",
+       "      <th>doc_num</th>\n",
        "      <th>span</th>\n",
        "      <th>ent_type</th>\n",
-       "      <th>gold</th>\n",
-       "      <th>num_models</th>\n",
+       "      <th>in_gold</th>\n",
+       "      <th>count</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>0</th>\n",
+       "      <th>3</th>\n",
        "      <td>dev</td>\n",
        "      <td>21</td>\n",
-       "      <td>[25, 39): 'STANDARD LIEGE'</td>\n",
+       "      <td>[86, 90): 'UEFA'</td>\n",
        "      <td>ORG</td>\n",
        "      <td>True</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2</th>\n",
+       "      <th>0</th>\n",
        "      <td>dev</td>\n",
        "      <td>21</td>\n",
-       "      <td>[86, 90): 'UEFA'</td>\n",
+       "      <td>[25, 39): 'STANDARD LIEGE'</td>\n",
        "      <td>ORG</td>\n",
        "      <td>True</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>18</th>\n",
+       "      <th>78</th>\n",
        "      <td>dev</td>\n",
        "      <td>64</td>\n",
        "      <td>[2571, 2575): 'AIDS'</td>\n",
@@ -2454,7 +2429,7 @@
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2</th>\n",
+       "      <th>246</th>\n",
        "      <td>dev</td>\n",
        "      <td>120</td>\n",
        "      <td>[63, 70): 'English'</td>\n",
@@ -2463,7 +2438,7 @@
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2</th>\n",
+       "      <th>374</th>\n",
        "      <td>dev</td>\n",
        "      <td>149</td>\n",
        "      <td>[81, 93): 'Major League'</td>\n",
@@ -2472,25 +2447,25 @@
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>19</th>\n",
+       "      <th>498</th>\n",
        "      <td>dev</td>\n",
        "      <td>182</td>\n",
-       "      <td>[662, 670): 'division'</td>\n",
-       "      <td>MISC</td>\n",
+       "      <td>[2173, 2177): 'Ruch'</td>\n",
+       "      <td>ORG</td>\n",
        "      <td>True</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>46</th>\n",
+       "      <th>462</th>\n",
        "      <td>dev</td>\n",
        "      <td>182</td>\n",
-       "      <td>[2173, 2177): 'Ruch'</td>\n",
-       "      <td>ORG</td>\n",
+       "      <td>[662, 670): 'division'</td>\n",
+       "      <td>MISC</td>\n",
        "      <td>True</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6</th>\n",
+       "      <th>512</th>\n",
        "      <td>dev</td>\n",
        "      <td>203</td>\n",
        "      <td>[879, 881): '90'</td>\n",
@@ -2499,106 +2474,106 @@
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>5</th>\n",
+       "      <th>622</th>\n",
        "      <td>dev</td>\n",
        "      <td>214</td>\n",
-       "      <td>[187, 202): 'Michael Collins'</td>\n",
+       "      <td>[1689, 1705): 'Schindler's List'</td>\n",
        "      <td>MISC</td>\n",
        "      <td>True</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>7</th>\n",
+       "      <th>621</th>\n",
        "      <td>dev</td>\n",
        "      <td>214</td>\n",
-       "      <td>[285, 305): 'Venice Film Festival'</td>\n",
-       "      <td>MISC</td>\n",
+       "      <td>[1643, 1648): 'Oscar'</td>\n",
+       "      <td>PER</td>\n",
        "      <td>True</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>33</th>\n",
+       "      <th>583</th>\n",
        "      <td>dev</td>\n",
        "      <td>214</td>\n",
-       "      <td>[1643, 1648): 'Oscar'</td>\n",
-       "      <td>PER</td>\n",
+       "      <td>[285, 305): 'Venice Film Festival'</td>\n",
+       "      <td>MISC</td>\n",
        "      <td>True</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>34</th>\n",
+       "      <th>569</th>\n",
        "      <td>dev</td>\n",
        "      <td>214</td>\n",
-       "      <td>[1689, 1705): 'Schindler's List'</td>\n",
+       "      <td>[187, 202): 'Michael Collins'</td>\n",
        "      <td>MISC</td>\n",
        "      <td>True</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1</th>\n",
+       "      <th>802</th>\n",
        "      <td>test</td>\n",
        "      <td>15</td>\n",
-       "      <td>[32, 43): 'WEST INDIES'</td>\n",
-       "      <td>LOC</td>\n",
+       "      <td>[44, 56): 'WORLD SERIES'</td>\n",
+       "      <td>MISC</td>\n",
        "      <td>True</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2</th>\n",
+       "      <th>801</th>\n",
        "      <td>test</td>\n",
        "      <td>15</td>\n",
-       "      <td>[44, 56): 'WORLD SERIES'</td>\n",
-       "      <td>MISC</td>\n",
+       "      <td>[32, 43): 'WEST INDIES'</td>\n",
+       "      <td>LOC</td>\n",
        "      <td>True</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>0</th>\n",
+       "      <th>942</th>\n",
        "      <td>test</td>\n",
        "      <td>21</td>\n",
-       "      <td>[22, 38): 'WORLD GRAND PRIX'</td>\n",
-       "      <td>MISC</td>\n",
+       "      <td>[719, 725): 'Wijaya'</td>\n",
+       "      <td>PER</td>\n",
        "      <td>True</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>34</th>\n",
+       "      <th>896</th>\n",
        "      <td>test</td>\n",
        "      <td>21</td>\n",
-       "      <td>[719, 725): 'Wijaya'</td>\n",
-       "      <td>PER</td>\n",
+       "      <td>[22, 38): 'WORLD GRAND PRIX'</td>\n",
+       "      <td>MISC</td>\n",
        "      <td>True</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2</th>\n",
+       "      <th>1057</th>\n",
        "      <td>test</td>\n",
        "      <td>23</td>\n",
-       "      <td>[94, 109): 'National Hockey'</td>\n",
+       "      <td>[1117, 1127): 'NY RANGERS'</td>\n",
        "      <td>ORG</td>\n",
        "      <td>True</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>3</th>\n",
+       "      <th>1052</th>\n",
        "      <td>test</td>\n",
        "      <td>23</td>\n",
-       "      <td>[110, 116): 'League'</td>\n",
+       "      <td>[1106, 1113): 'TORONTO'</td>\n",
        "      <td>ORG</td>\n",
        "      <td>True</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>10</th>\n",
+       "      <th>1025</th>\n",
        "      <td>test</td>\n",
        "      <td>23</td>\n",
-       "      <td>[427, 435): 'ATLANTIC'</td>\n",
-       "      <td>LOC</td>\n",
+       "      <td>[673, 689): 'CENTRAL DIVISION'</td>\n",
+       "      <td>MISC</td>\n",
        "      <td>True</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>16</th>\n",
+       "      <th>1016</th>\n",
        "      <td>test</td>\n",
        "      <td>23</td>\n",
        "      <td>[599, 611): 'NY ISLANDERS'</td>\n",
@@ -2611,59 +2586,59 @@
        "</div>"
       ],
       "text/plain": [
-       "    fold  doc_offset                                span ent_type  gold  \\\n",
-       "0    dev          21          [25, 39): 'STANDARD LIEGE'      ORG  True   \n",
-       "2    dev          21                    [86, 90): 'UEFA'      ORG  True   \n",
-       "18   dev          64                [2571, 2575): 'AIDS'     MISC  True   \n",
-       "2    dev         120                 [63, 70): 'English'     MISC  True   \n",
-       "2    dev         149            [81, 93): 'Major League'     MISC  True   \n",
-       "19   dev         182              [662, 670): 'division'     MISC  True   \n",
-       "46   dev         182                [2173, 2177): 'Ruch'      ORG  True   \n",
-       "6    dev         203                    [879, 881): '90'      LOC  True   \n",
-       "5    dev         214       [187, 202): 'Michael Collins'     MISC  True   \n",
-       "7    dev         214  [285, 305): 'Venice Film Festival'     MISC  True   \n",
-       "33   dev         214               [1643, 1648): 'Oscar'      PER  True   \n",
-       "34   dev         214    [1689, 1705): 'Schindler's List'     MISC  True   \n",
-       "1   test          15             [32, 43): 'WEST INDIES'      LOC  True   \n",
-       "2   test          15            [44, 56): 'WORLD SERIES'     MISC  True   \n",
-       "0   test          21        [22, 38): 'WORLD GRAND PRIX'     MISC  True   \n",
-       "34  test          21                [719, 725): 'Wijaya'      PER  True   \n",
-       "2   test          23        [94, 109): 'National Hockey'      ORG  True   \n",
-       "3   test          23                [110, 116): 'League'      ORG  True   \n",
-       "10  test          23              [427, 435): 'ATLANTIC'      LOC  True   \n",
-       "16  test          23          [599, 611): 'NY ISLANDERS'      ORG  True   \n",
+       "      fold  doc_num                                span ent_type  in_gold  \\\n",
+       "3      dev       21                    [86, 90): 'UEFA'      ORG     True   \n",
+       "0      dev       21          [25, 39): 'STANDARD LIEGE'      ORG     True   \n",
+       "78     dev       64                [2571, 2575): 'AIDS'     MISC     True   \n",
+       "246    dev      120                 [63, 70): 'English'     MISC     True   \n",
+       "374    dev      149            [81, 93): 'Major League'     MISC     True   \n",
+       "498    dev      182                [2173, 2177): 'Ruch'      ORG     True   \n",
+       "462    dev      182              [662, 670): 'division'     MISC     True   \n",
+       "512    dev      203                    [879, 881): '90'      LOC     True   \n",
+       "622    dev      214    [1689, 1705): 'Schindler's List'     MISC     True   \n",
+       "621    dev      214               [1643, 1648): 'Oscar'      PER     True   \n",
+       "583    dev      214  [285, 305): 'Venice Film Festival'     MISC     True   \n",
+       "569    dev      214       [187, 202): 'Michael Collins'     MISC     True   \n",
+       "802   test       15            [44, 56): 'WORLD SERIES'     MISC     True   \n",
+       "801   test       15             [32, 43): 'WEST INDIES'      LOC     True   \n",
+       "942   test       21                [719, 725): 'Wijaya'      PER     True   \n",
+       "896   test       21        [22, 38): 'WORLD GRAND PRIX'     MISC     True   \n",
+       "1057  test       23          [1117, 1127): 'NY RANGERS'      ORG     True   \n",
+       "1052  test       23             [1106, 1113): 'TORONTO'      ORG     True   \n",
+       "1025  test       23      [673, 689): 'CENTRAL DIVISION'     MISC     True   \n",
+       "1016  test       23          [599, 611): 'NY ISLANDERS'      ORG     True   \n",
        "\n",
-       "    num_models  \n",
-       "0            0  \n",
-       "2            0  \n",
-       "18           0  \n",
-       "2            0  \n",
-       "2            0  \n",
-       "19           0  \n",
-       "46           0  \n",
-       "6            0  \n",
-       "5            0  \n",
-       "7            0  \n",
-       "33           0  \n",
-       "34           0  \n",
-       "1            0  \n",
-       "2            0  \n",
-       "0            0  \n",
-       "34           0  \n",
-       "2            0  \n",
-       "3            0  \n",
-       "10           0  \n",
-       "16           0  "
+       "      count  \n",
+       "3         0  \n",
+       "0         0  \n",
+       "78        0  \n",
+       "246       0  \n",
+       "374       0  \n",
+       "498       0  \n",
+       "462       0  \n",
+       "512       0  \n",
+       "622       0  \n",
+       "621       0  \n",
+       "583       0  \n",
+       "569       0  \n",
+       "802       0  \n",
+       "801       0  \n",
+       "942       0  \n",
+       "896       0  \n",
+       "1057      0  \n",
+       "1052      0  \n",
+       "1025      0  \n",
+       "1016      0  "
       ]
      },
-     "execution_count": 20,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# Pull out some hard-to-find examples, sorting by document to make labeling easier\n",
-    "hard_to_get = results[results[\"gold\"]].sort_values([\"num_models\", \"fold\", \"doc_offset\"]).head(20)\n",
+    "hard_to_get = results[results[\"in_gold\"]].sort_values([\"count\", \"fold\", \"doc_num\"]).head(20)\n",
     "hard_to_get"
    ]
   },
@@ -2676,7 +2651,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
@@ -2701,16 +2676,16 @@
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>fold</th>\n",
-       "      <th>doc_offset</th>\n",
+       "      <th>doc_num</th>\n",
        "      <th>span</th>\n",
        "      <th>ent_type</th>\n",
-       "      <th>gold</th>\n",
-       "      <th>num_models</th>\n",
+       "      <th>in_gold</th>\n",
+       "      <th>count</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>35</th>\n",
+       "      <th>373</th>\n",
        "      <td>dev</td>\n",
        "      <td>149</td>\n",
        "      <td>[81, 102): 'Major League Baseball'</td>\n",
@@ -2719,7 +2694,7 @@
        "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>52</th>\n",
+       "      <th>570</th>\n",
        "      <td>dev</td>\n",
        "      <td>214</td>\n",
        "      <td>[187, 202): 'Michael Collins'</td>\n",
@@ -2728,7 +2703,7 @@
        "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>47</th>\n",
+       "      <th>983</th>\n",
        "      <td>test</td>\n",
        "      <td>23</td>\n",
        "      <td>[94, 116): 'National Hockey League'</td>\n",
@@ -2737,25 +2712,25 @@
        "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>43</th>\n",
+       "      <th>1110</th>\n",
        "      <td>test</td>\n",
        "      <td>25</td>\n",
-       "      <td>[823, 835): 'Philadelphia'</td>\n",
-       "      <td>ORG</td>\n",
+       "      <td>[856, 864): 'NFC East'</td>\n",
+       "      <td>MISC</td>\n",
        "      <td>False</td>\n",
        "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>44</th>\n",
+       "      <th>1109</th>\n",
        "      <td>test</td>\n",
        "      <td>25</td>\n",
-       "      <td>[856, 864): 'NFC East'</td>\n",
-       "      <td>MISC</td>\n",
+       "      <td>[823, 835): 'Philadelphia'</td>\n",
+       "      <td>ORG</td>\n",
        "      <td>False</td>\n",
        "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>25</th>\n",
+       "      <th>1184</th>\n",
        "      <td>test</td>\n",
        "      <td>41</td>\n",
        "      <td>[674, 688): 'Sporting Gijon'</td>\n",
@@ -2764,7 +2739,7 @@
        "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>8</th>\n",
+       "      <th>1323</th>\n",
        "      <td>test</td>\n",
        "      <td>114</td>\n",
        "      <td>[51, 61): 'sales-USDA'</td>\n",
@@ -2773,25 +2748,25 @@
        "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>13</th>\n",
+       "      <th>1367</th>\n",
        "      <td>test</td>\n",
        "      <td>118</td>\n",
-       "      <td>[535, 550): 'mid-Mississippi'</td>\n",
+       "      <td>[776, 791): 'mid-Mississippi'</td>\n",
        "      <td>LOC</td>\n",
        "      <td>False</td>\n",
        "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>15</th>\n",
+       "      <th>1362</th>\n",
        "      <td>test</td>\n",
        "      <td>118</td>\n",
-       "      <td>[776, 791): 'mid-Mississippi'</td>\n",
+       "      <td>[535, 550): 'mid-Mississippi'</td>\n",
        "      <td>LOC</td>\n",
        "      <td>False</td>\n",
        "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>53</th>\n",
+       "      <th>1509</th>\n",
        "      <td>test</td>\n",
        "      <td>178</td>\n",
        "      <td>[1787, 1800): 'Uruguay Round'</td>\n",
@@ -2800,25 +2775,25 @@
        "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>31</th>\n",
+       "      <th>1560</th>\n",
        "      <td>test</td>\n",
        "      <td>180</td>\n",
-       "      <td>[259, 263): 'BILO'</td>\n",
+       "      <td>[588, 592): 'BILO'</td>\n",
        "      <td>ORG</td>\n",
        "      <td>False</td>\n",
        "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>32</th>\n",
+       "      <th>1558</th>\n",
        "      <td>test</td>\n",
        "      <td>180</td>\n",
-       "      <td>[286, 293): 'Malysia'</td>\n",
+       "      <td>[579, 583): 'TOPS'</td>\n",
        "      <td>ORG</td>\n",
        "      <td>False</td>\n",
        "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>34</th>\n",
+       "      <th>1550</th>\n",
        "      <td>test</td>\n",
        "      <td>180</td>\n",
        "      <td>[395, 399): 'BILO'</td>\n",
@@ -2827,25 +2802,25 @@
        "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>36</th>\n",
+       "      <th>1544</th>\n",
        "      <td>test</td>\n",
        "      <td>180</td>\n",
-       "      <td>[579, 583): 'TOPS'</td>\n",
+       "      <td>[286, 293): 'Malysia'</td>\n",
        "      <td>ORG</td>\n",
        "      <td>False</td>\n",
        "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>37</th>\n",
+       "      <th>1542</th>\n",
        "      <td>test</td>\n",
        "      <td>180</td>\n",
-       "      <td>[588, 592): 'BILO'</td>\n",
+       "      <td>[259, 263): 'BILO'</td>\n",
        "      <td>ORG</td>\n",
        "      <td>False</td>\n",
        "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>96</th>\n",
+       "      <th>1649</th>\n",
        "      <td>test</td>\n",
        "      <td>207</td>\n",
        "      <td>[1041, 1047): 'Oxford'</td>\n",
@@ -2854,7 +2829,7 @@
        "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>25</th>\n",
+       "      <th>1786</th>\n",
        "      <td>test</td>\n",
        "      <td>219</td>\n",
        "      <td>[368, 381): 'Koo Jeon Woon'</td>\n",
@@ -2863,25 +2838,25 @@
        "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>52</th>\n",
+       "      <th>1807</th>\n",
        "      <td>test</td>\n",
        "      <td>222</td>\n",
-       "      <td>[92, 114): 'National Hockey League'</td>\n",
+       "      <td>[218, 225): 'EASTERN'</td>\n",
        "      <td>MISC</td>\n",
        "      <td>False</td>\n",
        "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>53</th>\n",
+       "      <th>1805</th>\n",
        "      <td>test</td>\n",
        "      <td>222</td>\n",
-       "      <td>[218, 225): 'EASTERN'</td>\n",
+       "      <td>[92, 114): 'National Hockey League'</td>\n",
        "      <td>MISC</td>\n",
        "      <td>False</td>\n",
        "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>230</th>\n",
+       "      <th>2054</th>\n",
        "      <td>train</td>\n",
        "      <td>48</td>\n",
        "      <td>[885, 899): 'Sjeng Schalken'</td>\n",
@@ -2894,59 +2869,59 @@
        "</div>"
       ],
       "text/plain": [
-       "      fold  doc_offset                                 span ent_type   gold  \\\n",
-       "35     dev         149   [81, 102): 'Major League Baseball'     MISC  False   \n",
-       "52     dev         214        [187, 202): 'Michael Collins'      PER  False   \n",
-       "47    test          23  [94, 116): 'National Hockey League'     MISC  False   \n",
-       "43    test          25           [823, 835): 'Philadelphia'      ORG  False   \n",
-       "44    test          25               [856, 864): 'NFC East'     MISC  False   \n",
-       "25    test          41         [674, 688): 'Sporting Gijon'      ORG  False   \n",
-       "8     test         114               [51, 61): 'sales-USDA'      ORG  False   \n",
-       "13    test         118        [535, 550): 'mid-Mississippi'      LOC  False   \n",
-       "15    test         118        [776, 791): 'mid-Mississippi'      LOC  False   \n",
-       "53    test         178        [1787, 1800): 'Uruguay Round'     MISC  False   \n",
-       "31    test         180                   [259, 263): 'BILO'      ORG  False   \n",
-       "32    test         180                [286, 293): 'Malysia'      ORG  False   \n",
-       "34    test         180                   [395, 399): 'BILO'      ORG  False   \n",
-       "36    test         180                   [579, 583): 'TOPS'      ORG  False   \n",
-       "37    test         180                   [588, 592): 'BILO'      ORG  False   \n",
-       "96    test         207               [1041, 1047): 'Oxford'      ORG  False   \n",
-       "25    test         219          [368, 381): 'Koo Jeon Woon'      PER  False   \n",
-       "52    test         222  [92, 114): 'National Hockey League'     MISC  False   \n",
-       "53    test         222                [218, 225): 'EASTERN'     MISC  False   \n",
-       "230  train          48         [885, 899): 'Sjeng Schalken'      ORG  False   \n",
+       "       fold  doc_num                                 span ent_type  in_gold  \\\n",
+       "373     dev      149   [81, 102): 'Major League Baseball'     MISC    False   \n",
+       "570     dev      214        [187, 202): 'Michael Collins'      PER    False   \n",
+       "983    test       23  [94, 116): 'National Hockey League'     MISC    False   \n",
+       "1110   test       25               [856, 864): 'NFC East'     MISC    False   \n",
+       "1109   test       25           [823, 835): 'Philadelphia'      ORG    False   \n",
+       "1184   test       41         [674, 688): 'Sporting Gijon'      ORG    False   \n",
+       "1323   test      114               [51, 61): 'sales-USDA'      ORG    False   \n",
+       "1367   test      118        [776, 791): 'mid-Mississippi'      LOC    False   \n",
+       "1362   test      118        [535, 550): 'mid-Mississippi'      LOC    False   \n",
+       "1509   test      178        [1787, 1800): 'Uruguay Round'     MISC    False   \n",
+       "1560   test      180                   [588, 592): 'BILO'      ORG    False   \n",
+       "1558   test      180                   [579, 583): 'TOPS'      ORG    False   \n",
+       "1550   test      180                   [395, 399): 'BILO'      ORG    False   \n",
+       "1544   test      180                [286, 293): 'Malysia'      ORG    False   \n",
+       "1542   test      180                   [259, 263): 'BILO'      ORG    False   \n",
+       "1649   test      207               [1041, 1047): 'Oxford'      ORG    False   \n",
+       "1786   test      219          [368, 381): 'Koo Jeon Woon'      PER    False   \n",
+       "1807   test      222                [218, 225): 'EASTERN'     MISC    False   \n",
+       "1805   test      222  [92, 114): 'National Hockey League'     MISC    False   \n",
+       "2054  train       48         [885, 899): 'Sjeng Schalken'      ORG    False   \n",
        "\n",
-       "     num_models  \n",
-       "35           17  \n",
-       "52           17  \n",
-       "47           17  \n",
-       "43           17  \n",
-       "44           17  \n",
-       "25           17  \n",
-       "8            17  \n",
-       "13           17  \n",
-       "15           17  \n",
-       "53           17  \n",
-       "31           17  \n",
-       "32           17  \n",
-       "34           17  \n",
-       "36           17  \n",
-       "37           17  \n",
-       "96           17  \n",
-       "25           17  \n",
-       "52           17  \n",
-       "53           17  \n",
-       "230          17  "
+       "      count  \n",
+       "373      17  \n",
+       "570      17  \n",
+       "983      17  \n",
+       "1110     17  \n",
+       "1109     17  \n",
+       "1184     17  \n",
+       "1323     17  \n",
+       "1367     17  \n",
+       "1362     17  \n",
+       "1509     17  \n",
+       "1560     17  \n",
+       "1558     17  \n",
+       "1550     17  \n",
+       "1544     17  \n",
+       "1542     17  \n",
+       "1649     17  \n",
+       "1786     17  \n",
+       "1807     17  \n",
+       "1805     17  \n",
+       "2054     17  "
       ]
      },
-     "execution_count": 21,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# Hardest results not in the gold standard for models to avoid\n",
-    "hard_to_avoid = results[~results[\"gold\"]].sort_values([\"num_models\", \"fold\", \"doc_offset\"], ascending=[False, True, True]).head(20)\n",
+    "hard_to_avoid = results[~results[\"in_gold\"]].sort_values([\"count\", \"fold\", \"doc_num\"], ascending=[False, True, True]).head(20)\n",
     "hard_to_avoid"
    ]
   },
@@ -2969,7 +2944,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
@@ -2994,29 +2969,29 @@
       "Training model '256_2' (#2 at 256 dimensions) with seed 643865\n",
       "Training model '256_3' (#3 at 256 dimensions) with seed 402414\n",
       "Training model '256_4' (#4 at 256 dimensions) with seed 822761\n",
-      "\u001b[2m\u001b[36m(pid=24222)\u001b[0m Training model with n_components=128 and seed=513226.\n",
-      "\u001b[2m\u001b[36m(pid=24226)\u001b[0m Training model with n_components=256 and seed=402414.\n",
-      "\u001b[2m\u001b[36m(pid=24230)\u001b[0m Training model with n_components=128 and seed=128113.\n",
-      "\u001b[2m\u001b[36m(pid=24223)\u001b[0m Training model with n_components=256 and seed=781567.\n",
-      "\u001b[2m\u001b[36m(pid=24224)\u001b[0m Training model with n_components=256 and seed=643865.\n",
-      "\u001b[2m\u001b[36m(pid=24225)\u001b[0m Training model with n_components=32 and seed=89250.\n",
-      "\u001b[2m\u001b[36m(pid=24235)\u001b[0m Training model with n_components=128 and seed=450385.\n",
-      "\u001b[2m\u001b[36m(pid=24227)\u001b[0m Training model with n_components=128 and seed=839748.\n",
-      "\u001b[2m\u001b[36m(pid=24231)\u001b[0m Training model with n_components=64 and seed=201469.\n",
-      "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=32 and seed=654571.\n",
-      "\u001b[2m\u001b[36m(pid=24234)\u001b[0m Training model with n_components=64 and seed=526478.\n",
-      "\u001b[2m\u001b[36m(pid=24233)\u001b[0m Training model with n_components=64 and seed=975622.\n",
-      "\u001b[2m\u001b[36m(pid=24232)\u001b[0m Training model with n_components=64 and seed=94177.\n",
-      "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=32 and seed=438878.\n",
-      "\u001b[2m\u001b[36m(pid=24229)\u001b[0m Training model with n_components=32 and seed=773956.\n",
-      "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=256 and seed=822761.\n",
+      "\u001b[2m\u001b[36m(pid=1424)\u001b[0m Training model with n_components=128 and seed=513226.\n",
+      "\u001b[2m\u001b[36m(pid=1427)\u001b[0m Training model with n_components=128 and seed=839748.\n",
+      "\u001b[2m\u001b[36m(pid=1421)\u001b[0m Training model with n_components=64 and seed=201469.\n",
+      "\u001b[2m\u001b[36m(pid=1428)\u001b[0m Training model with n_components=128 and seed=450385.\n",
+      "\u001b[2m\u001b[36m(pid=1422)\u001b[0m Training model with n_components=32 and seed=438878.\n",
+      "\u001b[2m\u001b[36m(pid=1431)\u001b[0m Training model with n_components=256 and seed=643865.\n",
+      "\u001b[2m\u001b[36m(pid=1430)\u001b[0m Training model with n_components=256 and seed=781567.\n",
+      "\u001b[2m\u001b[36m(pid=1432)\u001b[0m Training model with n_components=256 and seed=402414.\n",
+      "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=32 and seed=89250.\n",
+      "\u001b[2m\u001b[36m(pid=1426)\u001b[0m Training model with n_components=64 and seed=94177.\n",
+      "\u001b[2m\u001b[36m(pid=1419)\u001b[0m Training model with n_components=32 and seed=773956.\n",
+      "\u001b[2m\u001b[36m(pid=1425)\u001b[0m Training model with n_components=64 and seed=975622.\n",
+      "\u001b[2m\u001b[36m(pid=1423)\u001b[0m Training model with n_components=64 and seed=526478.\n",
+      "\u001b[2m\u001b[36m(pid=1420)\u001b[0m Training model with n_components=32 and seed=654571.\n",
+      "\u001b[2m\u001b[36m(pid=1429)\u001b[0m Training model with n_components=128 and seed=128113.\n",
+      "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=256 and seed=822761.\n",
       "Trained 17 models.\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "aa0c4f31711d4ebebabc172823af60c4",
+       "model_id": "01da45dd78b4496da2463cbd219c07a9",
        "version_major": 2,
        "version_minor": 0
       },
@@ -3027,20 +3002,6 @@
      "metadata": {},
      "output_type": "display_data"
     },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "3ebe24b3d5b34dc5838c6d42fdaabe9e",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=140, style=ProgressStyle(desc…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -3064,29 +3025,29 @@
       "Training model '256_2' (#2 at 256 dimensions) with seed 643865\n",
       "Training model '256_3' (#3 at 256 dimensions) with seed 402414\n",
       "Training model '256_4' (#4 at 256 dimensions) with seed 822761\n",
-      "\u001b[2m\u001b[36m(pid=24222)\u001b[0m Training model with n_components=64 and seed=526478.\n",
-      "\u001b[2m\u001b[36m(pid=24226)\u001b[0m Training model with n_components=32 and seed=438878.\n",
-      "\u001b[2m\u001b[36m(pid=24230)\u001b[0m Training model with n_components=64 and seed=201469.\n",
-      "\u001b[2m\u001b[36m(pid=24223)\u001b[0m Training model with n_components=32 and seed=773956.\n",
-      "\u001b[2m\u001b[36m(pid=24224)\u001b[0m Training model with n_components=32 and seed=654571.\n",
-      "\u001b[2m\u001b[36m(pid=24235)\u001b[0m Training model with n_components=64 and seed=94177.\n",
-      "\u001b[2m\u001b[36m(pid=24227)\u001b[0m Training model with n_components=64 and seed=975622.\n",
-      "\u001b[2m\u001b[36m(pid=24231)\u001b[0m Training model with n_components=128 and seed=128113.\n",
-      "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=32 and seed=89250.\n",
-      "\u001b[2m\u001b[36m(pid=24234)\u001b[0m Training model with n_components=128 and seed=450385.\n",
-      "\u001b[2m\u001b[36m(pid=24233)\u001b[0m Training model with n_components=128 and seed=839748.\n",
-      "\u001b[2m\u001b[36m(pid=24232)\u001b[0m Training model with n_components=128 and seed=513226.\n",
-      "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=256 and seed=781567.\n",
-      "\u001b[2m\u001b[36m(pid=24229)\u001b[0m Training model with n_components=256 and seed=643865.\n",
-      "\u001b[2m\u001b[36m(pid=24225)\u001b[0m Training model with n_components=256 and seed=402414.\n",
-      "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=256 and seed=822761.\n",
+      "\u001b[2m\u001b[36m(pid=1424)\u001b[0m Training model with n_components=64 and seed=975622.\n",
+      "\u001b[2m\u001b[36m(pid=1427)\u001b[0m Training model with n_components=64 and seed=526478.\n",
+      "\u001b[2m\u001b[36m(pid=1421)\u001b[0m Training model with n_components=128 and seed=128113.\n",
+      "\u001b[2m\u001b[36m(pid=1428)\u001b[0m Training model with n_components=64 and seed=94177.\n",
+      "\u001b[2m\u001b[36m(pid=1422)\u001b[0m Training model with n_components=256 and seed=643865.\n",
+      "\u001b[2m\u001b[36m(pid=1431)\u001b[0m Training model with n_components=32 and seed=438878.\n",
+      "\u001b[2m\u001b[36m(pid=1430)\u001b[0m Training model with n_components=32 and seed=773956.\n",
+      "\u001b[2m\u001b[36m(pid=1432)\u001b[0m Training model with n_components=32 and seed=654571.\n",
+      "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=32 and seed=89250.\n",
+      "\u001b[2m\u001b[36m(pid=1426)\u001b[0m Training model with n_components=128 and seed=513226.\n",
+      "\u001b[2m\u001b[36m(pid=1419)\u001b[0m Training model with n_components=256 and seed=402414.\n",
+      "\u001b[2m\u001b[36m(pid=1425)\u001b[0m Training model with n_components=128 and seed=450385.\n",
+      "\u001b[2m\u001b[36m(pid=1423)\u001b[0m Training model with n_components=128 and seed=839748.\n",
+      "\u001b[2m\u001b[36m(pid=1420)\u001b[0m Training model with n_components=256 and seed=781567.\n",
+      "\u001b[2m\u001b[36m(pid=1429)\u001b[0m Training model with n_components=64 and seed=201469.\n",
+      "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=256 and seed=822761.\n",
       "Trained 17 models.\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "be8b3f55fef141a9a6c003cc8e7bfc4c",
+       "model_id": "cb3412eb0b714433a507d2610a885409",
        "version_major": 2,
        "version_minor": 0
       },
@@ -3097,20 +3058,6 @@
      "metadata": {},
      "output_type": "display_data"
     },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "707100b8c15846c0a46a2dced9644de9",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=140, style=ProgressStyle(desc…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -3134,29 +3081,29 @@
       "Training model '256_2' (#2 at 256 dimensions) with seed 643865\n",
       "Training model '256_3' (#3 at 256 dimensions) with seed 402414\n",
       "Training model '256_4' (#4 at 256 dimensions) with seed 822761\n",
-      "\u001b[2m\u001b[36m(pid=24222)\u001b[0m Training model with n_components=128 and seed=450385.\n",
-      "\u001b[2m\u001b[36m(pid=24226)\u001b[0m Training model with n_components=256 and seed=643865.\n",
-      "\u001b[2m\u001b[36m(pid=24230)\u001b[0m Training model with n_components=128 and seed=839748.\n",
-      "\u001b[2m\u001b[36m(pid=24223)\u001b[0m Training model with n_components=256 and seed=781567.\n",
-      "\u001b[2m\u001b[36m(pid=24224)\u001b[0m Training model with n_components=256 and seed=402414.\n",
-      "\u001b[2m\u001b[36m(pid=24225)\u001b[0m Training model with n_components=32 and seed=438878.\n",
-      "\u001b[2m\u001b[36m(pid=24235)\u001b[0m Training model with n_components=128 and seed=128113.\n",
-      "\u001b[2m\u001b[36m(pid=24227)\u001b[0m Training model with n_components=128 and seed=513226.\n",
-      "\u001b[2m\u001b[36m(pid=24231)\u001b[0m Training model with n_components=64 and seed=201469.\n",
-      "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=32 and seed=89250.\n",
-      "\u001b[2m\u001b[36m(pid=24234)\u001b[0m Training model with n_components=64 and seed=975622.\n",
-      "\u001b[2m\u001b[36m(pid=24233)\u001b[0m Training model with n_components=64 and seed=94177.\n",
-      "\u001b[2m\u001b[36m(pid=24232)\u001b[0m Training model with n_components=64 and seed=526478.\n",
-      "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=32 and seed=654571.\n",
-      "\u001b[2m\u001b[36m(pid=24229)\u001b[0m Training model with n_components=32 and seed=773956.\n",
-      "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=256 and seed=822761.\n",
+      "\u001b[2m\u001b[36m(pid=1424)\u001b[0m Training model with n_components=128 and seed=450385.\n",
+      "\u001b[2m\u001b[36m(pid=1427)\u001b[0m Training model with n_components=128 and seed=839748.\n",
+      "\u001b[2m\u001b[36m(pid=1421)\u001b[0m Training model with n_components=64 and seed=201469.\n",
+      "\u001b[2m\u001b[36m(pid=1428)\u001b[0m Training model with n_components=128 and seed=513226.\n",
+      "\u001b[2m\u001b[36m(pid=1422)\u001b[0m Training model with n_components=32 and seed=773956.\n",
+      "\u001b[2m\u001b[36m(pid=1431)\u001b[0m Training model with n_components=256 and seed=781567.\n",
+      "\u001b[2m\u001b[36m(pid=1430)\u001b[0m Training model with n_components=256 and seed=643865.\n",
+      "\u001b[2m\u001b[36m(pid=1432)\u001b[0m Training model with n_components=256 and seed=402414.\n",
+      "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=32 and seed=89250.\n",
+      "\u001b[2m\u001b[36m(pid=1426)\u001b[0m Training model with n_components=64 and seed=526478.\n",
+      "\u001b[2m\u001b[36m(pid=1419)\u001b[0m Training model with n_components=32 and seed=438878.\n",
+      "\u001b[2m\u001b[36m(pid=1425)\u001b[0m Training model with n_components=64 and seed=975622.\n",
+      "\u001b[2m\u001b[36m(pid=1423)\u001b[0m Training model with n_components=64 and seed=94177.\n",
+      "\u001b[2m\u001b[36m(pid=1420)\u001b[0m Training model with n_components=32 and seed=654571.\n",
+      "\u001b[2m\u001b[36m(pid=1429)\u001b[0m Training model with n_components=128 and seed=128113.\n",
+      "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=256 and seed=822761.\n",
       "Trained 17 models.\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c4f19b309a724640a23d0f261b7c40cf",
+       "model_id": "f4df45a4607b4dcba42ea31c2a6cd13e",
        "version_major": 2,
        "version_minor": 0
       },
@@ -3167,20 +3114,6 @@
      "metadata": {},
      "output_type": "display_data"
     },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5b8a9fdfc1e247d69b46e7d736509cfd",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=139, style=ProgressStyle(desc…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -3201,51 +3134,37 @@
       "Training model '128_3' (#3 at 128 dimensions) with seed 839748\n",
       "Training model '128_4' (#4 at 128 dimensions) with seed 450385\n",
       "Training model '256_1' (#1 at 256 dimensions) with seed 781567\n",
-      "Training model '256_2' (#2 at 256 dimensions) with seed 643865\n",
-      "Training model '256_3' (#3 at 256 dimensions) with seed 402414\n",
-      "Training model '256_4' (#4 at 256 dimensions) with seed 822761\n",
-      "\u001b[2m\u001b[36m(pid=24222)\u001b[0m Training model with n_components=64 and seed=94177.\n",
-      "\u001b[2m\u001b[36m(pid=24226)\u001b[0m Training model with n_components=32 and seed=773956.\n",
-      "\u001b[2m\u001b[36m(pid=24230)\u001b[0m Training model with n_components=64 and seed=975622.\n",
-      "\u001b[2m\u001b[36m(pid=24223)\u001b[0m Training model with n_components=32 and seed=654571.\n",
-      "\u001b[2m\u001b[36m(pid=24224)\u001b[0m Training model with n_components=32 and seed=438878.\n",
-      "\u001b[2m\u001b[36m(pid=24225)\u001b[0m Training model with n_components=256 and seed=781567.\n",
-      "\u001b[2m\u001b[36m(pid=24235)\u001b[0m Training model with n_components=64 and seed=201469.\n",
-      "\u001b[2m\u001b[36m(pid=24227)\u001b[0m Training model with n_components=64 and seed=526478.\n",
-      "\u001b[2m\u001b[36m(pid=24231)\u001b[0m Training model with n_components=128 and seed=839748.\n",
-      "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=32 and seed=89250.\n",
-      "\u001b[2m\u001b[36m(pid=24234)\u001b[0m Training model with n_components=128 and seed=450385.\n",
-      "\u001b[2m\u001b[36m(pid=24233)\u001b[0m Training model with n_components=128 and seed=513226.\n",
-      "\u001b[2m\u001b[36m(pid=24232)\u001b[0m Training model with n_components=128 and seed=128113.\n",
-      "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=256 and seed=402414.\n",
-      "\u001b[2m\u001b[36m(pid=24229)\u001b[0m Training model with n_components=256 and seed=643865.\n",
-      "\u001b[2m\u001b[36m(pid=24224)\u001b[0m Training model with n_components=256 and seed=822761.\n",
-      "Trained 17 models.\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "1f077611cbc14f9ba4007f7d94edd4ce",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=17, style=ProgressStyle(descr…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
+      "Training model '256_2' (#2 at 256 dimensions) with seed 643865\n",
+      "Training model '256_3' (#3 at 256 dimensions) with seed 402414\n",
+      "Training model '256_4' (#4 at 256 dimensions) with seed 822761\n",
+      "\u001b[2m\u001b[36m(pid=1424)\u001b[0m Training model with n_components=64 and seed=94177.\n",
+      "\u001b[2m\u001b[36m(pid=1427)\u001b[0m Training model with n_components=64 and seed=975622.\n",
+      "\u001b[2m\u001b[36m(pid=1421)\u001b[0m Training model with n_components=128 and seed=839748.\n",
+      "\u001b[2m\u001b[36m(pid=1428)\u001b[0m Training model with n_components=64 and seed=526478.\n",
+      "\u001b[2m\u001b[36m(pid=1422)\u001b[0m Training model with n_components=256 and seed=402414.\n",
+      "\u001b[2m\u001b[36m(pid=1431)\u001b[0m Training model with n_components=32 and seed=654571.\n",
+      "\u001b[2m\u001b[36m(pid=1430)\u001b[0m Training model with n_components=32 and seed=773956.\n",
+      "\u001b[2m\u001b[36m(pid=1432)\u001b[0m Training model with n_components=32 and seed=438878.\n",
+      "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=32 and seed=89250.\n",
+      "\u001b[2m\u001b[36m(pid=1426)\u001b[0m Training model with n_components=128 and seed=128113.\n",
+      "\u001b[2m\u001b[36m(pid=1419)\u001b[0m Training model with n_components=256 and seed=781567.\n",
+      "\u001b[2m\u001b[36m(pid=1425)\u001b[0m Training model with n_components=128 and seed=450385.\n",
+      "\u001b[2m\u001b[36m(pid=1423)\u001b[0m Training model with n_components=128 and seed=513226.\n",
+      "\u001b[2m\u001b[36m(pid=1420)\u001b[0m Training model with n_components=256 and seed=643865.\n",
+      "\u001b[2m\u001b[36m(pid=1429)\u001b[0m Training model with n_components=64 and seed=201469.\n",
+      "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=256 and seed=822761.\n",
+      "Trained 17 models.\n"
+     ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "8e3d4523d19a405db777e4fb25d481aa",
+       "model_id": "b9e94c8d6456418284358fa0264359f5",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=139, style=ProgressStyle(desc…"
+       "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=17, style=ProgressStyle(descr…"
       ]
      },
      "metadata": {},
@@ -3274,29 +3193,29 @@
       "Training model '256_2' (#2 at 256 dimensions) with seed 643865\n",
       "Training model '256_3' (#3 at 256 dimensions) with seed 402414\n",
       "Training model '256_4' (#4 at 256 dimensions) with seed 822761\n",
-      "\u001b[2m\u001b[36m(pid=24222)\u001b[0m Training model with n_components=128 and seed=513226.\n",
-      "\u001b[2m\u001b[36m(pid=24226)\u001b[0m Training model with n_components=256 and seed=643865.\n",
-      "\u001b[2m\u001b[36m(pid=24230)\u001b[0m Training model with n_components=128 and seed=839748.\n",
-      "\u001b[2m\u001b[36m(pid=24223)\u001b[0m Training model with n_components=256 and seed=402414.\n",
-      "\u001b[2m\u001b[36m(pid=24224)\u001b[0m Training model with n_components=32 and seed=89250.\n",
-      "\u001b[2m\u001b[36m(pid=24225)\u001b[0m Training model with n_components=32 and seed=773956.\n",
-      "\u001b[2m\u001b[36m(pid=24235)\u001b[0m Training model with n_components=128 and seed=450385.\n",
-      "\u001b[2m\u001b[36m(pid=24227)\u001b[0m Training model with n_components=128 and seed=128113.\n",
-      "\u001b[2m\u001b[36m(pid=24231)\u001b[0m Training model with n_components=64 and seed=975622.\n",
-      "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=256 and seed=781567.\n",
-      "\u001b[2m\u001b[36m(pid=24234)\u001b[0m Training model with n_components=64 and seed=94177.\n",
-      "\u001b[2m\u001b[36m(pid=24233)\u001b[0m Training model with n_components=64 and seed=526478.\n",
-      "\u001b[2m\u001b[36m(pid=24232)\u001b[0m Training model with n_components=64 and seed=201469.\n",
-      "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=32 and seed=438878.\n",
-      "\u001b[2m\u001b[36m(pid=24229)\u001b[0m Training model with n_components=32 and seed=654571.\n",
-      "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=256 and seed=822761.\n",
+      "\u001b[2m\u001b[36m(pid=1424)\u001b[0m Training model with n_components=128 and seed=513226.\n",
+      "\u001b[2m\u001b[36m(pid=1427)\u001b[0m Training model with n_components=128 and seed=450385.\n",
+      "\u001b[2m\u001b[36m(pid=1421)\u001b[0m Training model with n_components=64 and seed=94177.\n",
+      "\u001b[2m\u001b[36m(pid=1428)\u001b[0m Training model with n_components=128 and seed=839748.\n",
+      "\u001b[2m\u001b[36m(pid=1422)\u001b[0m Training model with n_components=32 and seed=438878.\n",
+      "\u001b[2m\u001b[36m(pid=1431)\u001b[0m Training model with n_components=256 and seed=643865.\n",
+      "\u001b[2m\u001b[36m(pid=1430)\u001b[0m Training model with n_components=256 and seed=781567.\n",
+      "\u001b[2m\u001b[36m(pid=1432)\u001b[0m Training model with n_components=256 and seed=402414.\n",
+      "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=32 and seed=89250.\n",
+      "\u001b[2m\u001b[36m(pid=1426)\u001b[0m Training model with n_components=64 and seed=201469.\n",
+      "\u001b[2m\u001b[36m(pid=1419)\u001b[0m Training model with n_components=32 and seed=654571.\n",
+      "\u001b[2m\u001b[36m(pid=1425)\u001b[0m Training model with n_components=64 and seed=526478.\n",
+      "\u001b[2m\u001b[36m(pid=1423)\u001b[0m Training model with n_components=64 and seed=975622.\n",
+      "\u001b[2m\u001b[36m(pid=1420)\u001b[0m Training model with n_components=32 and seed=773956.\n",
+      "\u001b[2m\u001b[36m(pid=1429)\u001b[0m Training model with n_components=128 and seed=128113.\n",
+      "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=256 and seed=822761.\n",
       "Trained 17 models.\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "db4060ae0b2044c3b63224e5810ae14e",
+       "model_id": "40d1bd81bc0c4a52a93b9c72baa2e8e7",
        "version_major": 2,
        "version_minor": 0
       },
@@ -3307,20 +3226,6 @@
      "metadata": {},
      "output_type": "display_data"
     },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c1ed240bfef348aaa605df2175954a79",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=139, style=ProgressStyle(desc…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -3344,29 +3249,29 @@
       "Training model '256_2' (#2 at 256 dimensions) with seed 643865\n",
       "Training model '256_3' (#3 at 256 dimensions) with seed 402414\n",
       "Training model '256_4' (#4 at 256 dimensions) with seed 822761\n",
-      "\u001b[2m\u001b[36m(pid=24222)\u001b[0m Training model with n_components=64 and seed=526478.\n",
-      "\u001b[2m\u001b[36m(pid=24226)\u001b[0m Training model with n_components=32 and seed=773956.\n",
-      "\u001b[2m\u001b[36m(pid=24230)\u001b[0m Training model with n_components=64 and seed=975622.\n",
-      "\u001b[2m\u001b[36m(pid=24223)\u001b[0m Training model with n_components=32 and seed=438878.\n",
-      "\u001b[2m\u001b[36m(pid=24224)\u001b[0m Training model with n_components=256 and seed=643865.\n",
-      "\u001b[2m\u001b[36m(pid=24225)\u001b[0m Training model with n_components=256 and seed=781567.\n",
-      "\u001b[2m\u001b[36m(pid=24235)\u001b[0m Training model with n_components=64 and seed=94177.\n",
-      "\u001b[2m\u001b[36m(pid=24227)\u001b[0m Training model with n_components=64 and seed=201469.\n",
-      "\u001b[2m\u001b[36m(pid=24231)\u001b[0m Training model with n_components=128 and seed=450385.\n",
-      "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=32 and seed=654571.\n",
-      "\u001b[2m\u001b[36m(pid=24234)\u001b[0m Training model with n_components=128 and seed=513226.\n",
-      "\u001b[2m\u001b[36m(pid=24233)\u001b[0m Training model with n_components=128 and seed=839748.\n",
-      "\u001b[2m\u001b[36m(pid=24232)\u001b[0m Training model with n_components=128 and seed=128113.\n",
-      "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=32 and seed=89250.\n",
-      "\u001b[2m\u001b[36m(pid=24229)\u001b[0m Training model with n_components=256 and seed=402414.\n",
-      "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=256 and seed=822761.\n",
+      "\u001b[2m\u001b[36m(pid=1424)\u001b[0m Training model with n_components=64 and seed=526478.\n",
+      "\u001b[2m\u001b[36m(pid=1427)\u001b[0m Training model with n_components=64 and seed=94177.\n",
+      "\u001b[2m\u001b[36m(pid=1421)\u001b[0m Training model with n_components=128 and seed=513226.\n",
+      "\u001b[2m\u001b[36m(pid=1428)\u001b[0m Training model with n_components=64 and seed=975622.\n",
+      "\u001b[2m\u001b[36m(pid=1422)\u001b[0m Training model with n_components=256 and seed=781567.\n",
+      "\u001b[2m\u001b[36m(pid=1431)\u001b[0m Training model with n_components=32 and seed=654571.\n",
+      "\u001b[2m\u001b[36m(pid=1430)\u001b[0m Training model with n_components=32 and seed=773956.\n",
+      "\u001b[2m\u001b[36m(pid=1432)\u001b[0m Training model with n_components=32 and seed=438878.\n",
+      "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=32 and seed=89250.\n",
+      "\u001b[2m\u001b[36m(pid=1426)\u001b[0m Training model with n_components=128 and seed=128113.\n",
+      "\u001b[2m\u001b[36m(pid=1419)\u001b[0m Training model with n_components=256 and seed=643865.\n",
+      "\u001b[2m\u001b[36m(pid=1425)\u001b[0m Training model with n_components=128 and seed=839748.\n",
+      "\u001b[2m\u001b[36m(pid=1423)\u001b[0m Training model with n_components=128 and seed=450385.\n",
+      "\u001b[2m\u001b[36m(pid=1420)\u001b[0m Training model with n_components=256 and seed=402414.\n",
+      "\u001b[2m\u001b[36m(pid=1429)\u001b[0m Training model with n_components=64 and seed=201469.\n",
+      "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=256 and seed=822761.\n",
       "Trained 17 models.\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "3f7afaeaec744aa2a09ae158f43b363e",
+       "model_id": "0c4b799c0fc34aa19593b7496b7dafa5",
        "version_major": 2,
        "version_minor": 0
       },
@@ -3377,20 +3282,6 @@
      "metadata": {},
      "output_type": "display_data"
     },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d5d6db92f1fd4f49bf1fed41df60d803",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=139, style=ProgressStyle(desc…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -3414,29 +3305,29 @@
       "Training model '256_2' (#2 at 256 dimensions) with seed 643865\n",
       "Training model '256_3' (#3 at 256 dimensions) with seed 402414\n",
       "Training model '256_4' (#4 at 256 dimensions) with seed 822761\n",
-      "\u001b[2m\u001b[36m(pid=24222)\u001b[0m Training model with n_components=128 and seed=128113.\n",
-      "\u001b[2m\u001b[36m(pid=24226)\u001b[0m Training model with n_components=256 and seed=402414.\n",
-      "\u001b[2m\u001b[36m(pid=24230)\u001b[0m Training model with n_components=128 and seed=450385.\n",
-      "\u001b[2m\u001b[36m(pid=24223)\u001b[0m Training model with n_components=256 and seed=781567.\n",
-      "\u001b[2m\u001b[36m(pid=24224)\u001b[0m Training model with n_components=32 and seed=438878.\n",
-      "\u001b[2m\u001b[36m(pid=24225)\u001b[0m Training model with n_components=32 and seed=773956.\n",
-      "\u001b[2m\u001b[36m(pid=24235)\u001b[0m Training model with n_components=128 and seed=513226.\n",
-      "\u001b[2m\u001b[36m(pid=24227)\u001b[0m Training model with n_components=128 and seed=839748.\n",
-      "\u001b[2m\u001b[36m(pid=24231)\u001b[0m Training model with n_components=64 and seed=94177.\n",
-      "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=256 and seed=643865.\n",
-      "\u001b[2m\u001b[36m(pid=24234)\u001b[0m Training model with n_components=64 and seed=526478.\n",
-      "\u001b[2m\u001b[36m(pid=24233)\u001b[0m Training model with n_components=64 and seed=975622.\n",
-      "\u001b[2m\u001b[36m(pid=24232)\u001b[0m Training model with n_components=64 and seed=201469.\n",
-      "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=32 and seed=89250.\n",
-      "\u001b[2m\u001b[36m(pid=24229)\u001b[0m Training model with n_components=32 and seed=654571.\n",
-      "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=256 and seed=822761.\n",
+      "\u001b[2m\u001b[36m(pid=1424)\u001b[0m Training model with n_components=128 and seed=128113.\n",
+      "\u001b[2m\u001b[36m(pid=1427)\u001b[0m Training model with n_components=128 and seed=513226.\n",
+      "\u001b[2m\u001b[36m(pid=1421)\u001b[0m Training model with n_components=64 and seed=94177.\n",
+      "\u001b[2m\u001b[36m(pid=1428)\u001b[0m Training model with n_components=128 and seed=450385.\n",
+      "\u001b[2m\u001b[36m(pid=1422)\u001b[0m Training model with n_components=32 and seed=654571.\n",
+      "\u001b[2m\u001b[36m(pid=1431)\u001b[0m Training model with n_components=256 and seed=402414.\n",
+      "\u001b[2m\u001b[36m(pid=1430)\u001b[0m Training model with n_components=256 and seed=643865.\n",
+      "\u001b[2m\u001b[36m(pid=1432)\u001b[0m Training model with n_components=256 and seed=781567.\n",
+      "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=32 and seed=89250.\n",
+      "\u001b[2m\u001b[36m(pid=1426)\u001b[0m Training model with n_components=64 and seed=201469.\n",
+      "\u001b[2m\u001b[36m(pid=1419)\u001b[0m Training model with n_components=32 and seed=773956.\n",
+      "\u001b[2m\u001b[36m(pid=1425)\u001b[0m Training model with n_components=64 and seed=975622.\n",
+      "\u001b[2m\u001b[36m(pid=1423)\u001b[0m Training model with n_components=64 and seed=526478.\n",
+      "\u001b[2m\u001b[36m(pid=1420)\u001b[0m Training model with n_components=32 and seed=438878.\n",
+      "\u001b[2m\u001b[36m(pid=1429)\u001b[0m Training model with n_components=128 and seed=839748.\n",
+      "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=256 and seed=822761.\n",
       "Trained 17 models.\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "be3f25acbfbb4a778af688e75c761cfe",
+       "model_id": "3c72920296f44cdca60a5b22951c409d",
        "version_major": 2,
        "version_minor": 0
       },
@@ -3447,20 +3338,6 @@
      "metadata": {},
      "output_type": "display_data"
     },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "58bc95e025ae454eb7d5e8e230c36628",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=139, style=ProgressStyle(desc…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -3484,29 +3361,29 @@
       "Training model '256_2' (#2 at 256 dimensions) with seed 643865\n",
       "Training model '256_3' (#3 at 256 dimensions) with seed 402414\n",
       "Training model '256_4' (#4 at 256 dimensions) with seed 822761\n",
-      "\u001b[2m\u001b[36m(pid=24222)\u001b[0m Training model with n_components=64 and seed=201469.\n",
-      "\u001b[2m\u001b[36m(pid=24226)\u001b[0m Training model with n_components=32 and seed=654571.\n",
-      "\u001b[2m\u001b[36m(pid=24230)\u001b[0m Training model with n_components=64 and seed=94177.\n",
-      "\u001b[2m\u001b[36m(pid=24223)\u001b[0m Training model with n_components=32 and seed=438878.\n",
-      "\u001b[2m\u001b[36m(pid=24224)\u001b[0m Training model with n_components=256 and seed=781567.\n",
-      "\u001b[2m\u001b[36m(pid=24225)\u001b[0m Training model with n_components=256 and seed=643865.\n",
-      "\u001b[2m\u001b[36m(pid=24235)\u001b[0m Training model with n_components=64 and seed=526478.\n",
-      "\u001b[2m\u001b[36m(pid=24227)\u001b[0m Training model with n_components=64 and seed=975622.\n",
-      "\u001b[2m\u001b[36m(pid=24231)\u001b[0m Training model with n_components=128 and seed=513226.\n",
-      "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=32 and seed=773956.\n",
-      "\u001b[2m\u001b[36m(pid=24234)\u001b[0m Training model with n_components=128 and seed=839748.\n",
-      "\u001b[2m\u001b[36m(pid=24233)\u001b[0m Training model with n_components=128 and seed=450385.\n",
-      "\u001b[2m\u001b[36m(pid=24232)\u001b[0m Training model with n_components=128 and seed=128113.\n",
-      "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=32 and seed=89250.\n",
-      "\u001b[2m\u001b[36m(pid=24229)\u001b[0m Training model with n_components=256 and seed=402414.\n",
-      "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=256 and seed=822761.\n",
+      "\u001b[2m\u001b[36m(pid=1424)\u001b[0m Training model with n_components=64 and seed=201469.\n",
+      "\u001b[2m\u001b[36m(pid=1427)\u001b[0m Training model with n_components=64 and seed=94177.\n",
+      "\u001b[2m\u001b[36m(pid=1421)\u001b[0m Training model with n_components=128 and seed=513226.\n",
+      "\u001b[2m\u001b[36m(pid=1428)\u001b[0m Training model with n_components=64 and seed=975622.\n",
+      "\u001b[2m\u001b[36m(pid=1422)\u001b[0m Training model with n_components=256 and seed=781567.\n",
+      "\u001b[2m\u001b[36m(pid=1431)\u001b[0m Training model with n_components=32 and seed=438878.\n",
+      "\u001b[2m\u001b[36m(pid=1430)\u001b[0m Training model with n_components=32 and seed=773956.\n",
+      "\u001b[2m\u001b[36m(pid=1432)\u001b[0m Training model with n_components=32 and seed=654571.\n",
+      "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=32 and seed=89250.\n",
+      "\u001b[2m\u001b[36m(pid=1426)\u001b[0m Training model with n_components=128 and seed=128113.\n",
+      "\u001b[2m\u001b[36m(pid=1419)\u001b[0m Training model with n_components=256 and seed=402414.\n",
+      "\u001b[2m\u001b[36m(pid=1425)\u001b[0m Training model with n_components=128 and seed=450385.\n",
+      "\u001b[2m\u001b[36m(pid=1423)\u001b[0m Training model with n_components=128 and seed=839748.\n",
+      "\u001b[2m\u001b[36m(pid=1420)\u001b[0m Training model with n_components=256 and seed=643865.\n",
+      "\u001b[2m\u001b[36m(pid=1429)\u001b[0m Training model with n_components=64 and seed=526478.\n",
+      "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=256 and seed=822761.\n",
       "Trained 17 models.\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0a548fe09cb64771b3e30662428db940",
+       "model_id": "9ce8252670bb4302bd9d3a021474493a",
        "version_major": 2,
        "version_minor": 0
       },
@@ -3517,20 +3394,6 @@
      "metadata": {},
      "output_type": "display_data"
     },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "8a17fea4c2234d018b60c8a36612248f",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=139, style=ProgressStyle(desc…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -3554,29 +3417,29 @@
       "Training model '256_2' (#2 at 256 dimensions) with seed 643865\n",
       "Training model '256_3' (#3 at 256 dimensions) with seed 402414\n",
       "Training model '256_4' (#4 at 256 dimensions) with seed 822761\n",
-      "\u001b[2m\u001b[36m(pid=24222)\u001b[0m Training model with n_components=128 and seed=450385.\n",
-      "\u001b[2m\u001b[36m(pid=24226)\u001b[0m Training model with n_components=256 and seed=643865.\n",
-      "\u001b[2m\u001b[36m(pid=24230)\u001b[0m Training model with n_components=128 and seed=513226.\n",
-      "\u001b[2m\u001b[36m(pid=24223)\u001b[0m Training model with n_components=128 and seed=839748.\n",
-      "\u001b[2m\u001b[36m(pid=24224)\u001b[0m Training model with n_components=32 and seed=773956.\n",
-      "\u001b[2m\u001b[36m(pid=24225)\u001b[0m Training model with n_components=32 and seed=654571.\n",
-      "\u001b[2m\u001b[36m(pid=24235)\u001b[0m Training model with n_components=128 and seed=128113.\n",
-      "\u001b[2m\u001b[36m(pid=24227)\u001b[0m Training model with n_components=256 and seed=781567.\n",
-      "\u001b[2m\u001b[36m(pid=24231)\u001b[0m Training model with n_components=64 and seed=526478.\n",
-      "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=256 and seed=402414.\n",
-      "\u001b[2m\u001b[36m(pid=24234)\u001b[0m Training model with n_components=64 and seed=975622.\n",
-      "\u001b[2m\u001b[36m(pid=24233)\u001b[0m Training model with n_components=64 and seed=94177.\n",
-      "\u001b[2m\u001b[36m(pid=24232)\u001b[0m Training model with n_components=64 and seed=201469.\n",
-      "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=32 and seed=89250.\n",
-      "\u001b[2m\u001b[36m(pid=24229)\u001b[0m Training model with n_components=32 and seed=438878.\n",
-      "\u001b[2m\u001b[36m(pid=24224)\u001b[0m Training model with n_components=256 and seed=822761.\n",
+      "\u001b[2m\u001b[36m(pid=1424)\u001b[0m Training model with n_components=128 and seed=128113.\n",
+      "\u001b[2m\u001b[36m(pid=1427)\u001b[0m Training model with n_components=128 and seed=513226.\n",
+      "\u001b[2m\u001b[36m(pid=1421)\u001b[0m Training model with n_components=64 and seed=526478.\n",
+      "\u001b[2m\u001b[36m(pid=1428)\u001b[0m Training model with n_components=128 and seed=839748.\n",
+      "\u001b[2m\u001b[36m(pid=1422)\u001b[0m Training model with n_components=32 and seed=773956.\n",
+      "\u001b[2m\u001b[36m(pid=1431)\u001b[0m Training model with n_components=256 and seed=781567.\n",
+      "\u001b[2m\u001b[36m(pid=1430)\u001b[0m Training model with n_components=256 and seed=402414.\n",
+      "\u001b[2m\u001b[36m(pid=1432)\u001b[0m Training model with n_components=256 and seed=643865.\n",
+      "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=32 and seed=89250.\n",
+      "\u001b[2m\u001b[36m(pid=1426)\u001b[0m Training model with n_components=64 and seed=201469.\n",
+      "\u001b[2m\u001b[36m(pid=1419)\u001b[0m Training model with n_components=32 and seed=438878.\n",
+      "\u001b[2m\u001b[36m(pid=1425)\u001b[0m Training model with n_components=64 and seed=975622.\n",
+      "\u001b[2m\u001b[36m(pid=1423)\u001b[0m Training model with n_components=64 and seed=94177.\n",
+      "\u001b[2m\u001b[36m(pid=1420)\u001b[0m Training model with n_components=32 and seed=654571.\n",
+      "\u001b[2m\u001b[36m(pid=1429)\u001b[0m Training model with n_components=128 and seed=450385.\n",
+      "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=256 and seed=822761.\n",
       "Trained 17 models.\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "bdd6e0d863bc457ba79e180359026d1a",
+       "model_id": "d30ee772e8f94e4b8d5dd8626169bb69",
        "version_major": 2,
        "version_minor": 0
       },
@@ -3587,20 +3450,6 @@
      "metadata": {},
      "output_type": "display_data"
     },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "fc9817ab93d24828a877d8e324ec2e48",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=139, style=ProgressStyle(desc…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -3626,9 +3475,14 @@
     "    _models = maybe_train_models(_train_inputs_df, fold_ix)\n",
     "    _evals = eval_models(_models, _test_inputs_df)\n",
     "    _summary_df = make_summary_df(_evals)\n",
-    "    _full_results = util.merge_model_results(_evals)\n",
-    "    _results = _full_results[[\"fold\", \"doc_offset\", \"span\", \n",
-    "                              \"ent_type\", \"gold\", \"num_models\"]]\n",
+    "    _gold_elts = cleaning.preprocess.combine_raw_spans_docs_to_match(corpus_raw,_evals[list(evals.keys())[0]])\n",
+    "    _full_results = cleaning.flag_suspicious_labels(_evals,'ent_type','ent_type',\n",
+    "                                                    label_name='ent_type',\n",
+    "                                                    gold_feats=_gold_elts,\n",
+    "                                                    align_over_cols=['fold','doc_num','span'],\n",
+    "                                                    keep_cols=[],split_doc=False)\n",
+    "    _results = _full_results[[\"fold\", \"doc_num\", \"span\", \n",
+    "                              \"ent_type\", \"in_gold\", \"count\"]]\n",
     "    return {\n",
     "        \"models\": _models,\n",
     "        \"summary_df\": _summary_df,\n",
@@ -3655,7 +3509,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
@@ -3680,58 +3534,58 @@
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>fold</th>\n",
-       "      <th>doc_offset</th>\n",
+       "      <th>doc_num</th>\n",
        "      <th>span</th>\n",
        "      <th>ent_type</th>\n",
-       "      <th>gold</th>\n",
-       "      <th>num_models</th>\n",
+       "      <th>in_gold</th>\n",
+       "      <th>count</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>0</th>\n",
+       "      <th>4927</th>\n",
        "      <td>train</td>\n",
-       "      <td>12</td>\n",
-       "      <td>[11, 16): 'Saudi'</td>\n",
-       "      <td>MISC</td>\n",
+       "      <td>907</td>\n",
+       "      <td>[590, 598): 'Gorleben'</td>\n",
+       "      <td>LOC</td>\n",
        "      <td>True</td>\n",
        "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1</th>\n",
+       "      <th>4925</th>\n",
        "      <td>train</td>\n",
-       "      <td>12</td>\n",
-       "      <td>[59, 65): 'MANAMA'</td>\n",
+       "      <td>907</td>\n",
+       "      <td>[63, 67): 'BONN'</td>\n",
        "      <td>LOC</td>\n",
        "      <td>True</td>\n",
        "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2</th>\n",
+       "      <th>4924</th>\n",
        "      <td>train</td>\n",
-       "      <td>12</td>\n",
-       "      <td>[86, 91): 'Saudi'</td>\n",
+       "      <td>907</td>\n",
+       "      <td>[11, 17): 'German'</td>\n",
        "      <td>MISC</td>\n",
        "      <td>True</td>\n",
-       "      <td>14</td>\n",
+       "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>3</th>\n",
+       "      <th>4923</th>\n",
        "      <td>train</td>\n",
-       "      <td>12</td>\n",
-       "      <td>[259, 264): 'Saudi'</td>\n",
-       "      <td>MISC</td>\n",
+       "      <td>896</td>\n",
+       "      <td>[523, 528): 'China'</td>\n",
+       "      <td>LOC</td>\n",
        "      <td>True</td>\n",
-       "      <td>13</td>\n",
+       "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>4</th>\n",
+       "      <th>4922</th>\n",
        "      <td>train</td>\n",
-       "      <td>12</td>\n",
-       "      <td>[403, 412): 'One-month'</td>\n",
-       "      <td>MISC</td>\n",
+       "      <td>896</td>\n",
+       "      <td>[512, 518): 'Mexico'</td>\n",
+       "      <td>LOC</td>\n",
        "      <td>True</td>\n",
-       "      <td>9</td>\n",
+       "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>...</th>\n",
@@ -3743,86 +3597,86 @@
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>13</th>\n",
-       "      <td>test</td>\n",
-       "      <td>216</td>\n",
-       "      <td>[20, 29): 'SHEFFIELD'</td>\n",
-       "      <td>PER</td>\n",
-       "      <td>False</td>\n",
-       "      <td>12</td>\n",
+       "      <th>271</th>\n",
+       "      <td>dev</td>\n",
+       "      <td>93</td>\n",
+       "      <td>[469, 481): 'JAKARTA POST'</td>\n",
+       "      <td>ORG</td>\n",
+       "      <td>True</td>\n",
+       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>14</th>\n",
-       "      <td>test</td>\n",
-       "      <td>216</td>\n",
-       "      <td>[127, 143): 'Sheffield Shield'</td>\n",
+       "      <th>183</th>\n",
+       "      <td>dev</td>\n",
+       "      <td>76</td>\n",
+       "      <td>[1285, 1312): 'Chicago Purchasing Managers'</td>\n",
        "      <td>ORG</td>\n",
-       "      <td>False</td>\n",
-       "      <td>3</td>\n",
+       "      <td>True</td>\n",
+       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>15</th>\n",
-       "      <td>test</td>\n",
-       "      <td>216</td>\n",
-       "      <td>[166, 174): 'Tasmania'</td>\n",
-       "      <td>LOC</td>\n",
-       "      <td>False</td>\n",
-       "      <td>14</td>\n",
+       "      <th>126</th>\n",
+       "      <td>dev</td>\n",
+       "      <td>49</td>\n",
+       "      <td>[1920, 1925): 'Tajik'</td>\n",
+       "      <td>MISC</td>\n",
+       "      <td>True</td>\n",
+       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>16</th>\n",
-       "      <td>test</td>\n",
-       "      <td>216</td>\n",
-       "      <td>[179, 187): 'Victoria'</td>\n",
-       "      <td>LOC</td>\n",
-       "      <td>False</td>\n",
-       "      <td>14</td>\n",
+       "      <th>25</th>\n",
+       "      <td>dev</td>\n",
+       "      <td>15</td>\n",
+       "      <td>[109, 133): 'National Football League'</td>\n",
+       "      <td>ORG</td>\n",
+       "      <td>True</td>\n",
+       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>17</th>\n",
-       "      <td>test</td>\n",
-       "      <td>216</td>\n",
-       "      <td>[20, 29): 'SHEFFIELD'</td>\n",
-       "      <td>LOC</td>\n",
-       "      <td>False</td>\n",
-       "      <td>1</td>\n",
+       "      <td>dev</td>\n",
+       "      <td>15</td>\n",
+       "      <td>[15, 40): 'AMERICAN FOOTBALL-RANDALL'</td>\n",
+       "      <td>MISC</td>\n",
+       "      <td>True</td>\n",
+       "      <td>0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>44800 rows × 6 columns</p>\n",
+       "<p>44802 rows × 6 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "     fold  doc_offset                            span ent_type   gold  \\\n",
-       "0   train          12               [11, 16): 'Saudi'     MISC   True   \n",
-       "1   train          12              [59, 65): 'MANAMA'      LOC   True   \n",
-       "2   train          12               [86, 91): 'Saudi'     MISC   True   \n",
-       "3   train          12             [259, 264): 'Saudi'     MISC   True   \n",
-       "4   train          12         [403, 412): 'One-month'     MISC   True   \n",
-       "..    ...         ...                             ...      ...    ...   \n",
-       "13   test         216           [20, 29): 'SHEFFIELD'      PER  False   \n",
-       "14   test         216  [127, 143): 'Sheffield Shield'      ORG  False   \n",
-       "15   test         216          [166, 174): 'Tasmania'      LOC  False   \n",
-       "16   test         216          [179, 187): 'Victoria'      LOC  False   \n",
-       "17   test         216           [20, 29): 'SHEFFIELD'      LOC  False   \n",
+       "       fold  doc_num                                         span ent_type  \\\n",
+       "4927  train      907                       [590, 598): 'Gorleben'      LOC   \n",
+       "4925  train      907                             [63, 67): 'BONN'      LOC   \n",
+       "4924  train      907                           [11, 17): 'German'     MISC   \n",
+       "4923  train      896                          [523, 528): 'China'      LOC   \n",
+       "4922  train      896                         [512, 518): 'Mexico'      LOC   \n",
+       "...     ...      ...                                          ...      ...   \n",
+       "271     dev       93                   [469, 481): 'JAKARTA POST'      ORG   \n",
+       "183     dev       76  [1285, 1312): 'Chicago Purchasing Managers'      ORG   \n",
+       "126     dev       49                        [1920, 1925): 'Tajik'     MISC   \n",
+       "25      dev       15       [109, 133): 'National Football League'      ORG   \n",
+       "17      dev       15        [15, 40): 'AMERICAN FOOTBALL-RANDALL'     MISC   \n",
        "\n",
-       "    num_models  \n",
-       "0           17  \n",
-       "1           17  \n",
-       "2           14  \n",
-       "3           13  \n",
-       "4            9  \n",
-       "..         ...  \n",
-       "13          12  \n",
-       "14           3  \n",
-       "15          14  \n",
-       "16          14  \n",
-       "17           1  \n",
+       "      in_gold  count  \n",
+       "4927     True     17  \n",
+       "4925     True     17  \n",
+       "4924     True     17  \n",
+       "4923     True     17  \n",
+       "4922     True     17  \n",
+       "...       ...    ...  \n",
+       "271      True      0  \n",
+       "183      True      0  \n",
+       "126      True      0  \n",
+       "25       True      0  \n",
+       "17       True      0  \n",
        "\n",
-       "[44800 rows x 6 columns]"
+       "[44802 rows x 6 columns]"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3842,7 +3696,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
@@ -3866,7 +3720,7 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>num_models</th>\n",
+       "      <th>count</th>\n",
        "      <th>fold</th>\n",
        "      <th>doc_offset</th>\n",
        "      <th>corpus_span</th>\n",
@@ -3882,12 +3736,12 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>0</th>\n",
+       "      <th>30</th>\n",
        "      <td>0</td>\n",
        "      <td>dev</td>\n",
        "      <td>2</td>\n",
-       "      <td>[25, 30): 'ASHES'</td>\n",
-       "      <td>MISC</td>\n",
+       "      <td>[760, 765): 'Leeds'</td>\n",
+       "      <td>ORG</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -3897,12 +3751,12 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>3</th>\n",
+       "      <th>21</th>\n",
        "      <td>0</td>\n",
        "      <td>dev</td>\n",
        "      <td>2</td>\n",
-       "      <td>[87, 92): 'Ashes'</td>\n",
-       "      <td>MISC</td>\n",
+       "      <td>[614, 634): 'Duke of Norfolk's XI'</td>\n",
+       "      <td>ORG</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -3927,12 +3781,12 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>13</th>\n",
+       "      <th>3</th>\n",
        "      <td>0</td>\n",
        "      <td>dev</td>\n",
        "      <td>2</td>\n",
-       "      <td>[614, 634): 'Duke of Norfolk's XI'</td>\n",
-       "      <td>ORG</td>\n",
+       "      <td>[87, 92): 'Ashes'</td>\n",
+       "      <td>MISC</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -3942,12 +3796,12 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>19</th>\n",
+       "      <th>0</th>\n",
        "      <td>0</td>\n",
        "      <td>dev</td>\n",
        "      <td>2</td>\n",
-       "      <td>[760, 765): 'Leeds'</td>\n",
-       "      <td>ORG</td>\n",
+       "      <td>[25, 30): 'ASHES'</td>\n",
+       "      <td>MISC</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -3972,12 +3826,12 @@
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>21</th>\n",
+       "      <th>1738</th>\n",
        "      <td>17</td>\n",
        "      <td>test</td>\n",
        "      <td>230</td>\n",
-       "      <td>[1108, 1115): 'Germany'</td>\n",
-       "      <td>LOC</td>\n",
+       "      <td>[230, 238): 'Charlton'</td>\n",
+       "      <td>PER</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -3987,12 +3841,12 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>23</th>\n",
+       "      <th>1737</th>\n",
        "      <td>17</td>\n",
        "      <td>test</td>\n",
        "      <td>230</td>\n",
-       "      <td>[1153, 1160): 'England'</td>\n",
-       "      <td>LOC</td>\n",
+       "      <td>[177, 187): 'Englishman'</td>\n",
+       "      <td>MISC</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -4002,12 +3856,12 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>24</th>\n",
+       "      <th>1736</th>\n",
        "      <td>17</td>\n",
        "      <td>test</td>\n",
        "      <td>230</td>\n",
-       "      <td>[1213, 1225): 'Leeds United'</td>\n",
-       "      <td>ORG</td>\n",
+       "      <td>[135, 142): 'Ireland'</td>\n",
+       "      <td>LOC</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -4017,12 +3871,12 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>25</th>\n",
+       "      <th>1735</th>\n",
        "      <td>17</td>\n",
        "      <td>test</td>\n",
        "      <td>230</td>\n",
-       "      <td>[1252, 1259): 'England'</td>\n",
-       "      <td>LOC</td>\n",
+       "      <td>[87, 100): 'Jack Charlton'</td>\n",
+       "      <td>PER</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -4032,12 +3886,12 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>27</th>\n",
+       "      <th>1734</th>\n",
        "      <td>17</td>\n",
        "      <td>test</td>\n",
        "      <td>230</td>\n",
-       "      <td>[1395, 1400): 'Bobby'</td>\n",
-       "      <td>PER</td>\n",
+       "      <td>[69, 75): 'DUBLIN'</td>\n",
+       "      <td>LOC</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -4052,49 +3906,49 @@
        "</div>"
       ],
       "text/plain": [
-       "    num_models  fold  doc_offset                                  corpus_span  \\\n",
-       "0            0   dev           2                            [25, 30): 'ASHES'   \n",
-       "3            0   dev           2                            [87, 92): 'Ashes'   \n",
-       "5            0   dev           2  [189, 218): 'Test and County Cricket Board'   \n",
-       "13           0   dev           2           [614, 634): 'Duke of Norfolk's XI'   \n",
-       "19           0   dev           2                          [760, 765): 'Leeds'   \n",
-       "..         ...   ...         ...                                          ...   \n",
-       "21          17  test         230                      [1108, 1115): 'Germany'   \n",
-       "23          17  test         230                      [1153, 1160): 'England'   \n",
-       "24          17  test         230                 [1213, 1225): 'Leeds United'   \n",
-       "25          17  test         230                      [1252, 1259): 'England'   \n",
-       "27          17  test         230                        [1395, 1400): 'Bobby'   \n",
+       "      count  fold  doc_offset                                  corpus_span  \\\n",
+       "30        0   dev           2                          [760, 765): 'Leeds'   \n",
+       "21        0   dev           2           [614, 634): 'Duke of Norfolk's XI'   \n",
+       "5         0   dev           2  [189, 218): 'Test and County Cricket Board'   \n",
+       "3         0   dev           2                            [87, 92): 'Ashes'   \n",
+       "0         0   dev           2                            [25, 30): 'ASHES'   \n",
+       "...     ...   ...         ...                                          ...   \n",
+       "1738     17  test         230                       [230, 238): 'Charlton'   \n",
+       "1737     17  test         230                     [177, 187): 'Englishman'   \n",
+       "1736     17  test         230                        [135, 142): 'Ireland'   \n",
+       "1735     17  test         230                   [87, 100): 'Jack Charlton'   \n",
+       "1734     17  test         230                           [69, 75): 'DUBLIN'   \n",
        "\n",
-       "   corpus_ent_type error_type correct_span correct_ent_type notes  \\\n",
-       "0             MISC                                                  \n",
-       "3             MISC                                                  \n",
-       "5              ORG                                                  \n",
-       "13             ORG                                                  \n",
-       "19             ORG                                                  \n",
-       "..             ...        ...          ...              ...   ...   \n",
-       "21             LOC                                                  \n",
-       "23             LOC                                                  \n",
-       "24             ORG                                                  \n",
-       "25             LOC                                                  \n",
-       "27             PER                                                  \n",
+       "     corpus_ent_type error_type correct_span correct_ent_type notes  \\\n",
+       "30               ORG                                                  \n",
+       "21               ORG                                                  \n",
+       "5                ORG                                                  \n",
+       "3               MISC                                                  \n",
+       "0               MISC                                                  \n",
+       "...              ...        ...          ...              ...   ...   \n",
+       "1738             PER                                                  \n",
+       "1737            MISC                                                  \n",
+       "1736             LOC                                                  \n",
+       "1735             PER                                                  \n",
+       "1734             LOC                                                  \n",
        "\n",
-       "   time_started time_stopped time_elapsed  \n",
-       "0                                          \n",
-       "3                                          \n",
-       "5                                          \n",
-       "13                                         \n",
-       "19                                         \n",
-       "..          ...          ...          ...  \n",
-       "21                                         \n",
-       "23                                         \n",
-       "24                                         \n",
-       "25                                         \n",
-       "27                                         \n",
+       "     time_started time_stopped time_elapsed  \n",
+       "30                                           \n",
+       "21                                           \n",
+       "5                                            \n",
+       "3                                            \n",
+       "0                                            \n",
+       "...           ...          ...          ...  \n",
+       "1738                                         \n",
+       "1737                                         \n",
+       "1736                                         \n",
+       "1735                                         \n",
+       "1734                                         \n",
        "\n",
        "[11590 rows x 12 columns]"
       ]
      },
-     "execution_count": 24,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -4102,13 +3956,13 @@
    "source": [
     "# Reformat for output\n",
     "dev_and_test_results = all_results[all_results[\"fold\"].isin([\"dev\", \"test\"])]\n",
-    "in_gold_to_write, not_in_gold_to_write = util.csv_prep(dev_and_test_results, \"num_models\")\n",
+    "in_gold_to_write, not_in_gold_to_write = cleaning.analysis.csv_prep(dev_and_test_results, \"count\")\n",
     "in_gold_to_write"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
@@ -4132,7 +3986,7 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>num_models</th>\n",
+       "      <th>count</th>\n",
        "      <th>fold</th>\n",
        "      <th>doc_offset</th>\n",
        "      <th>model_span</th>\n",
@@ -4150,7 +4004,7 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>51</th>\n",
+       "      <th>29</th>\n",
        "      <td>17</td>\n",
        "      <td>dev</td>\n",
        "      <td>2</td>\n",
@@ -4167,11 +4021,11 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>18</th>\n",
+       "      <th>25</th>\n",
        "      <td>17</td>\n",
        "      <td>dev</td>\n",
        "      <td>6</td>\n",
-       "      <td>[262, 267): 'Rotor'</td>\n",
+       "      <td>[567, 572): 'Rotor'</td>\n",
        "      <td>PER</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -4184,7 +4038,7 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>19</th>\n",
+       "      <th>20</th>\n",
        "      <td>17</td>\n",
        "      <td>dev</td>\n",
        "      <td>6</td>\n",
@@ -4201,11 +4055,11 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>20</th>\n",
+       "      <th>16</th>\n",
        "      <td>17</td>\n",
        "      <td>dev</td>\n",
        "      <td>6</td>\n",
-       "      <td>[567, 572): 'Rotor'</td>\n",
+       "      <td>[262, 267): 'Rotor'</td>\n",
        "      <td>PER</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -4218,7 +4072,7 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>101</th>\n",
+       "      <th>142</th>\n",
        "      <td>17</td>\n",
        "      <td>dev</td>\n",
        "      <td>11</td>\n",
@@ -4252,11 +4106,11 @@
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>43</th>\n",
+       "      <th>1708</th>\n",
        "      <td>1</td>\n",
        "      <td>test</td>\n",
        "      <td>228</td>\n",
-       "      <td>[40, 43): 'SIX'</td>\n",
+       "      <td>[771, 784): 'De Graafschap'</td>\n",
        "      <td>ORG</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -4269,11 +4123,11 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>45</th>\n",
+       "      <th>1690</th>\n",
        "      <td>1</td>\n",
        "      <td>test</td>\n",
        "      <td>228</td>\n",
-       "      <td>[831, 845): 'Super Peasants'</td>\n",
+       "      <td>[269, 287): 'Brazilian defender'</td>\n",
        "      <td>MISC</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -4286,12 +4140,12 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>49</th>\n",
+       "      <th>1679</th>\n",
        "      <td>1</td>\n",
        "      <td>test</td>\n",
        "      <td>228</td>\n",
-       "      <td>[801, 811): 'Doetinchem'</td>\n",
-       "      <td>MISC</td>\n",
+       "      <td>[40, 43): 'SIX'</td>\n",
+       "      <td>ORG</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -4303,7 +4157,7 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>30</th>\n",
+       "      <th>1724</th>\n",
        "      <td>1</td>\n",
        "      <td>test</td>\n",
        "      <td>230</td>\n",
@@ -4320,7 +4174,7 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>31</th>\n",
+       "      <th>1727</th>\n",
        "      <td>1</td>\n",
        "      <td>test</td>\n",
        "      <td>230</td>\n",
@@ -4342,49 +4196,49 @@
        "</div>"
       ],
       "text/plain": [
-       "     num_models  fold  doc_offset                       model_span  \\\n",
-       "51           17   dev           2              [760, 765): 'Leeds'   \n",
-       "18           17   dev           6              [262, 267): 'Rotor'   \n",
-       "19           17   dev           6              [399, 404): 'Rotor'   \n",
-       "20           17   dev           6              [567, 572): 'Rotor'   \n",
-       "101          17   dev          11   [1961, 1975): 'Czech Republic'   \n",
-       "..          ...   ...         ...                              ...   \n",
-       "43            1  test         228                  [40, 43): 'SIX'   \n",
-       "45            1  test         228     [831, 845): 'Super Peasants'   \n",
-       "49            1  test         228         [801, 811): 'Doetinchem'   \n",
-       "30            1  test         230           [19, 29): 'ENGLISHMAN'   \n",
-       "31            1  test         230  [19, 38): 'ENGLISHMAN CHARLTON'   \n",
+       "      count  fold  doc_offset                        model_span  \\\n",
+       "29       17   dev           2               [760, 765): 'Leeds'   \n",
+       "25       17   dev           6               [567, 572): 'Rotor'   \n",
+       "20       17   dev           6               [399, 404): 'Rotor'   \n",
+       "16       17   dev           6               [262, 267): 'Rotor'   \n",
+       "142      17   dev          11    [1961, 1975): 'Czech Republic'   \n",
+       "...     ...   ...         ...                               ...   \n",
+       "1708      1  test         228       [771, 784): 'De Graafschap'   \n",
+       "1690      1  test         228  [269, 287): 'Brazilian defender'   \n",
+       "1679      1  test         228                   [40, 43): 'SIX'   \n",
+       "1724      1  test         230            [19, 29): 'ENGLISHMAN'   \n",
+       "1727      1  test         230   [19, 38): 'ENGLISHMAN CHARLTON'   \n",
        "\n",
-       "    model_ent_type error_type corpus_span corpus_ent_type correct_span  \\\n",
-       "51             LOC                                                       \n",
-       "18             PER                                                       \n",
-       "19             PER                                                       \n",
-       "20             PER                                                       \n",
-       "101            LOC                                                       \n",
-       "..             ...        ...         ...             ...          ...   \n",
-       "43             ORG                                                       \n",
-       "45            MISC                                                       \n",
-       "49            MISC                                                       \n",
-       "30             LOC                                                       \n",
-       "31             PER                                                       \n",
+       "     model_ent_type error_type corpus_span corpus_ent_type correct_span  \\\n",
+       "29              LOC                                                       \n",
+       "25              PER                                                       \n",
+       "20              PER                                                       \n",
+       "16              PER                                                       \n",
+       "142             LOC                                                       \n",
+       "...             ...        ...         ...             ...          ...   \n",
+       "1708            ORG                                                       \n",
+       "1690           MISC                                                       \n",
+       "1679            ORG                                                       \n",
+       "1724            LOC                                                       \n",
+       "1727            PER                                                       \n",
        "\n",
-       "    correct_ent_type notes time_started time_stopped time_elapsed  \n",
-       "51                                                                 \n",
-       "18                                                                 \n",
-       "19                                                                 \n",
-       "20                                                                 \n",
-       "101                                                                \n",
-       "..               ...   ...          ...          ...          ...  \n",
-       "43                                                                 \n",
-       "45                                                                 \n",
-       "49                                                                 \n",
-       "30                                                                 \n",
-       "31                                                                 \n",
+       "     correct_ent_type notes time_started time_stopped time_elapsed  \n",
+       "29                                                                  \n",
+       "25                                                                  \n",
+       "20                                                                  \n",
+       "16                                                                  \n",
+       "142                                                                 \n",
+       "...               ...   ...          ...          ...          ...  \n",
+       "1708                                                                \n",
+       "1690                                                                \n",
+       "1679                                                                \n",
+       "1724                                                                \n",
+       "1727                                                                \n",
        "\n",
        "[4366 rows x 14 columns]"
       ]
      },
-     "execution_count": 25,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -4395,7 +4249,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -4405,7 +4259,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [
     {
@@ -4429,7 +4283,7 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>num_models</th>\n",
+       "      <th>count</th>\n",
        "      <th>fold</th>\n",
        "      <th>doc_offset</th>\n",
        "      <th>corpus_span</th>\n",
@@ -4445,7 +4299,7 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>3</th>\n",
+       "      <th>1486</th>\n",
        "      <td>0</td>\n",
        "      <td>train</td>\n",
        "      <td>6</td>\n",
@@ -4460,12 +4314,12 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>4</th>\n",
+       "      <th>1358</th>\n",
        "      <td>0</td>\n",
        "      <td>train</td>\n",
        "      <td>24</td>\n",
-       "      <td>[161, 169): 'Africans'</td>\n",
-       "      <td>MISC</td>\n",
+       "      <td>[384, 388): 'FLNC'</td>\n",
+       "      <td>ORG</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -4475,12 +4329,12 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>7</th>\n",
+       "      <th>1355</th>\n",
        "      <td>0</td>\n",
        "      <td>train</td>\n",
        "      <td>24</td>\n",
-       "      <td>[384, 388): 'FLNC'</td>\n",
-       "      <td>ORG</td>\n",
+       "      <td>[161, 169): 'Africans'</td>\n",
+       "      <td>MISC</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -4490,7 +4344,7 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>4</th>\n",
+       "      <th>1965</th>\n",
        "      <td>0</td>\n",
        "      <td>train</td>\n",
        "      <td>25</td>\n",
@@ -4505,7 +4359,7 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>13</th>\n",
+       "      <th>1383</th>\n",
        "      <td>0</td>\n",
        "      <td>train</td>\n",
        "      <td>28</td>\n",
@@ -4535,12 +4389,12 @@
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2</th>\n",
+       "      <th>4132</th>\n",
        "      <td>17</td>\n",
        "      <td>train</td>\n",
        "      <td>945</td>\n",
-       "      <td>[72, 79): 'English'</td>\n",
-       "      <td>MISC</td>\n",
+       "      <td>[130, 137): 'Preston'</td>\n",
+       "      <td>ORG</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -4550,7 +4404,7 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>3</th>\n",
+       "      <th>4131</th>\n",
        "      <td>17</td>\n",
        "      <td>train</td>\n",
        "      <td>945</td>\n",
@@ -4565,12 +4419,12 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>4</th>\n",
+       "      <th>4130</th>\n",
        "      <td>17</td>\n",
        "      <td>train</td>\n",
        "      <td>945</td>\n",
-       "      <td>[130, 137): 'Preston'</td>\n",
-       "      <td>ORG</td>\n",
+       "      <td>[72, 79): 'English'</td>\n",
+       "      <td>MISC</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -4580,12 +4434,12 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>5</th>\n",
+       "      <th>4129</th>\n",
        "      <td>17</td>\n",
        "      <td>train</td>\n",
        "      <td>945</td>\n",
-       "      <td>[155, 162): 'Swansea'</td>\n",
-       "      <td>ORG</td>\n",
+       "      <td>[43, 49): 'LONDON'</td>\n",
+       "      <td>LOC</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -4595,12 +4449,12 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6</th>\n",
+       "      <th>4128</th>\n",
        "      <td>17</td>\n",
        "      <td>train</td>\n",
        "      <td>945</td>\n",
-       "      <td>[165, 172): 'Lincoln'</td>\n",
-       "      <td>ORG</td>\n",
+       "      <td>[19, 26): 'ENGLISH'</td>\n",
+       "      <td>MISC</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -4615,44 +4469,44 @@
        "</div>"
       ],
       "text/plain": [
-       "    num_models   fold  doc_offset                     corpus_span  \\\n",
-       "3            0  train           6  [121, 137): 'Toronto Dominion'   \n",
-       "4            0  train          24          [161, 169): 'Africans'   \n",
-       "7            0  train          24              [384, 388): 'FLNC'   \n",
-       "4            0  train          25        [141, 151): 'mid-Norway'   \n",
-       "13           0  train          28              [1133, 1135): 'EU'   \n",
-       "..         ...    ...         ...                             ...   \n",
-       "2           17  train         945             [72, 79): 'English'   \n",
-       "3           17  train         945          [119, 127): 'Plymouth'   \n",
-       "4           17  train         945           [130, 137): 'Preston'   \n",
-       "5           17  train         945           [155, 162): 'Swansea'   \n",
-       "6           17  train         945           [165, 172): 'Lincoln'   \n",
+       "      count   fold  doc_offset                     corpus_span  \\\n",
+       "1486      0  train           6  [121, 137): 'Toronto Dominion'   \n",
+       "1358      0  train          24              [384, 388): 'FLNC'   \n",
+       "1355      0  train          24          [161, 169): 'Africans'   \n",
+       "1965      0  train          25        [141, 151): 'mid-Norway'   \n",
+       "1383      0  train          28              [1133, 1135): 'EU'   \n",
+       "...     ...    ...         ...                             ...   \n",
+       "4132     17  train         945           [130, 137): 'Preston'   \n",
+       "4131     17  train         945          [119, 127): 'Plymouth'   \n",
+       "4130     17  train         945             [72, 79): 'English'   \n",
+       "4129     17  train         945              [43, 49): 'LONDON'   \n",
+       "4128     17  train         945             [19, 26): 'ENGLISH'   \n",
        "\n",
-       "   corpus_ent_type error_type correct_span correct_ent_type notes  \\\n",
-       "3              PER                                                  \n",
-       "4             MISC                                                  \n",
-       "7              ORG                                                  \n",
-       "4             MISC                                                  \n",
-       "13             ORG                                                  \n",
-       "..             ...        ...          ...              ...   ...   \n",
-       "2             MISC                                                  \n",
-       "3              ORG                                                  \n",
-       "4              ORG                                                  \n",
-       "5              ORG                                                  \n",
-       "6              ORG                                                  \n",
+       "     corpus_ent_type error_type correct_span correct_ent_type notes  \\\n",
+       "1486             PER                                                  \n",
+       "1358             ORG                                                  \n",
+       "1355            MISC                                                  \n",
+       "1965            MISC                                                  \n",
+       "1383             ORG                                                  \n",
+       "...              ...        ...          ...              ...   ...   \n",
+       "4132             ORG                                                  \n",
+       "4131             ORG                                                  \n",
+       "4130            MISC                                                  \n",
+       "4129             LOC                                                  \n",
+       "4128            MISC                                                  \n",
        "\n",
-       "   time_started time_stopped time_elapsed  \n",
-       "3                                          \n",
-       "4                                          \n",
-       "7                                          \n",
-       "4                                          \n",
-       "13                                         \n",
-       "..          ...          ...          ...  \n",
-       "2                                          \n",
-       "3                                          \n",
-       "4                                          \n",
-       "5                                          \n",
-       "6                                          \n",
+       "     time_started time_stopped time_elapsed  \n",
+       "1486                                         \n",
+       "1358                                         \n",
+       "1355                                         \n",
+       "1965                                         \n",
+       "1383                                         \n",
+       "...           ...          ...          ...  \n",
+       "4132                                         \n",
+       "4131                                         \n",
+       "4130                                         \n",
+       "4129                                         \n",
+       "4128                                         \n",
        "\n",
        "[23499 rows x 12 columns]"
       ]
@@ -4665,13 +4519,13 @@
    "source": [
     "# Repeat for the contents of the original training set\n",
     "train_results = all_results[all_results[\"fold\"] == \"train\"]\n",
-    "in_gold_to_write, not_in_gold_to_write = util.csv_prep(train_results, \"num_models\")\n",
+    "in_gold_to_write, not_in_gold_to_write = cleaning.analysis.csv_prep(train_results, \"count\")\n",
     "in_gold_to_write"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [
     {
@@ -4695,7 +4549,7 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>num_models</th>\n",
+       "      <th>count</th>\n",
        "      <th>fold</th>\n",
        "      <th>doc_offset</th>\n",
        "      <th>model_span</th>\n",
@@ -4713,7 +4567,7 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>8</th>\n",
+       "      <th>1738</th>\n",
        "      <td>17</td>\n",
        "      <td>train</td>\n",
        "      <td>3</td>\n",
@@ -4730,7 +4584,7 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>13</th>\n",
+       "      <th>1485</th>\n",
        "      <td>17</td>\n",
        "      <td>train</td>\n",
        "      <td>6</td>\n",
@@ -4747,7 +4601,7 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>10</th>\n",
+       "      <th>1964</th>\n",
        "      <td>17</td>\n",
        "      <td>train</td>\n",
        "      <td>25</td>\n",
@@ -4764,11 +4618,11 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>67</th>\n",
+       "      <th>2022</th>\n",
        "      <td>17</td>\n",
        "      <td>train</td>\n",
        "      <td>29</td>\n",
-       "      <td>[454, 468): 'Phil Mickelson'</td>\n",
+       "      <td>[762, 774): 'Mark O'Meara'</td>\n",
        "      <td>PER</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -4781,11 +4635,11 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>68</th>\n",
+       "      <th>1996</th>\n",
        "      <td>17</td>\n",
        "      <td>train</td>\n",
        "      <td>29</td>\n",
-       "      <td>[762, 774): 'Mark O'Meara'</td>\n",
+       "      <td>[454, 468): 'Phil Mickelson'</td>\n",
        "      <td>PER</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -4815,12 +4669,12 @@
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>42</th>\n",
+       "      <th>4416</th>\n",
        "      <td>1</td>\n",
        "      <td>train</td>\n",
        "      <td>943</td>\n",
-       "      <td>[25, 41): 'SAN MARINO GRAND'</td>\n",
-       "      <td>LOC</td>\n",
+       "      <td>[25, 46): 'SAN MARINO GRAND PRIX'</td>\n",
+       "      <td>PER</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -4832,12 +4686,12 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>60</th>\n",
+       "      <th>4461</th>\n",
        "      <td>1</td>\n",
        "      <td>train</td>\n",
        "      <td>944</td>\n",
-       "      <td>[11, 15): 'GOLF'</td>\n",
-       "      <td>LOC</td>\n",
+       "      <td>[25, 32): 'MASTERS'</td>\n",
+       "      <td>MISC</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -4849,7 +4703,7 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>62</th>\n",
+       "      <th>4462</th>\n",
        "      <td>1</td>\n",
        "      <td>train</td>\n",
        "      <td>944</td>\n",
@@ -4866,7 +4720,7 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>63</th>\n",
+       "      <th>4463</th>\n",
        "      <td>1</td>\n",
        "      <td>train</td>\n",
        "      <td>944</td>\n",
@@ -4883,12 +4737,12 @@
        "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>64</th>\n",
+       "      <th>4458</th>\n",
        "      <td>1</td>\n",
        "      <td>train</td>\n",
        "      <td>944</td>\n",
-       "      <td>[25, 32): 'MASTERS'</td>\n",
-       "      <td>MISC</td>\n",
+       "      <td>[11, 15): 'GOLF'</td>\n",
+       "      <td>LOC</td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
        "      <td></td>\n",
@@ -4901,50 +4755,50 @@
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>5345 rows × 14 columns</p>\n",
+       "<p>5347 rows × 14 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "    num_models   fold  doc_offset                      model_span  \\\n",
-       "8           17  train           3           [0, 10): '-DOCSTART-'   \n",
-       "13          17  train           6  [121, 137): 'Toronto Dominion'   \n",
-       "10          17  train          25        [141, 151): 'mid-Norway'   \n",
-       "67          17  train          29    [454, 468): 'Phil Mickelson'   \n",
-       "68          17  train          29      [762, 774): 'Mark O'Meara'   \n",
-       "..         ...    ...         ...                             ...   \n",
-       "42           1  train         943    [25, 41): 'SAN MARINO GRAND'   \n",
-       "60           1  train         944                [11, 15): 'GOLF'   \n",
-       "62           1  train         944             [25, 32): 'MASTERS'   \n",
-       "63           1  train         944     [17, 32): 'BRITISH MASTERS'   \n",
-       "64           1  train         944             [25, 32): 'MASTERS'   \n",
+       "      count   fold  doc_offset                         model_span  \\\n",
+       "1738     17  train           3              [0, 10): '-DOCSTART-'   \n",
+       "1485     17  train           6     [121, 137): 'Toronto Dominion'   \n",
+       "1964     17  train          25           [141, 151): 'mid-Norway'   \n",
+       "2022     17  train          29         [762, 774): 'Mark O'Meara'   \n",
+       "1996     17  train          29       [454, 468): 'Phil Mickelson'   \n",
+       "...     ...    ...         ...                                ...   \n",
+       "4416      1  train         943  [25, 46): 'SAN MARINO GRAND PRIX'   \n",
+       "4461      1  train         944                [25, 32): 'MASTERS'   \n",
+       "4462      1  train         944                [25, 32): 'MASTERS'   \n",
+       "4463      1  train         944        [17, 32): 'BRITISH MASTERS'   \n",
+       "4458      1  train         944                   [11, 15): 'GOLF'   \n",
        "\n",
-       "   model_ent_type error_type corpus_span corpus_ent_type correct_span  \\\n",
-       "8             LOC                                                       \n",
-       "13            LOC                                                       \n",
-       "10            LOC                                                       \n",
-       "67            PER                                                       \n",
-       "68            PER                                                       \n",
-       "..            ...        ...         ...             ...          ...   \n",
-       "42            LOC                                                       \n",
-       "60            LOC                                                       \n",
-       "62            PER                                                       \n",
-       "63            LOC                                                       \n",
-       "64           MISC                                                       \n",
+       "     model_ent_type error_type corpus_span corpus_ent_type correct_span  \\\n",
+       "1738            LOC                                                       \n",
+       "1485            LOC                                                       \n",
+       "1964            LOC                                                       \n",
+       "2022            PER                                                       \n",
+       "1996            PER                                                       \n",
+       "...             ...        ...         ...             ...          ...   \n",
+       "4416            PER                                                       \n",
+       "4461           MISC                                                       \n",
+       "4462            PER                                                       \n",
+       "4463            LOC                                                       \n",
+       "4458            LOC                                                       \n",
        "\n",
-       "   correct_ent_type notes time_started time_stopped time_elapsed  \n",
-       "8                                                                 \n",
-       "13                                                                \n",
-       "10                                                                \n",
-       "67                                                                \n",
-       "68                                                                \n",
-       "..              ...   ...          ...          ...          ...  \n",
-       "42                                                                \n",
-       "60                                                                \n",
-       "62                                                                \n",
-       "63                                                                \n",
-       "64                                                                \n",
+       "     correct_ent_type notes time_started time_stopped time_elapsed  \n",
+       "1738                                                                \n",
+       "1485                                                                \n",
+       "1964                                                                \n",
+       "2022                                                                \n",
+       "1996                                                                \n",
+       "...               ...   ...          ...          ...          ...  \n",
+       "4416                                                                \n",
+       "4461                                                                \n",
+       "4462                                                                \n",
+       "4463                                                                \n",
+       "4458                                                                \n",
        "\n",
-       "[5345 rows x 14 columns]"
+       "[5347 rows x 14 columns]"
       ]
      },
      "execution_count": 28,
@@ -4958,7 +4812,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -4990,7 +4844,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.10"
+   "version": "3.8.10"
   }
  },
  "nbformat": 4,