From eeec4ae6049c3ba7d37008489642b090db4f410f Mon Sep 17 00:00:00 2001 From: ZachEichen Date: Wed, 30 Jun 2021 11:50:18 -0400 Subject: [PATCH] ran and updated CoNLL4 notebook --- tutorials/corpus/CoNLL_4.ipynb | 3094 +++++++++++++++----------------- 1 file changed, 1474 insertions(+), 1620 deletions(-) diff --git a/tutorials/corpus/CoNLL_4.ipynb b/tutorials/corpus/CoNLL_4.ipynb index 3a4fcc1b..625d6f86 100644 --- a/tutorials/corpus/CoNLL_4.ipynb +++ b/tutorials/corpus/CoNLL_4.ipynb @@ -56,9 +56,8 @@ " \"from the directory containing this notebook, or use a Python \"\n", " \"environment on which you have used `pip` to install the package.\")\n", "\n", - "# Code shared among notebooks is kept in util.py, in this directory.\n", - "import util\n", - "\n", + "from text_extensions_for_pandas import cleaning\n", + " \n", "# BERT Configuration\n", "# Keep this in sync with `CoNLL_3.ipynb`.\n", "#bert_model_name = \"bert-base-uncased\"\n", @@ -156,57 +155,18 @@ "cell_type": "code", "execution_count": 4, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2021-04-08 19:07:45,448\tINFO services.py:1174 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n" - ] - }, - { - "data": { - "text/plain": [ - "{'node_ip_address': '192.168.0.238',\n", - " 'raylet_ip_address': '192.168.0.238',\n", - " 'redis_address': '192.168.0.238:6379',\n", - " 'object_store_address': '/tmp/ray/session_2021-04-08_19-07-44_920647_24089/sockets/plasma_store',\n", - " 'raylet_socket_name': '/tmp/ray/session_2021-04-08_19-07-44_920647_24089/sockets/raylet',\n", - " 'webui_url': '127.0.0.1:8265',\n", - " 'session_dir': '/tmp/ray/session_2021-04-08_19-07-44_920647_24089',\n", - " 'metrics_export_port': 63726,\n", - " 'node_id': 'd3d3ce9b64423f35ff87532e918e52feb8744a9104c7b830498fd8c6'}" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Use Ray to make things faster\n", - "import ray\n", - "if ray.is_initialized():\n", - " ray.shutdown()\n", - "ray.init()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Processing fold 'train'...\n" + "preprocessing fold train\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "623678efe9214ad7a48b6736aaafda82", + "model_id": "2a902c1e0d31464493eb8627e4a8c334", "version_major": 2, "version_minor": 0 }, @@ -217,17 +177,24 @@ "metadata": {}, "output_type": "display_data" }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Token indices sequence length is longer than the specified maximum sequence length for this model (559 > 512). Running this sequence through the model will result in indexing errors\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Processing fold 'dev'...\n" + "preprocessing fold dev\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "26b82e15bcac40ca96f13cc5a8f08dc9", + "model_id": "1cde38752f204098980b845f6995e3b8", "version_major": 2, "version_minor": 0 }, @@ -242,13 +209,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing fold 'test'...\n" + "preprocessing fold test\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "e79c2d5063054dd5951690b86ce63d2b", + "model_id": "d241a32cd56f41cf9409268d4ff3b702", "version_major": 2, "version_minor": 0 }, @@ -261,199 +228,8 @@ } ], "source": [ - "# Retokenize with the BERT tokenizer and optinally regenerate embeddings.\n", - "\n", - "actor_pool = ray.util.actor_pool.ActorPool([\n", - " util.BertActor.remote(bert_model_name, token_class_dtype, \n", - " compute_embeddings=_REGENERATE_EMBEDDINGS)\n", - " for i in range(multiprocessing.cpu_count())\n", - "])\n", - "\n", - "bert_toks_by_fold = {}\n", - "for fold_name in corpus_raw.keys():\n", - " print(f\"Processing fold '{fold_name}'...\")\n", - " raw = corpus_raw[fold_name]\n", - " for tokens_df in raw:\n", - " actor_pool.submit(lambda a, v: a.process_doc.remote(v), tokens_df)\n", - " bert_toks_by_fold[fold_name] = tp.jupyter.run_with_progress_bar(\n", - " len(raw), lambda i: actor_pool.get_next())\n", - "\n", - " \n", - "# The actors will stay active until their associated Python objects\n", - "# go out of scope and are garbage-collected.\n", - "del actor_pool\n", - "gc.collect(0)\n", - " \n", - "bert_data = bert_toks_by_fold" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
folddoc_numtoken_idinput_idtoken_type_idattention_maskspecial_tokens_maskent_iobent_typetoken_classtoken_class_idembedding
0train0010101TrueO<NA>O0[ -0.098505184, -0.4050192, 0.7428884...
1train0111801FalseO<NA>O0[ -0.057021223, -0.48112097, 0.989868...
2train0214101FalseO<NA>O0[ -0.04824195, -0.25330004, 1.167191...
3train03924401FalseO<NA>O0[ -0.26682988, -0.31008753, 1.007472...
4train04927201FalseO<NA>O0[ -0.22296889, -0.21308492, 0.9331016...
\n", - "
" - ], - "text/plain": [ - " fold doc_num token_id input_id token_type_id attention_mask \\\n", - "0 train 0 0 101 0 1 \n", - "1 train 0 1 118 0 1 \n", - "2 train 0 2 141 0 1 \n", - "3 train 0 3 9244 0 1 \n", - "4 train 0 4 9272 0 1 \n", - "\n", - " special_tokens_mask ent_iob ent_type token_class token_class_id \\\n", - "0 True O O 0 \n", - "1 False O O 0 \n", - "2 False O O 0 \n", - "3 False O O 0 \n", - "4 False O O 0 \n", - "\n", - " embedding \n", - "0 [ -0.098505184, -0.4050192, 0.7428884... \n", - "1 [ -0.057021223, -0.48112097, 0.989868... \n", - "2 [ -0.04824195, -0.25330004, 1.167191... \n", - "3 [ -0.26682988, -0.31008753, 1.007472... \n", - "4 [ -0.22296889, -0.21308492, 0.9331016... " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Create a single dataframe of annotated tokens for the entire corpus\n", - "if _REGENERATE_EMBEDDINGS:\n", - " corpus_df = tp.io.conll.combine_folds(bert_data)\n", - " # We can't currently serialize span columns that cover multiple documents (see issue 73),\n", - " # so the Feather file won't contain them. Drop these columns for consistency when\n", - " # we regenerate the embeddings here.\n", - " cols_to_drop = [c for c in corpus_df.columns if \"span\" in c]\n", - " corpus_df.drop(columns=cols_to_drop, inplace=True)\n", - "else:\n", - " # Use embeddings computed in CoNLL_3.ipynb\n", - " _EMBEDDINGS_FILE = \"outputs/corpus.feather\"\n", - " if not os.path.exists(_EMBEDDINGS_FILE):\n", - " raise ValueError(f\"Precomputed embeddings not found at {_EMBEDDINGS_FILE}. \"\n", - " f\"Please rerun CoNLL_3.ipynb to regenerate this file, or \"\n", - " f\"set _REGENERATE_EMBEDDINGS to True in the previous cell.\")\n", - " corpus_df = pd.read_feather(\"outputs/corpus.feather\")\n", - "\n", - "corpus_df.head()" + "# Retokenize with the BERT tokenizer and regenerate embeddings.\n", + "corpus_df,token_class_dtype, int_to_label, label_to_int = cleaning.preprocess.preprocess_documents(corpus_raw,'ent_type',True,carry_cols=['line_num'],iob_col='ent_iob')" ] }, { @@ -467,7 +243,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -573,7 +349,7 @@ "[1393 rows x 2 columns]" ] }, - "execution_count": 7, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -586,7 +362,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -683,7 +459,7 @@ "1213 test 51" ] }, - "execution_count": 8, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -726,7 +502,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -753,15 +529,19 @@ " fold\n", " doc_num\n", " token_id\n", + " span\n", " input_id\n", " token_type_id\n", " attention_mask\n", " special_tokens_mask\n", + " raw_span\n", + " line_num\n", + " raw_span_id\n", " ent_iob\n", " ent_type\n", + " embedding\n", " token_class\n", " token_class_id\n", - " embedding\n", " \n", " \n", " \n", @@ -770,75 +550,95 @@ " train\n", " 0\n", " 0\n", + " [0, 0): ''\n", " 101\n", " 0\n", " 1\n", " True\n", + " NaN\n", + " NaN\n", + " NaN\n", " O\n", " <NA>\n", + " [ -0.098505184, -0.4050192, 0.7428884...\n", " O\n", " 0\n", - " [ -0.098505184, -0.4050192, 0.7428884...\n", " \n", " \n", " 1\n", " train\n", " 0\n", " 1\n", + " [0, 1): '-'\n", " 118\n", " 0\n", " 1\n", " False\n", + " [0, 10): '-DOCSTART-'\n", + " 0.0\n", + " 0.0\n", " O\n", " <NA>\n", + " [ -0.057021223, -0.48112097, 0.989868...\n", " O\n", " 0\n", - " [ -0.057021223, -0.48112097, 0.989868...\n", " \n", " \n", " 2\n", " train\n", " 0\n", " 2\n", + " [1, 2): 'D'\n", " 141\n", " 0\n", " 1\n", " False\n", + " [0, 10): '-DOCSTART-'\n", + " 0.0\n", + " 0.0\n", " O\n", " <NA>\n", + " [ -0.04824195, -0.25330004, 1.167191...\n", " O\n", " 0\n", - " [ -0.04824195, -0.25330004, 1.167191...\n", " \n", " \n", " 3\n", " train\n", " 0\n", " 3\n", + " [2, 4): 'OC'\n", " 9244\n", " 0\n", " 1\n", " False\n", + " [0, 10): '-DOCSTART-'\n", + " 0.0\n", + " 0.0\n", " O\n", " <NA>\n", + " [ -0.26682988, -0.31008753, 1.007472...\n", " O\n", " 0\n", - " [ -0.26682988, -0.31008753, 1.007472...\n", " \n", " \n", " 4\n", " train\n", " 0\n", " 4\n", + " [4, 6): 'ST'\n", " 9272\n", " 0\n", " 1\n", " False\n", + " [0, 10): '-DOCSTART-'\n", + " 0.0\n", + " 0.0\n", " O\n", " <NA>\n", + " [ -0.22296889, -0.21308492, 0.9331016...\n", " O\n", " 0\n", - " [ -0.22296889, -0.21308492, 0.9331016...\n", " \n", " \n", " ...\n", @@ -854,131 +654,181 @@ " ...\n", " ...\n", " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", " \n", " \n", " 371472\n", " test\n", " 230\n", " 314\n", + " [1386, 1393): 'brother'\n", " 1711\n", " 0\n", " 1\n", " False\n", + " [1386, 1393): 'brother'\n", + " 50345.0\n", + " 267.0\n", " O\n", " <NA>\n", + " [ -0.028172785, -0.08062388, 0.9804888...\n", " O\n", " 0\n", - " [ -0.028172785, -0.08062388, 0.9804888...\n", " \n", " \n", " 371473\n", " test\n", " 230\n", " 315\n", + " [1393, 1394): ','\n", " 117\n", " 0\n", " 1\n", " False\n", + " [1393, 1394): ','\n", + " 50346.0\n", + " 268.0\n", " O\n", " <NA>\n", + " [ 0.11817408, -0.07008513, 0.865484...\n", " O\n", " 0\n", - " [ 0.11817408, -0.07008513, 0.865484...\n", " \n", " \n", " 371474\n", " test\n", " 230\n", " 316\n", + " [1395, 1400): 'Bobby'\n", " 5545\n", " 0\n", " 1\n", " False\n", + " [1395, 1400): 'Bobby'\n", + " 50347.0\n", + " 269.0\n", " B\n", " PER\n", - " B-PER\n", - " 4\n", " [ -0.35689482, 0.31400457, 1.573853...\n", + " B-PER\n", + " 3\n", " \n", " \n", " 371475\n", " test\n", " 230\n", " 317\n", + " [1400, 1401): '.'\n", " 119\n", " 0\n", " 1\n", " False\n", + " [1400, 1401): '.'\n", + " 50348.0\n", + " 270.0\n", " O\n", " <NA>\n", + " [ -0.18957126, -0.24581163, 0.66257...\n", " O\n", " 0\n", - " [ -0.18957126, -0.24581163, 0.66257...\n", " \n", " \n", " 371476\n", " test\n", " 230\n", " 318\n", + " [0, 0): ''\n", " 102\n", " 0\n", " 1\n", " True\n", + " NaN\n", + " NaN\n", + " NaN\n", " O\n", " <NA>\n", + " [ -0.44689128, -0.31665266, 0.779688...\n", " O\n", " 0\n", - " [ -0.44689128, -0.31665266, 0.779688...\n", " \n", " \n", "\n", - "

371477 rows × 12 columns

\n", + "

371477 rows × 16 columns

\n", "" ], "text/plain": [ - " fold doc_num token_id input_id token_type_id attention_mask \\\n", - "0 train 0 0 101 0 1 \n", - "1 train 0 1 118 0 1 \n", - "2 train 0 2 141 0 1 \n", - "3 train 0 3 9244 0 1 \n", - "4 train 0 4 9272 0 1 \n", - "... ... ... ... ... ... ... \n", - "371472 test 230 314 1711 0 1 \n", - "371473 test 230 315 117 0 1 \n", - "371474 test 230 316 5545 0 1 \n", - "371475 test 230 317 119 0 1 \n", - "371476 test 230 318 102 0 1 \n", + " fold doc_num token_id span input_id \\\n", + "0 train 0 0 [0, 0): '' 101 \n", + "1 train 0 1 [0, 1): '-' 118 \n", + "2 train 0 2 [1, 2): 'D' 141 \n", + "3 train 0 3 [2, 4): 'OC' 9244 \n", + "4 train 0 4 [4, 6): 'ST' 9272 \n", + "... ... ... ... ... ... \n", + "371472 test 230 314 [1386, 1393): 'brother' 1711 \n", + "371473 test 230 315 [1393, 1394): ',' 117 \n", + "371474 test 230 316 [1395, 1400): 'Bobby' 5545 \n", + "371475 test 230 317 [1400, 1401): '.' 119 \n", + "371476 test 230 318 [0, 0): '' 102 \n", + "\n", + " token_type_id attention_mask special_tokens_mask \\\n", + "0 0 1 True \n", + "1 0 1 False \n", + "2 0 1 False \n", + "3 0 1 False \n", + "4 0 1 False \n", + "... ... ... ... \n", + "371472 0 1 False \n", + "371473 0 1 False \n", + "371474 0 1 False \n", + "371475 0 1 False \n", + "371476 0 1 True \n", "\n", - " special_tokens_mask ent_iob ent_type token_class token_class_id \\\n", - "0 True O O 0 \n", - "1 False O O 0 \n", - "2 False O O 0 \n", - "3 False O O 0 \n", - "4 False O O 0 \n", - "... ... ... ... ... ... \n", - "371472 False O O 0 \n", - "371473 False O O 0 \n", - "371474 False B PER B-PER 4 \n", - "371475 False O O 0 \n", - "371476 True O O 0 \n", + " raw_span line_num raw_span_id ent_iob ent_type \\\n", + "0 NaN NaN NaN O \n", + "1 [0, 10): '-DOCSTART-' 0.0 0.0 O \n", + "2 [0, 10): '-DOCSTART-' 0.0 0.0 O \n", + "3 [0, 10): '-DOCSTART-' 0.0 0.0 O \n", + "4 [0, 10): '-DOCSTART-' 0.0 0.0 O \n", + "... ... ... ... ... ... \n", + "371472 [1386, 1393): 'brother' 50345.0 267.0 O \n", + "371473 [1393, 1394): ',' 50346.0 268.0 O \n", + "371474 [1395, 1400): 'Bobby' 50347.0 269.0 B PER \n", + "371475 [1400, 1401): '.' 50348.0 270.0 O \n", + "371476 NaN NaN NaN O \n", "\n", - " embedding \n", - "0 [ -0.098505184, -0.4050192, 0.7428884... \n", - "1 [ -0.057021223, -0.48112097, 0.989868... \n", - "2 [ -0.04824195, -0.25330004, 1.167191... \n", - "3 [ -0.26682988, -0.31008753, 1.007472... \n", - "4 [ -0.22296889, -0.21308492, 0.9331016... \n", - "... ... \n", - "371472 [ -0.028172785, -0.08062388, 0.9804888... \n", - "371473 [ 0.11817408, -0.07008513, 0.865484... \n", - "371474 [ -0.35689482, 0.31400457, 1.573853... \n", - "371475 [ -0.18957126, -0.24581163, 0.66257... \n", - "371476 [ -0.44689128, -0.31665266, 0.779688... \n", + " embedding token_class \\\n", + "0 [ -0.098505184, -0.4050192, 0.7428884... O \n", + "1 [ -0.057021223, -0.48112097, 0.989868... O \n", + "2 [ -0.04824195, -0.25330004, 1.167191... O \n", + "3 [ -0.26682988, -0.31008753, 1.007472... O \n", + "4 [ -0.22296889, -0.21308492, 0.9331016... O \n", + "... ... ... \n", + "371472 [ -0.028172785, -0.08062388, 0.9804888... O \n", + "371473 [ 0.11817408, -0.07008513, 0.865484... O \n", + "371474 [ -0.35689482, 0.31400457, 1.573853... B-PER \n", + "371475 [ -0.18957126, -0.24581163, 0.66257... O \n", + "371476 [ -0.44689128, -0.31665266, 0.779688... O \n", "\n", - "[371477 rows x 12 columns]" + " token_class_id \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "... ... \n", + "371472 0 \n", + "371473 0 \n", + "371474 3 \n", + "371475 0 \n", + "371476 0 \n", + "\n", + "[371477 rows x 16 columns]" ] }, - "execution_count": 9, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -992,7 +842,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -1019,15 +869,19 @@ " fold\n", " doc_num\n", " token_id\n", + " span\n", " input_id\n", " token_type_id\n", " attention_mask\n", " special_tokens_mask\n", + " raw_span\n", + " line_num\n", + " raw_span_id\n", " ent_iob\n", " ent_type\n", + " embedding\n", " token_class\n", " token_class_id\n", - " embedding\n", " \n", " \n", " \n", @@ -1036,75 +890,95 @@ " train\n", " 12\n", " 0\n", + " [0, 0): ''\n", " 101\n", " 0\n", " 1\n", " True\n", + " NaN\n", + " NaN\n", + " NaN\n", " O\n", " <NA>\n", + " [ -0.101977676, -0.42442498, 0.8440171...\n", " O\n", " 0\n", - " [ -0.101977676, -0.42442498, 0.8440171...\n", " \n", " \n", " 1\n", " train\n", " 12\n", " 1\n", + " [0, 1): '-'\n", " 118\n", " 0\n", " 1\n", " False\n", + " [0, 10): '-DOCSTART-'\n", + " 2664.0\n", + " 0.0\n", " O\n", " <NA>\n", + " [ -0.09124618, -0.47710702, 1.120292...\n", " O\n", " 0\n", - " [ -0.09124618, -0.47710702, 1.120292...\n", " \n", " \n", " 2\n", " train\n", " 12\n", " 2\n", + " [1, 2): 'D'\n", " 141\n", " 0\n", " 1\n", " False\n", + " [0, 10): '-DOCSTART-'\n", + " 2664.0\n", + " 0.0\n", " O\n", " <NA>\n", + " [ -0.1695277, -0.27063507, 1.209566...\n", " O\n", " 0\n", - " [ -0.1695277, -0.27063507, 1.209566...\n", " \n", " \n", " 3\n", " train\n", " 12\n", " 3\n", + " [2, 4): 'OC'\n", " 9244\n", " 0\n", " 1\n", " False\n", + " [0, 10): '-DOCSTART-'\n", + " 2664.0\n", + " 0.0\n", " O\n", " <NA>\n", + " [ -0.27648172, -0.3675844, 1.092024...\n", " O\n", " 0\n", - " [ -0.27648172, -0.3675844, 1.092024...\n", " \n", " \n", " 4\n", " train\n", " 12\n", " 4\n", + " [4, 6): 'ST'\n", " 9272\n", " 0\n", " 1\n", " False\n", + " [0, 10): '-DOCSTART-'\n", + " 2664.0\n", + " 0.0\n", " O\n", " <NA>\n", + " [ -0.24050614, -0.24247544, 1.07511...\n", " O\n", " 0\n", - " [ -0.24050614, -0.24247544, 1.07511...\n", " \n", " \n", " ...\n", @@ -1120,131 +994,181 @@ " ...\n", " ...\n", " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", " \n", " \n", " 45059\n", " test\n", " 225\n", " 75\n", + " [208, 213): 'fight'\n", " 2147\n", " 0\n", " 1\n", " False\n", + " [208, 213): 'fight'\n", + " 49418.0\n", + " 29.0\n", " O\n", " <NA>\n", + " [ -0.09621397, -0.48016888, 0.510937...\n", " O\n", " 0\n", - " [ -0.09621397, -0.48016888, 0.510937...\n", " \n", " \n", " 45060\n", " test\n", " 225\n", " 76\n", + " [214, 216): 'on'\n", " 1113\n", " 0\n", " 1\n", " False\n", + " [214, 216): 'on'\n", + " 49419.0\n", + " 30.0\n", " O\n", " <NA>\n", + " [ -0.0858628, -0.2341724, 0.832928...\n", " O\n", " 0\n", - " [ -0.0858628, -0.2341724, 0.832928...\n", " \n", " \n", " 45061\n", " test\n", " 225\n", " 77\n", + " [217, 225): 'Saturday'\n", " 4306\n", " 0\n", " 1\n", " False\n", + " [217, 225): 'Saturday'\n", + " 49420.0\n", + " 31.0\n", " O\n", " <NA>\n", + " [ -0.012238501, -0.4282664, 0.619483...\n", " O\n", " 0\n", - " [ -0.012238501, -0.4282664, 0.619483...\n", " \n", " \n", " 45062\n", " test\n", " 225\n", " 78\n", + " [225, 226): '.'\n", " 119\n", " 0\n", " 1\n", " False\n", + " [225, 226): '.'\n", + " 49421.0\n", + " 32.0\n", " O\n", " <NA>\n", + " [ -0.042955935, -0.36315423, 0.660203...\n", " O\n", " 0\n", - " [ -0.042955935, -0.36315423, 0.660203...\n", " \n", " \n", " 45063\n", " test\n", " 225\n", " 79\n", + " [0, 0): ''\n", " 102\n", " 0\n", " 1\n", " True\n", + " NaN\n", + " NaN\n", + " NaN\n", " O\n", " <NA>\n", + " [ -0.9504192, 0.012983555, 0.7374987...\n", " O\n", " 0\n", - " [ -0.9504192, 0.012983555, 0.7374987...\n", " \n", " \n", "\n", - "

45064 rows × 12 columns

\n", + "

45064 rows × 16 columns

\n", "" ], "text/plain": [ - " fold doc_num token_id input_id token_type_id attention_mask \\\n", - "0 train 12 0 101 0 1 \n", - "1 train 12 1 118 0 1 \n", - "2 train 12 2 141 0 1 \n", - "3 train 12 3 9244 0 1 \n", - "4 train 12 4 9272 0 1 \n", - "... ... ... ... ... ... ... \n", - "45059 test 225 75 2147 0 1 \n", - "45060 test 225 76 1113 0 1 \n", - "45061 test 225 77 4306 0 1 \n", - "45062 test 225 78 119 0 1 \n", - "45063 test 225 79 102 0 1 \n", + " fold doc_num token_id span input_id \\\n", + "0 train 12 0 [0, 0): '' 101 \n", + "1 train 12 1 [0, 1): '-' 118 \n", + "2 train 12 2 [1, 2): 'D' 141 \n", + "3 train 12 3 [2, 4): 'OC' 9244 \n", + "4 train 12 4 [4, 6): 'ST' 9272 \n", + "... ... ... ... ... ... \n", + "45059 test 225 75 [208, 213): 'fight' 2147 \n", + "45060 test 225 76 [214, 216): 'on' 1113 \n", + "45061 test 225 77 [217, 225): 'Saturday' 4306 \n", + "45062 test 225 78 [225, 226): '.' 119 \n", + "45063 test 225 79 [0, 0): '' 102 \n", + "\n", + " token_type_id attention_mask special_tokens_mask \\\n", + "0 0 1 True \n", + "1 0 1 False \n", + "2 0 1 False \n", + "3 0 1 False \n", + "4 0 1 False \n", + "... ... ... ... \n", + "45059 0 1 False \n", + "45060 0 1 False \n", + "45061 0 1 False \n", + "45062 0 1 False \n", + "45063 0 1 True \n", "\n", - " special_tokens_mask ent_iob ent_type token_class token_class_id \\\n", - "0 True O O 0 \n", - "1 False O O 0 \n", - "2 False O O 0 \n", - "3 False O O 0 \n", - "4 False O O 0 \n", - "... ... ... ... ... ... \n", - "45059 False O O 0 \n", - "45060 False O O 0 \n", - "45061 False O O 0 \n", - "45062 False O O 0 \n", - "45063 True O O 0 \n", + " raw_span line_num raw_span_id ent_iob ent_type \\\n", + "0 NaN NaN NaN O \n", + "1 [0, 10): '-DOCSTART-' 2664.0 0.0 O \n", + "2 [0, 10): '-DOCSTART-' 2664.0 0.0 O \n", + "3 [0, 10): '-DOCSTART-' 2664.0 0.0 O \n", + "4 [0, 10): '-DOCSTART-' 2664.0 0.0 O \n", + "... ... ... ... ... ... \n", + "45059 [208, 213): 'fight' 49418.0 29.0 O \n", + "45060 [214, 216): 'on' 49419.0 30.0 O \n", + "45061 [217, 225): 'Saturday' 49420.0 31.0 O \n", + "45062 [225, 226): '.' 49421.0 32.0 O \n", + "45063 NaN NaN NaN O \n", "\n", - " embedding \n", - "0 [ -0.101977676, -0.42442498, 0.8440171... \n", - "1 [ -0.09124618, -0.47710702, 1.120292... \n", - "2 [ -0.1695277, -0.27063507, 1.209566... \n", - "3 [ -0.27648172, -0.3675844, 1.092024... \n", - "4 [ -0.24050614, -0.24247544, 1.07511... \n", - "... ... \n", - "45059 [ -0.09621397, -0.48016888, 0.510937... \n", - "45060 [ -0.0858628, -0.2341724, 0.832928... \n", - "45061 [ -0.012238501, -0.4282664, 0.619483... \n", - "45062 [ -0.042955935, -0.36315423, 0.660203... \n", - "45063 [ -0.9504192, 0.012983555, 0.7374987... \n", + " embedding token_class \\\n", + "0 [ -0.101977676, -0.42442498, 0.8440171... O \n", + "1 [ -0.09124618, -0.47710702, 1.120292... O \n", + "2 [ -0.1695277, -0.27063507, 1.209566... O \n", + "3 [ -0.27648172, -0.3675844, 1.092024... O \n", + "4 [ -0.24050614, -0.24247544, 1.07511... O \n", + "... ... ... \n", + "45059 [ -0.09621397, -0.48016888, 0.510937... O \n", + "45060 [ -0.0858628, -0.2341724, 0.832928... O \n", + "45061 [ -0.012238501, -0.4282664, 0.619483... O \n", + "45062 [ -0.042955935, -0.36315423, 0.660203... O \n", + "45063 [ -0.9504192, 0.012983555, 0.7374987... O \n", "\n", - "[45064 rows x 12 columns]" + " token_class_id \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "... ... \n", + "45059 0 \n", + "45060 0 \n", + "45061 0 \n", + "45062 0 \n", + "45063 0 \n", + "\n", + "[45064 rows x 16 columns]" ] }, - "execution_count": 10, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -1264,9 +1188,16 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2021-06-30 00:53:27,285\tINFO services.py:1272 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -1288,22 +1219,22 @@ "Training model '256_2' (#2 at 256 dimensions) with seed 643865\n", "Training model '256_3' (#3 at 256 dimensions) with seed 402414\n", "Training model '256_4' (#4 at 256 dimensions) with seed 822761\n", - "\u001b[2m\u001b[36m(pid=24222)\u001b[0m Training model with n_components=64 and seed=94177.\n", - "\u001b[2m\u001b[36m(pid=24226)\u001b[0m Training model with n_components=32 and seed=654571.\n", - "\u001b[2m\u001b[36m(pid=24230)\u001b[0m Training model with n_components=64 and seed=201469.\n", - "\u001b[2m\u001b[36m(pid=24223)\u001b[0m Training model with n_components=32 and seed=773956.\n", - "\u001b[2m\u001b[36m(pid=24224)\u001b[0m Training model with n_components=32 and seed=438878.\n", - "\u001b[2m\u001b[36m(pid=24225)\u001b[0m Training model with n_components=32 and seed=89250.\n", - "\u001b[2m\u001b[36m(pid=24235)\u001b[0m Training model with n_components=64 and seed=975622.\n", - "\u001b[2m\u001b[36m(pid=24227)\u001b[0m Training model with n_components=64 and seed=526478.\n", - "\u001b[2m\u001b[36m(pid=24231)\u001b[0m Training model with n_components=128 and seed=128113.\n", - "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=256 and seed=781567.\n", - "\u001b[2m\u001b[36m(pid=24234)\u001b[0m Training model with n_components=128 and seed=513226.\n", - "\u001b[2m\u001b[36m(pid=24233)\u001b[0m Training model with n_components=128 and seed=839748.\n", - "\u001b[2m\u001b[36m(pid=24232)\u001b[0m Training model with n_components=128 and seed=450385.\n", - "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=256 and seed=402414.\n", - "\u001b[2m\u001b[36m(pid=24229)\u001b[0m Training model with n_components=256 and seed=643865.\n", - "\u001b[2m\u001b[36m(pid=24225)\u001b[0m Training model with n_components=256 and seed=822761.\n", + "\u001b[2m\u001b[36m(pid=1424)\u001b[0m Training model with n_components=64 and seed=94177.\n", + "\u001b[2m\u001b[36m(pid=1427)\u001b[0m Training model with n_components=64 and seed=526478.\n", + "\u001b[2m\u001b[36m(pid=1421)\u001b[0m Training model with n_components=128 and seed=128113.\n", + "\u001b[2m\u001b[36m(pid=1428)\u001b[0m Training model with n_components=64 and seed=975622.\n", + "\u001b[2m\u001b[36m(pid=1422)\u001b[0m Training model with n_components=256 and seed=781567.\n", + "\u001b[2m\u001b[36m(pid=1431)\u001b[0m Training model with n_components=32 and seed=654571.\n", + "\u001b[2m\u001b[36m(pid=1430)\u001b[0m Training model with n_components=32 and seed=773956.\n", + "\u001b[2m\u001b[36m(pid=1432)\u001b[0m Training model with n_components=32 and seed=438878.\n", + "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=32 and seed=89250.\n", + "\u001b[2m\u001b[36m(pid=1426)\u001b[0m Training model with n_components=128 and seed=450385.\n", + "\u001b[2m\u001b[36m(pid=1419)\u001b[0m Training model with n_components=256 and seed=643865.\n", + "\u001b[2m\u001b[36m(pid=1425)\u001b[0m Training model with n_components=128 and seed=839748.\n", + "\u001b[2m\u001b[36m(pid=1423)\u001b[0m Training model with n_components=128 and seed=513226.\n", + "\u001b[2m\u001b[36m(pid=1420)\u001b[0m Training model with n_components=256 and seed=402414.\n", + "\u001b[2m\u001b[36m(pid=1429)\u001b[0m Training model with n_components=64 and seed=201469.\n", + "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=256 and seed=822761.\n", "Trained 17 models.\n", "Model names after loading or training: 768_1, 32_1, 32_2, 32_3, 32_4, 64_1, 64_2, 64_3, 64_4, 128_1, 128_2, 128_3, 128_4, 256_1, 256_2, 256_3, 256_4\n" ] @@ -1311,15 +1242,16 @@ ], "source": [ "import importlib\n", - "util = importlib.reload(util)\n", "import sklearn.linear_model\n", + "import ray\n", + "ray.init()\n", "\n", - "# Wrap util.train_reduced_model in a Ray task\n", + "# Wrap train_reduced_model in a Ray task\n", "@ray.remote\n", "def train_reduced_model_task(\n", " x_values: np.ndarray, y_values: np.ndarray, n_components: int,\n", " seed: int, max_iter: int = 10000) -> sklearn.base.BaseEstimator:\n", - " return util.train_reduced_model(x_values, y_values, n_components, seed, max_iter)\n", + " return cleaning.ensemble.train_reduced_model(x_values, y_values, n_components, seed, max_iter)\n", "\n", "# Ray task that trains a model using the entire embedding\n", "@ray.remote\n", @@ -1370,7 +1302,6 @@ " names_list.append(model_name)\n", " futures_list.append(train_reduced_model_task.remote(X_id, Y_id, \n", " num_dims, seed))\n", - " #models[model_name] = util.train_reduced_model(X, Y, num_dims, seed)\n", " \n", " # Block until all training tasks have completed and fetch the resulting models.\n", " models_list = ray.get(futures_list)\n", @@ -1400,7 +1331,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -1430,13 +1361,13 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "affbf755344e45cfb595819ca5a00614", + "model_id": "db850b403c7f4056b57362ca627295f0", "version_major": 2, "version_minor": 0 }, @@ -1446,6 +1377,86 @@ }, "metadata": {}, "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
spanent_typefolddoc_num
0[11, 16): 'Saudi'MISCtrain12
1[59, 65): 'MANAMA'LOCtrain12
2[86, 91): 'Saudi'MISCtrain12
3[259, 264): 'Saudi'MISCtrain12
0[55, 65): 'MONTGOMERY'LOCtrain20
\n", + "
" + ], + "text/plain": [ + " span ent_type fold doc_num\n", + "0 [11, 16): 'Saudi' MISC train 12\n", + "1 [59, 65): 'MANAMA' LOC train 12\n", + "2 [86, 91): 'Saudi' MISC train 12\n", + "3 [259, 264): 'Saudi' MISC train 12\n", + "0 [55, 65): 'MONTGOMERY' LOC train 20" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -1465,46 +1476,219 @@ " todo = [(name, model) for name, model in models.items()]\n", " results = tp.jupyter.run_with_progress_bar(\n", " len(todo),\n", - " lambda i: util.analyze_model(test_df, int_to_label, todo[i][1], \n", - " bert_data, corpus_raw, expand_matches=True),\n", + " lambda i: cleaning.infer_and_extract_entities_iob(test_df,corpus_raw, int_to_label, todo[i][1]),\n", " \"model\"\n", " )\n", " return {t[0]: result for t, result in zip(todo, results)}\n", "\n", - "evals = eval_models(models, test_inputs_df)" + "evals = eval_models(models, test_inputs_df)\n", + "# display one of the results\n", + "evals[list(evals.keys())[0]].head()" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
precisionrecallf1-scoredims
768_10.9471490.9388390.942976768
32_10.9240750.8637420.89289032
32_20.9247550.8753550.89937732
32_30.9250280.8660650.89457632
32_40.9329490.8761290.90364732
64_10.9400860.9029680.92115364
64_20.9383210.9029680.92030564
64_30.9368080.8952260.91554564
64_40.9408280.9027100.92137564
128_10.9444010.9249030.934550128
128_20.9475770.9236130.935442128
128_30.9432120.9215480.932254128
128_40.9409910.9218060.931300128
256_10.9492010.9354840.942293256
256_20.9433960.9290320.936159256
256_30.9454780.9308390.938101256
256_40.9450550.9321290.938547256
\n", + "
" + ], + "text/plain": [ + " precision recall f1-score dims\n", + "768_1 0.947149 0.938839 0.942976 768\n", + "32_1 0.924075 0.863742 0.892890 32\n", + "32_2 0.924755 0.875355 0.899377 32\n", + "32_3 0.925028 0.866065 0.894576 32\n", + "32_4 0.932949 0.876129 0.903647 32\n", + "64_1 0.940086 0.902968 0.921153 64\n", + "64_2 0.938321 0.902968 0.920305 64\n", + "64_3 0.936808 0.895226 0.915545 64\n", + "64_4 0.940828 0.902710 0.921375 64\n", + "128_1 0.944401 0.924903 0.934550 128\n", + "128_2 0.947577 0.923613 0.935442 128\n", + "128_3 0.943212 0.921548 0.932254 128\n", + "128_4 0.940991 0.921806 0.931300 128\n", + "256_1 0.949201 0.935484 0.942293 256\n", + "256_2 0.943396 0.929032 0.936159 256\n", + "256_3 0.945478 0.930839 0.938101 256\n", + "256_4 0.945055 0.932129 0.938547 256" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Summarize how each of the models does on the test set.\n", + "gold_elts = cleaning.preprocess.combine_raw_spans_docs_to_match(corpus_raw,evals[list(evals.keys())[0]],label_col = 'ent_type')\n", "def make_summary_df(evals_df: pd.DataFrame) -> pd.DataFrame:\n", - " global_scores = [r[\"global_scores\"] for r in evals_df.values()]\n", - " return pd.DataFrame({\n", - " \"name\": list(evals_df.keys()),\n", - " \"dims\": pd.Series([n.split(\"_\")[0] for n in evals_df.keys()]).astype(int),\n", - " \"num_true_positives\": [r[\"num_true_positives\"] for r in global_scores],\n", - " \"num_entities\": [r[\"num_entities\"] for r in global_scores],\n", - " \"num_extracted\": [r[\"num_extracted\"] for r in global_scores],\n", - " \"precision\": [r[\"precision\"] for r in global_scores],\n", - " \"recall\": [r[\"recall\"] for r in global_scores],\n", - " \"F1\": [r[\"F1\"] for r in global_scores]\n", - " }).sort_values(\"dims\")\n", + " gold_elts = cleaning.preprocess.combine_raw_spans_docs_to_match(corpus_raw, evals['256_4'], label_col = 'ent_type')\n", + " summary_df= cleaning.analysis.create_f1_report_ensemble_iob(evals,gold_elts)\n", + " summary_df['dims'] = [int(name.split('_')[0]) for name in evals.keys()]\n", + " return summary_df\n", "\n", - "summary_df = make_summary_df(evals)" + "summary_df = make_summary_df(evals)\n", + "summary_df" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAASAAAAEKCAYAAACytIjQAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAXbElEQVR4nO3dfZQX1X3H8ffHBcUYIygkR8EHNCiSGMVs1FRPNEaFeBKfalqpbdTaUFs1xsQ0cuJRQ9qjRvOktUZM0BqthhiK1NjgEyap9YFFjIiKIj7AaiIJPlRFlOXbP+YuDOvu8lt2Z+/ubz+vc36HmXtnZu/srB/vzPzmjiICM7McNsvdADMbuBxAZpaNA8jMsnEAmVk2DiAzy8YBZGbZVBZAkqZLelnSYx3US9LlkpZIelTSvqW6kyQ9nT4nVdVGM8uryh7QdcDETuo/C4xJn8nAVQCStgUuAPYH9gMukDSswnaaWSaVBVBE/AZY2ckiRwPXR+EBYKik7YEJwJ0RsTIiXgHupPMgM7N+alDGnz0SWFaaX57KOip/D0mTKXpPbLXVVh8fO3ZsNS01s07Nnz//jxExoqvr5QygbouIacA0gMbGxmhqasrcIrOBSdLzm7JezrtgzcCOpflRqayjcjOrMzkDaDbwxXQ37ADgtYh4CZgDHCFpWLr4fEQqM7M6U9kpmKSbgEOA4ZKWU9zZGgwQET8CbgeOBJYAbwGnpLqVkr4NzEubmhoRnV3MNrN+qrIAiohJG6kP4PQO6qYD06tol5n1Hf4mtJll4wAys2wcQGaWjQPIzLJxAJlZNg4gM8vGAWRm2TiAzCwbB5CZZeMAMrNsHEBmlo0DyMyycQCZWTYOIDPLxgFkZtk4gMwsGweQmWXjADKzbBxAZpaNA8jMsnEAmVk2DiAzy8YBZGbZOIDMLBsHkJll4wAys2wcQGaWjQPIzLJxAJlZNg4gM8vGAWRm2TiAzCwbB5CZZeMAMrNsHEBmlk2lASRpoqTFkpZIOred+p0l3S3pUUn3ShpVqmuR9Ej6zK6ynWaWx6CqNiypAbgSOBxYDsyTNDsiHi8tdhlwfUT8u6RDgYuAv0l1qyJin6raZ2b5VdkD2g9YEhFLI+Id4Gbg6DbLjAPuSdNz26k3szpWZQCNBJaV5pensrLfAcel6WOBrSVtl+aHSGqS9ICkY9r7AZImp2WaVqxY0YNNN7PekPsi9DnAwZIWAAcDzUBLqts5IhqBvwJ+IGm3titHxLSIaIyIxhEjRvRao82sZ1R2DYgiTHYszY9KZetExIukHpCk9wN/HhGvprrm9O9SSfcC44FnKmyvmfWyKgNoHjBG0miK4DmBojezjqThwMqIWAtMAaan8mHAWxGxOi1zIPCdCttqZiWzFjRz6ZzFvPjqKnYYuiVfn7AHx4xvewWl+yoLoIhYI+kMYA7QAEyPiEWSpgJNETEbOAS4SFIAvwFOT6vvCVwtaS3FaeLFbe6eWY166w/J6sesBc1MmbmQVe8WV0OaX13FlJkLAXr8b0cR0aMbzKWxsTGamppyN6NPafuHBLDl4AYuOm4vh5B16MCL76H51VXvKR85dEvuO/fQdteRND9ds+2S3BehrUKXzlm8QfgArHq3hUvnLM7UIusPXmwnfDor7w4HUB3rzT8kqx87DN2yS+Xd4QCqY+/bvKFL5WYAX5+wB1sO3vBvZMvBDXx9wh49/rOqvAtmmb31TkuXys1g/YXmfn0XbCA7b9ZCbnpwGS0RNEhM2n9H/vmYvXq9HR3dXqiP2w5WpWPGj+yVGxUOoB523qyF3PDAC+vmWyLWzecIIbO+zNeAethNDy7rUrnZQOYA6mEtHXyvqqPyKo3s4K5FR+Vmvc0BVMd6826G2abwNaA61pt3M8w2hQOozvXW3QyzTeFTsB427H2Du1RuNpA5gHrYBZ//CIMbtEHZ4AZxwec/kqlFZn2XT8F6mK+7mNXOAVQBX3cxq41PwcwsG/eASvrKM1xmA4UDKPEzXGa9z6dgyY2l8Kml3My6zwGUeOgKs97nADKzbBxAyVYdDFPaUbmZdZ8DKDl23/a/t9NRuZl1nwMomfvkii6Vm1n3OYASv8LGrPc5gJLefBeSmRUcQIlHDzTrff4mdOKn2M16nwOoxE+xm/Uun4KZWTYOIDPLZkCdgs1a0OxrPGZ9yIAJoFkLmpkycyGr3m0BoPnVVUyZuRDAIWSWyYA5Bbt0zuJ14dNq1bstXDpncaYWmVmlASRpoqTFkpZIOred+p0l3S3pUUn3ShpVqjtJ0tPpc1J32+JvOpv1PZUFkKQG4Ergs8A4YJKkcW0Wuwy4PiI+BkwFLkrrbgtcAOwP7AdcIGlYd9rjbzqb9T1V9oD2A5ZExNKIeAe4GTi6zTLjgHvS9NxS/QTgzohYGRGvAHcCE7vTmE+PHdGlcjOrXpUBNBJYVppfnsrKfgccl6aPBbaWtF2N6yJpsqQmSU0rVnT+1Lqfdjfre3JfhD4HOFjSAuBgoBlo6XyV9SJiWkQ0RkTjiBGd92R8Dcis76kpgCQdJOmUND1C0ugaVmsGdizNj0pl60TEixFxXESMB76Zyl6tZd2u8jUgs75nowEk6QLgG8CUVDQYuKGGbc8DxkgaLWlz4ARgdpttD5fU2oYpwPQ0PQc4QtKwdPH5iFS2yXwNyKzvqaUHdCxwFPAmFL0WYOuNrRQRa4AzKILjCWBGRCySNFXSUWmxQ4DFkp4CPgT8S1p3JfBtihCbB0xNZZvM14DM+p5avgn9TkSEpACQtFWtG4+I24Hb25SdX5q+Bbilg3Wns75H1G3NHVzr6ajczKpXSw9ohqSrgaGSvgTcBVxTbbN6XoPUpXIzq16nPSBJAn4GjAVeB/YAzo+IO3uhbT2qJdp/xWBH5WZWvU4DKJ163R4Re1F8GbDfGjl0y3ZPt0b6LphZNrWcgj0s6ROVt6RiHvPZrO+p5SL0/sCJkp6nuBMmis7RxyptWQ/zmM9mfU8tATSh8lb0Eo/5bNa3bPQULCKeB4YCn0+foanMzKxbavkm9FnAjcAH0+cGSWdW3TAzq3+1nIKdCuwfEW8CSLoEuB+4osqGmVn9q+UumNjwCfWWVGZm1i219ICuBR6U9J9p/hjgJ5W1yMwGjI0GUER8T9K9wEGp6JSIWFBpq8xsQNhoAEk6AFgUEQ+n+Q9I2j8iHqy8dWZW12q5BnQV8EZp/o1UZmbWLTVdhI5Y/8RmRKxlAL3Q0MyqU0sALZX0ZUmD0+csYGnVDTOz+ldLAJ0G/BnFmMzNFM+GTa6yUWY2MNRyF+xlivGczcx6VIc9IElfkjQmTUvSdEmvpdco79t7TTSzetXZKdhZwHNpehKwN7Ar8FXgh9U2y8wGgs4CaE1EvJumP0fxDvc/RcRdQM0D05uZdaSzAForaXtJQ4DPUAxG38rjmJpZt3V2Efp8oAloAGZHxCIASQfj2/Bm1gM6DKCIuE3SzsDWEfFKqaoJ+MvKW2ZmdW9jb8VYA7zSpuzNSltkZgNGLV9ENDOrhAPIzLLZpACSNLanG2JmA8+m9oDu6NFWmNmA1OFFaEmXd1RF8ZoeM7Nu6ewu2CnA14DV7dRNqqY5ZjaQdBZA84DHIuJ/21ZIurCyFpnZgNFZAB0PvN1eRUSMrqY5ZjaQdHYR+v0R8VavtcTMBpzOAmhW64SkX1TfFDMbaDoLoPLbT3fdlI1LmihpsaQlks5tp34nSXMlLUgDnR2ZyneRtErSI+nzo035+WbWt3V2DSg6mK6JpAbgSuBwYDkwT9LsiHi8tNh5wIyIuErSOOB2YJdU90xE7NPVn2tm/UdnAbS3pNcpekJbpmnSfETEBzay7f2AJRGxFEDSzcDRQDmAAmjdzjbAi11sv5n1Y50Nx9HQzW2PBJaV5pdTvFGj7ELgDklnUoyyeFipbrSkBcDrwHkR8du2P0DSZNIbOnbaaaduNtfMelvuh1EnAddFxCjgSOCnkjYDXgJ2iojxFGNQ/4ek9/S4ImJaRDRGROOIESN6teFm1n1VBlAzsGNpflQqKzsVmAEQEfcDQ4DhEbE6Iv6UyucDzwC7V9hWM8ugygCaB4yRNFrS5hTvFpvdZpkXKMabRtKeFAG0QtKIdBEbSbsCY/AwsGZ1p7J3vEfEGklnAHMoxpWeHhGLJE0FmiJiNsWzZtdIOpvigvTJERGSPgVMlfQusBY4LSJWVtVWM8tDEV2+w94nNTY2RlNTU+5mmA1IkuZHRGNX18t9EdrMBjAHkJll4wAys2wcQGaWjQPIzLJxAJlZNg4gM8vGAWRm2TiAzCwbB5CZZeMAMrNsHEBmlo0DyMyycQCZWTYOIDPLxgFkZtk4gMwsGweQmWXjADKzbBxAZpaNA8jMsnEAmVk2DiAzy8YBZGbZOIDMLBsHkJll4wAys2wcQGaWjQPIzLJxAJlZNg4gM8vGAWRm2TiAzCwbB5CZZVNpAEmaKGmxpCWSzm2nfidJcyUtkPSopCNLdVPSeoslTaiynWaWx6CqNiypAbgSOBxYDsyTNDsiHi8tdh4wIyKukjQOuB3YJU2fAHwE2AG4S9LuEdFSVXvNrPdV2QPaD1gSEUsj4h3gZuDoNssE8IE0vQ3wYpo+Grg5IlZHxLPAkrQ9M6sjVQbQSGBZaX55Kiu7EPhrScspej9ndmFdJE2W1CSpacWKFT3VbjPrJbkvQk8CrouIUcCRwE8l1dymiJgWEY0R0ThixIjKGmlm1ajsGhDQDOxYmh+VyspOBSYCRMT9koYAw2tc18z6uSp7QPOAMZJGS9qc4qLy7DbLvAB8BkDSnsAQYEVa7gRJW0gaDYwBHqqwrWaWQWU9oIhYI+kMYA7QAEyPiEWSpgJNETEb+BpwjaSzKS5InxwRASySNAN4HFgDnO47YGb1R8V/7/1fY2NjNDU15W6G2YAkaX5ENHZ1vdwXoc1sAHMAmVk2DiAzy8YBZGbZOIDMLBsHkJll4wAys2wcQGaWjQPIzLJxAJlZNg4gM8vGAWRm2TiAzCwbB5CZZeMAMrNsHEBmlo0DyMyycQCZWTYOIDPLxgFkZtk4gMwsGweQmWXjADKzbBxAZpaNA8jMsnEAmVk2DiAzy8YBZGbZOIDMLBsHkJll4wAys2wcQGaWjQPIzLJxAJlZNg4gM8um0gCSNFHSYklLJJ3bTv33JT2SPk9JerVU11Kqm11lO80sj0FVbVhSA3AlcDiwHJgnaXZEPN66TEScXVr+TGB8aROrImKfqtpnZvlV2QPaD1gSEUsj4h3gZuDoTpafBNxUYXvMrI+prAcEjASWleaXA/u3t6CknYHRwD2l4iGSmoA1wMURMaud9SYDk9PsG5IWA8OBP3a79X2T961/Ggj7tvOmrFxlAHXFCcAtEdFSKts5Ipol7QrcI2lhRDxTXikipgHTymWSmiKisfom9z7vW//kfetYladgzcCOpflRqaw9J9Dm9CsimtO/S4F72fD6kJnVgSoDaB4wRtJoSZtThMx77mZJGgsMA+4vlQ2TtEWaHg4cCDzedl0z698qOwWLiDWSzgDmAA3A9IhYJGkq0BQRrWF0AnBzRERp9T2BqyWtpQjJi8t3zzZi2sYX6be8b/2T960D2vC/ezOz3uNvQptZNg4gM8umbgJoY4999HWSdpQ0V9LjkhZJOiuVbyvpTklPp3+HpXJJujzt76OS9s27BxsnqUHSAkm3pfnRkh5M+/CzdLMCSVuk+SWpfpesDa+BpKGSbpH0pKQnJH2yXo6dpLPT3+Rjkm6SNKSnjl1dBFDpsY/PAuOASZLG5W1Vl60BvhYR44ADgNPTPpwL3B0RY4C70zwU+zomfSYDV/V+k7vsLOCJ0vwlwPcj4sPAK8CpqfxU4JVU/v20XF/3Q+BXETEW2JtiP/v9sZM0Evgy0BgRH6W4oXQCPXXsIqLff4BPAnNK81OAKbnb1c19upXiObrFwPapbHtgcZq+GphUWn7dcn3xQ/E9sLuBQ4HbAFF8g3ZQ22NIcef0k2l6UFpOufehk33bBni2bRvr4dix/omGbdOxuA2Y0FPHri56QLT/2MfITG3pttRtHQ88CHwoIl5KVb8HPpSm+9s+/wD4J2Btmt8OeDUi1qT5cvvX7Vuqfy0t31eNBlYA16ZTzB9L2oo6OHZRfCH4MuAF4CWKYzGfHjp29RJAdUPS+4FfAF+JiNfLdVH8b6XffW9C0ueAlyNifu62VGQQsC9wVUSMB95k/ekW0K+P3TCKh8hHAzsAWwETe2r79RJAXXnso8+SNJgifG6MiJmp+A+Stk/12wMvp/L+tM8HAkdJeo5iVIRDKa6ZDJXU+mXYcvvX7Vuq3wb4U282uIuWA8sj4sE0fwtFINXDsTsMeDYiVkTEu8BMiuPZI8euXgKopsc++jJJAn4CPBER3ytVzQZOStMnUVwbai3/YrqjcgDwWqm736dExJSIGBURu1Acm3si4kRgLnB8WqztvrXu8/Fp+T7be4iI3wPLJO2Rij5D8ehQvz92FKdeB0h6X/obbd23njl2uS9y9eDFsiOBp4BngG/mbs8mtP8gii76o8Aj6XMkxfnz3cDTwF3Atml5Udz5ewZYSHGXIvt+1LCfhwC3peldgYeAJcDPgS1S+ZA0vyTV75q73TXs1z5AUzp+syieb6yLYwd8C3gSeAz4KbBFTx07P4phZtnUyymYmfVDDiAzy8YBZGbZOIDMLBsHkJll4wDqJySFpO+W5s+RdGEPbfs6ScdvfMlu/5wvpCfF57Yp30XSqvQYwxOSHpJ0cqn+qBwjHEjaQdItvf1zB5K+8lYM27jVwHGSLoqIPvOKF0mDYv0zQRtzKvCliPifduqeieIxBlS8CWWmJEXEtVEM39vrXyyNiBdZ/2U7q4B7QP3HGorxd89uW9G2ByPpjfTvIZJ+LelWSUslXSzpxNTDWChpt9JmDpPUpOIV2Z9L6zdIulTSvDRuzd+XtvtbFa/Mfs9Y3ZImpe0/JumSVHY+xZctfyLp0s52NIo3oXyVYhgIJJ0s6V9L+3qVpAfSPh0iaXrqOV1XasMRku6X9LCkn6dn7JD0nKRvpfKFKl6KgKSDtf5V4AskbZ16Zo+l+iGSrk3rLJD06VLbZkr6lYpxf75T+t1dl34HCyW957iZe0D9zZXAo61/5DXam2KQ/5XAUuDHEbGfigHPzgS+kpbbheJttrsBcyV9GPgixWMCn1DxlpL7JN2Rlt8X+GhEPFv+YZJ2oBgD5uMU48TcIemYiJgq6VDgnIhoqqHdDwNjO6gbRjEExFEUPaMDgb+jeP33PhTPZp0HHBYRb0r6BkWgTU3r/zEi9pX0j8A5ad1zgNMj4r4UVm+3+ZmnUzxTulcKrTsk7Z7q9qEYvWA1sFjSFcAHgZFRjKGDpKE17POA4x5QPxLF0/HXk3oGNZoXES9FxGqKr/63BshCitBpNSMi1kbE0xRBNRY4guKZpUcohgbZjmIQLYCH2oZP8gng3igeXlwD3Ah8qgvtbaVO6v4riq/wLwT+EBELI2ItsCjt0wEUA9Pdl9p+Ehu+ubP1Qd/5rP8d3Ad8T9KXgaHtnFYeBNwAEBFPAs8DrQF0d0S8FhFvU/QId6b4He4q6QpJE4HXsfdwD6j/+QFF7+DaUtka0v9MJG0GbF6qW12aXluaX8uGx7/tMzlBEQJnRsSccoWkQyiGnKjSeDYcPbGsvA9t928Q0ALcGRGTNrJ+S1qeiLhY0i8pnr+7T9IE3tsL6ki5DS0UA3W9ImlvisG7TgP+AvjbGrc3YLgH1M9ExEpgBuuHwAR4juKUB4rTksGbsOkvSNosXRfalWKUvjnAP6gYJgRJu6sYaKszDwEHSxquYqjcScCvu9IQFQOyXQZc0cV9aPUAcGA6jUTSVqXTpY5+5m6pJ3UJxegKbU//fgucmJbdHdiJ4nfU0faGA5tFxC8oTgf79LjPubgH1D99FzijNH8NcKuk3wG/YtN6Jy9QhMcHgNMi4m1JP6Y4RXlYkihG/Tums41ExEvplvlcih7ULyPi1s7WSXaTtIDiaer/Ay6PiOs2YT+IiBUqbuPflK5dQRECT3Wy2lfSheXWU7n/phhGtdW/AVdJWkjR4zw5IlYXv5Z2jaQYIbH1f/JTNmVf6p2fhjezbHwKZmbZOIDMLBsHkJll4wAys2wcQGaWjQPIzLJxAJlZNv8PJWDR/pTDR+kAAAAASUVORK5CYII=\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAASAAAAEGCAYAAADFdkirAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAYQ0lEQVR4nO3df5RdZX3v8feHZIKjgIMkZZGEmwBiaGzB0DFCpU2uWhK6bPihbUlZFaxXrldRrE1uybILNV0s6g22/ijXVargpbqkiGmM2DpgDNbSipkwhBBxQoxaMqF1LAy2OkJ+fO8f+znJyZCZnJk5+zxnzvm81pqVvZ+99znfPSf55Nl7n/1sRQRmZjkcl7sAM2tfDiAzy8YBZGbZOIDMLBsHkJllMz13AfUyc+bMmD9/fu4yzNrS1q1bfxwRs8a7XcsE0Pz58+nt7c1dhllbkvTDiWznQzAzy8YBZGbZOIDMLBsHkJll4wAys2xa5iqYmdXPhr4B1vX0s3domNldnaxetoDLFs2p+/s4gMzsCBv6BlizfjvD+w4AMDA0zJr12wHqHkI+BDOzI6zr6T8UPhXD+w6wrqe/7u/lHlCLa1RX2lrH3qHhcbVPhntALazSlR4YGiY43JXe0DeQuzRrYrO7OsfVPhkOoBbWyK60tY7VyxbQ2THtiLbOjmmsXrag7u/lQ7AW1siutLWOyiG6r4LZpMzu6mTgKGFTRlfaWstli+Y05FyhD8FaWCO70mYT4R5QC2tkV9psIhxALa5RXWmzifAhmJllU2oASVouqV/SLkk3HGX5PEmbJD0q6QFJc0csP0nSHkl/WWadZpZHaQEkaRpwK3AJsBBYKWnhiNVuAe6MiHOBtcDNI5b/KfCPZdVoZnmV2QNaDOyKiN0R8TxwF3DpiHUWAl9P05url0v6FeBU4L4SazSzjMoMoDnAk1Xze1JbtW3AFWn6cuBESadIOg74CLBqrDeQdK2kXkm9g4ODdSrbzBol90noVcASSX3AEmAAOAC8E/j7iNgz1sYRcVtEdEdE96xZ434iiJllVuZl+AHg9Kr5uantkIjYS+oBSToBeFNEDEm6EPg1Se8ETgBmSPqviHjBiWwzm7rKDKAtwNmSzqAIniuB36teQdJM4OmIOAisAW4HiIirqta5Buh2+EyMh+OwZlbaIVhE7AeuA3qAx4G7I2KHpLWSVqTVlgL9knZSnHC+qax62tGGvgFW37PtiOE4Vt+zzcNxWNNQROSuoS66u7vDT0Y90qK19/HMz/a9oP3kF3fQd+PFGSqyViVpa0R0j3e73CehrURHC5+x2s0azQFkZtk4gFpYV2fHuNrNGs0B1MI+uOKVdBynI9o6jhMfXPHKTBWZHcnDcbQwjwdkzc4B1OI8HpA1MwdQi/MXEa2ZOYBaWCMfsWs2ET4J3cL8XDBrdu4BlaBZDnv8XDBrdu4B1VkzPQ65kY/YNZsIB1CdNdNhj58LZs3Oh2B11kyHPf4ekDU7B1Cddb2446g3e3a9OM/tD/4ekDUzH4LV2Wijm7TIqCdmdeUAqrNnh48+1MVo7WbtzAFUZ77yZFY7B1Cd+cqTWe18ErrOfOXJrHYOoBL4ypNZbXwIZmbZOIDMLBsfgpWgWW5GNWt2DqA68xg8ZrXzIVidNdPNqGbNzgFUZ810M6pZs3MA1dlLR3nm1mjtZu3MAVRn0vjazdqZA6jOhkZ57vpo7WbtzAFUZ74Z1ax2pQaQpOWS+iXtknTDUZbPk7RJ0qOSHpA0t6r9YUmPSNoh6R1l1llPvhnVrHalfQ9I0jTgVuA3gD3AFkkbI+I7VavdAtwZEf9P0uuAm4HfB54CLoyI5ySdADyWtt1bVr314ptRzWpX5hcRFwO7ImI3gKS7gEuB6gBaCLwvTW8GNgBExPNV6xzPFDtU9M2oZrUp8x/2HODJqvk9qa3aNuCKNH05cKKkUwAknS7p0fQaHz5a70fStZJ6JfUODg7WfQfMrFy5exargCWS+oAlwABwACAinoyIc4GXA1dLOnXkxhFxW0R0R0T3rFmzGlm3mdVBmQE0AJxeNT83tR0SEXsj4oqIWAS8P7UNjVwHeAz4tRJrNbMMygygLcDZks6QNAO4EthYvYKkmZIqNawBbk/tcyV1pumTgYsA30xl1mJKC6CI2A9cB/QAjwN3R8QOSWslrUirLQX6Je0ETgVuSu2/CDwkaRvwDeCWiNheVq1mloeiRR5Y1d3dHb29vbnLMGtLkrZGRPd4t8t9EtrM2pgDyMyycQCZWTYOIDPLxgFkZtk4gMwsGweQmWXjADKzbBxAZpaNA8jMsnEAmVk2DiAzy8YBZGbZOIDMLBsHkJll4wAys2wcQGaWjQPIzLJxAJlZNg4gM8vGAWRm2TiAzCybmgJI0kWS3pqmZ0k6o9yyzKwdHDOAJH0A+GOKJ5cCdACfLbMoM2sPtfSALgdWAD+FQ89qP7HMosysPUyvYZ3nIyIkBYCkl5RcUzYb+gZY19PP3qFhZnd1snrZAi5bNCd3WWYtq5YAulvSXwFdkt4O/AHw1+WW1Xgb+gZYfc829h0oHlU9MDTM6nu2ATiEzEoy5iGYJAF/C9wDfBFYANwYEZ9oQG0N9aEv7zgUPhX7DgQf+vKOTBWZtb4xe0Dp0OvvI+KXgfsbVFMWz/xs37jazWzyajkJ/bCkV5deiZm1nVrOAb0GuErSDymuhImic3RuqZU1WFdnB0PDL+ztdHV2ZKjGrD3U0gNaBpwFvA74LeCN6c9jkrRcUr+kXZJuOMryeZI2SXpU0gOS5qb2V0n6F0k70rLfrX2XJuaN5502rnYzm7xjBlBE/BDoogid3wK6UtuYJE0DbgUuARYCKyUtHLHaLcCdqTe1Frg5tf8MeEtEvBJYDnxUUlctOzRRm787OK52M5u8Wr4JfT3wOeAX0s9nJb27htdeDOyKiN0R8TxwF3DpiHUWAl9P05sryyNiZ0Q8kab3Aj8CZtXwnhO2d2h4XO1mNnm1HIK9DXhNRNwYETcCFwBvr2G7OcCTVfN7Ulu1bcAVafpy4ERJp1SvIGkxMAP43sg3kHStpF5JvYODk+upzO7qHFe7mU1eLQEk4EDV/IHUVg+rgCWS+oAlwED1e0k6Dfgb4K0RcXDkxhFxW0R0R0T3rFmT6yCtXraAzo5pR7R1dkxj9bIFk3pdMxtdLVfB7gAekvR3af4y4NM1bDcAnF41Pze1HZIOr64AkHQC8KaIGErzJwFfAd4fEd+q4f0mpfJtZ9+KYdY4iohjrySdD1yUZr8ZEX01bDMd2Am8niJ4tgC/FxE7qtaZCTwdEQcl3QQciIgbJc0A/gH4ckR8tJYd6e7ujt7e3lpWNbM6k7Q1IrrHu90xe0CSLgB2RMTDaf4kSa+JiIfG2i4i9ku6DugBpgG3R8QOSWuB3ojYCCwFbk43uv4j8K60+e8Avw6cIuma1HZNRDwy3h00s+Z1zB5QOj9zfqQVJR1HESDnN6C+mrkHZJbPRHtANZ2EjqqUSieDazl3ZGY2ploCaLek90jqSD/XA7vLLszMWl8tAfQO4FcpTiQPUNwbdm2ZRZlZezjmoVRE/Ai4sgG1mFmbGbUHJOntks5O05J0u6Rn082hTXUC2symprEOwa4HfpCmVwLnAWcC7wM+Vm5ZZtYOxgqg/RFRGSDnjRR3rf9HRHwNaNmB6c2sccYKoIOSTpP0IopvM3+tapnv0DSzSRvrJPSNQC/Ft5g3Vm6hkLQEX4Y3szoYNYAi4l5J84ATI+KZqkW9QOkjFObg54KZNdaxnoqxH3hmRNtPS60okw19A6xZv53hfcVoIANDw6xZvx3wc8HMylLLFxHbwrqe/kPhUzG87wDrevozVWTW+hxAiYdkNWu8CQWQpHPqXUhuHpLVrPEm2gO6r65VNAEPyWrWeKOehJb08dEWUTymp6V4SFazxhvrKthbgT8CnjvKspXllJPXZYvmOHDMGmisANoCPBYR/zxygaQPllaRmbWNsQLozcDPj7YgIs4opxwzaydjnYQ+ISJ+1rBKzKztjBVAGyoTkr5Yfilm1m7GCqDqp5+eWXYhZtZ+xgqgGGXazKwuxjoJfZ6kn1D0hDrTNGk+IuKk0qszs5Y21nAc00ZbZmZWD74Z1cyycQCZWTYOIDPLxgFkZtk4gMwsm1IDSNJySf2Sdkm64SjL50nalJ62+oCkuVXLvippSNK9ZdZoZvmUFkCSpgG3ApcAC4GVkhaOWO0WigcengusBW6uWrYO+P2y6jOz/MrsAS0GdkXE7oh4HrgLuHTEOguBr6fpzdXLI2IT8J8l1mdmmZUZQHOAJ6vm96S2atuAK9L05cCJkk6p9Q0kXSupV1Lv4ODgpIo1s8bLfRJ6FbBEUh+wBBgADoy9yWERcVtEdEdE96xZs8qq0cxKMuaDCSdpADi9an5uajskIvaSekCSTgDeFBFDJdZkZk2kzB7QFuBsSWdImgFcCWysXkHSTEmVGtYAt5dYj5k1mdICKD3W+TqgB3gcuDsidkhaK2lFWm0p0C9pJ3AqcFNle0nfBL4AvF7SHknLyqrVzPJQRGsM9dPd3R29vb25yzBrS5K2RkT3eLfLfRLazNqYA8jMsnEAmVk2DiAzy8YBZGbZOIDMLBsHkJll4wAys2wcQGaWjQPIzLJxAJlZNg4gM8vGAWRm2TiAzCwbB5CZZeMAMrNsyhwTuuls6BtgXU8/e4eGmd3VyeplC7hs0cgHdZhZo7RNAG3oG2DN+u0M7yseujEwNMya9dsBHEJmmbTNIdi6nv5D4VMxvO8A63r6M1VkZm0TQHuHhsfVbmbla5sAmt3VOa52Mytf2wTQ6mUL6OyYdkRbZ8c0Vi9bkKkiM2ubk9CVE82+CmbWPNomgKAIIQeOWfNom0MwM2s+DiAzy8YBZGbZOIDMLBsHkJll4wAys2wcQGaWTakBJGm5pH5JuyTdcJTl8yRtkvSopAckza1adrWkJ9LP1WXWaWZ5lBZAkqYBtwKXAAuBlZIWjljtFuDOiDgXWAvcnLZ9GfAB4DXAYuADkk4uq1Yzy6PMb0IvBnZFxG4ASXcBlwLfqVpnIfC+NL0Z2JCmlwH3R8TTadv7geXA5ydTkAckM2suZR6CzQGerJrfk9qqbQOuSNOXAydKOqXGbZF0raReSb2Dg4NjFlMZkGxgaJjg8IBkG/oGxrVTZlY/uU9CrwKWSOoDlgADwIGxNzksIm6LiO6I6J41a9aY63pAMrPmU+Yh2ABwetX83NR2SETsJfWAJJ0AvCkihiQNAEtHbPvAZIrxgGRmzafMHtAW4GxJZ0iaAVwJbKxeQdJMSZUa1gC3p+ke4GJJJ6eTzxentgnzgGRmzae0AIqI/cB1FMHxOHB3ROyQtFbSirTaUqBf0k7gVOCmtO3TwJ9ShNgWYG3lhPREeUAys+ajiMhdQ110d3dHb2/vmOv4KphZOSRtjYju8W7nAcnMLJvcV8HMrI05gMwsGweQmWXjADKzbBxAZpaNA8jMsnEAmVk2DiAzy8YBZGbZOIDMLBsHkJll01b3gvlmVLPm0jYBVBmStTIqYmVIVsAhZJZJ2xyCeUhWs+bTNgHkIVnNmk/bBJCHZDVrPm0TQB6S1az5tM1J6MqJZl8FM2sebdMDMrPm0zY9IF+GN2s+bdMD8mV4s+bTNgHky/BmzadtAsiX4c2aT9sEkC/DmzWftjkJ7cvwZs2nbQII/GRUs2bTNodgZtZ8HEBmlo0DyMyycQCZWTYOIDPLRhGRu4a6kDQI/BCYCfw4czll8b5NTe2wb/MiYtZ4N26ZAKqQ1BsR3bnrKIP3bWryvo3Oh2Bmlo0DyMyyacUAui13ASXyvk1N3rdRtNw5IDObOlqxB2RmU4QDyMyyaZkAkrRcUr+kXZJuyF3PeEk6XdJmSd+RtEPS9an9ZZLul/RE+vPk1C5JH0/7+6ik8/PuwbFJmiapT9K9af4MSQ+lffhbSTNS+/FpfldaPj9r4TWQ1CXpHknflfS4pAtb5bOT9Ifp7+Rjkj4v6UX1+uxaIoAkTQNuBS4BFgIrJS3MW9W47Qf+KCIWAhcA70r7cAOwKSLOBjaleSj29ez0cy3wycaXPG7XA49XzX8Y+IuIeDnwDPC21P424JnU/hdpvWb3MeCrEXEOcB7Ffk75z07SHOA9QHdE/BIwDbiSen12ETHlf4ALgZ6q+TXAmtx1TXKfvgT8BtAPnJbaTgP60/RfASur1j+0XjP+AHMp/hG+DrgXEMU3aKeP/AyBHuDCND09rafc+zDGvr0U+P7IGlvhswPmAE8CL0ufxb3Asnp9di3RA+LwL6liT2qbklK3dRHwEHBqRDyVFv0bcGqanmr7/FHgfwMH0/wpwFBE7E/z1fUf2re0/Nm0frM6AxgE7kiHmJ+S9BJa4LOLiAHgFuBfgacoPout1Omza5UAahmSTgC+CLw3In5SvSyK/1am3PcmJL0R+FFEbM1dS0mmA+cDn4yIRcBPOXy4BUzpz+5k4FKKkJ0NvARYXq/Xb5UAGgBOr5qfm9qmFEkdFOHzuYhYn5r/XdJpaflpwI9S+1Ta59cCKyT9ALiL4jDsY0CXpMqwwNX1H9q3tPylwH80suBx2gPsiYiH0vw9FIHUCp/dG4DvR8RgROwD1lN8nnX57FolgLYAZ6cz8zMoTpJtzFzTuEgS8Gng8Yj486pFG4Gr0/TVFOeGKu1vSVdULgCereruN5WIWBMRcyNiPsVn8/WIuArYDLw5rTZy3yr7/Oa0ftP2HiLi34AnJVUesfJ64Du0wGdHceh1gaQXp7+jlX2rz2eX+yRXHU+W/SawE/ge8P7c9Uyg/osouuiPAo+kn9+kOH7eBDwBfA14WVpfFFf+vgdsp7hKkX0/atjPpcC9afpM4NvALuALwPGp/UVpfldafmbuumvYr1cBvenz2wCc3CqfHfAh4LvAY8DfAMfX67PzrRhmlk2rHIKZ2RTkADKzbBxAZpaNA8jMsnEAmVk2DqApQlJI+kjV/CpJH6zTa39G0puPveak3+e3053im0e0z5c0nG5jeFzStyVdU7V8RY4RDiTNlnRPo9+3nUw/9irWJJ4DrpB0c0Q0zSNeJE2Pw/cEHcvbgLdHxD8dZdn3oriNAUlnAuslKSLuiIiNZPhiaUTs5fCX7awE7gFNHfspxt/9w5ELRvZgJP1X+nOppG9I+pKk3ZL+TNJVqYexXdJZVS/zBkm9kname7cq4/esk7QljVvzP6te95uSNlJ8K3ZkPSvT6z8m6cOp7UaKL1t+WtK6sXY0InYD76MYBgJJ10j6y6p9/aSkb6V9Wirp9tRz+kxVDRdL+hdJD0v6QrrHDkk/kPSh1L5d0jmpfYmkR9JPn6QTU8/ssbT8RZLuSNv0SfrvVbWtl/RVFeP+/J+q391n0u9gu6QXfG7mHtBUcyvwaOUveY3OA34ReBrYDXwqIharGPDs3cB703rzgcXAWcBmSS8H3kJxm8CrJR0PPCjpvrT++cAvRcT3q99M0myKMWB+hWKcmPskXRYRayW9DlgVEb011P0wcM4oy06mGAJiBUXP6LXA/wC2SHoVxb1ZfwK8ISJ+KumPKQJtbdr+xxFxvqR3AqvStquAd0XEgymsfj7iPd9FcU/pL6fQuk/SK9KyV1GMXvAc0C/pE8AvAHOiGEMHSV017HPbcQ9oConi7vg7ST2DGm2JiKci4jmKr/5XAmQ7RehU3B0RByPiCYqgOge4mOKepUcohgY5hWIQLYBvjwyf5NXAA1HcvLgf+Bzw6+Oot0JjLPtyFF/h3w78e0Rsj4iDwI60TxdQDEz3YKr9amBe1faVG323cvh38CDw55LeA3Qd5bDyIuCzABHxXYqn8FYCaFNEPBsRP6foEc6j+B2eKekTkpYDP8FewD2gqeejFL2DO6ra9pP+M5F0HDCjatlzVdMHq+YPcuTnP/KenKAIgXdHRE/1AklLKYacKNMijhw9sVr1Pozcv+nAAeD+iFh5jO0PpPWJiD+T9BWK++8elLSMF/aCRlNdwwGKgbqekXQexeBd7wB+B/iDGl+vbbgHNMVExNPA3RweAhPgBxSHPFAclnRM4KV/W9Jx6bzQmRSj9PUA/0vFMCFIeoWKgbbG8m1giaSZKobKXQl8YzyFqBiQ7RbgE+Pch4pvAa9Nh5FIeknV4dJo73lW6kl9mGJ0hZGHf98ErkrrvgL4bxS/o9FebyZwXER8keJwsKnHfc7FPaCp6SPAdVXzfw18SdI24KtMrHfyrxThcRLwjoj4uaRPURyiPCxJFKP+XTbWi0TEU+mS+WaKHtRXIuJLY22TnCWpj+Ju6v8EPh4Rn5nAfhARgyou438+nbuCIgR2jrHZe9OJ5cqh3D9QDKNa8X+BT0raTtHjvCYinit+LUc1h2KExMp/8msmsi+tznfDm1k2PgQzs2wcQGaWjQPIzLJxAJlZNg4gM8vGAWRm2TiAzCyb/w/cTs4P76EQxgAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] @@ -1518,13 +1702,12 @@ "source": [ "# Plot the tradeoff between dimensionality and F1 score\n", "x = summary_df[\"dims\"]\n", - "y = summary_df[\"F1\"]\n", + "y = summary_df[\"f1-score\"]\n", "\n", "plt.figure(figsize=(4,4))\n", "plt.scatter(x, y)\n", "#plt.yscale(\"log\")\n", "#plt.xscale(\"log\")\n", - "plt.ylim([0.75, 1.0])\n", "plt.xlabel(\"Number of Dimensions\")\n", "plt.ylabel(\"F1 Score\")\n", "\n", @@ -1544,23 +1727,9 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 14, "metadata": {}, "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "6dcf56c7d5c942c48750c96a6d1edcf2", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=140, style=ProgressStyle(desc…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ @@ -1582,149 +1751,65 @@ " \n", " \n", " \n", - " doc_num\n", " fold\n", - " doc_offset\n", + " doc_num\n", " span\n", " ent_type\n", - " gold\n", - " 768_1\n", - " 32_1\n", - " 32_2\n", - " 32_3\n", - " ...\n", - " 64_4\n", - " 128_1\n", - " 128_2\n", - " 128_3\n", - " 128_4\n", - " 256_1\n", - " 256_2\n", - " 256_3\n", - " 256_4\n", - " num_models\n", + " in_gold\n", + " count\n", + " models\n", " \n", " \n", " \n", " \n", - " 0\n", - " 0\n", + " 4927\n", " train\n", - " 12\n", - " [11, 16): 'Saudi'\n", - " MISC\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " ...\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", + " 907\n", + " [590, 598): 'Gorleben'\n", + " LOC\n", " True\n", " 17\n", + " [GOLD, 768_1, 32_1, 32_2, 32_3, 32_4, 64_1, 64...\n", " \n", " \n", - " 1\n", - " 0\n", + " 4925\n", " train\n", - " 12\n", - " [59, 65): 'MANAMA'\n", + " 907\n", + " [63, 67): 'BONN'\n", " LOC\n", " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " ...\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", " 17\n", + " [GOLD, 768_1, 32_1, 32_2, 32_3, 32_4, 64_1, 64...\n", " \n", " \n", - " 2\n", - " 0\n", + " 4924\n", " train\n", - " 12\n", - " [86, 91): 'Saudi'\n", + " 907\n", + " [11, 17): 'German'\n", " MISC\n", " True\n", - " True\n", - " True\n", - " True\n", - " False\n", - " ...\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " 14\n", + " 17\n", + " [GOLD, 768_1, 32_1, 32_2, 32_3, 32_4, 64_1, 64...\n", " \n", " \n", - " 3\n", - " 0\n", + " 4923\n", " train\n", - " 12\n", - " [259, 264): 'Saudi'\n", - " MISC\n", - " True\n", - " True\n", - " True\n", - " False\n", - " False\n", - " ...\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", - " True\n", + " 896\n", + " [523, 528): 'China'\n", + " LOC\n", " True\n", - " 13\n", + " 17\n", + " [GOLD, 768_1, 32_1, 32_2, 32_3, 32_4, 64_1, 64...\n", " \n", " \n", - " 4\n", - " 0\n", + " 4922\n", " train\n", - " 12\n", - " [403, 412): 'One-month'\n", - " MISC\n", - " True\n", - " False\n", - " True\n", - " False\n", - " True\n", - " ...\n", - " False\n", - " True\n", - " True\n", - " False\n", - " False\n", - " False\n", + " 896\n", + " [512, 518): 'Mexico'\n", + " LOC\n", " True\n", - " False\n", - " False\n", - " 9\n", + " 17\n", + " [GOLD, 768_1, 32_1, 32_2, 32_3, 32_4, 64_1, 64...\n", " \n", " \n", " ...\n", @@ -1735,202 +1820,105 @@ " ...\n", " ...\n", " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", " \n", " \n", - " 13\n", - " 139\n", - " test\n", - " 225\n", - " [45, 48): 'IBF'\n", + " 374\n", + " dev\n", + " 149\n", + " [81, 93): 'Major League'\n", " MISC\n", - " False\n", - " False\n", - " False\n", - " True\n", - " False\n", - " ...\n", - " False\n", " True\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " 2\n", + " 0\n", + " [GOLD]\n", " \n", " \n", - " 14\n", - " 139\n", - " test\n", - " 225\n", - " [11, 17): 'BOXING'\n", - " LOC\n", - " False\n", - " False\n", - " False\n", - " False\n", + " 246\n", + " dev\n", + " 120\n", + " [63, 70): 'English'\n", + " MISC\n", " True\n", - " ...\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " 1\n", + " 0\n", + " [GOLD]\n", " \n", " \n", - " 15\n", - " 139\n", - " test\n", - " 225\n", - " [86, 104): 'German Axel Schulz'\n", + " 78\n", + " dev\n", + " 64\n", + " [2571, 2575): 'AIDS'\n", " MISC\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " ...\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " 1\n", + " True\n", + " 0\n", + " [GOLD]\n", " \n", " \n", - " 16\n", - " 139\n", - " test\n", - " 225\n", - " [19, 25): 'SCHULZ'\n", - " LOC\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " ...\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", + " 3\n", + " dev\n", + " 21\n", + " [86, 90): 'UEFA'\n", + " ORG\n", " True\n", - " 1\n", + " 0\n", + " [GOLD]\n", " \n", " \n", - " 17\n", - " 139\n", - " test\n", - " 225\n", - " [145, 158): 'International'\n", + " 0\n", + " dev\n", + " 21\n", + " [25, 39): 'STANDARD LIEGE'\n", " ORG\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " ...\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " True\n", - " 1\n", + " 0\n", + " [GOLD]\n", " \n", " \n", "\n", - "

4927 rows × 24 columns

\n", + "

4928 rows × 7 columns

\n", "" ], "text/plain": [ - " doc_num fold doc_offset span ent_type \\\n", - "0 0 train 12 [11, 16): 'Saudi' MISC \n", - "1 0 train 12 [59, 65): 'MANAMA' LOC \n", - "2 0 train 12 [86, 91): 'Saudi' MISC \n", - "3 0 train 12 [259, 264): 'Saudi' MISC \n", - "4 0 train 12 [403, 412): 'One-month' MISC \n", - ".. ... ... ... ... ... \n", - "13 139 test 225 [45, 48): 'IBF' MISC \n", - "14 139 test 225 [11, 17): 'BOXING' LOC \n", - "15 139 test 225 [86, 104): 'German Axel Schulz' MISC \n", - "16 139 test 225 [19, 25): 'SCHULZ' LOC \n", - "17 139 test 225 [145, 158): 'International' ORG \n", - "\n", - " gold 768_1 32_1 32_2 32_3 ... 64_4 128_1 128_2 128_3 128_4 \\\n", - "0 True True True True True ... True True True True True \n", - "1 True True True True True ... True True True True True \n", - "2 True True True True False ... True True True True True \n", - "3 True True True False False ... True True True True True \n", - "4 True False True False True ... False True True False False \n", - ".. ... ... ... ... ... ... ... ... ... ... ... \n", - "13 False False False True False ... False True False False False \n", - "14 False False False False True ... False False False False False \n", - "15 False False False False False ... False False False False False \n", - "16 False False False False False ... False False False False False \n", - "17 False False False False False ... False False False False False \n", + " fold doc_num span ent_type in_gold count \\\n", + "4927 train 907 [590, 598): 'Gorleben' LOC True 17 \n", + "4925 train 907 [63, 67): 'BONN' LOC True 17 \n", + "4924 train 907 [11, 17): 'German' MISC True 17 \n", + "4923 train 896 [523, 528): 'China' LOC True 17 \n", + "4922 train 896 [512, 518): 'Mexico' LOC True 17 \n", + "... ... ... ... ... ... ... \n", + "374 dev 149 [81, 93): 'Major League' MISC True 0 \n", + "246 dev 120 [63, 70): 'English' MISC True 0 \n", + "78 dev 64 [2571, 2575): 'AIDS' MISC True 0 \n", + "3 dev 21 [86, 90): 'UEFA' ORG True 0 \n", + "0 dev 21 [25, 39): 'STANDARD LIEGE' ORG True 0 \n", "\n", - " 256_1 256_2 256_3 256_4 num_models \n", - "0 True True True True 17 \n", - "1 True True True True 17 \n", - "2 True True True True 14 \n", - "3 True True True True 13 \n", - "4 False True False False 9 \n", - ".. ... ... ... ... ... \n", - "13 False False False False 2 \n", - "14 False False False False 1 \n", - "15 False False False False 1 \n", - "16 False False False True 1 \n", - "17 False False False True 1 \n", + " models \n", + "4927 [GOLD, 768_1, 32_1, 32_2, 32_3, 32_4, 64_1, 64... \n", + "4925 [GOLD, 768_1, 32_1, 32_2, 32_3, 32_4, 64_1, 64... \n", + "4924 [GOLD, 768_1, 32_1, 32_2, 32_3, 32_4, 64_1, 64... \n", + "4923 [GOLD, 768_1, 32_1, 32_2, 32_3, 32_4, 64_1, 64... \n", + "4922 [GOLD, 768_1, 32_1, 32_2, 32_3, 32_4, 64_1, 64... \n", + "... ... \n", + "374 [GOLD] \n", + "246 [GOLD] \n", + "78 [GOLD] \n", + "3 [GOLD] \n", + "0 [GOLD] \n", "\n", - "[4927 rows x 24 columns]" + "[4928 rows x 7 columns]" ] }, - "execution_count": 16, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "full_results = util.merge_model_results(evals)\n", + "full_results = cleaning.flag_suspicious_labels(evals,'ent_type','ent_type',label_name='ent_type',gold_feats=gold_elts,align_over_cols=['fold','doc_num','span'],keep_cols=[],split_doc=False)\n", "full_results" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -1955,58 +1943,58 @@ " \n", " \n", " fold\n", - " doc_offset\n", + " doc_num\n", " span\n", " ent_type\n", - " gold\n", - " num_models\n", + " in_gold\n", + " count\n", " \n", " \n", " \n", " \n", - " 0\n", + " 4927\n", " train\n", - " 12\n", - " [11, 16): 'Saudi'\n", - " MISC\n", + " 907\n", + " [590, 598): 'Gorleben'\n", + " LOC\n", " True\n", " 17\n", " \n", " \n", - " 1\n", + " 4925\n", " train\n", - " 12\n", - " [59, 65): 'MANAMA'\n", + " 907\n", + " [63, 67): 'BONN'\n", " LOC\n", " True\n", " 17\n", " \n", " \n", - " 2\n", + " 4924\n", " train\n", - " 12\n", - " [86, 91): 'Saudi'\n", + " 907\n", + " [11, 17): 'German'\n", " MISC\n", " True\n", - " 14\n", + " 17\n", " \n", " \n", - " 3\n", + " 4923\n", " train\n", - " 12\n", - " [259, 264): 'Saudi'\n", - " MISC\n", + " 896\n", + " [523, 528): 'China'\n", + " LOC\n", " True\n", - " 13\n", + " 17\n", " \n", " \n", - " 4\n", + " 4922\n", " train\n", - " 12\n", - " [403, 412): 'One-month'\n", - " MISC\n", + " 896\n", + " [512, 518): 'Mexico'\n", + " LOC\n", " True\n", - " 9\n", + " 17\n", " \n", " \n", " ...\n", @@ -2018,99 +2006,86 @@ " ...\n", " \n", " \n", - " 13\n", - " test\n", - " 225\n", - " [45, 48): 'IBF'\n", + " 374\n", + " dev\n", + " 149\n", + " [81, 93): 'Major League'\n", " MISC\n", - " False\n", - " 2\n", + " True\n", + " 0\n", " \n", " \n", - " 14\n", - " test\n", - " 225\n", - " [11, 17): 'BOXING'\n", - " LOC\n", - " False\n", - " 1\n", + " 246\n", + " dev\n", + " 120\n", + " [63, 70): 'English'\n", + " MISC\n", + " True\n", + " 0\n", " \n", " \n", - " 15\n", - " test\n", - " 225\n", - " [86, 104): 'German Axel Schulz'\n", + " 78\n", + " dev\n", + " 64\n", + " [2571, 2575): 'AIDS'\n", " MISC\n", - " False\n", - " 1\n", + " True\n", + " 0\n", " \n", " \n", - " 16\n", - " test\n", - " 225\n", - " [19, 25): 'SCHULZ'\n", - " LOC\n", - " False\n", - " 1\n", + " 3\n", + " dev\n", + " 21\n", + " [86, 90): 'UEFA'\n", + " ORG\n", + " True\n", + " 0\n", " \n", " \n", - " 17\n", - " test\n", - " 225\n", - " [145, 158): 'International'\n", + " 0\n", + " dev\n", + " 21\n", + " [25, 39): 'STANDARD LIEGE'\n", " ORG\n", - " False\n", - " 1\n", + " True\n", + " 0\n", " \n", " \n", "\n", - "

4927 rows × 6 columns

\n", + "

4928 rows × 6 columns

\n", "" ], "text/plain": [ - " fold doc_offset span ent_type gold \\\n", - "0 train 12 [11, 16): 'Saudi' MISC True \n", - "1 train 12 [59, 65): 'MANAMA' LOC True \n", - "2 train 12 [86, 91): 'Saudi' MISC True \n", - "3 train 12 [259, 264): 'Saudi' MISC True \n", - "4 train 12 [403, 412): 'One-month' MISC True \n", - ".. ... ... ... ... ... \n", - "13 test 225 [45, 48): 'IBF' MISC False \n", - "14 test 225 [11, 17): 'BOXING' LOC False \n", - "15 test 225 [86, 104): 'German Axel Schulz' MISC False \n", - "16 test 225 [19, 25): 'SCHULZ' LOC False \n", - "17 test 225 [145, 158): 'International' ORG False \n", - "\n", - " num_models \n", - "0 17 \n", - "1 17 \n", - "2 14 \n", - "3 13 \n", - "4 9 \n", - ".. ... \n", - "13 2 \n", - "14 1 \n", - "15 1 \n", - "16 1 \n", - "17 1 \n", + " fold doc_num span ent_type in_gold count\n", + "4927 train 907 [590, 598): 'Gorleben' LOC True 17\n", + "4925 train 907 [63, 67): 'BONN' LOC True 17\n", + "4924 train 907 [11, 17): 'German' MISC True 17\n", + "4923 train 896 [523, 528): 'China' LOC True 17\n", + "4922 train 896 [512, 518): 'Mexico' LOC True 17\n", + "... ... ... ... ... ... ...\n", + "374 dev 149 [81, 93): 'Major League' MISC True 0\n", + "246 dev 120 [63, 70): 'English' MISC True 0\n", + "78 dev 64 [2571, 2575): 'AIDS' MISC True 0\n", + "3 dev 21 [86, 90): 'UEFA' ORG True 0\n", + "0 dev 21 [25, 39): 'STANDARD LIEGE' ORG True 0\n", "\n", - "[4927 rows x 6 columns]" + "[4928 rows x 6 columns]" ] }, - "execution_count": 17, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Drop Boolean columns for now\n", - "results = full_results[[\"fold\", \"doc_offset\", \"span\", \"ent_type\", \"gold\", \"num_models\"]]\n", + "results = full_results[[\"fold\", \"doc_num\", \"span\", \"ent_type\", \"in_gold\", \"count\"]]\n", "results" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -2134,10 +2109,10 @@ " \n", " \n", " \n", - " count\n", + " num_ents\n", " \n", " \n", - " num_models\n", + " count\n", " \n", " \n", " \n", @@ -2188,11 +2163,11 @@ " \n", " \n", " 11\n", - " 42\n", + " 41\n", " \n", " \n", " 12\n", - " 47\n", + " 48\n", " \n", " \n", " 13\n", @@ -2208,53 +2183,53 @@ " \n", " \n", " 16\n", - " 247\n", + " 248\n", " \n", " \n", " 17\n", - " 2941\n", + " 2940\n", " \n", " \n", "\n", "" ], "text/plain": [ - " count\n", - "num_models \n", - "0 115\n", - "1 31\n", - "2 23\n", - "3 20\n", - "4 17\n", - "5 18\n", - "6 23\n", - "7 23\n", - "8 19\n", - "9 29\n", - "10 28\n", - "11 42\n", - "12 47\n", - "13 62\n", - "14 75\n", - "15 115\n", - "16 247\n", - "17 2941" + " num_ents\n", + "count \n", + "0 115\n", + "1 31\n", + "2 23\n", + "3 20\n", + "4 17\n", + "5 18\n", + "6 23\n", + "7 23\n", + "8 19\n", + "9 29\n", + "10 28\n", + "11 41\n", + "12 48\n", + "13 62\n", + "14 75\n", + "15 115\n", + "16 248\n", + "17 2940" ] }, - "execution_count": 18, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "(results[results[\"gold\"] == True][[\"num_models\", \"span\"]]\n", - " .groupby(\"num_models\").count()\n", - " .rename(columns={\"span\": \"count\"}))" + "(results[results[\"in_gold\"] == True][[\"count\", \"span\"]]\n", + " .groupby(\"count\").count()\n", + " .rename(columns={\"span\": \"num_ents\"}))" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -2278,10 +2253,10 @@ " \n", " \n", " \n", - " count\n", + " num_ents\n", " \n", " \n", - " num_models\n", + " count\n", " \n", " \n", " \n", @@ -2292,7 +2267,7 @@ " \n", " \n", " 2\n", - " 173\n", + " 174\n", " \n", " \n", " 3\n", @@ -2304,11 +2279,11 @@ " \n", " \n", " 5\n", - " 51\n", + " 52\n", " \n", " \n", " 6\n", - " 27\n", + " 26\n", " \n", " \n", " 7\n", @@ -2320,19 +2295,19 @@ " \n", " \n", " 9\n", - " 18\n", + " 17\n", " \n", " \n", " 10\n", - " 11\n", + " 12\n", " \n", " \n", " 11\n", - " 10\n", + " 9\n", " \n", " \n", " 12\n", - " 8\n", + " 9\n", " \n", " \n", " 13\n", @@ -2359,41 +2334,41 @@ "" ], "text/plain": [ - " count\n", - "num_models \n", - "1 468\n", - "2 173\n", - "3 94\n", - "4 61\n", - "5 51\n", - "6 27\n", - "7 36\n", - "8 16\n", - "9 18\n", - "10 11\n", - "11 10\n", - "12 8\n", - "13 8\n", - "14 11\n", - "15 14\n", - "16 15\n", - "17 31" + " num_ents\n", + "count \n", + "1 468\n", + "2 174\n", + "3 94\n", + "4 61\n", + "5 52\n", + "6 26\n", + "7 36\n", + "8 16\n", + "9 17\n", + "10 12\n", + "11 9\n", + "12 9\n", + "13 8\n", + "14 11\n", + "15 14\n", + "16 15\n", + "17 31" ] }, - "execution_count": 19, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "(results[results[\"gold\"] == False][[\"num_models\", \"span\"]]\n", - " .groupby(\"num_models\").count()\n", - " .rename(columns={\"span\": \"count\"}))" + "(results[results[\"in_gold\"] == False][[\"count\", \"span\"]]\n", + " .groupby(\"count\").count()\n", + " .rename(columns={\"span\": \"num_ents\"}))" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -2418,34 +2393,34 @@ " \n", " \n", " fold\n", - " doc_offset\n", + " doc_num\n", " span\n", " ent_type\n", - " gold\n", - " num_models\n", + " in_gold\n", + " count\n", " \n", " \n", " \n", " \n", - " 0\n", + " 3\n", " dev\n", " 21\n", - " [25, 39): 'STANDARD LIEGE'\n", + " [86, 90): 'UEFA'\n", " ORG\n", " True\n", " 0\n", " \n", " \n", - " 2\n", + " 0\n", " dev\n", " 21\n", - " [86, 90): 'UEFA'\n", + " [25, 39): 'STANDARD LIEGE'\n", " ORG\n", " True\n", " 0\n", " \n", " \n", - " 18\n", + " 78\n", " dev\n", " 64\n", " [2571, 2575): 'AIDS'\n", @@ -2454,7 +2429,7 @@ " 0\n", " \n", " \n", - " 2\n", + " 246\n", " dev\n", " 120\n", " [63, 70): 'English'\n", @@ -2463,7 +2438,7 @@ " 0\n", " \n", " \n", - " 2\n", + " 374\n", " dev\n", " 149\n", " [81, 93): 'Major League'\n", @@ -2472,25 +2447,25 @@ " 0\n", " \n", " \n", - " 19\n", + " 498\n", " dev\n", " 182\n", - " [662, 670): 'division'\n", - " MISC\n", + " [2173, 2177): 'Ruch'\n", + " ORG\n", " True\n", " 0\n", " \n", " \n", - " 46\n", + " 462\n", " dev\n", " 182\n", - " [2173, 2177): 'Ruch'\n", - " ORG\n", + " [662, 670): 'division'\n", + " MISC\n", " True\n", " 0\n", " \n", " \n", - " 6\n", + " 512\n", " dev\n", " 203\n", " [879, 881): '90'\n", @@ -2499,106 +2474,106 @@ " 0\n", " \n", " \n", - " 5\n", + " 622\n", " dev\n", " 214\n", - " [187, 202): 'Michael Collins'\n", + " [1689, 1705): 'Schindler's List'\n", " MISC\n", " True\n", " 0\n", " \n", " \n", - " 7\n", + " 621\n", " dev\n", " 214\n", - " [285, 305): 'Venice Film Festival'\n", - " MISC\n", + " [1643, 1648): 'Oscar'\n", + " PER\n", " True\n", " 0\n", " \n", " \n", - " 33\n", + " 583\n", " dev\n", " 214\n", - " [1643, 1648): 'Oscar'\n", - " PER\n", + " [285, 305): 'Venice Film Festival'\n", + " MISC\n", " True\n", " 0\n", " \n", " \n", - " 34\n", + " 569\n", " dev\n", " 214\n", - " [1689, 1705): 'Schindler's List'\n", + " [187, 202): 'Michael Collins'\n", " MISC\n", " True\n", " 0\n", " \n", " \n", - " 1\n", + " 802\n", " test\n", " 15\n", - " [32, 43): 'WEST INDIES'\n", - " LOC\n", + " [44, 56): 'WORLD SERIES'\n", + " MISC\n", " True\n", " 0\n", " \n", " \n", - " 2\n", + " 801\n", " test\n", " 15\n", - " [44, 56): 'WORLD SERIES'\n", - " MISC\n", + " [32, 43): 'WEST INDIES'\n", + " LOC\n", " True\n", " 0\n", " \n", " \n", - " 0\n", + " 942\n", " test\n", " 21\n", - " [22, 38): 'WORLD GRAND PRIX'\n", - " MISC\n", + " [719, 725): 'Wijaya'\n", + " PER\n", " True\n", " 0\n", " \n", " \n", - " 34\n", + " 896\n", " test\n", " 21\n", - " [719, 725): 'Wijaya'\n", - " PER\n", + " [22, 38): 'WORLD GRAND PRIX'\n", + " MISC\n", " True\n", " 0\n", " \n", " \n", - " 2\n", + " 1057\n", " test\n", " 23\n", - " [94, 109): 'National Hockey'\n", + " [1117, 1127): 'NY RANGERS'\n", " ORG\n", " True\n", " 0\n", " \n", " \n", - " 3\n", + " 1052\n", " test\n", " 23\n", - " [110, 116): 'League'\n", + " [1106, 1113): 'TORONTO'\n", " ORG\n", " True\n", " 0\n", " \n", " \n", - " 10\n", + " 1025\n", " test\n", " 23\n", - " [427, 435): 'ATLANTIC'\n", - " LOC\n", + " [673, 689): 'CENTRAL DIVISION'\n", + " MISC\n", " True\n", " 0\n", " \n", " \n", - " 16\n", + " 1016\n", " test\n", " 23\n", " [599, 611): 'NY ISLANDERS'\n", @@ -2611,59 +2586,59 @@ "" ], "text/plain": [ - " fold doc_offset span ent_type gold \\\n", - "0 dev 21 [25, 39): 'STANDARD LIEGE' ORG True \n", - "2 dev 21 [86, 90): 'UEFA' ORG True \n", - "18 dev 64 [2571, 2575): 'AIDS' MISC True \n", - "2 dev 120 [63, 70): 'English' MISC True \n", - "2 dev 149 [81, 93): 'Major League' MISC True \n", - "19 dev 182 [662, 670): 'division' MISC True \n", - "46 dev 182 [2173, 2177): 'Ruch' ORG True \n", - "6 dev 203 [879, 881): '90' LOC True \n", - "5 dev 214 [187, 202): 'Michael Collins' MISC True \n", - "7 dev 214 [285, 305): 'Venice Film Festival' MISC True \n", - "33 dev 214 [1643, 1648): 'Oscar' PER True \n", - "34 dev 214 [1689, 1705): 'Schindler's List' MISC True \n", - "1 test 15 [32, 43): 'WEST INDIES' LOC True \n", - "2 test 15 [44, 56): 'WORLD SERIES' MISC True \n", - "0 test 21 [22, 38): 'WORLD GRAND PRIX' MISC True \n", - "34 test 21 [719, 725): 'Wijaya' PER True \n", - "2 test 23 [94, 109): 'National Hockey' ORG True \n", - "3 test 23 [110, 116): 'League' ORG True \n", - "10 test 23 [427, 435): 'ATLANTIC' LOC True \n", - "16 test 23 [599, 611): 'NY ISLANDERS' ORG True \n", + " fold doc_num span ent_type in_gold \\\n", + "3 dev 21 [86, 90): 'UEFA' ORG True \n", + "0 dev 21 [25, 39): 'STANDARD LIEGE' ORG True \n", + "78 dev 64 [2571, 2575): 'AIDS' MISC True \n", + "246 dev 120 [63, 70): 'English' MISC True \n", + "374 dev 149 [81, 93): 'Major League' MISC True \n", + "498 dev 182 [2173, 2177): 'Ruch' ORG True \n", + "462 dev 182 [662, 670): 'division' MISC True \n", + "512 dev 203 [879, 881): '90' LOC True \n", + "622 dev 214 [1689, 1705): 'Schindler's List' MISC True \n", + "621 dev 214 [1643, 1648): 'Oscar' PER True \n", + "583 dev 214 [285, 305): 'Venice Film Festival' MISC True \n", + "569 dev 214 [187, 202): 'Michael Collins' MISC True \n", + "802 test 15 [44, 56): 'WORLD SERIES' MISC True \n", + "801 test 15 [32, 43): 'WEST INDIES' LOC True \n", + "942 test 21 [719, 725): 'Wijaya' PER True \n", + "896 test 21 [22, 38): 'WORLD GRAND PRIX' MISC True \n", + "1057 test 23 [1117, 1127): 'NY RANGERS' ORG True \n", + "1052 test 23 [1106, 1113): 'TORONTO' ORG True \n", + "1025 test 23 [673, 689): 'CENTRAL DIVISION' MISC True \n", + "1016 test 23 [599, 611): 'NY ISLANDERS' ORG True \n", "\n", - " num_models \n", - "0 0 \n", - "2 0 \n", - "18 0 \n", - "2 0 \n", - "2 0 \n", - "19 0 \n", - "46 0 \n", - "6 0 \n", - "5 0 \n", - "7 0 \n", - "33 0 \n", - "34 0 \n", - "1 0 \n", - "2 0 \n", - "0 0 \n", - "34 0 \n", - "2 0 \n", - "3 0 \n", - "10 0 \n", - "16 0 " + " count \n", + "3 0 \n", + "0 0 \n", + "78 0 \n", + "246 0 \n", + "374 0 \n", + "498 0 \n", + "462 0 \n", + "512 0 \n", + "622 0 \n", + "621 0 \n", + "583 0 \n", + "569 0 \n", + "802 0 \n", + "801 0 \n", + "942 0 \n", + "896 0 \n", + "1057 0 \n", + "1052 0 \n", + "1025 0 \n", + "1016 0 " ] }, - "execution_count": 20, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Pull out some hard-to-find examples, sorting by document to make labeling easier\n", - "hard_to_get = results[results[\"gold\"]].sort_values([\"num_models\", \"fold\", \"doc_offset\"]).head(20)\n", + "hard_to_get = results[results[\"in_gold\"]].sort_values([\"count\", \"fold\", \"doc_num\"]).head(20)\n", "hard_to_get" ] }, @@ -2676,7 +2651,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -2701,16 +2676,16 @@ " \n", " \n", " fold\n", - " doc_offset\n", + " doc_num\n", " span\n", " ent_type\n", - " gold\n", - " num_models\n", + " in_gold\n", + " count\n", " \n", " \n", " \n", " \n", - " 35\n", + " 373\n", " dev\n", " 149\n", " [81, 102): 'Major League Baseball'\n", @@ -2719,7 +2694,7 @@ " 17\n", " \n", " \n", - " 52\n", + " 570\n", " dev\n", " 214\n", " [187, 202): 'Michael Collins'\n", @@ -2728,7 +2703,7 @@ " 17\n", " \n", " \n", - " 47\n", + " 983\n", " test\n", " 23\n", " [94, 116): 'National Hockey League'\n", @@ -2737,25 +2712,25 @@ " 17\n", " \n", " \n", - " 43\n", + " 1110\n", " test\n", " 25\n", - " [823, 835): 'Philadelphia'\n", - " ORG\n", + " [856, 864): 'NFC East'\n", + " MISC\n", " False\n", " 17\n", " \n", " \n", - " 44\n", + " 1109\n", " test\n", " 25\n", - " [856, 864): 'NFC East'\n", - " MISC\n", + " [823, 835): 'Philadelphia'\n", + " ORG\n", " False\n", " 17\n", " \n", " \n", - " 25\n", + " 1184\n", " test\n", " 41\n", " [674, 688): 'Sporting Gijon'\n", @@ -2764,7 +2739,7 @@ " 17\n", " \n", " \n", - " 8\n", + " 1323\n", " test\n", " 114\n", " [51, 61): 'sales-USDA'\n", @@ -2773,25 +2748,25 @@ " 17\n", " \n", " \n", - " 13\n", + " 1367\n", " test\n", " 118\n", - " [535, 550): 'mid-Mississippi'\n", + " [776, 791): 'mid-Mississippi'\n", " LOC\n", " False\n", " 17\n", " \n", " \n", - " 15\n", + " 1362\n", " test\n", " 118\n", - " [776, 791): 'mid-Mississippi'\n", + " [535, 550): 'mid-Mississippi'\n", " LOC\n", " False\n", " 17\n", " \n", " \n", - " 53\n", + " 1509\n", " test\n", " 178\n", " [1787, 1800): 'Uruguay Round'\n", @@ -2800,25 +2775,25 @@ " 17\n", " \n", " \n", - " 31\n", + " 1560\n", " test\n", " 180\n", - " [259, 263): 'BILO'\n", + " [588, 592): 'BILO'\n", " ORG\n", " False\n", " 17\n", " \n", " \n", - " 32\n", + " 1558\n", " test\n", " 180\n", - " [286, 293): 'Malysia'\n", + " [579, 583): 'TOPS'\n", " ORG\n", " False\n", " 17\n", " \n", " \n", - " 34\n", + " 1550\n", " test\n", " 180\n", " [395, 399): 'BILO'\n", @@ -2827,25 +2802,25 @@ " 17\n", " \n", " \n", - " 36\n", + " 1544\n", " test\n", " 180\n", - " [579, 583): 'TOPS'\n", + " [286, 293): 'Malysia'\n", " ORG\n", " False\n", " 17\n", " \n", " \n", - " 37\n", + " 1542\n", " test\n", " 180\n", - " [588, 592): 'BILO'\n", + " [259, 263): 'BILO'\n", " ORG\n", " False\n", " 17\n", " \n", " \n", - " 96\n", + " 1649\n", " test\n", " 207\n", " [1041, 1047): 'Oxford'\n", @@ -2854,7 +2829,7 @@ " 17\n", " \n", " \n", - " 25\n", + " 1786\n", " test\n", " 219\n", " [368, 381): 'Koo Jeon Woon'\n", @@ -2863,25 +2838,25 @@ " 17\n", " \n", " \n", - " 52\n", + " 1807\n", " test\n", " 222\n", - " [92, 114): 'National Hockey League'\n", + " [218, 225): 'EASTERN'\n", " MISC\n", " False\n", " 17\n", " \n", " \n", - " 53\n", + " 1805\n", " test\n", " 222\n", - " [218, 225): 'EASTERN'\n", + " [92, 114): 'National Hockey League'\n", " MISC\n", " False\n", " 17\n", " \n", " \n", - " 230\n", + " 2054\n", " train\n", " 48\n", " [885, 899): 'Sjeng Schalken'\n", @@ -2894,59 +2869,59 @@ "" ], "text/plain": [ - " fold doc_offset span ent_type gold \\\n", - "35 dev 149 [81, 102): 'Major League Baseball' MISC False \n", - "52 dev 214 [187, 202): 'Michael Collins' PER False \n", - "47 test 23 [94, 116): 'National Hockey League' MISC False \n", - "43 test 25 [823, 835): 'Philadelphia' ORG False \n", - "44 test 25 [856, 864): 'NFC East' MISC False \n", - "25 test 41 [674, 688): 'Sporting Gijon' ORG False \n", - "8 test 114 [51, 61): 'sales-USDA' ORG False \n", - "13 test 118 [535, 550): 'mid-Mississippi' LOC False \n", - "15 test 118 [776, 791): 'mid-Mississippi' LOC False \n", - "53 test 178 [1787, 1800): 'Uruguay Round' MISC False \n", - "31 test 180 [259, 263): 'BILO' ORG False \n", - "32 test 180 [286, 293): 'Malysia' ORG False \n", - "34 test 180 [395, 399): 'BILO' ORG False \n", - "36 test 180 [579, 583): 'TOPS' ORG False \n", - "37 test 180 [588, 592): 'BILO' ORG False \n", - "96 test 207 [1041, 1047): 'Oxford' ORG False \n", - "25 test 219 [368, 381): 'Koo Jeon Woon' PER False \n", - "52 test 222 [92, 114): 'National Hockey League' MISC False \n", - "53 test 222 [218, 225): 'EASTERN' MISC False \n", - "230 train 48 [885, 899): 'Sjeng Schalken' ORG False \n", + " fold doc_num span ent_type in_gold \\\n", + "373 dev 149 [81, 102): 'Major League Baseball' MISC False \n", + "570 dev 214 [187, 202): 'Michael Collins' PER False \n", + "983 test 23 [94, 116): 'National Hockey League' MISC False \n", + "1110 test 25 [856, 864): 'NFC East' MISC False \n", + "1109 test 25 [823, 835): 'Philadelphia' ORG False \n", + "1184 test 41 [674, 688): 'Sporting Gijon' ORG False \n", + "1323 test 114 [51, 61): 'sales-USDA' ORG False \n", + "1367 test 118 [776, 791): 'mid-Mississippi' LOC False \n", + "1362 test 118 [535, 550): 'mid-Mississippi' LOC False \n", + "1509 test 178 [1787, 1800): 'Uruguay Round' MISC False \n", + "1560 test 180 [588, 592): 'BILO' ORG False \n", + "1558 test 180 [579, 583): 'TOPS' ORG False \n", + "1550 test 180 [395, 399): 'BILO' ORG False \n", + "1544 test 180 [286, 293): 'Malysia' ORG False \n", + "1542 test 180 [259, 263): 'BILO' ORG False \n", + "1649 test 207 [1041, 1047): 'Oxford' ORG False \n", + "1786 test 219 [368, 381): 'Koo Jeon Woon' PER False \n", + "1807 test 222 [218, 225): 'EASTERN' MISC False \n", + "1805 test 222 [92, 114): 'National Hockey League' MISC False \n", + "2054 train 48 [885, 899): 'Sjeng Schalken' ORG False \n", "\n", - " num_models \n", - "35 17 \n", - "52 17 \n", - "47 17 \n", - "43 17 \n", - "44 17 \n", - "25 17 \n", - "8 17 \n", - "13 17 \n", - "15 17 \n", - "53 17 \n", - "31 17 \n", - "32 17 \n", - "34 17 \n", - "36 17 \n", - "37 17 \n", - "96 17 \n", - "25 17 \n", - "52 17 \n", - "53 17 \n", - "230 17 " + " count \n", + "373 17 \n", + "570 17 \n", + "983 17 \n", + "1110 17 \n", + "1109 17 \n", + "1184 17 \n", + "1323 17 \n", + "1367 17 \n", + "1362 17 \n", + "1509 17 \n", + "1560 17 \n", + "1558 17 \n", + "1550 17 \n", + "1544 17 \n", + "1542 17 \n", + "1649 17 \n", + "1786 17 \n", + "1807 17 \n", + "1805 17 \n", + "2054 17 " ] }, - "execution_count": 21, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Hardest results not in the gold standard for models to avoid\n", - "hard_to_avoid = results[~results[\"gold\"]].sort_values([\"num_models\", \"fold\", \"doc_offset\"], ascending=[False, True, True]).head(20)\n", + "hard_to_avoid = results[~results[\"in_gold\"]].sort_values([\"count\", \"fold\", \"doc_num\"], ascending=[False, True, True]).head(20)\n", "hard_to_avoid" ] }, @@ -2969,7 +2944,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -2994,29 +2969,29 @@ "Training model '256_2' (#2 at 256 dimensions) with seed 643865\n", "Training model '256_3' (#3 at 256 dimensions) with seed 402414\n", "Training model '256_4' (#4 at 256 dimensions) with seed 822761\n", - "\u001b[2m\u001b[36m(pid=24222)\u001b[0m Training model with n_components=128 and seed=513226.\n", - "\u001b[2m\u001b[36m(pid=24226)\u001b[0m Training model with n_components=256 and seed=402414.\n", - "\u001b[2m\u001b[36m(pid=24230)\u001b[0m Training model with n_components=128 and seed=128113.\n", - "\u001b[2m\u001b[36m(pid=24223)\u001b[0m Training model with n_components=256 and seed=781567.\n", - "\u001b[2m\u001b[36m(pid=24224)\u001b[0m Training model with n_components=256 and seed=643865.\n", - "\u001b[2m\u001b[36m(pid=24225)\u001b[0m Training model with n_components=32 and seed=89250.\n", - "\u001b[2m\u001b[36m(pid=24235)\u001b[0m Training model with n_components=128 and seed=450385.\n", - "\u001b[2m\u001b[36m(pid=24227)\u001b[0m Training model with n_components=128 and seed=839748.\n", - "\u001b[2m\u001b[36m(pid=24231)\u001b[0m Training model with n_components=64 and seed=201469.\n", - "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=32 and seed=654571.\n", - "\u001b[2m\u001b[36m(pid=24234)\u001b[0m Training model with n_components=64 and seed=526478.\n", - "\u001b[2m\u001b[36m(pid=24233)\u001b[0m Training model with n_components=64 and seed=975622.\n", - "\u001b[2m\u001b[36m(pid=24232)\u001b[0m Training model with n_components=64 and seed=94177.\n", - "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=32 and seed=438878.\n", - "\u001b[2m\u001b[36m(pid=24229)\u001b[0m Training model with n_components=32 and seed=773956.\n", - "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=256 and seed=822761.\n", + "\u001b[2m\u001b[36m(pid=1424)\u001b[0m Training model with n_components=128 and seed=513226.\n", + "\u001b[2m\u001b[36m(pid=1427)\u001b[0m Training model with n_components=128 and seed=839748.\n", + "\u001b[2m\u001b[36m(pid=1421)\u001b[0m Training model with n_components=64 and seed=201469.\n", + "\u001b[2m\u001b[36m(pid=1428)\u001b[0m Training model with n_components=128 and seed=450385.\n", + "\u001b[2m\u001b[36m(pid=1422)\u001b[0m Training model with n_components=32 and seed=438878.\n", + "\u001b[2m\u001b[36m(pid=1431)\u001b[0m Training model with n_components=256 and seed=643865.\n", + "\u001b[2m\u001b[36m(pid=1430)\u001b[0m Training model with n_components=256 and seed=781567.\n", + "\u001b[2m\u001b[36m(pid=1432)\u001b[0m Training model with n_components=256 and seed=402414.\n", + "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=32 and seed=89250.\n", + "\u001b[2m\u001b[36m(pid=1426)\u001b[0m Training model with n_components=64 and seed=94177.\n", + "\u001b[2m\u001b[36m(pid=1419)\u001b[0m Training model with n_components=32 and seed=773956.\n", + "\u001b[2m\u001b[36m(pid=1425)\u001b[0m Training model with n_components=64 and seed=975622.\n", + "\u001b[2m\u001b[36m(pid=1423)\u001b[0m Training model with n_components=64 and seed=526478.\n", + "\u001b[2m\u001b[36m(pid=1420)\u001b[0m Training model with n_components=32 and seed=654571.\n", + "\u001b[2m\u001b[36m(pid=1429)\u001b[0m Training model with n_components=128 and seed=128113.\n", + "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=256 and seed=822761.\n", "Trained 17 models.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "aa0c4f31711d4ebebabc172823af60c4", + "model_id": "01da45dd78b4496da2463cbd219c07a9", "version_major": 2, "version_minor": 0 }, @@ -3027,20 +3002,6 @@ "metadata": {}, "output_type": "display_data" }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "3ebe24b3d5b34dc5838c6d42fdaabe9e", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=140, style=ProgressStyle(desc…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", @@ -3064,29 +3025,29 @@ "Training model '256_2' (#2 at 256 dimensions) with seed 643865\n", "Training model '256_3' (#3 at 256 dimensions) with seed 402414\n", "Training model '256_4' (#4 at 256 dimensions) with seed 822761\n", - "\u001b[2m\u001b[36m(pid=24222)\u001b[0m Training model with n_components=64 and seed=526478.\n", - "\u001b[2m\u001b[36m(pid=24226)\u001b[0m Training model with n_components=32 and seed=438878.\n", - "\u001b[2m\u001b[36m(pid=24230)\u001b[0m Training model with n_components=64 and seed=201469.\n", - "\u001b[2m\u001b[36m(pid=24223)\u001b[0m Training model with n_components=32 and seed=773956.\n", - "\u001b[2m\u001b[36m(pid=24224)\u001b[0m Training model with n_components=32 and seed=654571.\n", - "\u001b[2m\u001b[36m(pid=24235)\u001b[0m Training model with n_components=64 and seed=94177.\n", - "\u001b[2m\u001b[36m(pid=24227)\u001b[0m Training model with n_components=64 and seed=975622.\n", - "\u001b[2m\u001b[36m(pid=24231)\u001b[0m Training model with n_components=128 and seed=128113.\n", - "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=32 and seed=89250.\n", - "\u001b[2m\u001b[36m(pid=24234)\u001b[0m Training model with n_components=128 and seed=450385.\n", - "\u001b[2m\u001b[36m(pid=24233)\u001b[0m Training model with n_components=128 and seed=839748.\n", - "\u001b[2m\u001b[36m(pid=24232)\u001b[0m Training model with n_components=128 and seed=513226.\n", - "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=256 and seed=781567.\n", - "\u001b[2m\u001b[36m(pid=24229)\u001b[0m Training model with n_components=256 and seed=643865.\n", - "\u001b[2m\u001b[36m(pid=24225)\u001b[0m Training model with n_components=256 and seed=402414.\n", - "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=256 and seed=822761.\n", + "\u001b[2m\u001b[36m(pid=1424)\u001b[0m Training model with n_components=64 and seed=975622.\n", + "\u001b[2m\u001b[36m(pid=1427)\u001b[0m Training model with n_components=64 and seed=526478.\n", + "\u001b[2m\u001b[36m(pid=1421)\u001b[0m Training model with n_components=128 and seed=128113.\n", + "\u001b[2m\u001b[36m(pid=1428)\u001b[0m Training model with n_components=64 and seed=94177.\n", + "\u001b[2m\u001b[36m(pid=1422)\u001b[0m Training model with n_components=256 and seed=643865.\n", + "\u001b[2m\u001b[36m(pid=1431)\u001b[0m Training model with n_components=32 and seed=438878.\n", + "\u001b[2m\u001b[36m(pid=1430)\u001b[0m Training model with n_components=32 and seed=773956.\n", + "\u001b[2m\u001b[36m(pid=1432)\u001b[0m Training model with n_components=32 and seed=654571.\n", + "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=32 and seed=89250.\n", + "\u001b[2m\u001b[36m(pid=1426)\u001b[0m Training model with n_components=128 and seed=513226.\n", + "\u001b[2m\u001b[36m(pid=1419)\u001b[0m Training model with n_components=256 and seed=402414.\n", + "\u001b[2m\u001b[36m(pid=1425)\u001b[0m Training model with n_components=128 and seed=450385.\n", + "\u001b[2m\u001b[36m(pid=1423)\u001b[0m Training model with n_components=128 and seed=839748.\n", + "\u001b[2m\u001b[36m(pid=1420)\u001b[0m Training model with n_components=256 and seed=781567.\n", + "\u001b[2m\u001b[36m(pid=1429)\u001b[0m Training model with n_components=64 and seed=201469.\n", + "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=256 and seed=822761.\n", "Trained 17 models.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "be8b3f55fef141a9a6c003cc8e7bfc4c", + "model_id": "cb3412eb0b714433a507d2610a885409", "version_major": 2, "version_minor": 0 }, @@ -3097,20 +3058,6 @@ "metadata": {}, "output_type": "display_data" }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "707100b8c15846c0a46a2dced9644de9", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=140, style=ProgressStyle(desc…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", @@ -3134,29 +3081,29 @@ "Training model '256_2' (#2 at 256 dimensions) with seed 643865\n", "Training model '256_3' (#3 at 256 dimensions) with seed 402414\n", "Training model '256_4' (#4 at 256 dimensions) with seed 822761\n", - "\u001b[2m\u001b[36m(pid=24222)\u001b[0m Training model with n_components=128 and seed=450385.\n", - "\u001b[2m\u001b[36m(pid=24226)\u001b[0m Training model with n_components=256 and seed=643865.\n", - "\u001b[2m\u001b[36m(pid=24230)\u001b[0m Training model with n_components=128 and seed=839748.\n", - "\u001b[2m\u001b[36m(pid=24223)\u001b[0m Training model with n_components=256 and seed=781567.\n", - "\u001b[2m\u001b[36m(pid=24224)\u001b[0m Training model with n_components=256 and seed=402414.\n", - "\u001b[2m\u001b[36m(pid=24225)\u001b[0m Training model with n_components=32 and seed=438878.\n", - "\u001b[2m\u001b[36m(pid=24235)\u001b[0m Training model with n_components=128 and seed=128113.\n", - "\u001b[2m\u001b[36m(pid=24227)\u001b[0m Training model with n_components=128 and seed=513226.\n", - "\u001b[2m\u001b[36m(pid=24231)\u001b[0m Training model with n_components=64 and seed=201469.\n", - "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=32 and seed=89250.\n", - "\u001b[2m\u001b[36m(pid=24234)\u001b[0m Training model with n_components=64 and seed=975622.\n", - "\u001b[2m\u001b[36m(pid=24233)\u001b[0m Training model with n_components=64 and seed=94177.\n", - "\u001b[2m\u001b[36m(pid=24232)\u001b[0m Training model with n_components=64 and seed=526478.\n", - "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=32 and seed=654571.\n", - "\u001b[2m\u001b[36m(pid=24229)\u001b[0m Training model with n_components=32 and seed=773956.\n", - "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=256 and seed=822761.\n", + "\u001b[2m\u001b[36m(pid=1424)\u001b[0m Training model with n_components=128 and seed=450385.\n", + "\u001b[2m\u001b[36m(pid=1427)\u001b[0m Training model with n_components=128 and seed=839748.\n", + "\u001b[2m\u001b[36m(pid=1421)\u001b[0m Training model with n_components=64 and seed=201469.\n", + "\u001b[2m\u001b[36m(pid=1428)\u001b[0m Training model with n_components=128 and seed=513226.\n", + "\u001b[2m\u001b[36m(pid=1422)\u001b[0m Training model with n_components=32 and seed=773956.\n", + "\u001b[2m\u001b[36m(pid=1431)\u001b[0m Training model with n_components=256 and seed=781567.\n", + "\u001b[2m\u001b[36m(pid=1430)\u001b[0m Training model with n_components=256 and seed=643865.\n", + "\u001b[2m\u001b[36m(pid=1432)\u001b[0m Training model with n_components=256 and seed=402414.\n", + "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=32 and seed=89250.\n", + "\u001b[2m\u001b[36m(pid=1426)\u001b[0m Training model with n_components=64 and seed=526478.\n", + "\u001b[2m\u001b[36m(pid=1419)\u001b[0m Training model with n_components=32 and seed=438878.\n", + "\u001b[2m\u001b[36m(pid=1425)\u001b[0m Training model with n_components=64 and seed=975622.\n", + "\u001b[2m\u001b[36m(pid=1423)\u001b[0m Training model with n_components=64 and seed=94177.\n", + "\u001b[2m\u001b[36m(pid=1420)\u001b[0m Training model with n_components=32 and seed=654571.\n", + "\u001b[2m\u001b[36m(pid=1429)\u001b[0m Training model with n_components=128 and seed=128113.\n", + "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=256 and seed=822761.\n", "Trained 17 models.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "c4f19b309a724640a23d0f261b7c40cf", + "model_id": "f4df45a4607b4dcba42ea31c2a6cd13e", "version_major": 2, "version_minor": 0 }, @@ -3167,20 +3114,6 @@ "metadata": {}, "output_type": "display_data" }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5b8a9fdfc1e247d69b46e7d736509cfd", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=139, style=ProgressStyle(desc…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", @@ -3201,51 +3134,37 @@ "Training model '128_3' (#3 at 128 dimensions) with seed 839748\n", "Training model '128_4' (#4 at 128 dimensions) with seed 450385\n", "Training model '256_1' (#1 at 256 dimensions) with seed 781567\n", - "Training model '256_2' (#2 at 256 dimensions) with seed 643865\n", - "Training model '256_3' (#3 at 256 dimensions) with seed 402414\n", - "Training model '256_4' (#4 at 256 dimensions) with seed 822761\n", - "\u001b[2m\u001b[36m(pid=24222)\u001b[0m Training model with n_components=64 and seed=94177.\n", - "\u001b[2m\u001b[36m(pid=24226)\u001b[0m Training model with n_components=32 and seed=773956.\n", - "\u001b[2m\u001b[36m(pid=24230)\u001b[0m Training model with n_components=64 and seed=975622.\n", - "\u001b[2m\u001b[36m(pid=24223)\u001b[0m Training model with n_components=32 and seed=654571.\n", - "\u001b[2m\u001b[36m(pid=24224)\u001b[0m Training model with n_components=32 and seed=438878.\n", - "\u001b[2m\u001b[36m(pid=24225)\u001b[0m Training model with n_components=256 and seed=781567.\n", - "\u001b[2m\u001b[36m(pid=24235)\u001b[0m Training model with n_components=64 and seed=201469.\n", - "\u001b[2m\u001b[36m(pid=24227)\u001b[0m Training model with n_components=64 and seed=526478.\n", - "\u001b[2m\u001b[36m(pid=24231)\u001b[0m Training model with n_components=128 and seed=839748.\n", - "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=32 and seed=89250.\n", - "\u001b[2m\u001b[36m(pid=24234)\u001b[0m Training model with n_components=128 and seed=450385.\n", - "\u001b[2m\u001b[36m(pid=24233)\u001b[0m Training model with n_components=128 and seed=513226.\n", - "\u001b[2m\u001b[36m(pid=24232)\u001b[0m Training model with n_components=128 and seed=128113.\n", - "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=256 and seed=402414.\n", - "\u001b[2m\u001b[36m(pid=24229)\u001b[0m Training model with n_components=256 and seed=643865.\n", - "\u001b[2m\u001b[36m(pid=24224)\u001b[0m Training model with n_components=256 and seed=822761.\n", - "Trained 17 models.\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "1f077611cbc14f9ba4007f7d94edd4ce", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=17, style=ProgressStyle(descr…" - ] - }, - "metadata": {}, - "output_type": "display_data" + "Training model '256_2' (#2 at 256 dimensions) with seed 643865\n", + "Training model '256_3' (#3 at 256 dimensions) with seed 402414\n", + "Training model '256_4' (#4 at 256 dimensions) with seed 822761\n", + "\u001b[2m\u001b[36m(pid=1424)\u001b[0m Training model with n_components=64 and seed=94177.\n", + "\u001b[2m\u001b[36m(pid=1427)\u001b[0m Training model with n_components=64 and seed=975622.\n", + "\u001b[2m\u001b[36m(pid=1421)\u001b[0m Training model with n_components=128 and seed=839748.\n", + "\u001b[2m\u001b[36m(pid=1428)\u001b[0m Training model with n_components=64 and seed=526478.\n", + "\u001b[2m\u001b[36m(pid=1422)\u001b[0m Training model with n_components=256 and seed=402414.\n", + "\u001b[2m\u001b[36m(pid=1431)\u001b[0m Training model with n_components=32 and seed=654571.\n", + "\u001b[2m\u001b[36m(pid=1430)\u001b[0m Training model with n_components=32 and seed=773956.\n", + "\u001b[2m\u001b[36m(pid=1432)\u001b[0m Training model with n_components=32 and seed=438878.\n", + "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=32 and seed=89250.\n", + "\u001b[2m\u001b[36m(pid=1426)\u001b[0m Training model with n_components=128 and seed=128113.\n", + "\u001b[2m\u001b[36m(pid=1419)\u001b[0m Training model with n_components=256 and seed=781567.\n", + "\u001b[2m\u001b[36m(pid=1425)\u001b[0m Training model with n_components=128 and seed=450385.\n", + "\u001b[2m\u001b[36m(pid=1423)\u001b[0m Training model with n_components=128 and seed=513226.\n", + "\u001b[2m\u001b[36m(pid=1420)\u001b[0m Training model with n_components=256 and seed=643865.\n", + "\u001b[2m\u001b[36m(pid=1429)\u001b[0m Training model with n_components=64 and seed=201469.\n", + "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=256 and seed=822761.\n", + "Trained 17 models.\n" + ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "8e3d4523d19a405db777e4fb25d481aa", + "model_id": "b9e94c8d6456418284358fa0264359f5", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=139, style=ProgressStyle(desc…" + "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=17, style=ProgressStyle(descr…" ] }, "metadata": {}, @@ -3274,29 +3193,29 @@ "Training model '256_2' (#2 at 256 dimensions) with seed 643865\n", "Training model '256_3' (#3 at 256 dimensions) with seed 402414\n", "Training model '256_4' (#4 at 256 dimensions) with seed 822761\n", - "\u001b[2m\u001b[36m(pid=24222)\u001b[0m Training model with n_components=128 and seed=513226.\n", - "\u001b[2m\u001b[36m(pid=24226)\u001b[0m Training model with n_components=256 and seed=643865.\n", - "\u001b[2m\u001b[36m(pid=24230)\u001b[0m Training model with n_components=128 and seed=839748.\n", - "\u001b[2m\u001b[36m(pid=24223)\u001b[0m Training model with n_components=256 and seed=402414.\n", - "\u001b[2m\u001b[36m(pid=24224)\u001b[0m Training model with n_components=32 and seed=89250.\n", - "\u001b[2m\u001b[36m(pid=24225)\u001b[0m Training model with n_components=32 and seed=773956.\n", - "\u001b[2m\u001b[36m(pid=24235)\u001b[0m Training model with n_components=128 and seed=450385.\n", - "\u001b[2m\u001b[36m(pid=24227)\u001b[0m Training model with n_components=128 and seed=128113.\n", - "\u001b[2m\u001b[36m(pid=24231)\u001b[0m Training model with n_components=64 and seed=975622.\n", - "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=256 and seed=781567.\n", - "\u001b[2m\u001b[36m(pid=24234)\u001b[0m Training model with n_components=64 and seed=94177.\n", - "\u001b[2m\u001b[36m(pid=24233)\u001b[0m Training model with n_components=64 and seed=526478.\n", - "\u001b[2m\u001b[36m(pid=24232)\u001b[0m Training model with n_components=64 and seed=201469.\n", - "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=32 and seed=438878.\n", - "\u001b[2m\u001b[36m(pid=24229)\u001b[0m Training model with n_components=32 and seed=654571.\n", - "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=256 and seed=822761.\n", + "\u001b[2m\u001b[36m(pid=1424)\u001b[0m Training model with n_components=128 and seed=513226.\n", + "\u001b[2m\u001b[36m(pid=1427)\u001b[0m Training model with n_components=128 and seed=450385.\n", + "\u001b[2m\u001b[36m(pid=1421)\u001b[0m Training model with n_components=64 and seed=94177.\n", + "\u001b[2m\u001b[36m(pid=1428)\u001b[0m Training model with n_components=128 and seed=839748.\n", + "\u001b[2m\u001b[36m(pid=1422)\u001b[0m Training model with n_components=32 and seed=438878.\n", + "\u001b[2m\u001b[36m(pid=1431)\u001b[0m Training model with n_components=256 and seed=643865.\n", + "\u001b[2m\u001b[36m(pid=1430)\u001b[0m Training model with n_components=256 and seed=781567.\n", + "\u001b[2m\u001b[36m(pid=1432)\u001b[0m Training model with n_components=256 and seed=402414.\n", + "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=32 and seed=89250.\n", + "\u001b[2m\u001b[36m(pid=1426)\u001b[0m Training model with n_components=64 and seed=201469.\n", + "\u001b[2m\u001b[36m(pid=1419)\u001b[0m Training model with n_components=32 and seed=654571.\n", + "\u001b[2m\u001b[36m(pid=1425)\u001b[0m Training model with n_components=64 and seed=526478.\n", + "\u001b[2m\u001b[36m(pid=1423)\u001b[0m Training model with n_components=64 and seed=975622.\n", + "\u001b[2m\u001b[36m(pid=1420)\u001b[0m Training model with n_components=32 and seed=773956.\n", + "\u001b[2m\u001b[36m(pid=1429)\u001b[0m Training model with n_components=128 and seed=128113.\n", + "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=256 and seed=822761.\n", "Trained 17 models.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "db4060ae0b2044c3b63224e5810ae14e", + "model_id": "40d1bd81bc0c4a52a93b9c72baa2e8e7", "version_major": 2, "version_minor": 0 }, @@ -3307,20 +3226,6 @@ "metadata": {}, "output_type": "display_data" }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c1ed240bfef348aaa605df2175954a79", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=139, style=ProgressStyle(desc…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", @@ -3344,29 +3249,29 @@ "Training model '256_2' (#2 at 256 dimensions) with seed 643865\n", "Training model '256_3' (#3 at 256 dimensions) with seed 402414\n", "Training model '256_4' (#4 at 256 dimensions) with seed 822761\n", - "\u001b[2m\u001b[36m(pid=24222)\u001b[0m Training model with n_components=64 and seed=526478.\n", - "\u001b[2m\u001b[36m(pid=24226)\u001b[0m Training model with n_components=32 and seed=773956.\n", - "\u001b[2m\u001b[36m(pid=24230)\u001b[0m Training model with n_components=64 and seed=975622.\n", - "\u001b[2m\u001b[36m(pid=24223)\u001b[0m Training model with n_components=32 and seed=438878.\n", - "\u001b[2m\u001b[36m(pid=24224)\u001b[0m Training model with n_components=256 and seed=643865.\n", - "\u001b[2m\u001b[36m(pid=24225)\u001b[0m Training model with n_components=256 and seed=781567.\n", - "\u001b[2m\u001b[36m(pid=24235)\u001b[0m Training model with n_components=64 and seed=94177.\n", - "\u001b[2m\u001b[36m(pid=24227)\u001b[0m Training model with n_components=64 and seed=201469.\n", - "\u001b[2m\u001b[36m(pid=24231)\u001b[0m Training model with n_components=128 and seed=450385.\n", - "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=32 and seed=654571.\n", - "\u001b[2m\u001b[36m(pid=24234)\u001b[0m Training model with n_components=128 and seed=513226.\n", - "\u001b[2m\u001b[36m(pid=24233)\u001b[0m Training model with n_components=128 and seed=839748.\n", - "\u001b[2m\u001b[36m(pid=24232)\u001b[0m Training model with n_components=128 and seed=128113.\n", - "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=32 and seed=89250.\n", - "\u001b[2m\u001b[36m(pid=24229)\u001b[0m Training model with n_components=256 and seed=402414.\n", - "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=256 and seed=822761.\n", + "\u001b[2m\u001b[36m(pid=1424)\u001b[0m Training model with n_components=64 and seed=526478.\n", + "\u001b[2m\u001b[36m(pid=1427)\u001b[0m Training model with n_components=64 and seed=94177.\n", + "\u001b[2m\u001b[36m(pid=1421)\u001b[0m Training model with n_components=128 and seed=513226.\n", + "\u001b[2m\u001b[36m(pid=1428)\u001b[0m Training model with n_components=64 and seed=975622.\n", + "\u001b[2m\u001b[36m(pid=1422)\u001b[0m Training model with n_components=256 and seed=781567.\n", + "\u001b[2m\u001b[36m(pid=1431)\u001b[0m Training model with n_components=32 and seed=654571.\n", + "\u001b[2m\u001b[36m(pid=1430)\u001b[0m Training model with n_components=32 and seed=773956.\n", + "\u001b[2m\u001b[36m(pid=1432)\u001b[0m Training model with n_components=32 and seed=438878.\n", + "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=32 and seed=89250.\n", + "\u001b[2m\u001b[36m(pid=1426)\u001b[0m Training model with n_components=128 and seed=128113.\n", + "\u001b[2m\u001b[36m(pid=1419)\u001b[0m Training model with n_components=256 and seed=643865.\n", + "\u001b[2m\u001b[36m(pid=1425)\u001b[0m Training model with n_components=128 and seed=839748.\n", + "\u001b[2m\u001b[36m(pid=1423)\u001b[0m Training model with n_components=128 and seed=450385.\n", + "\u001b[2m\u001b[36m(pid=1420)\u001b[0m Training model with n_components=256 and seed=402414.\n", + "\u001b[2m\u001b[36m(pid=1429)\u001b[0m Training model with n_components=64 and seed=201469.\n", + "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=256 and seed=822761.\n", "Trained 17 models.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "3f7afaeaec744aa2a09ae158f43b363e", + "model_id": "0c4b799c0fc34aa19593b7496b7dafa5", "version_major": 2, "version_minor": 0 }, @@ -3377,20 +3282,6 @@ "metadata": {}, "output_type": "display_data" }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d5d6db92f1fd4f49bf1fed41df60d803", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=139, style=ProgressStyle(desc…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", @@ -3414,29 +3305,29 @@ "Training model '256_2' (#2 at 256 dimensions) with seed 643865\n", "Training model '256_3' (#3 at 256 dimensions) with seed 402414\n", "Training model '256_4' (#4 at 256 dimensions) with seed 822761\n", - "\u001b[2m\u001b[36m(pid=24222)\u001b[0m Training model with n_components=128 and seed=128113.\n", - "\u001b[2m\u001b[36m(pid=24226)\u001b[0m Training model with n_components=256 and seed=402414.\n", - "\u001b[2m\u001b[36m(pid=24230)\u001b[0m Training model with n_components=128 and seed=450385.\n", - "\u001b[2m\u001b[36m(pid=24223)\u001b[0m Training model with n_components=256 and seed=781567.\n", - "\u001b[2m\u001b[36m(pid=24224)\u001b[0m Training model with n_components=32 and seed=438878.\n", - "\u001b[2m\u001b[36m(pid=24225)\u001b[0m Training model with n_components=32 and seed=773956.\n", - "\u001b[2m\u001b[36m(pid=24235)\u001b[0m Training model with n_components=128 and seed=513226.\n", - "\u001b[2m\u001b[36m(pid=24227)\u001b[0m Training model with n_components=128 and seed=839748.\n", - "\u001b[2m\u001b[36m(pid=24231)\u001b[0m Training model with n_components=64 and seed=94177.\n", - "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=256 and seed=643865.\n", - "\u001b[2m\u001b[36m(pid=24234)\u001b[0m Training model with n_components=64 and seed=526478.\n", - "\u001b[2m\u001b[36m(pid=24233)\u001b[0m Training model with n_components=64 and seed=975622.\n", - "\u001b[2m\u001b[36m(pid=24232)\u001b[0m Training model with n_components=64 and seed=201469.\n", - "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=32 and seed=89250.\n", - "\u001b[2m\u001b[36m(pid=24229)\u001b[0m Training model with n_components=32 and seed=654571.\n", - "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=256 and seed=822761.\n", + "\u001b[2m\u001b[36m(pid=1424)\u001b[0m Training model with n_components=128 and seed=128113.\n", + "\u001b[2m\u001b[36m(pid=1427)\u001b[0m Training model with n_components=128 and seed=513226.\n", + "\u001b[2m\u001b[36m(pid=1421)\u001b[0m Training model with n_components=64 and seed=94177.\n", + "\u001b[2m\u001b[36m(pid=1428)\u001b[0m Training model with n_components=128 and seed=450385.\n", + "\u001b[2m\u001b[36m(pid=1422)\u001b[0m Training model with n_components=32 and seed=654571.\n", + "\u001b[2m\u001b[36m(pid=1431)\u001b[0m Training model with n_components=256 and seed=402414.\n", + "\u001b[2m\u001b[36m(pid=1430)\u001b[0m Training model with n_components=256 and seed=643865.\n", + "\u001b[2m\u001b[36m(pid=1432)\u001b[0m Training model with n_components=256 and seed=781567.\n", + "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=32 and seed=89250.\n", + "\u001b[2m\u001b[36m(pid=1426)\u001b[0m Training model with n_components=64 and seed=201469.\n", + "\u001b[2m\u001b[36m(pid=1419)\u001b[0m Training model with n_components=32 and seed=773956.\n", + "\u001b[2m\u001b[36m(pid=1425)\u001b[0m Training model with n_components=64 and seed=975622.\n", + "\u001b[2m\u001b[36m(pid=1423)\u001b[0m Training model with n_components=64 and seed=526478.\n", + "\u001b[2m\u001b[36m(pid=1420)\u001b[0m Training model with n_components=32 and seed=438878.\n", + "\u001b[2m\u001b[36m(pid=1429)\u001b[0m Training model with n_components=128 and seed=839748.\n", + "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=256 and seed=822761.\n", "Trained 17 models.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "be3f25acbfbb4a778af688e75c761cfe", + "model_id": "3c72920296f44cdca60a5b22951c409d", "version_major": 2, "version_minor": 0 }, @@ -3447,20 +3338,6 @@ "metadata": {}, "output_type": "display_data" }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "58bc95e025ae454eb7d5e8e230c36628", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=139, style=ProgressStyle(desc…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", @@ -3484,29 +3361,29 @@ "Training model '256_2' (#2 at 256 dimensions) with seed 643865\n", "Training model '256_3' (#3 at 256 dimensions) with seed 402414\n", "Training model '256_4' (#4 at 256 dimensions) with seed 822761\n", - "\u001b[2m\u001b[36m(pid=24222)\u001b[0m Training model with n_components=64 and seed=201469.\n", - "\u001b[2m\u001b[36m(pid=24226)\u001b[0m Training model with n_components=32 and seed=654571.\n", - "\u001b[2m\u001b[36m(pid=24230)\u001b[0m Training model with n_components=64 and seed=94177.\n", - "\u001b[2m\u001b[36m(pid=24223)\u001b[0m Training model with n_components=32 and seed=438878.\n", - "\u001b[2m\u001b[36m(pid=24224)\u001b[0m Training model with n_components=256 and seed=781567.\n", - "\u001b[2m\u001b[36m(pid=24225)\u001b[0m Training model with n_components=256 and seed=643865.\n", - "\u001b[2m\u001b[36m(pid=24235)\u001b[0m Training model with n_components=64 and seed=526478.\n", - "\u001b[2m\u001b[36m(pid=24227)\u001b[0m Training model with n_components=64 and seed=975622.\n", - "\u001b[2m\u001b[36m(pid=24231)\u001b[0m Training model with n_components=128 and seed=513226.\n", - "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=32 and seed=773956.\n", - "\u001b[2m\u001b[36m(pid=24234)\u001b[0m Training model with n_components=128 and seed=839748.\n", - "\u001b[2m\u001b[36m(pid=24233)\u001b[0m Training model with n_components=128 and seed=450385.\n", - "\u001b[2m\u001b[36m(pid=24232)\u001b[0m Training model with n_components=128 and seed=128113.\n", - "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=32 and seed=89250.\n", - "\u001b[2m\u001b[36m(pid=24229)\u001b[0m Training model with n_components=256 and seed=402414.\n", - "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=256 and seed=822761.\n", + "\u001b[2m\u001b[36m(pid=1424)\u001b[0m Training model with n_components=64 and seed=201469.\n", + "\u001b[2m\u001b[36m(pid=1427)\u001b[0m Training model with n_components=64 and seed=94177.\n", + "\u001b[2m\u001b[36m(pid=1421)\u001b[0m Training model with n_components=128 and seed=513226.\n", + "\u001b[2m\u001b[36m(pid=1428)\u001b[0m Training model with n_components=64 and seed=975622.\n", + "\u001b[2m\u001b[36m(pid=1422)\u001b[0m Training model with n_components=256 and seed=781567.\n", + "\u001b[2m\u001b[36m(pid=1431)\u001b[0m Training model with n_components=32 and seed=438878.\n", + "\u001b[2m\u001b[36m(pid=1430)\u001b[0m Training model with n_components=32 and seed=773956.\n", + "\u001b[2m\u001b[36m(pid=1432)\u001b[0m Training model with n_components=32 and seed=654571.\n", + "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=32 and seed=89250.\n", + "\u001b[2m\u001b[36m(pid=1426)\u001b[0m Training model with n_components=128 and seed=128113.\n", + "\u001b[2m\u001b[36m(pid=1419)\u001b[0m Training model with n_components=256 and seed=402414.\n", + "\u001b[2m\u001b[36m(pid=1425)\u001b[0m Training model with n_components=128 and seed=450385.\n", + "\u001b[2m\u001b[36m(pid=1423)\u001b[0m Training model with n_components=128 and seed=839748.\n", + "\u001b[2m\u001b[36m(pid=1420)\u001b[0m Training model with n_components=256 and seed=643865.\n", + "\u001b[2m\u001b[36m(pid=1429)\u001b[0m Training model with n_components=64 and seed=526478.\n", + "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=256 and seed=822761.\n", "Trained 17 models.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "0a548fe09cb64771b3e30662428db940", + "model_id": "9ce8252670bb4302bd9d3a021474493a", "version_major": 2, "version_minor": 0 }, @@ -3517,20 +3394,6 @@ "metadata": {}, "output_type": "display_data" }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "8a17fea4c2234d018b60c8a36612248f", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=139, style=ProgressStyle(desc…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", @@ -3554,29 +3417,29 @@ "Training model '256_2' (#2 at 256 dimensions) with seed 643865\n", "Training model '256_3' (#3 at 256 dimensions) with seed 402414\n", "Training model '256_4' (#4 at 256 dimensions) with seed 822761\n", - "\u001b[2m\u001b[36m(pid=24222)\u001b[0m Training model with n_components=128 and seed=450385.\n", - "\u001b[2m\u001b[36m(pid=24226)\u001b[0m Training model with n_components=256 and seed=643865.\n", - "\u001b[2m\u001b[36m(pid=24230)\u001b[0m Training model with n_components=128 and seed=513226.\n", - "\u001b[2m\u001b[36m(pid=24223)\u001b[0m Training model with n_components=128 and seed=839748.\n", - "\u001b[2m\u001b[36m(pid=24224)\u001b[0m Training model with n_components=32 and seed=773956.\n", - "\u001b[2m\u001b[36m(pid=24225)\u001b[0m Training model with n_components=32 and seed=654571.\n", - "\u001b[2m\u001b[36m(pid=24235)\u001b[0m Training model with n_components=128 and seed=128113.\n", - "\u001b[2m\u001b[36m(pid=24227)\u001b[0m Training model with n_components=256 and seed=781567.\n", - "\u001b[2m\u001b[36m(pid=24231)\u001b[0m Training model with n_components=64 and seed=526478.\n", - "\u001b[2m\u001b[36m(pid=24228)\u001b[0m Training model with n_components=256 and seed=402414.\n", - "\u001b[2m\u001b[36m(pid=24234)\u001b[0m Training model with n_components=64 and seed=975622.\n", - "\u001b[2m\u001b[36m(pid=24233)\u001b[0m Training model with n_components=64 and seed=94177.\n", - "\u001b[2m\u001b[36m(pid=24232)\u001b[0m Training model with n_components=64 and seed=201469.\n", - "\u001b[2m\u001b[36m(pid=24236)\u001b[0m Training model with n_components=32 and seed=89250.\n", - "\u001b[2m\u001b[36m(pid=24229)\u001b[0m Training model with n_components=32 and seed=438878.\n", - "\u001b[2m\u001b[36m(pid=24224)\u001b[0m Training model with n_components=256 and seed=822761.\n", + "\u001b[2m\u001b[36m(pid=1424)\u001b[0m Training model with n_components=128 and seed=128113.\n", + "\u001b[2m\u001b[36m(pid=1427)\u001b[0m Training model with n_components=128 and seed=513226.\n", + "\u001b[2m\u001b[36m(pid=1421)\u001b[0m Training model with n_components=64 and seed=526478.\n", + "\u001b[2m\u001b[36m(pid=1428)\u001b[0m Training model with n_components=128 and seed=839748.\n", + "\u001b[2m\u001b[36m(pid=1422)\u001b[0m Training model with n_components=32 and seed=773956.\n", + "\u001b[2m\u001b[36m(pid=1431)\u001b[0m Training model with n_components=256 and seed=781567.\n", + "\u001b[2m\u001b[36m(pid=1430)\u001b[0m Training model with n_components=256 and seed=402414.\n", + "\u001b[2m\u001b[36m(pid=1432)\u001b[0m Training model with n_components=256 and seed=643865.\n", + "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=32 and seed=89250.\n", + "\u001b[2m\u001b[36m(pid=1426)\u001b[0m Training model with n_components=64 and seed=201469.\n", + "\u001b[2m\u001b[36m(pid=1419)\u001b[0m Training model with n_components=32 and seed=438878.\n", + "\u001b[2m\u001b[36m(pid=1425)\u001b[0m Training model with n_components=64 and seed=975622.\n", + "\u001b[2m\u001b[36m(pid=1423)\u001b[0m Training model with n_components=64 and seed=94177.\n", + "\u001b[2m\u001b[36m(pid=1420)\u001b[0m Training model with n_components=32 and seed=654571.\n", + "\u001b[2m\u001b[36m(pid=1429)\u001b[0m Training model with n_components=128 and seed=450385.\n", + "\u001b[2m\u001b[36m(pid=1434)\u001b[0m Training model with n_components=256 and seed=822761.\n", "Trained 17 models.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "bdd6e0d863bc457ba79e180359026d1a", + "model_id": "d30ee772e8f94e4b8d5dd8626169bb69", "version_major": 2, "version_minor": 0 }, @@ -3587,20 +3450,6 @@ "metadata": {}, "output_type": "display_data" }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "fc9817ab93d24828a877d8e324ec2e48", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=139, style=ProgressStyle(desc…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", @@ -3626,9 +3475,14 @@ " _models = maybe_train_models(_train_inputs_df, fold_ix)\n", " _evals = eval_models(_models, _test_inputs_df)\n", " _summary_df = make_summary_df(_evals)\n", - " _full_results = util.merge_model_results(_evals)\n", - " _results = _full_results[[\"fold\", \"doc_offset\", \"span\", \n", - " \"ent_type\", \"gold\", \"num_models\"]]\n", + " _gold_elts = cleaning.preprocess.combine_raw_spans_docs_to_match(corpus_raw,_evals[list(evals.keys())[0]])\n", + " _full_results = cleaning.flag_suspicious_labels(_evals,'ent_type','ent_type',\n", + " label_name='ent_type',\n", + " gold_feats=_gold_elts,\n", + " align_over_cols=['fold','doc_num','span'],\n", + " keep_cols=[],split_doc=False)\n", + " _results = _full_results[[\"fold\", \"doc_num\", \"span\", \n", + " \"ent_type\", \"in_gold\", \"count\"]]\n", " return {\n", " \"models\": _models,\n", " \"summary_df\": _summary_df,\n", @@ -3655,7 +3509,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -3680,58 +3534,58 @@ " \n", " \n", " fold\n", - " doc_offset\n", + " doc_num\n", " span\n", " ent_type\n", - " gold\n", - " num_models\n", + " in_gold\n", + " count\n", " \n", " \n", " \n", " \n", - " 0\n", + " 4927\n", " train\n", - " 12\n", - " [11, 16): 'Saudi'\n", - " MISC\n", + " 907\n", + " [590, 598): 'Gorleben'\n", + " LOC\n", " True\n", " 17\n", " \n", " \n", - " 1\n", + " 4925\n", " train\n", - " 12\n", - " [59, 65): 'MANAMA'\n", + " 907\n", + " [63, 67): 'BONN'\n", " LOC\n", " True\n", " 17\n", " \n", " \n", - " 2\n", + " 4924\n", " train\n", - " 12\n", - " [86, 91): 'Saudi'\n", + " 907\n", + " [11, 17): 'German'\n", " MISC\n", " True\n", - " 14\n", + " 17\n", " \n", " \n", - " 3\n", + " 4923\n", " train\n", - " 12\n", - " [259, 264): 'Saudi'\n", - " MISC\n", + " 896\n", + " [523, 528): 'China'\n", + " LOC\n", " True\n", - " 13\n", + " 17\n", " \n", " \n", - " 4\n", + " 4922\n", " train\n", - " 12\n", - " [403, 412): 'One-month'\n", - " MISC\n", + " 896\n", + " [512, 518): 'Mexico'\n", + " LOC\n", " True\n", - " 9\n", + " 17\n", " \n", " \n", " ...\n", @@ -3743,86 +3597,86 @@ " ...\n", " \n", " \n", - " 13\n", - " test\n", - " 216\n", - " [20, 29): 'SHEFFIELD'\n", - " PER\n", - " False\n", - " 12\n", + " 271\n", + " dev\n", + " 93\n", + " [469, 481): 'JAKARTA POST'\n", + " ORG\n", + " True\n", + " 0\n", " \n", " \n", - " 14\n", - " test\n", - " 216\n", - " [127, 143): 'Sheffield Shield'\n", + " 183\n", + " dev\n", + " 76\n", + " [1285, 1312): 'Chicago Purchasing Managers'\n", " ORG\n", - " False\n", - " 3\n", + " True\n", + " 0\n", " \n", " \n", - " 15\n", - " test\n", - " 216\n", - " [166, 174): 'Tasmania'\n", - " LOC\n", - " False\n", - " 14\n", + " 126\n", + " dev\n", + " 49\n", + " [1920, 1925): 'Tajik'\n", + " MISC\n", + " True\n", + " 0\n", " \n", " \n", - " 16\n", - " test\n", - " 216\n", - " [179, 187): 'Victoria'\n", - " LOC\n", - " False\n", - " 14\n", + " 25\n", + " dev\n", + " 15\n", + " [109, 133): 'National Football League'\n", + " ORG\n", + " True\n", + " 0\n", " \n", " \n", " 17\n", - " test\n", - " 216\n", - " [20, 29): 'SHEFFIELD'\n", - " LOC\n", - " False\n", - " 1\n", + " dev\n", + " 15\n", + " [15, 40): 'AMERICAN FOOTBALL-RANDALL'\n", + " MISC\n", + " True\n", + " 0\n", " \n", " \n", "\n", - "

44800 rows × 6 columns

\n", + "

44802 rows × 6 columns

\n", "" ], "text/plain": [ - " fold doc_offset span ent_type gold \\\n", - "0 train 12 [11, 16): 'Saudi' MISC True \n", - "1 train 12 [59, 65): 'MANAMA' LOC True \n", - "2 train 12 [86, 91): 'Saudi' MISC True \n", - "3 train 12 [259, 264): 'Saudi' MISC True \n", - "4 train 12 [403, 412): 'One-month' MISC True \n", - ".. ... ... ... ... ... \n", - "13 test 216 [20, 29): 'SHEFFIELD' PER False \n", - "14 test 216 [127, 143): 'Sheffield Shield' ORG False \n", - "15 test 216 [166, 174): 'Tasmania' LOC False \n", - "16 test 216 [179, 187): 'Victoria' LOC False \n", - "17 test 216 [20, 29): 'SHEFFIELD' LOC False \n", + " fold doc_num span ent_type \\\n", + "4927 train 907 [590, 598): 'Gorleben' LOC \n", + "4925 train 907 [63, 67): 'BONN' LOC \n", + "4924 train 907 [11, 17): 'German' MISC \n", + "4923 train 896 [523, 528): 'China' LOC \n", + "4922 train 896 [512, 518): 'Mexico' LOC \n", + "... ... ... ... ... \n", + "271 dev 93 [469, 481): 'JAKARTA POST' ORG \n", + "183 dev 76 [1285, 1312): 'Chicago Purchasing Managers' ORG \n", + "126 dev 49 [1920, 1925): 'Tajik' MISC \n", + "25 dev 15 [109, 133): 'National Football League' ORG \n", + "17 dev 15 [15, 40): 'AMERICAN FOOTBALL-RANDALL' MISC \n", "\n", - " num_models \n", - "0 17 \n", - "1 17 \n", - "2 14 \n", - "3 13 \n", - "4 9 \n", - ".. ... \n", - "13 12 \n", - "14 3 \n", - "15 14 \n", - "16 14 \n", - "17 1 \n", + " in_gold count \n", + "4927 True 17 \n", + "4925 True 17 \n", + "4924 True 17 \n", + "4923 True 17 \n", + "4922 True 17 \n", + "... ... ... \n", + "271 True 0 \n", + "183 True 0 \n", + "126 True 0 \n", + "25 True 0 \n", + "17 True 0 \n", "\n", - "[44800 rows x 6 columns]" + "[44802 rows x 6 columns]" ] }, - "execution_count": 23, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -3842,7 +3696,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -3866,7 +3720,7 @@ " \n", " \n", " \n", - " num_models\n", + " count\n", " fold\n", " doc_offset\n", " corpus_span\n", @@ -3882,12 +3736,12 @@ " \n", " \n", " \n", - " 0\n", + " 30\n", " 0\n", " dev\n", " 2\n", - " [25, 30): 'ASHES'\n", - " MISC\n", + " [760, 765): 'Leeds'\n", + " ORG\n", " \n", " \n", " \n", @@ -3897,12 +3751,12 @@ " \n", " \n", " \n", - " 3\n", + " 21\n", " 0\n", " dev\n", " 2\n", - " [87, 92): 'Ashes'\n", - " MISC\n", + " [614, 634): 'Duke of Norfolk's XI'\n", + " ORG\n", " \n", " \n", " \n", @@ -3927,12 +3781,12 @@ " \n", " \n", " \n", - " 13\n", + " 3\n", " 0\n", " dev\n", " 2\n", - " [614, 634): 'Duke of Norfolk's XI'\n", - " ORG\n", + " [87, 92): 'Ashes'\n", + " MISC\n", " \n", " \n", " \n", @@ -3942,12 +3796,12 @@ " \n", " \n", " \n", - " 19\n", + " 0\n", " 0\n", " dev\n", " 2\n", - " [760, 765): 'Leeds'\n", - " ORG\n", + " [25, 30): 'ASHES'\n", + " MISC\n", " \n", " \n", " \n", @@ -3972,12 +3826,12 @@ " ...\n", " \n", " \n", - " 21\n", + " 1738\n", " 17\n", " test\n", " 230\n", - " [1108, 1115): 'Germany'\n", - " LOC\n", + " [230, 238): 'Charlton'\n", + " PER\n", " \n", " \n", " \n", @@ -3987,12 +3841,12 @@ " \n", " \n", " \n", - " 23\n", + " 1737\n", " 17\n", " test\n", " 230\n", - " [1153, 1160): 'England'\n", - " LOC\n", + " [177, 187): 'Englishman'\n", + " MISC\n", " \n", " \n", " \n", @@ -4002,12 +3856,12 @@ " \n", " \n", " \n", - " 24\n", + " 1736\n", " 17\n", " test\n", " 230\n", - " [1213, 1225): 'Leeds United'\n", - " ORG\n", + " [135, 142): 'Ireland'\n", + " LOC\n", " \n", " \n", " \n", @@ -4017,12 +3871,12 @@ " \n", " \n", " \n", - " 25\n", + " 1735\n", " 17\n", " test\n", " 230\n", - " [1252, 1259): 'England'\n", - " LOC\n", + " [87, 100): 'Jack Charlton'\n", + " PER\n", " \n", " \n", " \n", @@ -4032,12 +3886,12 @@ " \n", " \n", " \n", - " 27\n", + " 1734\n", " 17\n", " test\n", " 230\n", - " [1395, 1400): 'Bobby'\n", - " PER\n", + " [69, 75): 'DUBLIN'\n", + " LOC\n", " \n", " \n", " \n", @@ -4052,49 +3906,49 @@ "" ], "text/plain": [ - " num_models fold doc_offset corpus_span \\\n", - "0 0 dev 2 [25, 30): 'ASHES' \n", - "3 0 dev 2 [87, 92): 'Ashes' \n", - "5 0 dev 2 [189, 218): 'Test and County Cricket Board' \n", - "13 0 dev 2 [614, 634): 'Duke of Norfolk's XI' \n", - "19 0 dev 2 [760, 765): 'Leeds' \n", - ".. ... ... ... ... \n", - "21 17 test 230 [1108, 1115): 'Germany' \n", - "23 17 test 230 [1153, 1160): 'England' \n", - "24 17 test 230 [1213, 1225): 'Leeds United' \n", - "25 17 test 230 [1252, 1259): 'England' \n", - "27 17 test 230 [1395, 1400): 'Bobby' \n", + " count fold doc_offset corpus_span \\\n", + "30 0 dev 2 [760, 765): 'Leeds' \n", + "21 0 dev 2 [614, 634): 'Duke of Norfolk's XI' \n", + "5 0 dev 2 [189, 218): 'Test and County Cricket Board' \n", + "3 0 dev 2 [87, 92): 'Ashes' \n", + "0 0 dev 2 [25, 30): 'ASHES' \n", + "... ... ... ... ... \n", + "1738 17 test 230 [230, 238): 'Charlton' \n", + "1737 17 test 230 [177, 187): 'Englishman' \n", + "1736 17 test 230 [135, 142): 'Ireland' \n", + "1735 17 test 230 [87, 100): 'Jack Charlton' \n", + "1734 17 test 230 [69, 75): 'DUBLIN' \n", "\n", - " corpus_ent_type error_type correct_span correct_ent_type notes \\\n", - "0 MISC \n", - "3 MISC \n", - "5 ORG \n", - "13 ORG \n", - "19 ORG \n", - ".. ... ... ... ... ... \n", - "21 LOC \n", - "23 LOC \n", - "24 ORG \n", - "25 LOC \n", - "27 PER \n", + " corpus_ent_type error_type correct_span correct_ent_type notes \\\n", + "30 ORG \n", + "21 ORG \n", + "5 ORG \n", + "3 MISC \n", + "0 MISC \n", + "... ... ... ... ... ... \n", + "1738 PER \n", + "1737 MISC \n", + "1736 LOC \n", + "1735 PER \n", + "1734 LOC \n", "\n", - " time_started time_stopped time_elapsed \n", - "0 \n", - "3 \n", - "5 \n", - "13 \n", - "19 \n", - ".. ... ... ... \n", - "21 \n", - "23 \n", - "24 \n", - "25 \n", - "27 \n", + " time_started time_stopped time_elapsed \n", + "30 \n", + "21 \n", + "5 \n", + "3 \n", + "0 \n", + "... ... ... ... \n", + "1738 \n", + "1737 \n", + "1736 \n", + "1735 \n", + "1734 \n", "\n", "[11590 rows x 12 columns]" ] }, - "execution_count": 24, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -4102,13 +3956,13 @@ "source": [ "# Reformat for output\n", "dev_and_test_results = all_results[all_results[\"fold\"].isin([\"dev\", \"test\"])]\n", - "in_gold_to_write, not_in_gold_to_write = util.csv_prep(dev_and_test_results, \"num_models\")\n", + "in_gold_to_write, not_in_gold_to_write = cleaning.analysis.csv_prep(dev_and_test_results, \"count\")\n", "in_gold_to_write" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -4132,7 +3986,7 @@ " \n", " \n", " \n", - " num_models\n", + " count\n", " fold\n", " doc_offset\n", " model_span\n", @@ -4150,7 +4004,7 @@ " \n", " \n", " \n", - " 51\n", + " 29\n", " 17\n", " dev\n", " 2\n", @@ -4167,11 +4021,11 @@ " \n", " \n", " \n", - " 18\n", + " 25\n", " 17\n", " dev\n", " 6\n", - " [262, 267): 'Rotor'\n", + " [567, 572): 'Rotor'\n", " PER\n", " \n", " \n", @@ -4184,7 +4038,7 @@ " \n", " \n", " \n", - " 19\n", + " 20\n", " 17\n", " dev\n", " 6\n", @@ -4201,11 +4055,11 @@ " \n", " \n", " \n", - " 20\n", + " 16\n", " 17\n", " dev\n", " 6\n", - " [567, 572): 'Rotor'\n", + " [262, 267): 'Rotor'\n", " PER\n", " \n", " \n", @@ -4218,7 +4072,7 @@ " \n", " \n", " \n", - " 101\n", + " 142\n", " 17\n", " dev\n", " 11\n", @@ -4252,11 +4106,11 @@ " ...\n", " \n", " \n", - " 43\n", + " 1708\n", " 1\n", " test\n", " 228\n", - " [40, 43): 'SIX'\n", + " [771, 784): 'De Graafschap'\n", " ORG\n", " \n", " \n", @@ -4269,11 +4123,11 @@ " \n", " \n", " \n", - " 45\n", + " 1690\n", " 1\n", " test\n", " 228\n", - " [831, 845): 'Super Peasants'\n", + " [269, 287): 'Brazilian defender'\n", " MISC\n", " \n", " \n", @@ -4286,12 +4140,12 @@ " \n", " \n", " \n", - " 49\n", + " 1679\n", " 1\n", " test\n", " 228\n", - " [801, 811): 'Doetinchem'\n", - " MISC\n", + " [40, 43): 'SIX'\n", + " ORG\n", " \n", " \n", " \n", @@ -4303,7 +4157,7 @@ " \n", " \n", " \n", - " 30\n", + " 1724\n", " 1\n", " test\n", " 230\n", @@ -4320,7 +4174,7 @@ " \n", " \n", " \n", - " 31\n", + " 1727\n", " 1\n", " test\n", " 230\n", @@ -4342,49 +4196,49 @@ "" ], "text/plain": [ - " num_models fold doc_offset model_span \\\n", - "51 17 dev 2 [760, 765): 'Leeds' \n", - "18 17 dev 6 [262, 267): 'Rotor' \n", - "19 17 dev 6 [399, 404): 'Rotor' \n", - "20 17 dev 6 [567, 572): 'Rotor' \n", - "101 17 dev 11 [1961, 1975): 'Czech Republic' \n", - ".. ... ... ... ... \n", - "43 1 test 228 [40, 43): 'SIX' \n", - "45 1 test 228 [831, 845): 'Super Peasants' \n", - "49 1 test 228 [801, 811): 'Doetinchem' \n", - "30 1 test 230 [19, 29): 'ENGLISHMAN' \n", - "31 1 test 230 [19, 38): 'ENGLISHMAN CHARLTON' \n", + " count fold doc_offset model_span \\\n", + "29 17 dev 2 [760, 765): 'Leeds' \n", + "25 17 dev 6 [567, 572): 'Rotor' \n", + "20 17 dev 6 [399, 404): 'Rotor' \n", + "16 17 dev 6 [262, 267): 'Rotor' \n", + "142 17 dev 11 [1961, 1975): 'Czech Republic' \n", + "... ... ... ... ... \n", + "1708 1 test 228 [771, 784): 'De Graafschap' \n", + "1690 1 test 228 [269, 287): 'Brazilian defender' \n", + "1679 1 test 228 [40, 43): 'SIX' \n", + "1724 1 test 230 [19, 29): 'ENGLISHMAN' \n", + "1727 1 test 230 [19, 38): 'ENGLISHMAN CHARLTON' \n", "\n", - " model_ent_type error_type corpus_span corpus_ent_type correct_span \\\n", - "51 LOC \n", - "18 PER \n", - "19 PER \n", - "20 PER \n", - "101 LOC \n", - ".. ... ... ... ... ... \n", - "43 ORG \n", - "45 MISC \n", - "49 MISC \n", - "30 LOC \n", - "31 PER \n", + " model_ent_type error_type corpus_span corpus_ent_type correct_span \\\n", + "29 LOC \n", + "25 PER \n", + "20 PER \n", + "16 PER \n", + "142 LOC \n", + "... ... ... ... ... ... \n", + "1708 ORG \n", + "1690 MISC \n", + "1679 ORG \n", + "1724 LOC \n", + "1727 PER \n", "\n", - " correct_ent_type notes time_started time_stopped time_elapsed \n", - "51 \n", - "18 \n", - "19 \n", - "20 \n", - "101 \n", - ".. ... ... ... ... ... \n", - "43 \n", - "45 \n", - "49 \n", - "30 \n", - "31 \n", + " correct_ent_type notes time_started time_stopped time_elapsed \n", + "29 \n", + "25 \n", + "20 \n", + "16 \n", + "142 \n", + "... ... ... ... ... ... \n", + "1708 \n", + "1690 \n", + "1679 \n", + "1724 \n", + "1727 \n", "\n", "[4366 rows x 14 columns]" ] }, - "execution_count": 25, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -4395,7 +4249,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -4405,7 +4259,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -4429,7 +4283,7 @@ " \n", " \n", " \n", - " num_models\n", + " count\n", " fold\n", " doc_offset\n", " corpus_span\n", @@ -4445,7 +4299,7 @@ " \n", " \n", " \n", - " 3\n", + " 1486\n", " 0\n", " train\n", " 6\n", @@ -4460,12 +4314,12 @@ " \n", " \n", " \n", - " 4\n", + " 1358\n", " 0\n", " train\n", " 24\n", - " [161, 169): 'Africans'\n", - " MISC\n", + " [384, 388): 'FLNC'\n", + " ORG\n", " \n", " \n", " \n", @@ -4475,12 +4329,12 @@ " \n", " \n", " \n", - " 7\n", + " 1355\n", " 0\n", " train\n", " 24\n", - " [384, 388): 'FLNC'\n", - " ORG\n", + " [161, 169): 'Africans'\n", + " MISC\n", " \n", " \n", " \n", @@ -4490,7 +4344,7 @@ " \n", " \n", " \n", - " 4\n", + " 1965\n", " 0\n", " train\n", " 25\n", @@ -4505,7 +4359,7 @@ " \n", " \n", " \n", - " 13\n", + " 1383\n", " 0\n", " train\n", " 28\n", @@ -4535,12 +4389,12 @@ " ...\n", " \n", " \n", - " 2\n", + " 4132\n", " 17\n", " train\n", " 945\n", - " [72, 79): 'English'\n", - " MISC\n", + " [130, 137): 'Preston'\n", + " ORG\n", " \n", " \n", " \n", @@ -4550,7 +4404,7 @@ " \n", " \n", " \n", - " 3\n", + " 4131\n", " 17\n", " train\n", " 945\n", @@ -4565,12 +4419,12 @@ " \n", " \n", " \n", - " 4\n", + " 4130\n", " 17\n", " train\n", " 945\n", - " [130, 137): 'Preston'\n", - " ORG\n", + " [72, 79): 'English'\n", + " MISC\n", " \n", " \n", " \n", @@ -4580,12 +4434,12 @@ " \n", " \n", " \n", - " 5\n", + " 4129\n", " 17\n", " train\n", " 945\n", - " [155, 162): 'Swansea'\n", - " ORG\n", + " [43, 49): 'LONDON'\n", + " LOC\n", " \n", " \n", " \n", @@ -4595,12 +4449,12 @@ " \n", " \n", " \n", - " 6\n", + " 4128\n", " 17\n", " train\n", " 945\n", - " [165, 172): 'Lincoln'\n", - " ORG\n", + " [19, 26): 'ENGLISH'\n", + " MISC\n", " \n", " \n", " \n", @@ -4615,44 +4469,44 @@ "" ], "text/plain": [ - " num_models fold doc_offset corpus_span \\\n", - "3 0 train 6 [121, 137): 'Toronto Dominion' \n", - "4 0 train 24 [161, 169): 'Africans' \n", - "7 0 train 24 [384, 388): 'FLNC' \n", - "4 0 train 25 [141, 151): 'mid-Norway' \n", - "13 0 train 28 [1133, 1135): 'EU' \n", - ".. ... ... ... ... \n", - "2 17 train 945 [72, 79): 'English' \n", - "3 17 train 945 [119, 127): 'Plymouth' \n", - "4 17 train 945 [130, 137): 'Preston' \n", - "5 17 train 945 [155, 162): 'Swansea' \n", - "6 17 train 945 [165, 172): 'Lincoln' \n", + " count fold doc_offset corpus_span \\\n", + "1486 0 train 6 [121, 137): 'Toronto Dominion' \n", + "1358 0 train 24 [384, 388): 'FLNC' \n", + "1355 0 train 24 [161, 169): 'Africans' \n", + "1965 0 train 25 [141, 151): 'mid-Norway' \n", + "1383 0 train 28 [1133, 1135): 'EU' \n", + "... ... ... ... ... \n", + "4132 17 train 945 [130, 137): 'Preston' \n", + "4131 17 train 945 [119, 127): 'Plymouth' \n", + "4130 17 train 945 [72, 79): 'English' \n", + "4129 17 train 945 [43, 49): 'LONDON' \n", + "4128 17 train 945 [19, 26): 'ENGLISH' \n", "\n", - " corpus_ent_type error_type correct_span correct_ent_type notes \\\n", - "3 PER \n", - "4 MISC \n", - "7 ORG \n", - "4 MISC \n", - "13 ORG \n", - ".. ... ... ... ... ... \n", - "2 MISC \n", - "3 ORG \n", - "4 ORG \n", - "5 ORG \n", - "6 ORG \n", + " corpus_ent_type error_type correct_span correct_ent_type notes \\\n", + "1486 PER \n", + "1358 ORG \n", + "1355 MISC \n", + "1965 MISC \n", + "1383 ORG \n", + "... ... ... ... ... ... \n", + "4132 ORG \n", + "4131 ORG \n", + "4130 MISC \n", + "4129 LOC \n", + "4128 MISC \n", "\n", - " time_started time_stopped time_elapsed \n", - "3 \n", - "4 \n", - "7 \n", - "4 \n", - "13 \n", - ".. ... ... ... \n", - "2 \n", - "3 \n", - "4 \n", - "5 \n", - "6 \n", + " time_started time_stopped time_elapsed \n", + "1486 \n", + "1358 \n", + "1355 \n", + "1965 \n", + "1383 \n", + "... ... ... ... \n", + "4132 \n", + "4131 \n", + "4130 \n", + "4129 \n", + "4128 \n", "\n", "[23499 rows x 12 columns]" ] @@ -4665,13 +4519,13 @@ "source": [ "# Repeat for the contents of the original training set\n", "train_results = all_results[all_results[\"fold\"] == \"train\"]\n", - "in_gold_to_write, not_in_gold_to_write = util.csv_prep(train_results, \"num_models\")\n", + "in_gold_to_write, not_in_gold_to_write = cleaning.analysis.csv_prep(train_results, \"count\")\n", "in_gold_to_write" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -4695,7 +4549,7 @@ " \n", " \n", " \n", - " num_models\n", + " count\n", " fold\n", " doc_offset\n", " model_span\n", @@ -4713,7 +4567,7 @@ " \n", " \n", " \n", - " 8\n", + " 1738\n", " 17\n", " train\n", " 3\n", @@ -4730,7 +4584,7 @@ " \n", " \n", " \n", - " 13\n", + " 1485\n", " 17\n", " train\n", " 6\n", @@ -4747,7 +4601,7 @@ " \n", " \n", " \n", - " 10\n", + " 1964\n", " 17\n", " train\n", " 25\n", @@ -4764,11 +4618,11 @@ " \n", " \n", " \n", - " 67\n", + " 2022\n", " 17\n", " train\n", " 29\n", - " [454, 468): 'Phil Mickelson'\n", + " [762, 774): 'Mark O'Meara'\n", " PER\n", " \n", " \n", @@ -4781,11 +4635,11 @@ " \n", " \n", " \n", - " 68\n", + " 1996\n", " 17\n", " train\n", " 29\n", - " [762, 774): 'Mark O'Meara'\n", + " [454, 468): 'Phil Mickelson'\n", " PER\n", " \n", " \n", @@ -4815,12 +4669,12 @@ " ...\n", " \n", " \n", - " 42\n", + " 4416\n", " 1\n", " train\n", " 943\n", - " [25, 41): 'SAN MARINO GRAND'\n", - " LOC\n", + " [25, 46): 'SAN MARINO GRAND PRIX'\n", + " PER\n", " \n", " \n", " \n", @@ -4832,12 +4686,12 @@ " \n", " \n", " \n", - " 60\n", + " 4461\n", " 1\n", " train\n", " 944\n", - " [11, 15): 'GOLF'\n", - " LOC\n", + " [25, 32): 'MASTERS'\n", + " MISC\n", " \n", " \n", " \n", @@ -4849,7 +4703,7 @@ " \n", " \n", " \n", - " 62\n", + " 4462\n", " 1\n", " train\n", " 944\n", @@ -4866,7 +4720,7 @@ " \n", " \n", " \n", - " 63\n", + " 4463\n", " 1\n", " train\n", " 944\n", @@ -4883,12 +4737,12 @@ " \n", " \n", " \n", - " 64\n", + " 4458\n", " 1\n", " train\n", " 944\n", - " [25, 32): 'MASTERS'\n", - " MISC\n", + " [11, 15): 'GOLF'\n", + " LOC\n", " \n", " \n", " \n", @@ -4901,50 +4755,50 @@ " \n", " \n", "\n", - "

5345 rows × 14 columns

\n", + "

5347 rows × 14 columns

\n", "" ], "text/plain": [ - " num_models fold doc_offset model_span \\\n", - "8 17 train 3 [0, 10): '-DOCSTART-' \n", - "13 17 train 6 [121, 137): 'Toronto Dominion' \n", - "10 17 train 25 [141, 151): 'mid-Norway' \n", - "67 17 train 29 [454, 468): 'Phil Mickelson' \n", - "68 17 train 29 [762, 774): 'Mark O'Meara' \n", - ".. ... ... ... ... \n", - "42 1 train 943 [25, 41): 'SAN MARINO GRAND' \n", - "60 1 train 944 [11, 15): 'GOLF' \n", - "62 1 train 944 [25, 32): 'MASTERS' \n", - "63 1 train 944 [17, 32): 'BRITISH MASTERS' \n", - "64 1 train 944 [25, 32): 'MASTERS' \n", + " count fold doc_offset model_span \\\n", + "1738 17 train 3 [0, 10): '-DOCSTART-' \n", + "1485 17 train 6 [121, 137): 'Toronto Dominion' \n", + "1964 17 train 25 [141, 151): 'mid-Norway' \n", + "2022 17 train 29 [762, 774): 'Mark O'Meara' \n", + "1996 17 train 29 [454, 468): 'Phil Mickelson' \n", + "... ... ... ... ... \n", + "4416 1 train 943 [25, 46): 'SAN MARINO GRAND PRIX' \n", + "4461 1 train 944 [25, 32): 'MASTERS' \n", + "4462 1 train 944 [25, 32): 'MASTERS' \n", + "4463 1 train 944 [17, 32): 'BRITISH MASTERS' \n", + "4458 1 train 944 [11, 15): 'GOLF' \n", "\n", - " model_ent_type error_type corpus_span corpus_ent_type correct_span \\\n", - "8 LOC \n", - "13 LOC \n", - "10 LOC \n", - "67 PER \n", - "68 PER \n", - ".. ... ... ... ... ... \n", - "42 LOC \n", - "60 LOC \n", - "62 PER \n", - "63 LOC \n", - "64 MISC \n", + " model_ent_type error_type corpus_span corpus_ent_type correct_span \\\n", + "1738 LOC \n", + "1485 LOC \n", + "1964 LOC \n", + "2022 PER \n", + "1996 PER \n", + "... ... ... ... ... ... \n", + "4416 PER \n", + "4461 MISC \n", + "4462 PER \n", + "4463 LOC \n", + "4458 LOC \n", "\n", - " correct_ent_type notes time_started time_stopped time_elapsed \n", - "8 \n", - "13 \n", - "10 \n", - "67 \n", - "68 \n", - ".. ... ... ... ... ... \n", - "42 \n", - "60 \n", - "62 \n", - "63 \n", - "64 \n", + " correct_ent_type notes time_started time_stopped time_elapsed \n", + "1738 \n", + "1485 \n", + "1964 \n", + "2022 \n", + "1996 \n", + "... ... ... ... ... ... \n", + "4416 \n", + "4461 \n", + "4462 \n", + "4463 \n", + "4458 \n", "\n", - "[5345 rows x 14 columns]" + "[5347 rows x 14 columns]" ] }, "execution_count": 28, @@ -4958,7 +4812,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -4990,7 +4844,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.10" + "version": "3.8.10" } }, "nbformat": 4,