diff --git a/notebooks/Read_conllu_Files.ipynb b/notebooks/Read_conllu_Files.ipynb new file mode 100644 index 00000000..0705b2ab --- /dev/null +++ b/notebooks/Read_conllu_Files.ipynb @@ -0,0 +1,2911 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "38b14610-2ac6-43dc-a3ca-2f5533190bd8", + "metadata": {}, + "source": [ + "\n", + " Read_conllu_files.ipynb: Read and parse information from diverse .conllu files, and use integrations with libraries to apply data efficently\n", + " \n", + "\n", + "## Introduction\n", + "\n", + "This notebook demonstrates how diverse .conllu files can be imported, converted and worked with using the open source library [Text Extensions for Pandas](https://github.com/CODAIT/text-extensions-for-pandas). This library uses [Pandas](https://pandas.pydata.org/) DataFrames as a primary data storage format, and to work with several different NLP libraries, such as [SpaCy](https://spacy.io), [Huggingface Transformers](https://huggingface.co/transformers/). \n", + "\n", + "Here we show how these features can be used in conjunction to import, select data, display sentence structure information, and then finally retokenize and train a classifier model on the dataset. \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "32709961-2eae-4a62-a7de-84072c1ab7d5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import numpy as np\n", + "import pandas as pd\n", + "import json\n", + "import feather\n", + "import sklearn.pipeline\n", + "import sklearn.linear_model\n", + "import transformers\n", + "\n", + "# And of course we need the text_extensions_for_pandas library itself.\n", + "try:\n", + " import text_extensions_for_pandas as tp\n", + "except ModuleNotFoundError as e:\n", + " # If we're running from within the project source tree and the parent Python\n", + " # environment doesn't have the text_extensions_for_pandas package, use the\n", + " # version in the local source tree.\n", + " if not os.getcwd().endswith(\"notebooks\"):\n", + " raise e\n", + " if \"..\" not in sys.path:\n", + " sys.path.insert(0, \"..\")\n", + " import text_extensions_for_pandas as tp\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "a9f78f9f-ec2c-4a85-8b0f-9b114642633e", + "metadata": { + "tags": [] + }, + "source": [ + "### Loading files\n", + "There are several sub-flavors of .conllu files, including those used in the EWT, Ontonotes, Universal Dependencies, and CoNLL 2009 corpuses. Text Extensions is designed to take advantage of the common features of .conllu files, while allowing for varied types to be accepted. \n", + "\n", + "In importing this file type, we\n", + "1. Translate the raw words into Token Dtypes\n", + "1. Preserve the dependencies between tokens as represented in the `head` and `deprel` columns\n", + "1. Capture conllu metadata written into the file, if it exists \n", + "1. Allow for conll 09 and Ontonotes style predicate - predicate argument representations\n", + "1. Capture each token's sentence \n", + "1. Allow the user to choose how sub-tokens are handled\n", + "\n", + "\n", + "First, though we must load the datasets we will be using for this demo notebook \n", + "\n", + "In the following cell, we use the facilities of Text Extensions for Pandas to download a copy of the [Universal Dependencies EWT data set](https://github.com/UniversalDependencies/UD_English-EWT) and the [Trial section of the CoNLL 2009 dataset](https://ufal.mff.cuni.cz/conll2009-st/trial-data.html). **Make sure that you adhere to the terms under which they are liscensed when using them** \n", + "\n", + "Then we read them in and display them in the document. Notice how different the information stored in each dataset is. One thing to note is in this specific example, we drop a few columns from each dataset for brevity; remove the `.drop()` methods to show more lines. " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ccd19ec4-8480-4f00-9e34-d18f90f210a1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# init file locations, and download data if necessary. \n", + "BASE_DIR = 'CoNLL_u_test_inputs/'\n", + "FEATHER_FILE = \"conllu_database.feather\"\n", + "\n", + "ewt_base_url = \"https://github.com/UniversalDependencies/UD_English-EWT/blob/master/en_ewt-ud-\"\n", + "ewt_dev_url = ewt_base_url + 'dev.conllu'\n", + "conll_09_test_data_url = 'https://ufal.mff.cuni.cz/conll2009-st/trial/CoNLL2009-ST-English-trial.zip'\n", + "\n", + "# allows us to re-start from saved points\n", + "corpus_df = None " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2c71e57d-811e-4cd0-84f4-efeefe127697", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# download the files if they have not already been downloaded \n", + "conll_09_path = tp.io.conll.maybe_download_dataset_data(BASE_DIR, conll_09_test_data_url)\n", + "conllu_ewt_path = tp.io.conll.maybe_download_dataset_data(BASE_DIR, ewt_dev_url)\n", + "\n", + "# if you already have access to the full conll 2009 dataset, name the file accordingly and uncomment this line \n", + "# conll_09_path = BASE_DIR + 'CoNLL2009-ST-evaluation-English.conllu'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a036fb68-3441-4c7f-bc4a-2b23bdf6875d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Conll 09 format .conllu document:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
spanLEMMAPOSFEATheadDEPRELPREDpredicatepred0argpred1argpred2argpred3argpred4argpred5argpred6argpred7argpred8argpred9argpred10arg
0[0, 3): 'The'theDTNone1NMODNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNone
1[4, 11): 'economy'economyNNNone3NMODNoneA1NoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNone
2[11, 13): ''s''sPOSNone1SUFFIXNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNone
3[14, 25): 'temperature'temperatureNNNone4SBJtemperature.01A2A1NoneNoneNoneNoneNoneNoneNoneNoneNoneNone
4[26, 30): 'will'willMDNone-1ROOTNoneNoneAM-MODNoneNoneNoneNoneNoneNoneNoneNoneNoneNone
\n", + "
" + ], + "text/plain": [ + " span LEMMA POS FEAT head DEPREL \\\n", + "0 [0, 3): 'The' the DT None 1 NMOD \n", + "1 [4, 11): 'economy' economy NN None 3 NMOD \n", + "2 [11, 13): ''s' 's POS None 1 SUFFIX \n", + "3 [14, 25): 'temperature' temperature NN None 4 SBJ \n", + "4 [26, 30): 'will' will MD None -1 ROOT \n", + "\n", + " PRED predicate pred0arg pred1arg pred2arg pred3arg pred4arg \\\n", + "0 None None None None None None None \n", + "1 None A1 None None None None None \n", + "2 None None None None None None None \n", + "3 temperature.01 A2 A1 None None None None \n", + "4 None None AM-MOD None None None None \n", + "\n", + " pred5arg pred6arg pred7arg pred8arg pred9arg pred10arg \n", + "0 None None None None None None \n", + "1 None None None None None None \n", + "2 None None None None None None \n", + "3 None None None None None None \n", + "4 None None None None None None " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "EWT format .conllu document:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
spanlemmaupostagxpostagfeaturesheaddepreldepsmiscsentence_idparagraph_iddoc_idline_num
0[0, 4): 'From'fromADPINNone2case3:caseNoneweblog-blogspot.com_nominations_20041117172713...weblog-blogspot.com_nominations_20041117172713...weblog-blogspot.com_nominations_20041117172713...4
1[5, 8): 'the'theDETDTDefinite=Def|PronType=Art2det3:detNoneweblog-blogspot.com_nominations_20041117172713...weblog-blogspot.com_nominations_20041117172713...weblog-blogspot.com_nominations_20041117172713...5
2[9, 11): 'AP'APPROPNNNPNumber=Sing3obl4:obl:fromNoneweblog-blogspot.com_nominations_20041117172713...weblog-blogspot.com_nominations_20041117172713...weblog-blogspot.com_nominations_20041117172713...6
3[12, 17): 'comes'comeVERBVBZMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...-1root0:rootNoneweblog-blogspot.com_nominations_20041117172713...weblog-blogspot.com_nominations_20041117172713...weblog-blogspot.com_nominations_20041117172713...7
4[18, 22): 'this'thisDETDTNumber=Sing|PronType=Dem5det6:detNoneweblog-blogspot.com_nominations_20041117172713...weblog-blogspot.com_nominations_20041117172713...weblog-blogspot.com_nominations_20041117172713...8
5[23, 28): 'story'storyNOUNNNNumber=Sing3nsubj4:nsubjNoneweblog-blogspot.com_nominations_20041117172713...weblog-blogspot.com_nominations_20041117172713...weblog-blogspot.com_nominations_20041117172713...9
6[28, 29): ':':PUNCT:None3punct4:punctNoneweblog-blogspot.com_nominations_20041117172713...weblog-blogspot.com_nominations_20041117172713...weblog-blogspot.com_nominations_20041117172713...10
7[30, 39): 'President'PresidentPROPNNNPNumber=Sing11nsubj5:nsubjNoneweblog-blogspot.com_nominations_20041117172713...weblog-blogspot.com_nominations_20041117172713...weblog-blogspot.com_nominations_20041117172713...15
8[40, 44): 'Bush'BushPROPNNNPNumber=Sing7flat1:flatNoneweblog-blogspot.com_nominations_20041117172713...weblog-blogspot.com_nominations_20041117172713...weblog-blogspot.com_nominations_20041117172713...16
9[45, 47): 'on'onADPINNone10case4:caseNoneweblog-blogspot.com_nominations_20041117172713...weblog-blogspot.com_nominations_20041117172713...weblog-blogspot.com_nominations_20041117172713...17
\n", + "
" + ], + "text/plain": [ + " span lemma upostag xpostag \\\n", + "0 [0, 4): 'From' from ADP IN \n", + "1 [5, 8): 'the' the DET DT \n", + "2 [9, 11): 'AP' AP PROPN NNP \n", + "3 [12, 17): 'comes' come VERB VBZ \n", + "4 [18, 22): 'this' this DET DT \n", + "5 [23, 28): 'story' story NOUN NN \n", + "6 [28, 29): ':' : PUNCT : \n", + "7 [30, 39): 'President' President PROPN NNP \n", + "8 [40, 44): 'Bush' Bush PROPN NNP \n", + "9 [45, 47): 'on' on ADP IN \n", + "\n", + " features head deprel deps \\\n", + "0 None 2 case 3:case \n", + "1 Definite=Def|PronType=Art 2 det 3:det \n", + "2 Number=Sing 3 obl 4:obl:from \n", + "3 Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF... -1 root 0:root \n", + "4 Number=Sing|PronType=Dem 5 det 6:det \n", + "5 Number=Sing 3 nsubj 4:nsubj \n", + "6 None 3 punct 4:punct \n", + "7 Number=Sing 11 nsubj 5:nsubj \n", + "8 Number=Sing 7 flat 1:flat \n", + "9 None 10 case 4:case \n", + "\n", + " misc sentence_id \\\n", + "0 None weblog-blogspot.com_nominations_20041117172713... \n", + "1 None weblog-blogspot.com_nominations_20041117172713... \n", + "2 None weblog-blogspot.com_nominations_20041117172713... \n", + "3 None weblog-blogspot.com_nominations_20041117172713... \n", + "4 None weblog-blogspot.com_nominations_20041117172713... \n", + "5 None weblog-blogspot.com_nominations_20041117172713... \n", + "6 None weblog-blogspot.com_nominations_20041117172713... \n", + "7 None weblog-blogspot.com_nominations_20041117172713... \n", + "8 None weblog-blogspot.com_nominations_20041117172713... \n", + "9 None weblog-blogspot.com_nominations_20041117172713... \n", + "\n", + " paragraph_id \\\n", + "0 weblog-blogspot.com_nominations_20041117172713... \n", + "1 weblog-blogspot.com_nominations_20041117172713... \n", + "2 weblog-blogspot.com_nominations_20041117172713... \n", + "3 weblog-blogspot.com_nominations_20041117172713... \n", + "4 weblog-blogspot.com_nominations_20041117172713... \n", + "5 weblog-blogspot.com_nominations_20041117172713... \n", + "6 weblog-blogspot.com_nominations_20041117172713... \n", + "7 weblog-blogspot.com_nominations_20041117172713... \n", + "8 weblog-blogspot.com_nominations_20041117172713... \n", + "9 weblog-blogspot.com_nominations_20041117172713... \n", + "\n", + " doc_id line_num \n", + "0 weblog-blogspot.com_nominations_20041117172713... 4 \n", + "1 weblog-blogspot.com_nominations_20041117172713... 5 \n", + "2 weblog-blogspot.com_nominations_20041117172713... 6 \n", + "3 weblog-blogspot.com_nominations_20041117172713... 7 \n", + "4 weblog-blogspot.com_nominations_20041117172713... 8 \n", + "5 weblog-blogspot.com_nominations_20041117172713... 9 \n", + "6 weblog-blogspot.com_nominations_20041117172713... 10 \n", + "7 weblog-blogspot.com_nominations_20041117172713... 15 \n", + "8 weblog-blogspot.com_nominations_20041117172713... 16 \n", + "9 weblog-blogspot.com_nominations_20041117172713... 17 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# import two very different documents, both in the conllu file format. \n", + "\n", + "# by default we look for EWT style column names, \n", + "# so we have to define a new set for this specific conll09 format\n", + "conll_09_cols = [\"LEMMA\",\"PLEMMA\",'POS','PPOS','FEAT','PFEAT','head','phead','DEPREL','PDEPREL','FILLPRED','PRED']\n", + "\n", + "conll_09_docs = tp.io.conll.conll_u_to_dataframes(conll_09_path,column_names=conll_09_cols)\n", + "#now just filter,and display the document \n", + "conll_09_doc = conll_09_docs[0].drop(columns=[\"PLEMMA\",'PPOS','PFEAT','phead','PDEPREL','FILLPRED','sentence','line_num'])\n", + "print(\"Conll 09 format .conllu document:\")\n", + "display(conll_09_doc.head())\n", + "\n", + "\n", + "#simultaneously, we can import an ewt style document, and display it with the same function\n", + "conll_u_docs = tp.io.conll.conll_u_to_dataframes(conllu_ewt_path)\n", + "#display \n", + "DOC_NUM = 0\n", + "doc_df = conll_u_docs[DOC_NUM]\n", + "# here we drop the sentence argument for brevity.\n", + "print(\"EWT format .conllu document:\")\n", + "doc_df.head(10).drop(columns = [\"sentence\"])" + ] + }, + { + "cell_type": "markdown", + "id": "4a42bb64-5ac6-4270-b764-6faa991347a2", + "metadata": { + "tags": [] + }, + "source": [ + "## Combining documents and saving as a .feather file. \n", + "\n", + "Something that is often useful is to store multiple documents from a corpus in one single dataset. Here we do that, then make an adjustment to keep the `'head'` column of our database pointing at the correct elements.\n", + "\n", + "\n", + "Next we quickly write then reread our document as a `.feather` file in its. manipulated state. Because this is serialized, writing and reading is significantly faster than writing to a raw `.conllu` format. " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "153306f1-5b99-459d-8049-fa9a28f7dfee", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "size is 25151\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
spanlemmaupostagheaddeprel
0[0, 4): 'From'fromADP2.0case
1[5, 8): 'the'theDET2.0det
2[9, 11): 'AP'APPROPN3.0obl
3[12, 17): 'comes'comeVERB-1.0root
4[18, 22): 'this'thisDET5.0det
..................
25146[251, 254): 'and'andCCONJ25150.0cc
25147[255, 256): 'a'aDET25150.0det
25148[257, 261): 'very'veryADV25149.0advmod
25149[262, 275): 'knowledgeable'knowledgeableADJ25150.0amod
25150[276, 281): 'staff'staffNOUN25145.0conj
\n", + "

25151 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " span lemma upostag head deprel\n", + "0 [0, 4): 'From' from ADP 2.0 case\n", + "1 [5, 8): 'the' the DET 2.0 det\n", + "2 [9, 11): 'AP' AP PROPN 3.0 obl\n", + "3 [12, 17): 'comes' come VERB -1.0 root\n", + "4 [18, 22): 'this' this DET 5.0 det\n", + "... ... ... ... ... ...\n", + "25146 [251, 254): 'and' and CCONJ 25150.0 cc\n", + "25147 [255, 256): 'a' a DET 25150.0 det\n", + "25148 [257, 261): 'very' very ADV 25149.0 advmod\n", + "25149 [262, 275): 'knowledgeable' knowledgeable ADJ 25150.0 amod\n", + "25150 [276, 281): 'staff' staff NOUN 25145.0 conj\n", + "\n", + "[25151 rows x 5 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# because we are concatenating our dataframes, we need to modify the \"head\" feilds to still point at their desired targets \n", + "df_starts_at =0\n", + "temp = conll_u_docs.copy()\n", + "for df in temp:\n", + " df['head'] = df['head'].apply(lambda i: i +df_starts_at if i!= -1 else -1)\n", + " df_starts_at += df.shape[0]\n", + "\n", + "# Now concatenate all our documents into one big dataframe\n", + "complete_df = temp[0]\n", + "complete_df = complete_df.append(temp[1:], ignore_index=True)\n", + "\n", + "#show the last few rows of the dataframe, select just a few columns for compactness\n", + "print(f\"size is {complete_df.shape[0]}\")\n", + "complete_df[[\"span\",\"lemma\",\"upostag\",\"head\",\"deprel\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6540975b-14a6-4187-98a4-9993f5f2d1cc", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1e+03 ns, sys: 1 µs, total: 2 µs\n", + "Wall time: 6.91 µs\n", + "File written to CoNLL_u_test_inputs/conllu_database.feather\n" + ] + } + ], + "source": [ + "# one advantage of using pandas dataframes is that we can write and read them signifcantly faster than we could the raw conllu files \n", + "# here we use pyarrow with feather to save and reload our dataframe. \n", + "\n", + "# Currently writing multi document files is not supported, so we will have to use a workaround, \n", + "# by converting sentences from TokenSpanArrays to SpanArrays\n", + "complete_df[\"sentence\"] = tp.SpanArray(complete_df[\"span\"].array.target_text, complete_df[\"sentence\"].array.begin, complete_df[\"sentence\"].array.end)\n", + "\n", + "#finally write to file using feather \n", + "path = BASE_DIR +FEATHER_FILE\n", + "# increase the chunksize slightly, to allow writing in a single block\n", + "# time to show how fast Feather actually is \n", + "%time\n", + "feather.write_dataframe(complete_df, path,chunksize= 65536*8)\n", + "print(f\"File written to {path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ade30665-3298-46ad-ab24-0da753c23067", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 2 µs, sys: 1 µs, total: 3 µs\n", + "Wall time: 5.01 µs\n", + "size is 25151\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
spanlemmaupostagheaddeprel
25146[251, 254): 'and'andCCONJ25150.0cc
25147[255, 256): 'a'aDET25150.0det
25148[257, 261): 'very'veryADV25149.0advmod
25149[262, 275): 'knowledgeable'knowledgeableADJ25150.0amod
25150[276, 281): 'staff'staffNOUN25145.0conj
\n", + "
" + ], + "text/plain": [ + " span lemma upostag head deprel\n", + "25146 [251, 254): 'and' and CCONJ 25150.0 cc\n", + "25147 [255, 256): 'a' a DET 25150.0 det\n", + "25148 [257, 261): 'very' very ADV 25149.0 advmod\n", + "25149 [262, 275): 'knowledgeable' knowledgeable ADJ 25150.0 amod\n", + "25150 [276, 281): 'staff' staff NOUN 25145.0 conj" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# now we can read this df and continue operating on it as before. Time the read operation \n", + "%time \n", + "re_read_df = feather.read_dataframe(path)\n", + "print(f\"size is {re_read_df.shape[0]}\")\n", + "# show the same subset of the dataframe as above \n", + "re_read_df.tail()[[\"span\",\"lemma\",\"upostag\",\"head\",\"deprel\"]]" + ] + }, + { + "cell_type": "markdown", + "id": "a290f829-688b-4d82-af90-7e168fa5808d", + "metadata": { + "tags": [] + }, + "source": [ + "## Show sentence parse trees using pandas data manipulation, and SpaCy integrations\n", + "Because of the integrations built into Text extensions, we can use powerful data visualization tools here we're leveraging spaCy's dependency tree visualization tools, to show the parse tree as specified in the raw conllu file. \n", + "\n", + "First, we use Pandas groupby to to quickly select the n'th sentence in the dataset, and store it as its own dataframe and display selected columns \n", + "\n", + "Then we use Spacy to render the parse tree of that specific sentence, as found in the raw data. " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f1f41cb8-a833-4e84-ab9b-d0d5a7408e00", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
spanlemmaupostagxpostagheaddeprelsentence
2510[979, 982): 'And'andCCONJCC2514.0cc[979, 1014): 'And what do we get for this effo...
2511[983, 987): 'what'whatPRONWP2514.0obj[979, 1014): 'And what do we get for this effo...
2512[988, 990): 'do'doAUXVBP2514.0aux[979, 1014): 'And what do we get for this effo...
2513[991, 993): 'we'wePRONPRP2514.0nsubj[979, 1014): 'And what do we get for this effo...
2514[994, 997): 'get'getVERBVB-1.0root[979, 1014): 'And what do we get for this effo...
2515[998, 1001): 'for'forADPIN2517.0case[979, 1014): 'And what do we get for this effo...
2516[1002, 1006): 'this'thisDETDT2517.0det[979, 1014): 'And what do we get for this effo...
2517[1007, 1013): 'effort'effortNOUNNN2514.0obl[979, 1014): 'And what do we get for this effo...
2518[1013, 1014): '?'?PUNCT.2514.0punct[979, 1014): 'And what do we get for this effo...
\n", + "
" + ], + "text/plain": [ + " span lemma upostag xpostag head deprel \\\n", + "2510 [979, 982): 'And' and CCONJ CC 2514.0 cc \n", + "2511 [983, 987): 'what' what PRON WP 2514.0 obj \n", + "2512 [988, 990): 'do' do AUX VBP 2514.0 aux \n", + "2513 [991, 993): 'we' we PRON PRP 2514.0 nsubj \n", + "2514 [994, 997): 'get' get VERB VB -1.0 root \n", + "2515 [998, 1001): 'for' for ADP IN 2517.0 case \n", + "2516 [1002, 1006): 'this' this DET DT 2517.0 det \n", + "2517 [1007, 1013): 'effort' effort NOUN NN 2514.0 obl \n", + "2518 [1013, 1014): '?' ? PUNCT . 2514.0 punct \n", + "\n", + " sentence \n", + "2510 [979, 1014): 'And what do we get for this effo... \n", + "2511 [979, 1014): 'And what do we get for this effo... \n", + "2512 [979, 1014): 'And what do we get for this effo... \n", + "2513 [979, 1014): 'And what do we get for this effo... \n", + "2514 [979, 1014): 'And what do we get for this effo... \n", + "2515 [979, 1014): 'And what do we get for this effo... \n", + "2516 [979, 1014): 'And what do we get for this effo... \n", + "2517 [979, 1014): 'And what do we get for this effo... \n", + "2518 [979, 1014): 'And what do we get for this effo... " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " And\n", + " CCONJ\n", + "\n", + "\n", + "\n", + " what\n", + " PRON\n", + "\n", + "\n", + "\n", + " do\n", + " AUX\n", + "\n", + "\n", + "\n", + " we\n", + " PRON\n", + "\n", + "\n", + "\n", + " get\n", + " VERB\n", + "\n", + "\n", + "\n", + " for\n", + " ADP\n", + "\n", + "\n", + "\n", + " this\n", + " DET\n", + "\n", + "\n", + "\n", + " effort\n", + " NOUN\n", + "\n", + "\n", + "\n", + " ?\n", + " PUNCT\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " cc\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " obj\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " aux\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " nsubj\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " case\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " det\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " obl\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " punct\n", + " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "Sentence_num = 110\n", + "\n", + "# use pandas to quickly select the 'n'th sentence in the dataset \n", + "nth_sentence = list(re_read_df.groupby(\"sentence_id\",sort=False))[Sentence_num][1]\n", + "display(nth_sentence[[\"span\",\"lemma\",\"upostag\",\"xpostag\",\"head\",\"deprel\",\"sentence\"]])\n", + "\n", + "# now use spacy integration to rendeer the parse tree\n", + "tp.io.spacy.render_parse_tree(nth_sentence,tag_col=\"upostag\",label_col=\"deprel\",head_col=\"head\")" + ] + }, + { + "cell_type": "markdown", + "id": "0fc00e06-b2fe-47f0-9f0a-20d0b8370807", + "metadata": {}, + "source": [ + "# Train a classifier model\n", + "\n", + "Now use more text extensions integrations, with *transformers* to quickly and easily train a part of speech classifier model using bert embeddings on our data. We loosely follow the same process as is used in the [Model_Training_with_BERT](./Model_Training_with_BERT.ipynb) demo, notebook so check there for a more indepth explanation of each step.\n", + "\n", + "Broadly, what we do is: \n", + "1. Import all the folds of the dataset we're using (Universal dependencies EWT) \n", + "1. Create a Pandas Categorical datatype on over which to classify\n", + "1. Retokenize that dataset using Huggingface Transformers to Bert-compatible tokens\n", + "1. Correlate the new tokens with their original counterpart's parts of speech\n", + "1. Create the Bert embeddings for each sub-token\n", + "1. Convert the parts of speech tags to our categoical datatype\n", + "1. Initialize and train a sklearn model on the Bert embeddings -> Part of Speech\n", + "1. Use that model to perform inference on our dataset\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "20f20214-62d9-4f7b-bd4f-fcd377148aad", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "converted fold: 'test' to list of 316 dataframes\n", + "converted fold: 'dev' to list of 318 dataframes\n", + "converted fold: 'train' to list of 540 dataframes\n" + ] + } + ], + "source": [ + "# We're going to need the whole ewt dataset for this: download them, and parse them in \n", + "fold_paths = {\"test\": tp.io.conll.maybe_download_dataset_data(BASE_DIR, ewt_base_url + \"test.conllu\"),\n", + " \"dev\": tp.io.conll.maybe_download_dataset_data(BASE_DIR, ewt_base_url + \"dev.conllu\"),\n", + " \"train\": tp.io.conll.maybe_download_dataset_data(BASE_DIR, ewt_base_url + \"train.conllu\")}\n", + "fold_docs = {}\n", + "for fold,fold_path in fold_paths.items(): \n", + " fold_docs[fold] = tp.io.conll.conll_u_to_dataframes(fold_path)\n", + " print(f\"converted fold: '{fold}' to list of {len(fold_docs[fold])} dataframes\")\n", + " # uncomment to display segments of the extracted folds \n", + " # display(fold_docs[fold][0].head()[['span','lemma','upostag','features','sentence']])" + ] + }, + { + "cell_type": "markdown", + "id": "cade0734-2845-4b30-8472-59f103f44bd9", + "metadata": {}, + "source": [ + "### Initialize elements for preprocessing steps\n", + "Instantiate pretrained tokenizer and BERT models from transformers library, and create a pandas categorical datatype for parts of speech" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2cf1b9d6-b2cb-45fe-b724-529b7bb75d3f", + "metadata": {}, + "outputs": [], + "source": [ + "bert_model_name = \"dslim/bert-base-NER\"\n", + "tokenizer = transformers.BertTokenizerFast.from_pretrained(bert_model_name)\n", + "bert = transformers.BertModel.from_pretrained(bert_model_name)\n", + "\n", + "# also we will want to create a pandas categorical dtype for what we want to predict- part of speech. \n", + "# use the combined df, because it has all the elements \n", + "upostags_list = list(re_read_df[\"upostag\"].unique())\n", + "# upostag_dtype,upostag_list,upostag_dict = tp.io.conll.make_iob_tag_categories(upostags)\n", + "upostag_dtype = pd.CategoricalDtype(categories = upostags_list)\n", + "upostag_dict = {upostags_list[i]:i for i in range(len(upostags_list)) }" + ] + }, + { + "cell_type": "markdown", + "id": "f10fdc34-613d-47e1-8287-c33384da69ba", + "metadata": {}, + "source": [ + "## Preprocess the document\n", + "\n", + "Because steps 3-6 can only be done on a document-by-document basis, we create a method to do them in a batch, then run them on the whole corpus. Note this process is computationally intensive so it may take a few minutes to run." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "3fe4b27e-4f8f-4dab-ad42-ca04bcb3234c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "processing fold test\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a6b4ed4fe82a499584fd2b25dfcb1254", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=316, style=ProgressStyle(desc…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Token indices sequence length is longer than the specified maximum sequence length for this model (713 > 512). Running this sequence through the model will result in indexing errors\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "processing fold dev\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7e4d211c2807462799823e5d9ce486d4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=318, style=ProgressStyle(desc…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "processing fold train\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2fbfd56ed94943bc9c1268fe37a9b278", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=540, style=ProgressStyle(desc…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# make a method to take care of preprocessing steps: 3-6\n", + "def preprocess_document(document, tokenizer,bert):\n", + " # create BERT compatible tokens using our tokenizer\n", + " temp = tp.io.bert.make_bert_tokens(document.loc[0,'span'].target_text, tokenizer)\n", + " # re-correlate our original spans with their bert-compatible equivalents\n", + " spans = tp.TokenSpanArray.align_to_tokens(temp[\"span\"],document[\"span\"])\n", + " \n", + " # now carry over some features from the old spans to the new onesspans_df = spans.as_frame().drop(columns = [\"begin\",\"end\"])\n", + " spans_df = spans.as_frame().drop(columns = ['begin','end','covered_text'])\n", + " spans_df['postag'] = document['upostag']\n", + " printed = 20\n", + " for i,b_tok,e_tok,pos in spans_df.itertuples():\n", + " temp.loc[b_tok:e_tok-1 , [\"postag\",\"raw_span\",'raw_span_id']] = pos,spans[i],i\n", + " \n", + " # now translate from text tags to postag \n", + " temp['postag'].fillna('X',inplace=True) # in our Labels, 'X' is a standin for \"N/A\" so convert N/A's to 'X'\n", + " temp[\"postag_id\"] = temp['postag'].apply(lambda t: int(upostag_dict[str(t)]))\n", + " temp = temp.astype({'postag_id':'int','postag':upostag_dtype})\n", + " return tp.io.bert.add_embeddings(temp, bert)\n", + "\n", + "\n", + "# preprocess the whole corpus: \n", + "bert_docs_by_fold = {}\n", + "for fold in fold_docs.keys():\n", + " docs = fold_docs[fold]\n", + " print(f\"processing fold {fold}\")\n", + " bert_docs_by_fold[fold] = tp.jupyter.run_with_progress_bar(len(docs),lambda i: preprocess_document(docs[i],tokenizer,bert))" + ] + }, + { + "cell_type": "markdown", + "id": "b843badb-33c5-4bd3-a05c-ef2159adfa43", + "metadata": {}, + "source": [ + "## Checkpoint: save preprocessed data \n", + "\n", + "Because the last step was time intensive, combine all the documents together, then save them as a feather file, so that we can restart from here if need be. " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "ec08895b-98e7-43cc-8138-412893982019", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
folddoc_numtoken_idspaninput_idtoken_type_idattention_maskspecial_tokens_maskpostagraw_spanraw_span_idpostag_idembeddingtext
0test00[0, 0): ''10101TrueXNaNNaN14[ -0.37686592, -0.14841378, 0.73980016, ...
1test01[0, 4): 'What'132701FalsePRON[0, 4): 'What'0.011[ -0.23266968, -0.40546328, 0.6171929, ...What
2test02[5, 7): 'if'119101FalseSCONJ[5, 7): 'if'1.013[ -0.8156859, -0.04782569, 0.081484295, ...if
3test03[8, 14): 'Google'798601FalsePROPN[8, 14): 'Google'2.02[ 0.78967804, -0.8511879, -0.48812625, ...Google
4test04[15, 17): 'Mo'1255601FalseVERB[15, 22): 'Morphed'3.03[ -0.25935018, 0.5710723, -0.09106647, ...Mo
\n", + "
" + ], + "text/plain": [ + " fold doc_num token_id span input_id token_type_id \\\n", + "0 test 0 0 [0, 0): '' 101 0 \n", + "1 test 0 1 [0, 4): 'What' 1327 0 \n", + "2 test 0 2 [5, 7): 'if' 1191 0 \n", + "3 test 0 3 [8, 14): 'Google' 7986 0 \n", + "4 test 0 4 [15, 17): 'Mo' 12556 0 \n", + "\n", + " attention_mask special_tokens_mask postag raw_span \\\n", + "0 1 True X NaN \n", + "1 1 False PRON [0, 4): 'What' \n", + "2 1 False SCONJ [5, 7): 'if' \n", + "3 1 False PROPN [8, 14): 'Google' \n", + "4 1 False VERB [15, 22): 'Morphed' \n", + "\n", + " raw_span_id postag_id embedding \\\n", + "0 NaN 14 [ -0.37686592, -0.14841378, 0.73980016, ... \n", + "1 0.0 11 [ -0.23266968, -0.40546328, 0.6171929, ... \n", + "2 1.0 13 [ -0.8156859, -0.04782569, 0.081484295, ... \n", + "3 2.0 2 [ 0.78967804, -0.8511879, -0.48812625, ... \n", + "4 3.0 3 [ -0.25935018, 0.5710723, -0.09106647, ... \n", + "\n", + " text \n", + "0 \n", + "1 What \n", + "2 if \n", + "3 Google \n", + "4 Mo " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# combine folds and save to a feather file, so we don't necessarily need to redo the preprocessing. \n", + "corpus_df = tp.io.conll.combine_folds(bert_docs_by_fold)\n", + "corpus_df[\"text\"] = corpus_df[\"span\"].apply(lambda s: s.covered_text)\n", + "cols_to_drop = [c for c in corpus_df.columns if \"span\" in c]\n", + "corpus_df.drop(columns=cols_to_drop).to_feather(\"outputs/conll_u_corpus.feather\")\n", + "corpus_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "63efdd32-5458-4dcb-91f3-dca97a7d1772", + "metadata": {}, + "outputs": [], + "source": [ + "# re-read feather document if need be: \n", + "if corpus_df is None or corpus_df.size == 0:\n", + " corpus_df = pd.read_feather(\"outputs/conll_u_corpus.feather\")\n", + " corpus_df" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "0bfa7afc-6510-4b5f-9ff4-4b184f00e352", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
folddoc_numtoken_idspaninput_idtoken_type_idattention_maskspecial_tokens_maskpostagraw_spanraw_span_idpostag_idembeddingtext
64729train00[0, 0): ''10101TrueXNaNNaN14[ -0.41927838, -0.22575253, 0.6648760...
64730train01[0, 2): 'Al'258601FalsePROPN[0, 2): 'Al'0.02[ -0.36961424, -1.0804733, -0.283367...Al
64731train02[2, 3): '-'11801FalsePUNCT[2, 3): '-'1.05[ -0.9178737, -0.94624436, -0.808995...-
64732train03[4, 5): 'Z'16301FalsePROPN[4, 9): 'Zaman'2.02[ -0.90530086, -0.97086835, -1.440879...Z
64733train04[5, 9): 'aman'1985301FalsePROPN[4, 9): 'Zaman'2.02[ -1.1586123, -1.149766, -1.194975...aman
.............................................
307892train539756[3152, 3154): 'my'113901FalsePRON[3152, 3154): 'my'690.011[ -0.06984596, -0.4646067, 0.8547705...my
307893train539757[3155, 3158): 'car'161001FalseNOUN[3155, 3158): 'car'691.04[ 0.14624132, -0.46386197, 0.596684...car
307894train539758[3158, 3159): ')'11401FalsePUNCT[3158, 3159): ')'692.05[ -0.090651065, -0.29592788, 0.597023...)
307895train539759[3159, 3160): '.'11901FalsePUNCT[3159, 3160): '.'693.05[ 0.031023545, -0.27608734, 0.782190....
307896train539760[0, 0): ''10201TrueXNaNNaN14[ -0.5088702, -0.22885968, 0.544944...
\n", + "

243168 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " fold doc_num token_id span input_id \\\n", + "64729 train 0 0 [0, 0): '' 101 \n", + "64730 train 0 1 [0, 2): 'Al' 2586 \n", + "64731 train 0 2 [2, 3): '-' 118 \n", + "64732 train 0 3 [4, 5): 'Z' 163 \n", + "64733 train 0 4 [5, 9): 'aman' 19853 \n", + "... ... ... ... ... ... \n", + "307892 train 539 756 [3152, 3154): 'my' 1139 \n", + "307893 train 539 757 [3155, 3158): 'car' 1610 \n", + "307894 train 539 758 [3158, 3159): ')' 114 \n", + "307895 train 539 759 [3159, 3160): '.' 119 \n", + "307896 train 539 760 [0, 0): '' 102 \n", + "\n", + " token_type_id attention_mask special_tokens_mask postag \\\n", + "64729 0 1 True X \n", + "64730 0 1 False PROPN \n", + "64731 0 1 False PUNCT \n", + "64732 0 1 False PROPN \n", + "64733 0 1 False PROPN \n", + "... ... ... ... ... \n", + "307892 0 1 False PRON \n", + "307893 0 1 False NOUN \n", + "307894 0 1 False PUNCT \n", + "307895 0 1 False PUNCT \n", + "307896 0 1 True X \n", + "\n", + " raw_span raw_span_id postag_id \\\n", + "64729 NaN NaN 14 \n", + "64730 [0, 2): 'Al' 0.0 2 \n", + "64731 [2, 3): '-' 1.0 5 \n", + "64732 [4, 9): 'Zaman' 2.0 2 \n", + "64733 [4, 9): 'Zaman' 2.0 2 \n", + "... ... ... ... \n", + "307892 [3152, 3154): 'my' 690.0 11 \n", + "307893 [3155, 3158): 'car' 691.0 4 \n", + "307894 [3158, 3159): ')' 692.0 5 \n", + "307895 [3159, 3160): '.' 693.0 5 \n", + "307896 NaN NaN 14 \n", + "\n", + " embedding text \n", + "64729 [ -0.41927838, -0.22575253, 0.6648760... \n", + "64730 [ -0.36961424, -1.0804733, -0.283367... Al \n", + "64731 [ -0.9178737, -0.94624436, -0.808995... - \n", + "64732 [ -0.90530086, -0.97086835, -1.440879... Z \n", + "64733 [ -1.1586123, -1.149766, -1.194975... aman \n", + "... ... ... \n", + "307892 [ -0.06984596, -0.4646067, 0.8547705... my \n", + "307893 [ 0.14624132, -0.46386197, 0.596684... car \n", + "307894 [ -0.090651065, -0.29592788, 0.597023... ) \n", + "307895 [ 0.031023545, -0.27608734, 0.782190... . \n", + "307896 [ -0.5088702, -0.22885968, 0.544944... \n", + "\n", + "[243168 rows x 14 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# now get ready to train our model: \n", + "train_df = corpus_df[corpus_df[\"fold\"] == \"train\"]\n", + "train_df = train_df.astype({'postag_id':'int'}, copy=False)\n", + "train_df" + ] + }, + { + "cell_type": "markdown", + "id": "ac4fbd6f-eecc-40a1-aa60-bfdd41546af0", + "metadata": {}, + "source": [ + "## Train the model\n", + "Use a sklearn pipeline to train a multinomial regression model ontop of Bert embeddings to predict Part of speech\n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "034a61f3-7fe8-4b02-b649-cea67e14ceb3", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", + "/Users/zacharyeichenberger/anaconda3/envs/pd/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n", + "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 15.6min remaining: 0.0s\n", + "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 15.6min finished\n" + ] + }, + { + "data": { + "text/plain": [ + "Pipeline(steps=[('mlogreg',\n", + " LogisticRegression(max_iter=1000, multi_class='multinomial',\n", + " verbose=10))])" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# now actually train a model, using sklearn \n", + "MULTI_CLASS= \"multinomial\"\n", + "\n", + "# How many iterations to run the BGFS optimizer when fitting logistic\n", + "# regression models. 100 ==> Fast; 10000 ==> Full convergence\n", + "LBGFS_ITERATIONS = 1000\n", + "\n", + "base_pipeline = sklearn.pipeline.Pipeline([\n", + " # Standard scaler. This only makes a difference for certain classes\n", + " # of embeddings.\n", + " #(\"scaler\", sklearn.preprocessing.StandardScaler()),\n", + " (\"mlogreg\", sklearn.linear_model.LogisticRegression(\n", + " multi_class=MULTI_CLASS,\n", + " verbose=10,\n", + " max_iter=LBGFS_ITERATIONS\n", + " ))\n", + "])\n", + "\n", + "X_train = train_df[\"embedding\"].values\n", + "Y_train = train_df[\"postag_id\"]\n", + "base_model = base_pipeline.fit(X_train, Y_train)\n", + "base_model" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "eebcc759-bd0e-497e-8cbe-99708d9da66c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "saved\n" + ] + } + ], + "source": [ + "# pickle model so I don't need to re-fit it every time\n", + "import pickle \n", + "\n", + "load_from_file= False\n", + "pickle_model_file = \"conllu_pos_classifier.pickle\"\n", + "\n", + "if not load_from_file:\n", + " with open(BASE_DIR+ pickle_model_file, 'wb') as file: \n", + " pickle.dump(base_model,file)\n", + " print(\"saved\")\n", + "else: \n", + " with open(BASE_DIR+ pickle_model_file, 'rb') as file: \n", + " base_model = pickle.load(file)\n", + " print(\"loaded\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "ff14187f-ad0b-4fdf-acbe-eadc8acfaffe", + "metadata": {}, + "source": [ + "### Use the model to run inference on the test set of the data " + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "bf936b74-2f15-4180-8778-2ccac83c9ee7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
folddoc_numtoken_idspaninput_idtoken_type_idattention_maskspecial_tokens_maskpostagraw_spanraw_span_idpostag_idembeddingtextp_idp_postagraw_output
0test00[0, 0): ''10101TrueXNaNNaN14[ -0.37686592, -0.14841378, 0.7398001...14X[ 3.681475919382054e-11, 8.766155854203454e-1...
1test01[0, 4): 'What'132701FalsePRON[0, 4): 'What'0.011[ -0.23266968, -0.40546328, 0.617192...What5PUNCT[ 4.480117969135689e-05, 0.000492260661933639...
2test02[5, 7): 'if'119101FalseSCONJ[5, 7): 'if'1.013[ -0.8156859, -0.04782569, 0.08148429...if13SCONJ[ 0.00458089489431613, 1.0102614181540655e-0...
3test03[8, 14): 'Google'798601FalsePROPN[8, 14): 'Google'2.02[ 0.78967804, -0.8511879, -0.4881262...Google2PROPN[2.0128060688355368e-13, 4.3664010704307723e-1...
4test04[15, 17): 'Mo'1255601FalseVERB[15, 22): 'Morphed'3.03[ -0.25935018, 0.5710723, -0.0910664...Mo2PROPN[ 0.004772281895284574, 3.990804066047649e-0...
5test05[17, 19): 'rp'1561501FalseVERB[15, 22): 'Morphed'3.03[ -0.3267119, -0.10905984, 0.053087...rp2PROPN[ 4.133346131920443e-14, 3.0715492927999484e-0...
6test06[19, 22): 'hed'896101FalseVERB[15, 22): 'Morphed'3.03[ -0.9018082, -0.16881262, 0.4379902...hed3VERB[ 0.0003547861848056146, 2.0943022199837429e-1...
7test07[23, 27): 'Into'1400001FalseADP[23, 27): 'Into'4.00[ 0.09566124, -0.109931074, -0.1493219...Into0ADP[ 0.98593362749934, 2.223312204453196e-1...
8test08[28, 34): 'Google'798601FalsePROPN[28, 36): 'GoogleOS'5.02[ -1.2022994, -0.29254374, 0.2236384...Google2PROPN[ 8.802423148364236e-22, 9.098724786449631e-2...
9test09[34, 36): 'OS'902501FalsePROPN[28, 36): 'GoogleOS'5.02[ -0.78180003, -0.20742358, -1.288184...OS2PROPN[ 6.859296955972293e-14, 1.3584745823663452e-1...
10test010[36, 37): '?'13601FalsePUNCT[36, 37): '?'6.05[ -0.34068698, -0.4208277, 0.674408...?5PUNCT[ 4.419303709134792e-06, 2.2879619521678678e-0...
11test011[38, 42): 'What'132701FalsePRON[38, 42): 'What'7.011[ -0.39101043, -0.33632284, 0.6353156...What13SCONJ[ 1.089099756150613e-05, 4.301330361203966e-0...
12test012[43, 45): 'if'119101FalseSCONJ[43, 45): 'if'8.013[ -0.68665487, -0.16331403, 0.2546722...if13SCONJ[ 0.00056745829301006, 1.0575813370745938e-0...
13test013[46, 52): 'Google'798601FalsePROPN[46, 52): 'Google'9.02[ 0.57027435, -0.9182296, -0.1871779...Google2PROPN[ 5.256030568823519e-06, 0.000141057983398473...
14test014[53, 61): 'expanded'363101FalseVERB[53, 61): 'expanded'10.03[ -0.48126522, -0.1581611, 0.4039635...expanded3VERB[ 4.050874025777886e-07, 1.9460546573814385e-1...
15test015[62, 64): 'on'111301FalseADP[62, 64): 'on'11.00[ -0.17011818, -0.37733135, 0.745948...on0ADP[ 0.994939087225643, 1.8480376986013427e-0...
16test016[65, 68): 'its'115701FalsePRON[65, 68): 'its'12.011[ -0.34582123, -0.3814539, 0.539305...its11PRON[ 0.14933916780906895, 1.9661816857805298e-0...
17test017[69, 75): 'search'340301FalseNOUN[69, 75): 'search'13.04[ -0.1650713, -0.54526025, 0.648461...search4NOUN[ 5.089421239766719e-07, 5.0023756350866345e-0...
18test018[75, 76): '-'11801FalsePUNCT[75, 76): '-'14.05[ -0.16116095, -0.44251364, 0.7121795...-5PUNCT[ 0.0004927920586926724, 6.359739270183003e-0...
19test019[77, 83): 'engine'239501FalseNOUN[77, 83): 'engine'15.04[ -0.35368297, -0.47415957, 0.4551175...engine4NOUN[ 1.888161183325207e-08, 1.225134189182207e-1...
\n", + "
" + ], + "text/plain": [ + " fold doc_num token_id span input_id token_type_id \\\n", + "0 test 0 0 [0, 0): '' 101 0 \n", + "1 test 0 1 [0, 4): 'What' 1327 0 \n", + "2 test 0 2 [5, 7): 'if' 1191 0 \n", + "3 test 0 3 [8, 14): 'Google' 7986 0 \n", + "4 test 0 4 [15, 17): 'Mo' 12556 0 \n", + "5 test 0 5 [17, 19): 'rp' 15615 0 \n", + "6 test 0 6 [19, 22): 'hed' 8961 0 \n", + "7 test 0 7 [23, 27): 'Into' 14000 0 \n", + "8 test 0 8 [28, 34): 'Google' 7986 0 \n", + "9 test 0 9 [34, 36): 'OS' 9025 0 \n", + "10 test 0 10 [36, 37): '?' 136 0 \n", + "11 test 0 11 [38, 42): 'What' 1327 0 \n", + "12 test 0 12 [43, 45): 'if' 1191 0 \n", + "13 test 0 13 [46, 52): 'Google' 7986 0 \n", + "14 test 0 14 [53, 61): 'expanded' 3631 0 \n", + "15 test 0 15 [62, 64): 'on' 1113 0 \n", + "16 test 0 16 [65, 68): 'its' 1157 0 \n", + "17 test 0 17 [69, 75): 'search' 3403 0 \n", + "18 test 0 18 [75, 76): '-' 118 0 \n", + "19 test 0 19 [77, 83): 'engine' 2395 0 \n", + "\n", + " attention_mask special_tokens_mask postag raw_span \\\n", + "0 1 True X NaN \n", + "1 1 False PRON [0, 4): 'What' \n", + "2 1 False SCONJ [5, 7): 'if' \n", + "3 1 False PROPN [8, 14): 'Google' \n", + "4 1 False VERB [15, 22): 'Morphed' \n", + "5 1 False VERB [15, 22): 'Morphed' \n", + "6 1 False VERB [15, 22): 'Morphed' \n", + "7 1 False ADP [23, 27): 'Into' \n", + "8 1 False PROPN [28, 36): 'GoogleOS' \n", + "9 1 False PROPN [28, 36): 'GoogleOS' \n", + "10 1 False PUNCT [36, 37): '?' \n", + "11 1 False PRON [38, 42): 'What' \n", + "12 1 False SCONJ [43, 45): 'if' \n", + "13 1 False PROPN [46, 52): 'Google' \n", + "14 1 False VERB [53, 61): 'expanded' \n", + "15 1 False ADP [62, 64): 'on' \n", + "16 1 False PRON [65, 68): 'its' \n", + "17 1 False NOUN [69, 75): 'search' \n", + "18 1 False PUNCT [75, 76): '-' \n", + "19 1 False NOUN [77, 83): 'engine' \n", + "\n", + " raw_span_id postag_id embedding \\\n", + "0 NaN 14 [ -0.37686592, -0.14841378, 0.7398001... \n", + "1 0.0 11 [ -0.23266968, -0.40546328, 0.617192... \n", + "2 1.0 13 [ -0.8156859, -0.04782569, 0.08148429... \n", + "3 2.0 2 [ 0.78967804, -0.8511879, -0.4881262... \n", + "4 3.0 3 [ -0.25935018, 0.5710723, -0.0910664... \n", + "5 3.0 3 [ -0.3267119, -0.10905984, 0.053087... \n", + "6 3.0 3 [ -0.9018082, -0.16881262, 0.4379902... \n", + "7 4.0 0 [ 0.09566124, -0.109931074, -0.1493219... \n", + "8 5.0 2 [ -1.2022994, -0.29254374, 0.2236384... \n", + "9 5.0 2 [ -0.78180003, -0.20742358, -1.288184... \n", + "10 6.0 5 [ -0.34068698, -0.4208277, 0.674408... \n", + "11 7.0 11 [ -0.39101043, -0.33632284, 0.6353156... \n", + "12 8.0 13 [ -0.68665487, -0.16331403, 0.2546722... \n", + "13 9.0 2 [ 0.57027435, -0.9182296, -0.1871779... \n", + "14 10.0 3 [ -0.48126522, -0.1581611, 0.4039635... \n", + "15 11.0 0 [ -0.17011818, -0.37733135, 0.745948... \n", + "16 12.0 11 [ -0.34582123, -0.3814539, 0.539305... \n", + "17 13.0 4 [ -0.1650713, -0.54526025, 0.648461... \n", + "18 14.0 5 [ -0.16116095, -0.44251364, 0.7121795... \n", + "19 15.0 4 [ -0.35368297, -0.47415957, 0.4551175... \n", + "\n", + " text p_id p_postag raw_output \n", + "0 14 X [ 3.681475919382054e-11, 8.766155854203454e-1... \n", + "1 What 5 PUNCT [ 4.480117969135689e-05, 0.000492260661933639... \n", + "2 if 13 SCONJ [ 0.00458089489431613, 1.0102614181540655e-0... \n", + "3 Google 2 PROPN [2.0128060688355368e-13, 4.3664010704307723e-1... \n", + "4 Mo 2 PROPN [ 0.004772281895284574, 3.990804066047649e-0... \n", + "5 rp 2 PROPN [ 4.133346131920443e-14, 3.0715492927999484e-0... \n", + "6 hed 3 VERB [ 0.0003547861848056146, 2.0943022199837429e-1... \n", + "7 Into 0 ADP [ 0.98593362749934, 2.223312204453196e-1... \n", + "8 Google 2 PROPN [ 8.802423148364236e-22, 9.098724786449631e-2... \n", + "9 OS 2 PROPN [ 6.859296955972293e-14, 1.3584745823663452e-1... \n", + "10 ? 5 PUNCT [ 4.419303709134792e-06, 2.2879619521678678e-0... \n", + "11 What 13 SCONJ [ 1.089099756150613e-05, 4.301330361203966e-0... \n", + "12 if 13 SCONJ [ 0.00056745829301006, 1.0575813370745938e-0... \n", + "13 Google 2 PROPN [ 5.256030568823519e-06, 0.000141057983398473... \n", + "14 expanded 3 VERB [ 4.050874025777886e-07, 1.9460546573814385e-1... \n", + "15 on 0 ADP [ 0.994939087225643, 1.8480376986013427e-0... \n", + "16 its 11 PRON [ 0.14933916780906895, 1.9661816857805298e-0... \n", + "17 search 4 NOUN [ 5.089421239766719e-07, 5.0023756350866345e-0... \n", + "18 - 5 PUNCT [ 0.0004927920586926724, 6.359739270183003e-0... \n", + "19 engine 4 NOUN [ 1.888161183325207e-08, 1.225134189182207e-1... " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def infer_on_df(df: pd.DataFrame, id_to_class_dict, predictor):\n", + " result_df = df.copy()\n", + " raw_outputs = tp.TensorArray(predictor.predict_proba(result_df[\"embedding\"]))\n", + " result_df[\"p_id\"] = np.argmax(raw_outputs, axis=1)\n", + " result_df[\"p_postag\"]= result_df[\"p_id\"].apply(lambda p_id: id_to_class_dict[p_id])\n", + " result_df[\"raw_output\"] = raw_outputs\n", + " return result_df\n", + "\n", + "test_results = infer_on_df(corpus_df[corpus_df[\"fold\"] == \"test\"],upostags_list,base_model)\n", + "test_results.head(20)" + ] + }, + { + "cell_type": "markdown", + "id": "a2634300-ea5f-4424-9c65-120143a66444", + "metadata": {}, + "source": [ + "## Now look at the data we've made and aggregate it to calculate F1 scores \n", + "\n", + "First, aggregate by raw surface token, to get the 'real' token predictions from the bert-ified values To do this we multiply the probabilities for each subtoken\n", + "\n", + "Then compare with existing labels " + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "b57b7d79-2d5f-450e-8cfd-3f6e3e78b799", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
folddoc_numpostagpostag_idp_postag_idpredicted_postag
raw_span
[0, 4): 'What'test0PRON115PUNCT
[5, 7): 'if'test0SCONJ1313SCONJ
[8, 14): 'Google'test0PROPN22PROPN
[15, 22): 'Morphed'test0VERB32PROPN
[23, 27): 'Into'test0ADP00ADP
.....................
[307, 309): 'of'test2ADP00ADP
[310, 313): 'the'test2DET11DET
[314, 319): 'pic's'test2NOUN44NOUN
[319, 320): '.'test2PUNCT55PUNCT
[321, 324): 'One'test2NUM66NUM
\n", + "

200 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " fold doc_num postag postag_id p_postag_id \\\n", + "raw_span \n", + "[0, 4): 'What' test 0 PRON 11 5 \n", + "[5, 7): 'if' test 0 SCONJ 13 13 \n", + "[8, 14): 'Google' test 0 PROPN 2 2 \n", + "[15, 22): 'Morphed' test 0 VERB 3 2 \n", + "[23, 27): 'Into' test 0 ADP 0 0 \n", + "... ... ... ... ... ... \n", + "[307, 309): 'of' test 2 ADP 0 0 \n", + "[310, 313): 'the' test 2 DET 1 1 \n", + "[314, 319): 'pic's' test 2 NOUN 4 4 \n", + "[319, 320): '.' test 2 PUNCT 5 5 \n", + "[321, 324): 'One' test 2 NUM 6 6 \n", + "\n", + " predicted_postag \n", + "raw_span \n", + "[0, 4): 'What' PUNCT \n", + "[5, 7): 'if' SCONJ \n", + "[8, 14): 'Google' PROPN \n", + "[15, 22): 'Morphed' PROPN \n", + "[23, 27): 'Into' ADP \n", + "... ... \n", + "[307, 309): 'of' ADP \n", + "[310, 313): 'the' DET \n", + "[314, 319): 'pic's' NOUN \n", + "[319, 320): '.' PUNCT \n", + "[321, 324): 'One' NUM \n", + "\n", + "[200 rows x 6 columns]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def agg_outputs(series: pd.Series):\n", + " return series.to_numpy().prod(axis=0).argmax()\n", + " \n", + "test_raw_preds = test_results.groupby(\"raw_span\").agg({\"fold\":'first', \"doc_num\": 'first','postag':'first','postag_id':'first','raw_output': agg_outputs}).rename(columns= {'raw_output':'p_postag_id'}).sort_values([\"fold\",\"doc_num\",'raw_span'])\n", + "test_raw_preds['predicted_postag'] = test_raw_preds[\"p_postag_id\"].apply(lambda p_id: upostags_list[p_id]) \n", + "test_raw_preds.head(200)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "29d7dce2-a624-48df-8332-bcec682a4478", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " ADJ 0.793 0.783 0.788 1782\n", + " ADP 0.916 0.919 0.917 2030\n", + " ADV 0.788 0.749 0.768 1147\n", + " AUX 0.939 0.952 0.946 1509\n", + " CCONJ 0.971 0.961 0.966 738\n", + " DET 0.959 0.958 0.958 1898\n", + " INTJ 0.876 0.708 0.783 120\n", + " NOUN 0.864 0.893 0.878 4136\n", + " NUM 0.839 0.893 0.865 541\n", + " PART 0.943 0.938 0.940 630\n", + " PRON 0.967 0.965 0.966 2158\n", + " PROPN 0.844 0.831 0.837 1985\n", + " PUNCT 0.987 0.963 0.975 3098\n", + " SCONJ 0.852 0.828 0.840 443\n", + " SYM 0.646 0.604 0.624 106\n", + " VERB 0.906 0.901 0.904 2640\n", + " X 0.482 0.686 0.566 137\n", + "\n", + " accuracy 0.899 25098\n", + " macro avg 0.857 0.855 0.854 25098\n", + "weighted avg 0.900 0.899 0.899 25098\n", + "\n" + ] + } + ], + "source": [ + "from sklearn import metrics\n", + "# calculate precision, recall, and f1 score for each pos\n", + "print(metrics.classification_report(test_raw_preds['postag'], test_raw_preds['predicted_postag'], digits=3))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce504db1-fba7-4962-8aac-5bc61aad8ef1", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/text_extensions_for_pandas/io/conll.py b/text_extensions_for_pandas/io/conll.py index a387c514..a9c1acf7 100644 --- a/text_extensions_for_pandas/io/conll.py +++ b/text_extensions_for_pandas/io/conll.py @@ -27,6 +27,7 @@ import regex import requests import os +from zipfile import ZipFile from text_extensions_for_pandas.array.span import SpanArray, SpanDtype from text_extensions_for_pandas.array.token_span import ( @@ -748,7 +749,7 @@ def _doc_to_df( if val is not None: points_to = int(val) meta_lists["head"][i] = ( - points_to + sentence_begin_token - 1 if points_to != 0 else None + points_to + sentence_begin_token - 1 if points_to != 0 else -1 ) begins = np.concatenate(begins_list) @@ -767,7 +768,8 @@ def _doc_to_df( ret["sentence"] = sentence_spans ret["line_num"] = pd.Series(doc_line_nums) if conll_u and "head" in column_names: - ret = ret.astype({"head": "int32"}, errors="ignore") + ret = ret.astype({"head": "Int64"}, errors="ignore") + ret.loc[ret['head'] == -1, 'head'] = pd.NA return ret @@ -1296,6 +1298,57 @@ def download_file(url, destination): return {"train": _TRAIN_FILE, "dev": _DEV_FILE, "test": _TEST_FILE} +def maybe_download_dataset_data( + target_dir: str, document_url: str, alternate_name: str = None +) -> Union[str, List[str]]: + """ + If the file found at the github url is not found in the target directory, + downloads it from the github url, and saves it to that plave in downloads. + Returns the path to the file. If a zip archive is downloaded, only files that are not already in the target + directory will be fetched, and if an alternate_name is given only that file will be operated on. + Note if a Zip archive is downloaded it will be unpacked so verify that the url being used is safe. + + :param target_dir: Directory where this function should write the document + :param document_url: url from which to download the docuemnt. If no alternate name is specified, + it is assumed that the string after the last slash is the name of the file. + :param alternate_name: if given, the name of the file that is checked in the target directory, + as well as what is used to save the file if no such file is found. If a zip file is downloaded, and a file of this + name exists in in the archive, only it will be extracted. + + :returns: the path to the file, or None if downloading was not successful + """ + file_name = ( + alternate_name if alternate_name is not None else document_url.split("/")[-1] + ) + full_path = target_dir + file_name + + # special logic for zip files + if document_url.split(".")[-1] == "zip" and ( + alternate_name is None or not os.path.exists(full_path) + ): + with ZipFile(full_path, "r") as zipf: + fnames = zipf.namelist() + if alternate_name is not None and alternate_name in fnames: + zipf.extract(alternate_name, target_dir) + return full_path + for fname in fnames: + if not os.path.exists(target_dir + fname): + zipf.extract(fname, target_dir) + if len(fnames) == 1: + full_path = target_dir + fnames[0] + else: + return [target_dir + fname for fname in fnames] + + # regular logic + elif not os.path.exists(full_path): + try: + data = requests.get(document_url) + open(full_path, "wb").write(data.content) + except: + return None + return full_path + + def _prep_for_stacking(fold_name: str, doc_num: int, df: pd.DataFrame) -> pd.DataFrame: """ Subroutine of combine_folds() diff --git a/text_extensions_for_pandas/io/test_conll.py b/text_extensions_for_pandas/io/test_conll.py index ca6e01bc..7350a084 100644 --- a/text_extensions_for_pandas/io/test_conll.py +++ b/text_extensions_for_pandas/io/test_conll.py @@ -257,98 +257,98 @@ def test_conll_u_to_dataframes(self): # in when regenerating this string! textwrap.dedent( """\ - span lemma upostag xpostag \\ - 0 [0, 6): 'Google' Google PROPN NNP - 1 [7, 10): 'has' have AUX VBZ - 2 [11, 18): 'finally' finally ADV RB - 3 [19, 22): 'had' have VERB VBN - 4 [23, 25): 'an' a DET DT - .. ... ... ... ... - 161 [776, 777): 'a' a DET DT - 162 [778, 787): 'punchline' punchline NOUN NN - 163 [787, 788): ',' , PUNCT , - 164 [789, 792): 'too' too ADV RB - 165 [792, 793): '.' . PUNCT . - - features head deprel \\ - 0 Number=Sing 3.0 nsubj - 1 Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF... 3.0 aux - 2 None 3.0 advmod - 3 Tense=Past|VerbForm=Part NaN root - 4 Definite=Ind|PronType=Art 6.0 det - .. ... ... ... - 161 Definite=Ind|PronType=Art 162.0 det - 162 Number=Sing 160.0 nsubj - 163 None 160.0 punct - 164 None 160.0 advmod - 165 None 154.0 punct - - deps misc \\ - 0 4:nsubj None - 1 4:aux None - 2 4:advmod None - 3 0:root None - 4 7:det None - .. ... ... - 161 9:det None - 162 7:nsubj SpaceAfter=No - 163 7:punct None - 164 7:advmod SpaceAfter=No - 165 1:punct None - - sentence_id \\ - 0 weblog-blogspot.com_marketview_20050210075500_... - 1 weblog-blogspot.com_marketview_20050210075500_... - 2 weblog-blogspot.com_marketview_20050210075500_... - 3 weblog-blogspot.com_marketview_20050210075500_... - 4 weblog-blogspot.com_marketview_20050210075500_... - .. ... - 161 weblog-blogspot.com_marketview_20050210075500_... - 162 weblog-blogspot.com_marketview_20050210075500_... - 163 weblog-blogspot.com_marketview_20050210075500_... - 164 weblog-blogspot.com_marketview_20050210075500_... - 165 weblog-blogspot.com_marketview_20050210075500_... - - paragraph_id \\ - 0 weblog-blogspot.com_marketview_20050210075500_... - 1 weblog-blogspot.com_marketview_20050210075500_... - 2 weblog-blogspot.com_marketview_20050210075500_... - 3 weblog-blogspot.com_marketview_20050210075500_... - 4 weblog-blogspot.com_marketview_20050210075500_... - .. ... - 161 weblog-blogspot.com_marketview_20050210075500_... - 162 weblog-blogspot.com_marketview_20050210075500_... - 163 weblog-blogspot.com_marketview_20050210075500_... - 164 weblog-blogspot.com_marketview_20050210075500_... - 165 weblog-blogspot.com_marketview_20050210075500_... - - doc_id \\ - 0 weblog-blogspot.com_marketview_20050210075500_... - 1 weblog-blogspot.com_marketview_20050210075500_... - 2 weblog-blogspot.com_marketview_20050210075500_... - 3 weblog-blogspot.com_marketview_20050210075500_... - 4 weblog-blogspot.com_marketview_20050210075500_... - .. ... - 161 weblog-blogspot.com_marketview_20050210075500_... - 162 weblog-blogspot.com_marketview_20050210075500_... - 163 weblog-blogspot.com_marketview_20050210075500_... - 164 weblog-blogspot.com_marketview_20050210075500_... - 165 weblog-blogspot.com_marketview_20050210075500_... - - sentence line_num - 0 [0, 139): 'Google has finally had an analyst d... 383 - 1 [0, 139): 'Google has finally had an analyst d... 384 - 2 [0, 139): 'Google has finally had an analyst d... 385 - 3 [0, 139): 'Google has finally had an analyst d... 386 - 4 [0, 139): 'Google has finally had an analyst d... 387 - .. ... ... - 161 [743, 793): 'Read the entire article; there's ... 565 - 162 [743, 793): 'Read the entire article; there's ... 566 - 163 [743, 793): 'Read the entire article; there's ... 567 - 164 [743, 793): 'Read the entire article; there's ... 568 - 165 [743, 793): 'Read the entire article; there's ... 569 - - [166 rows x 14 columns]""" + span lemma upostag xpostag \\ + 0 [0, 6): 'Google' Google PROPN NNP + 1 [7, 10): 'has' have AUX VBZ + 2 [11, 18): 'finally' finally ADV RB + 3 [19, 22): 'had' have VERB VBN + 4 [23, 25): 'an' a DET DT + .. ... ... ... ... + 161 [776, 777): 'a' a DET DT + 162 [778, 787): 'punchline' punchline NOUN NN + 163 [787, 788): ',' , PUNCT , + 164 [789, 792): 'too' too ADV RB + 165 [792, 793): '.' . PUNCT . + + features head deprel \\ + 0 Number=Sing 3 nsubj + 1 Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF... 3 aux + 2 None 3 advmod + 3 Tense=Past|VerbForm=Part root + 4 Definite=Ind|PronType=Art 6 det + .. ... ... ... + 161 Definite=Ind|PronType=Art 162 det + 162 Number=Sing 160 nsubj + 163 None 160 punct + 164 None 160 advmod + 165 None 154 punct + + deps misc \\ + 0 4:nsubj None + 1 4:aux None + 2 4:advmod None + 3 0:root None + 4 7:det None + .. ... ... + 161 9:det None + 162 7:nsubj SpaceAfter=No + 163 7:punct None + 164 7:advmod SpaceAfter=No + 165 1:punct None + + sentence_id \\ + 0 weblog-blogspot.com_marketview_20050210075500_... + 1 weblog-blogspot.com_marketview_20050210075500_... + 2 weblog-blogspot.com_marketview_20050210075500_... + 3 weblog-blogspot.com_marketview_20050210075500_... + 4 weblog-blogspot.com_marketview_20050210075500_... + .. ... + 161 weblog-blogspot.com_marketview_20050210075500_... + 162 weblog-blogspot.com_marketview_20050210075500_... + 163 weblog-blogspot.com_marketview_20050210075500_... + 164 weblog-blogspot.com_marketview_20050210075500_... + 165 weblog-blogspot.com_marketview_20050210075500_... + + paragraph_id \\ + 0 weblog-blogspot.com_marketview_20050210075500_... + 1 weblog-blogspot.com_marketview_20050210075500_... + 2 weblog-blogspot.com_marketview_20050210075500_... + 3 weblog-blogspot.com_marketview_20050210075500_... + 4 weblog-blogspot.com_marketview_20050210075500_... + .. ... + 161 weblog-blogspot.com_marketview_20050210075500_... + 162 weblog-blogspot.com_marketview_20050210075500_... + 163 weblog-blogspot.com_marketview_20050210075500_... + 164 weblog-blogspot.com_marketview_20050210075500_... + 165 weblog-blogspot.com_marketview_20050210075500_... + + doc_id \\ + 0 weblog-blogspot.com_marketview_20050210075500_... + 1 weblog-blogspot.com_marketview_20050210075500_... + 2 weblog-blogspot.com_marketview_20050210075500_... + 3 weblog-blogspot.com_marketview_20050210075500_... + 4 weblog-blogspot.com_marketview_20050210075500_... + .. ... + 161 weblog-blogspot.com_marketview_20050210075500_... + 162 weblog-blogspot.com_marketview_20050210075500_... + 163 weblog-blogspot.com_marketview_20050210075500_... + 164 weblog-blogspot.com_marketview_20050210075500_... + 165 weblog-blogspot.com_marketview_20050210075500_... + + sentence line_num + 0 [0, 139): 'Google has finally had an analyst d... 383 + 1 [0, 139): 'Google has finally had an analyst d... 384 + 2 [0, 139): 'Google has finally had an analyst d... 385 + 3 [0, 139): 'Google has finally had an analyst d... 386 + 4 [0, 139): 'Google has finally had an analyst d... 387 + .. ... ... + 161 [743, 793): 'Read the entire article; there's ... 565 + 162 [743, 793): 'Read the entire article; there's ... 566 + 163 [743, 793): 'Read the entire article; there's ... 567 + 164 [743, 793): 'Read the entire article; there's ... 568 + 165 [743, 793): 'Read the entire article; there's ... 569 + + [166 rows x 14 columns]""" ), ) @@ -373,17 +373,17 @@ def test_conll_u_to_dataframes(self): 85 [471, 472): '.' . PUNCT . features head deprel \\ - 0 None 2.0 case - 1 Definite=Def|PronType=Art 2.0 det - 2 Number=Sing 3.0 obl - 3 Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF... NaN root - 4 Number=Sing|PronType=Dem 5.0 det + 0 None 2 case + 1 Definite=Def|PronType=Art 2 det + 2 Number=Sing 3 obl + 3 Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF... root + 4 Number=Sing|PronType=Dem 5 det .. ... ... ... - 81 VerbForm=Ger 58.0 advcl - 82 Number=Sing 81.0 obj - 83 Number=Sing 82.0 flat - 84 Number=Sing 82.0 flat - 85 None 58.0 punct + 81 VerbForm=Ger 58 advcl + 82 Number=Sing 81 obj + 83 Number=Sing 82 flat + 84 Number=Sing 82 flat + 85 None 58 punct deps misc \\ 0 3:case None @@ -459,59 +459,59 @@ def test_conll_u_to_dataframes(self): repr(dfs[0]), textwrap.dedent( """\ - span lemma upostag xpostag features head deprel deps \\ - 0 [0, 2): 'No' no DT DT None 3.0 DEP None - 1 [2, 3): ',' , , , None 3.0 P None - 2 [4, 6): 'it' it PRP PRP None 3.0 SBJ None - 3 [7, 10): 'was' be VBD VBD None NaN ROOT None - 4 [11, 14): 'n't' not RB RB None 3.0 ADV None - .. ... ... ... ... ... ... ... ... - 74 [373, 377): 'both' both DT DT None 75.0 NMOD None - 75 [378, 384): 'stocks' stocks NNS NNS None 73.0 PMOD None - 76 [385, 388): 'and' and CC CC None 75.0 COORD None - 77 [389, 396): 'futures' future NNS NNS None 76.0 CONJ None - 78 [396, 397): '.' . . . None 59.0 P None - - misc predicate pred0arg pred1arg pred2arg pred3arg pred4arg pred5arg \\ - 0 None None None None None None None None - 1 None None None None None None None None - 2 None None None None None None None None - 3 None None None None None None None None - 4 None None None None None None None None - .. ... ... ... ... ... ... ... ... - 74 None None None None None None None None - 75 None None None None None None None None - 76 None None None None None None None None - 77 None None None None None None None None - 78 None None None None None None None None - - pred6arg pred7arg sentence \\ - 0 None None [0, 28): 'No, it was n't Black Monday.' - 1 None None [0, 28): 'No, it was n't Black Monday.' - 2 None None [0, 28): 'No, it was n't Black Monday.' - 3 None None [0, 28): 'No, it was n't Black Monday.' - 4 None None [0, 28): 'No, it was n't Black Monday.' - .. ... ... ... - 74 None None [232, 397): 'Some `` circuit breakers '' insta... - 75 None None [232, 397): 'Some `` circuit breakers '' insta... - 76 None None [232, 397): 'Some `` circuit breakers '' insta... - 77 None None [232, 397): 'Some `` circuit breakers '' insta... - 78 None None [232, 397): 'Some `` circuit breakers '' insta... - - line_num - 0 1 - 1 2 - 2 3 - 3 4 - 4 5 - .. ... - 74 79 - 75 80 - 76 81 - 77 82 - 78 83 - - [79 rows x 20 columns]""" + span lemma upostag xpostag features head deprel deps \\ + 0 [0, 2): 'No' no DT DT None 3 DEP None + 1 [2, 3): ',' , , , None 3 P None + 2 [4, 6): 'it' it PRP PRP None 3 SBJ None + 3 [7, 10): 'was' be VBD VBD None ROOT None + 4 [11, 14): 'n't' not RB RB None 3 ADV None + .. ... ... ... ... ... ... ... ... + 74 [373, 377): 'both' both DT DT None 75 NMOD None + 75 [378, 384): 'stocks' stocks NNS NNS None 73 PMOD None + 76 [385, 388): 'and' and CC CC None 75 COORD None + 77 [389, 396): 'futures' future NNS NNS None 76 CONJ None + 78 [396, 397): '.' . . . None 59 P None + + misc predicate pred0arg pred1arg pred2arg pred3arg pred4arg pred5arg \\ + 0 None None None None None None None None + 1 None None None None None None None None + 2 None None None None None None None None + 3 None None None None None None None None + 4 None None None None None None None None + .. ... ... ... ... ... ... ... ... + 74 None None None None None None None None + 75 None None None None None None None None + 76 None None None None None None None None + 77 None None None None None None None None + 78 None None None None None None None None + + pred6arg pred7arg sentence \\ + 0 None None [0, 28): 'No, it was n't Black Monday.' + 1 None None [0, 28): 'No, it was n't Black Monday.' + 2 None None [0, 28): 'No, it was n't Black Monday.' + 3 None None [0, 28): 'No, it was n't Black Monday.' + 4 None None [0, 28): 'No, it was n't Black Monday.' + .. ... ... ... + 74 None None [232, 397): 'Some `` circuit breakers '' insta... + 75 None None [232, 397): 'Some `` circuit breakers '' insta... + 76 None None [232, 397): 'Some `` circuit breakers '' insta... + 77 None None [232, 397): 'Some `` circuit breakers '' insta... + 78 None None [232, 397): 'Some `` circuit breakers '' insta... + + line_num + 0 1 + 1 2 + 2 3 + 3 4 + 4 5 + .. ... + 74 79 + 75 80 + 76 81 + 77 82 + 78 83 + + [79 rows x 20 columns]""" ), )