diff --git a/notebooks/Read_conllu_Files.ipynb b/notebooks/Read_conllu_Files.ipynb
new file mode 100644
index 00000000..0705b2ab
--- /dev/null
+++ b/notebooks/Read_conllu_Files.ipynb
@@ -0,0 +1,2911 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "38b14610-2ac6-43dc-a3ca-2f5533190bd8",
+ "metadata": {},
+ "source": [
+ "\n",
+ " Read_conllu_files.ipynb: Read and parse information from diverse .conllu files, and use integrations with libraries to apply data efficently\n",
+ " \n",
+ "\n",
+ "## Introduction\n",
+ "\n",
+ "This notebook demonstrates how diverse .conllu files can be imported, converted and worked with using the open source library [Text Extensions for Pandas](https://github.com/CODAIT/text-extensions-for-pandas). This library uses [Pandas](https://pandas.pydata.org/) DataFrames as a primary data storage format, and to work with several different NLP libraries, such as [SpaCy](https://spacy.io), [Huggingface Transformers](https://huggingface.co/transformers/). \n",
+ "\n",
+ "Here we show how these features can be used in conjunction to import, select data, display sentence structure information, and then finally retokenize and train a classifier model on the dataset. \n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "32709961-2eae-4a62-a7de-84072c1ab7d5",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import sys\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import json\n",
+ "import feather\n",
+ "import sklearn.pipeline\n",
+ "import sklearn.linear_model\n",
+ "import transformers\n",
+ "\n",
+ "# And of course we need the text_extensions_for_pandas library itself.\n",
+ "try:\n",
+ " import text_extensions_for_pandas as tp\n",
+ "except ModuleNotFoundError as e:\n",
+ " # If we're running from within the project source tree and the parent Python\n",
+ " # environment doesn't have the text_extensions_for_pandas package, use the\n",
+ " # version in the local source tree.\n",
+ " if not os.getcwd().endswith(\"notebooks\"):\n",
+ " raise e\n",
+ " if \"..\" not in sys.path:\n",
+ " sys.path.insert(0, \"..\")\n",
+ " import text_extensions_for_pandas as tp\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a9f78f9f-ec2c-4a85-8b0f-9b114642633e",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "### Loading files\n",
+ "There are several sub-flavors of .conllu files, including those used in the EWT, Ontonotes, Universal Dependencies, and CoNLL 2009 corpuses. Text Extensions is designed to take advantage of the common features of .conllu files, while allowing for varied types to be accepted. \n",
+ "\n",
+ "In importing this file type, we\n",
+ "1. Translate the raw words into Token Dtypes\n",
+ "1. Preserve the dependencies between tokens as represented in the `head` and `deprel` columns\n",
+ "1. Capture conllu metadata written into the file, if it exists \n",
+ "1. Allow for conll 09 and Ontonotes style predicate - predicate argument representations\n",
+ "1. Capture each token's sentence \n",
+ "1. Allow the user to choose how sub-tokens are handled\n",
+ "\n",
+ "\n",
+ "First, though we must load the datasets we will be using for this demo notebook \n",
+ "\n",
+ "In the following cell, we use the facilities of Text Extensions for Pandas to download a copy of the [Universal Dependencies EWT data set](https://github.com/UniversalDependencies/UD_English-EWT) and the [Trial section of the CoNLL 2009 dataset](https://ufal.mff.cuni.cz/conll2009-st/trial-data.html). **Make sure that you adhere to the terms under which they are liscensed when using them** \n",
+ "\n",
+ "Then we read them in and display them in the document. Notice how different the information stored in each dataset is. One thing to note is in this specific example, we drop a few columns from each dataset for brevity; remove the `.drop()` methods to show more lines. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "ccd19ec4-8480-4f00-9e34-d18f90f210a1",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# init file locations, and download data if necessary. \n",
+ "BASE_DIR = 'CoNLL_u_test_inputs/'\n",
+ "FEATHER_FILE = \"conllu_database.feather\"\n",
+ "\n",
+ "ewt_base_url = \"https://github.com/UniversalDependencies/UD_English-EWT/blob/master/en_ewt-ud-\"\n",
+ "ewt_dev_url = ewt_base_url + 'dev.conllu'\n",
+ "conll_09_test_data_url = 'https://ufal.mff.cuni.cz/conll2009-st/trial/CoNLL2009-ST-English-trial.zip'\n",
+ "\n",
+ "# allows us to re-start from saved points\n",
+ "corpus_df = None "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "2c71e57d-811e-4cd0-84f4-efeefe127697",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# download the files if they have not already been downloaded \n",
+ "conll_09_path = tp.io.conll.maybe_download_dataset_data(BASE_DIR, conll_09_test_data_url)\n",
+ "conllu_ewt_path = tp.io.conll.maybe_download_dataset_data(BASE_DIR, ewt_dev_url)\n",
+ "\n",
+ "# if you already have access to the full conll 2009 dataset, name the file accordingly and uncomment this line \n",
+ "# conll_09_path = BASE_DIR + 'CoNLL2009-ST-evaluation-English.conllu'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "a036fb68-3441-4c7f-bc4a-2b23bdf6875d",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Conll 09 format .conllu document:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
"
+ ],
+ "text/plain": [
+ " span lemma upostag xpostag \\\n",
+ "0 [0, 4): 'From' from ADP IN \n",
+ "1 [5, 8): 'the' the DET DT \n",
+ "2 [9, 11): 'AP' AP PROPN NNP \n",
+ "3 [12, 17): 'comes' come VERB VBZ \n",
+ "4 [18, 22): 'this' this DET DT \n",
+ "5 [23, 28): 'story' story NOUN NN \n",
+ "6 [28, 29): ':' : PUNCT : \n",
+ "7 [30, 39): 'President' President PROPN NNP \n",
+ "8 [40, 44): 'Bush' Bush PROPN NNP \n",
+ "9 [45, 47): 'on' on ADP IN \n",
+ "\n",
+ " features head deprel deps \\\n",
+ "0 None 2 case 3:case \n",
+ "1 Definite=Def|PronType=Art 2 det 3:det \n",
+ "2 Number=Sing 3 obl 4:obl:from \n",
+ "3 Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF... -1 root 0:root \n",
+ "4 Number=Sing|PronType=Dem 5 det 6:det \n",
+ "5 Number=Sing 3 nsubj 4:nsubj \n",
+ "6 None 3 punct 4:punct \n",
+ "7 Number=Sing 11 nsubj 5:nsubj \n",
+ "8 Number=Sing 7 flat 1:flat \n",
+ "9 None 10 case 4:case \n",
+ "\n",
+ " misc sentence_id \\\n",
+ "0 None weblog-blogspot.com_nominations_20041117172713... \n",
+ "1 None weblog-blogspot.com_nominations_20041117172713... \n",
+ "2 None weblog-blogspot.com_nominations_20041117172713... \n",
+ "3 None weblog-blogspot.com_nominations_20041117172713... \n",
+ "4 None weblog-blogspot.com_nominations_20041117172713... \n",
+ "5 None weblog-blogspot.com_nominations_20041117172713... \n",
+ "6 None weblog-blogspot.com_nominations_20041117172713... \n",
+ "7 None weblog-blogspot.com_nominations_20041117172713... \n",
+ "8 None weblog-blogspot.com_nominations_20041117172713... \n",
+ "9 None weblog-blogspot.com_nominations_20041117172713... \n",
+ "\n",
+ " paragraph_id \\\n",
+ "0 weblog-blogspot.com_nominations_20041117172713... \n",
+ "1 weblog-blogspot.com_nominations_20041117172713... \n",
+ "2 weblog-blogspot.com_nominations_20041117172713... \n",
+ "3 weblog-blogspot.com_nominations_20041117172713... \n",
+ "4 weblog-blogspot.com_nominations_20041117172713... \n",
+ "5 weblog-blogspot.com_nominations_20041117172713... \n",
+ "6 weblog-blogspot.com_nominations_20041117172713... \n",
+ "7 weblog-blogspot.com_nominations_20041117172713... \n",
+ "8 weblog-blogspot.com_nominations_20041117172713... \n",
+ "9 weblog-blogspot.com_nominations_20041117172713... \n",
+ "\n",
+ " doc_id line_num \n",
+ "0 weblog-blogspot.com_nominations_20041117172713... 4 \n",
+ "1 weblog-blogspot.com_nominations_20041117172713... 5 \n",
+ "2 weblog-blogspot.com_nominations_20041117172713... 6 \n",
+ "3 weblog-blogspot.com_nominations_20041117172713... 7 \n",
+ "4 weblog-blogspot.com_nominations_20041117172713... 8 \n",
+ "5 weblog-blogspot.com_nominations_20041117172713... 9 \n",
+ "6 weblog-blogspot.com_nominations_20041117172713... 10 \n",
+ "7 weblog-blogspot.com_nominations_20041117172713... 15 \n",
+ "8 weblog-blogspot.com_nominations_20041117172713... 16 \n",
+ "9 weblog-blogspot.com_nominations_20041117172713... 17 "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# import two very different documents, both in the conllu file format. \n",
+ "\n",
+ "# by default we look for EWT style column names, \n",
+ "# so we have to define a new set for this specific conll09 format\n",
+ "conll_09_cols = [\"LEMMA\",\"PLEMMA\",'POS','PPOS','FEAT','PFEAT','head','phead','DEPREL','PDEPREL','FILLPRED','PRED']\n",
+ "\n",
+ "conll_09_docs = tp.io.conll.conll_u_to_dataframes(conll_09_path,column_names=conll_09_cols)\n",
+ "#now just filter,and display the document \n",
+ "conll_09_doc = conll_09_docs[0].drop(columns=[\"PLEMMA\",'PPOS','PFEAT','phead','PDEPREL','FILLPRED','sentence','line_num'])\n",
+ "print(\"Conll 09 format .conllu document:\")\n",
+ "display(conll_09_doc.head())\n",
+ "\n",
+ "\n",
+ "#simultaneously, we can import an ewt style document, and display it with the same function\n",
+ "conll_u_docs = tp.io.conll.conll_u_to_dataframes(conllu_ewt_path)\n",
+ "#display \n",
+ "DOC_NUM = 0\n",
+ "doc_df = conll_u_docs[DOC_NUM]\n",
+ "# here we drop the sentence argument for brevity.\n",
+ "print(\"EWT format .conllu document:\")\n",
+ "doc_df.head(10).drop(columns = [\"sentence\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4a42bb64-5ac6-4270-b764-6faa991347a2",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "## Combining documents and saving as a .feather file. \n",
+ "\n",
+ "Something that is often useful is to store multiple documents from a corpus in one single dataset. Here we do that, then make an adjustment to keep the `'head'` column of our database pointing at the correct elements.\n",
+ "\n",
+ "\n",
+ "Next we quickly write then reread our document as a `.feather` file in its. manipulated state. Because this is serialized, writing and reading is significantly faster than writing to a raw `.conllu` format. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "153306f1-5b99-459d-8049-fa9a28f7dfee",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "size is 25151\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
span
\n",
+ "
lemma
\n",
+ "
upostag
\n",
+ "
head
\n",
+ "
deprel
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
[0, 4): 'From'
\n",
+ "
from
\n",
+ "
ADP
\n",
+ "
2.0
\n",
+ "
case
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
[5, 8): 'the'
\n",
+ "
the
\n",
+ "
DET
\n",
+ "
2.0
\n",
+ "
det
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
[9, 11): 'AP'
\n",
+ "
AP
\n",
+ "
PROPN
\n",
+ "
3.0
\n",
+ "
obl
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
[12, 17): 'comes'
\n",
+ "
come
\n",
+ "
VERB
\n",
+ "
-1.0
\n",
+ "
root
\n",
+ "
\n",
+ "
\n",
+ "
4
\n",
+ "
[18, 22): 'this'
\n",
+ "
this
\n",
+ "
DET
\n",
+ "
5.0
\n",
+ "
det
\n",
+ "
\n",
+ "
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
\n",
+ "
\n",
+ "
25146
\n",
+ "
[251, 254): 'and'
\n",
+ "
and
\n",
+ "
CCONJ
\n",
+ "
25150.0
\n",
+ "
cc
\n",
+ "
\n",
+ "
\n",
+ "
25147
\n",
+ "
[255, 256): 'a'
\n",
+ "
a
\n",
+ "
DET
\n",
+ "
25150.0
\n",
+ "
det
\n",
+ "
\n",
+ "
\n",
+ "
25148
\n",
+ "
[257, 261): 'very'
\n",
+ "
very
\n",
+ "
ADV
\n",
+ "
25149.0
\n",
+ "
advmod
\n",
+ "
\n",
+ "
\n",
+ "
25149
\n",
+ "
[262, 275): 'knowledgeable'
\n",
+ "
knowledgeable
\n",
+ "
ADJ
\n",
+ "
25150.0
\n",
+ "
amod
\n",
+ "
\n",
+ "
\n",
+ "
25150
\n",
+ "
[276, 281): 'staff'
\n",
+ "
staff
\n",
+ "
NOUN
\n",
+ "
25145.0
\n",
+ "
conj
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
25151 rows × 5 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " span lemma upostag head deprel\n",
+ "0 [0, 4): 'From' from ADP 2.0 case\n",
+ "1 [5, 8): 'the' the DET 2.0 det\n",
+ "2 [9, 11): 'AP' AP PROPN 3.0 obl\n",
+ "3 [12, 17): 'comes' come VERB -1.0 root\n",
+ "4 [18, 22): 'this' this DET 5.0 det\n",
+ "... ... ... ... ... ...\n",
+ "25146 [251, 254): 'and' and CCONJ 25150.0 cc\n",
+ "25147 [255, 256): 'a' a DET 25150.0 det\n",
+ "25148 [257, 261): 'very' very ADV 25149.0 advmod\n",
+ "25149 [262, 275): 'knowledgeable' knowledgeable ADJ 25150.0 amod\n",
+ "25150 [276, 281): 'staff' staff NOUN 25145.0 conj\n",
+ "\n",
+ "[25151 rows x 5 columns]"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# because we are concatenating our dataframes, we need to modify the \"head\" feilds to still point at their desired targets \n",
+ "df_starts_at =0\n",
+ "temp = conll_u_docs.copy()\n",
+ "for df in temp:\n",
+ " df['head'] = df['head'].apply(lambda i: i +df_starts_at if i!= -1 else -1)\n",
+ " df_starts_at += df.shape[0]\n",
+ "\n",
+ "# Now concatenate all our documents into one big dataframe\n",
+ "complete_df = temp[0]\n",
+ "complete_df = complete_df.append(temp[1:], ignore_index=True)\n",
+ "\n",
+ "#show the last few rows of the dataframe, select just a few columns for compactness\n",
+ "print(f\"size is {complete_df.shape[0]}\")\n",
+ "complete_df[[\"span\",\"lemma\",\"upostag\",\"head\",\"deprel\"]]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "6540975b-14a6-4187-98a4-9993f5f2d1cc",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 1e+03 ns, sys: 1 µs, total: 2 µs\n",
+ "Wall time: 6.91 µs\n",
+ "File written to CoNLL_u_test_inputs/conllu_database.feather\n"
+ ]
+ }
+ ],
+ "source": [
+ "# one advantage of using pandas dataframes is that we can write and read them signifcantly faster than we could the raw conllu files \n",
+ "# here we use pyarrow with feather to save and reload our dataframe. \n",
+ "\n",
+ "# Currently writing multi document files is not supported, so we will have to use a workaround, \n",
+ "# by converting sentences from TokenSpanArrays to SpanArrays\n",
+ "complete_df[\"sentence\"] = tp.SpanArray(complete_df[\"span\"].array.target_text, complete_df[\"sentence\"].array.begin, complete_df[\"sentence\"].array.end)\n",
+ "\n",
+ "#finally write to file using feather \n",
+ "path = BASE_DIR +FEATHER_FILE\n",
+ "# increase the chunksize slightly, to allow writing in a single block\n",
+ "# time to show how fast Feather actually is \n",
+ "%time\n",
+ "feather.write_dataframe(complete_df, path,chunksize= 65536*8)\n",
+ "print(f\"File written to {path}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "ade30665-3298-46ad-ab24-0da753c23067",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 2 µs, sys: 1 µs, total: 3 µs\n",
+ "Wall time: 5.01 µs\n",
+ "size is 25151\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
span
\n",
+ "
lemma
\n",
+ "
upostag
\n",
+ "
head
\n",
+ "
deprel
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
25146
\n",
+ "
[251, 254): 'and'
\n",
+ "
and
\n",
+ "
CCONJ
\n",
+ "
25150.0
\n",
+ "
cc
\n",
+ "
\n",
+ "
\n",
+ "
25147
\n",
+ "
[255, 256): 'a'
\n",
+ "
a
\n",
+ "
DET
\n",
+ "
25150.0
\n",
+ "
det
\n",
+ "
\n",
+ "
\n",
+ "
25148
\n",
+ "
[257, 261): 'very'
\n",
+ "
very
\n",
+ "
ADV
\n",
+ "
25149.0
\n",
+ "
advmod
\n",
+ "
\n",
+ "
\n",
+ "
25149
\n",
+ "
[262, 275): 'knowledgeable'
\n",
+ "
knowledgeable
\n",
+ "
ADJ
\n",
+ "
25150.0
\n",
+ "
amod
\n",
+ "
\n",
+ "
\n",
+ "
25150
\n",
+ "
[276, 281): 'staff'
\n",
+ "
staff
\n",
+ "
NOUN
\n",
+ "
25145.0
\n",
+ "
conj
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " span lemma upostag head deprel\n",
+ "25146 [251, 254): 'and' and CCONJ 25150.0 cc\n",
+ "25147 [255, 256): 'a' a DET 25150.0 det\n",
+ "25148 [257, 261): 'very' very ADV 25149.0 advmod\n",
+ "25149 [262, 275): 'knowledgeable' knowledgeable ADJ 25150.0 amod\n",
+ "25150 [276, 281): 'staff' staff NOUN 25145.0 conj"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# now we can read this df and continue operating on it as before. Time the read operation \n",
+ "%time \n",
+ "re_read_df = feather.read_dataframe(path)\n",
+ "print(f\"size is {re_read_df.shape[0]}\")\n",
+ "# show the same subset of the dataframe as above \n",
+ "re_read_df.tail()[[\"span\",\"lemma\",\"upostag\",\"head\",\"deprel\"]]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a290f829-688b-4d82-af90-7e168fa5808d",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "## Show sentence parse trees using pandas data manipulation, and SpaCy integrations\n",
+ "Because of the integrations built into Text extensions, we can use powerful data visualization tools here we're leveraging spaCy's dependency tree visualization tools, to show the parse tree as specified in the raw conllu file. \n",
+ "\n",
+ "First, we use Pandas groupby to to quickly select the n'th sentence in the dataset, and store it as its own dataframe and display selected columns \n",
+ "\n",
+ "Then we use Spacy to render the parse tree of that specific sentence, as found in the raw data. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "f1f41cb8-a833-4e84-ab9b-d0d5a7408e00",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
span
\n",
+ "
lemma
\n",
+ "
upostag
\n",
+ "
xpostag
\n",
+ "
head
\n",
+ "
deprel
\n",
+ "
sentence
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
2510
\n",
+ "
[979, 982): 'And'
\n",
+ "
and
\n",
+ "
CCONJ
\n",
+ "
CC
\n",
+ "
2514.0
\n",
+ "
cc
\n",
+ "
[979, 1014): 'And what do we get for this effo...
\n",
+ "
\n",
+ "
\n",
+ "
2511
\n",
+ "
[983, 987): 'what'
\n",
+ "
what
\n",
+ "
PRON
\n",
+ "
WP
\n",
+ "
2514.0
\n",
+ "
obj
\n",
+ "
[979, 1014): 'And what do we get for this effo...
\n",
+ "
\n",
+ "
\n",
+ "
2512
\n",
+ "
[988, 990): 'do'
\n",
+ "
do
\n",
+ "
AUX
\n",
+ "
VBP
\n",
+ "
2514.0
\n",
+ "
aux
\n",
+ "
[979, 1014): 'And what do we get for this effo...
\n",
+ "
\n",
+ "
\n",
+ "
2513
\n",
+ "
[991, 993): 'we'
\n",
+ "
we
\n",
+ "
PRON
\n",
+ "
PRP
\n",
+ "
2514.0
\n",
+ "
nsubj
\n",
+ "
[979, 1014): 'And what do we get for this effo...
\n",
+ "
\n",
+ "
\n",
+ "
2514
\n",
+ "
[994, 997): 'get'
\n",
+ "
get
\n",
+ "
VERB
\n",
+ "
VB
\n",
+ "
-1.0
\n",
+ "
root
\n",
+ "
[979, 1014): 'And what do we get for this effo...
\n",
+ "
\n",
+ "
\n",
+ "
2515
\n",
+ "
[998, 1001): 'for'
\n",
+ "
for
\n",
+ "
ADP
\n",
+ "
IN
\n",
+ "
2517.0
\n",
+ "
case
\n",
+ "
[979, 1014): 'And what do we get for this effo...
\n",
+ "
\n",
+ "
\n",
+ "
2516
\n",
+ "
[1002, 1006): 'this'
\n",
+ "
this
\n",
+ "
DET
\n",
+ "
DT
\n",
+ "
2517.0
\n",
+ "
det
\n",
+ "
[979, 1014): 'And what do we get for this effo...
\n",
+ "
\n",
+ "
\n",
+ "
2517
\n",
+ "
[1007, 1013): 'effort'
\n",
+ "
effort
\n",
+ "
NOUN
\n",
+ "
NN
\n",
+ "
2514.0
\n",
+ "
obl
\n",
+ "
[979, 1014): 'And what do we get for this effo...
\n",
+ "
\n",
+ "
\n",
+ "
2518
\n",
+ "
[1013, 1014): '?'
\n",
+ "
?
\n",
+ "
PUNCT
\n",
+ "
.
\n",
+ "
2514.0
\n",
+ "
punct
\n",
+ "
[979, 1014): 'And what do we get for this effo...
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " span lemma upostag xpostag head deprel \\\n",
+ "2510 [979, 982): 'And' and CCONJ CC 2514.0 cc \n",
+ "2511 [983, 987): 'what' what PRON WP 2514.0 obj \n",
+ "2512 [988, 990): 'do' do AUX VBP 2514.0 aux \n",
+ "2513 [991, 993): 'we' we PRON PRP 2514.0 nsubj \n",
+ "2514 [994, 997): 'get' get VERB VB -1.0 root \n",
+ "2515 [998, 1001): 'for' for ADP IN 2517.0 case \n",
+ "2516 [1002, 1006): 'this' this DET DT 2517.0 det \n",
+ "2517 [1007, 1013): 'effort' effort NOUN NN 2514.0 obl \n",
+ "2518 [1013, 1014): '?' ? PUNCT . 2514.0 punct \n",
+ "\n",
+ " sentence \n",
+ "2510 [979, 1014): 'And what do we get for this effo... \n",
+ "2511 [979, 1014): 'And what do we get for this effo... \n",
+ "2512 [979, 1014): 'And what do we get for this effo... \n",
+ "2513 [979, 1014): 'And what do we get for this effo... \n",
+ "2514 [979, 1014): 'And what do we get for this effo... \n",
+ "2515 [979, 1014): 'And what do we get for this effo... \n",
+ "2516 [979, 1014): 'And what do we get for this effo... \n",
+ "2517 [979, 1014): 'And what do we get for this effo... \n",
+ "2518 [979, 1014): 'And what do we get for this effo... "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ ""
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "Sentence_num = 110\n",
+ "\n",
+ "# use pandas to quickly select the 'n'th sentence in the dataset \n",
+ "nth_sentence = list(re_read_df.groupby(\"sentence_id\",sort=False))[Sentence_num][1]\n",
+ "display(nth_sentence[[\"span\",\"lemma\",\"upostag\",\"xpostag\",\"head\",\"deprel\",\"sentence\"]])\n",
+ "\n",
+ "# now use spacy integration to rendeer the parse tree\n",
+ "tp.io.spacy.render_parse_tree(nth_sentence,tag_col=\"upostag\",label_col=\"deprel\",head_col=\"head\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0fc00e06-b2fe-47f0-9f0a-20d0b8370807",
+ "metadata": {},
+ "source": [
+ "# Train a classifier model\n",
+ "\n",
+ "Now use more text extensions integrations, with *transformers* to quickly and easily train a part of speech classifier model using bert embeddings on our data. We loosely follow the same process as is used in the [Model_Training_with_BERT](./Model_Training_with_BERT.ipynb) demo, notebook so check there for a more indepth explanation of each step.\n",
+ "\n",
+ "Broadly, what we do is: \n",
+ "1. Import all the folds of the dataset we're using (Universal dependencies EWT) \n",
+ "1. Create a Pandas Categorical datatype on over which to classify\n",
+ "1. Retokenize that dataset using Huggingface Transformers to Bert-compatible tokens\n",
+ "1. Correlate the new tokens with their original counterpart's parts of speech\n",
+ "1. Create the Bert embeddings for each sub-token\n",
+ "1. Convert the parts of speech tags to our categoical datatype\n",
+ "1. Initialize and train a sklearn model on the Bert embeddings -> Part of Speech\n",
+ "1. Use that model to perform inference on our dataset\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "20f20214-62d9-4f7b-bd4f-fcd377148aad",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "converted fold: 'test' to list of 316 dataframes\n",
+ "converted fold: 'dev' to list of 318 dataframes\n",
+ "converted fold: 'train' to list of 540 dataframes\n"
+ ]
+ }
+ ],
+ "source": [
+ "# We're going to need the whole ewt dataset for this: download them, and parse them in \n",
+ "fold_paths = {\"test\": tp.io.conll.maybe_download_dataset_data(BASE_DIR, ewt_base_url + \"test.conllu\"),\n",
+ " \"dev\": tp.io.conll.maybe_download_dataset_data(BASE_DIR, ewt_base_url + \"dev.conllu\"),\n",
+ " \"train\": tp.io.conll.maybe_download_dataset_data(BASE_DIR, ewt_base_url + \"train.conllu\")}\n",
+ "fold_docs = {}\n",
+ "for fold,fold_path in fold_paths.items(): \n",
+ " fold_docs[fold] = tp.io.conll.conll_u_to_dataframes(fold_path)\n",
+ " print(f\"converted fold: '{fold}' to list of {len(fold_docs[fold])} dataframes\")\n",
+ " # uncomment to display segments of the extracted folds \n",
+ " # display(fold_docs[fold][0].head()[['span','lemma','upostag','features','sentence']])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cade0734-2845-4b30-8472-59f103f44bd9",
+ "metadata": {},
+ "source": [
+ "### Initialize elements for preprocessing steps\n",
+ "Instantiate pretrained tokenizer and BERT models from transformers library, and create a pandas categorical datatype for parts of speech"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "2cf1b9d6-b2cb-45fe-b724-529b7bb75d3f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "bert_model_name = \"dslim/bert-base-NER\"\n",
+ "tokenizer = transformers.BertTokenizerFast.from_pretrained(bert_model_name)\n",
+ "bert = transformers.BertModel.from_pretrained(bert_model_name)\n",
+ "\n",
+ "# also we will want to create a pandas categorical dtype for what we want to predict- part of speech. \n",
+ "# use the combined df, because it has all the elements \n",
+ "upostags_list = list(re_read_df[\"upostag\"].unique())\n",
+ "# upostag_dtype,upostag_list,upostag_dict = tp.io.conll.make_iob_tag_categories(upostags)\n",
+ "upostag_dtype = pd.CategoricalDtype(categories = upostags_list)\n",
+ "upostag_dict = {upostags_list[i]:i for i in range(len(upostags_list)) }"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f10fdc34-613d-47e1-8287-c33384da69ba",
+ "metadata": {},
+ "source": [
+ "## Preprocess the document\n",
+ "\n",
+ "Because steps 3-6 can only be done on a document-by-document basis, we create a method to do them in a batch, then run them on the whole corpus. Note this process is computationally intensive so it may take a few minutes to run."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "3fe4b27e-4f8f-4dab-ad42-ca04bcb3234c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "processing fold test\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "a6b4ed4fe82a499584fd2b25dfcb1254",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=316, style=ProgressStyle(desc…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Token indices sequence length is longer than the specified maximum sequence length for this model (713 > 512). Running this sequence through the model will result in indexing errors\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "processing fold dev\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "7e4d211c2807462799823e5d9ce486d4",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=318, style=ProgressStyle(desc…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "processing fold train\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "2fbfd56ed94943bc9c1268fe37a9b278",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=540, style=ProgressStyle(desc…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# make a method to take care of preprocessing steps: 3-6\n",
+ "def preprocess_document(document, tokenizer,bert):\n",
+ " # create BERT compatible tokens using our tokenizer\n",
+ " temp = tp.io.bert.make_bert_tokens(document.loc[0,'span'].target_text, tokenizer)\n",
+ " # re-correlate our original spans with their bert-compatible equivalents\n",
+ " spans = tp.TokenSpanArray.align_to_tokens(temp[\"span\"],document[\"span\"])\n",
+ " \n",
+ " # now carry over some features from the old spans to the new onesspans_df = spans.as_frame().drop(columns = [\"begin\",\"end\"])\n",
+ " spans_df = spans.as_frame().drop(columns = ['begin','end','covered_text'])\n",
+ " spans_df['postag'] = document['upostag']\n",
+ " printed = 20\n",
+ " for i,b_tok,e_tok,pos in spans_df.itertuples():\n",
+ " temp.loc[b_tok:e_tok-1 , [\"postag\",\"raw_span\",'raw_span_id']] = pos,spans[i],i\n",
+ " \n",
+ " # now translate from text tags to postag \n",
+ " temp['postag'].fillna('X',inplace=True) # in our Labels, 'X' is a standin for \"N/A\" so convert N/A's to 'X'\n",
+ " temp[\"postag_id\"] = temp['postag'].apply(lambda t: int(upostag_dict[str(t)]))\n",
+ " temp = temp.astype({'postag_id':'int','postag':upostag_dtype})\n",
+ " return tp.io.bert.add_embeddings(temp, bert)\n",
+ "\n",
+ "\n",
+ "# preprocess the whole corpus: \n",
+ "bert_docs_by_fold = {}\n",
+ "for fold in fold_docs.keys():\n",
+ " docs = fold_docs[fold]\n",
+ " print(f\"processing fold {fold}\")\n",
+ " bert_docs_by_fold[fold] = tp.jupyter.run_with_progress_bar(len(docs),lambda i: preprocess_document(docs[i],tokenizer,bert))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b843badb-33c5-4bd3-a05c-ef2159adfa43",
+ "metadata": {},
+ "source": [
+ "## Checkpoint: save preprocessed data \n",
+ "\n",
+ "Because the last step was time intensive, combine all the documents together, then save them as a feather file, so that we can restart from here if need be. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "ec08895b-98e7-43cc-8138-412893982019",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "