{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Natural Language Processing With SpaCy\n", "![title](SpaCy_logo.png)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Training the Named Entity Recognizer\n", "##### Updating our NER\n", "+ Load the model\n", " + spacy.load('en')\n", " - Disable existing pipe line (nlp.disable_pipes)\n", " + spacy.blank('en')\n", " - Added Entity Recognizer to Pipeline\n", "+ Shuffle and loop over the examples\n", " - update the model (nlp.update)\n", "+ Save the trained model (nlp.to_disk)\n", "+ Test" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load Packages\n", "from __future__ import unicode_literals, print_function\n", "\n", "import plac # wrapper over argparse\n", "import random\n", "from pathlib import Path\n", "import spacy\n", "from tqdm import tqdm # loading bar" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nlp1 = spacy.load('en')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "docx1 = nlp1(u\"Who was Kofi Annan?\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for token in docx1.ents:\n", " print(token.text,token.start_char, token.end_char,token.label_)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "docx2 = nlp1(u\"Who is Steve Jobs?\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for token in docx2.ents:\n", " print(token.text,token.start_char, token.end_char,token.label_)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "docx3 = nlp1(u\"Who is Shaka Khan?\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# training data\n", "TRAIN_DATA = [\n", " ('Who is Kofi Annan?', {\n", " 'entities': [(8, 18, 'PERSON')]\n", " }),\n", " ('Who is Steve Jobs?', {\n", " 'entities': [(7, 17, 'PERSON')]\n", " }),\n", " ('I like London and Berlin.', {\n", " 'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]\n", " })\n", "]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## plac is wrapper for argparser \n", "@plac.annotations(\n", " model=(\"Model name. Defaults to blank 'en' model.\", \"option\", \"m\", str),\n", " output_dir=(\"C:\\Users\\This PC\\Documents\\JLabs\\JFlow\", \"option\", \"o\", Path),\n", " n_iter=(\"Number of training iterations\", \"option\", \"n\", int))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Define our variables\n", "model = None\n", "output_dir=Path(\"C:\\\\Users\\\\This PC\\\\Documents\\\\JLabs\\\\JFlow\")\n", "n_iter=100" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Load the model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if model is not None:\n", " nlp = spacy.load(model) # load existing spaCy model\n", " print(\"Loaded model '%s'\" % model)\n", "else:\n", " nlp = spacy.blank('en') # create blank Language class\n", " print(\"Created blank 'en' model\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Set Up the Pipeline" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# create the built-in pipeline components and add them to the pipeline\n", " # nlp.create_pipe works for built-ins that are registered with spaCy\n", "if 'ner' not in nlp.pipe_names:\n", " ner = nlp.create_pipe('ner')\n", " nlp.add_pipe(ner, last=True)\n", "# otherwise, get it so we can add labels\n", "else:\n", " ner = nlp.get_pipe('ner')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Train the Recognizer\n", "+ Add labels,Annotate them\n", "+ Pipes\n", "+ Begin_training()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", " # add labels\n", "for _, annotations in TRAIN_DATA:\n", " for ent in annotations.get('entities'):\n", " ner.add_label(ent[2])\n", "\n", " # get names of other pipes to disable them during training\n", "other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']\n", "with nlp.disable_pipes(*other_pipes): # only train NER\n", " optimizer = nlp.begin_training()\n", " for itn in range(n_iter):\n", " random.shuffle(TRAIN_DATA)\n", " losses = {}\n", " for text, annotations in tqdm(TRAIN_DATA):\n", " nlp.update(\n", " [text], # batch of texts\n", " [annotations], # batch of annotations\n", " drop=0.5, # dropout - make it harder to memorise data\n", " sgd=optimizer, # callable to update weights\n", " losses=losses)\n", " print(losses)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Test the trained model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# test the trained model\n", "for text, _ in TRAIN_DATA:\n", " doc = nlp(text)\n", " print('Entities', [(ent.text, ent.label_) for ent in doc.ents])\n", " print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Save the Model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# save model to output directory\n", "if output_dir is not None:\n", " output_dir = Path(output_dir)\n", " if not output_dir.exists():\n", " output_dir.mkdir()\n", " nlp.to_disk(output_dir)\n", " print(\"Saved model to\", output_dir)\n", "\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Test The Saved Model\n", "+ NB Output Directory" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# test the saved model\n", "print(\"Loading from\", output_dir)\n", "nlp2 = spacy.load(output_dir)\n", "for text, _ in TRAIN_DATA:\n", " doc = nlp2(text)\n", " print('Entities', [(ent.text, ent.label_) for ent in doc.ents])\n", " print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Adding Additional Entity Types\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Natural Language Processing With SpaCy\n", "![title](SpaCy_logo.png)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Training the Named Entity Recognizer (NER)\n", "##### Adding An Additional Entity (NER)\n", "+ Load the model\n", " + spacy.load('en')\n", " - Disable existing pipe line (nlp.disable_pipes)\n", " + spacy.blank('en')\n", " - Added Entity Recognizer to Pipeline\n", "+ Add a Label eg(ner.add_label(LABEL) & (nlp.begin_training())\n", "+ Shuffle and loop over the examples\n", " - update the model (nlp.update)\n", "+ Save the trained model (nlp.to_disk)\n", "+ Test" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "from __future__ import unicode_literals, print_function\n", "\n", "import plac\n", "import random\n", "from pathlib import Path\n", "import spacy" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [], "source": [ "# new entity label\n", "LABEL = 'ANIMAL'" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [], "source": [ "TRAIN_DATA = [\n", " (\"Horses are too tall and they pretend to care about your feelings\", {\n", " 'entities': [(0, 6, 'ANIMAL')]\n", " }),\n", "\n", " (\"Do they bite?\", {\n", " 'entities': []\n", " }),\n", "\n", " (\"horses are too tall and they pretend to care about your feelings\", {\n", " 'entities': [(0, 6, 'ANIMAL')]\n", " }),\n", "\n", " (\"horses pretend to care about your feelings\", {\n", " 'entities': [(0, 6, 'ANIMAL')]\n", " }),\n", "\n", " (\"they pretend to care about your feelings, those horses\", {\n", " 'entities': [(48, 54, 'ANIMAL')]\n", " }),\n", "\n", " (\"horses?\", {\n", " 'entities': [(0, 6, 'ANIMAL')]\n", " })\n", "]" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [], "source": [ "\n", "@plac.annotations(\n", " model=(\"Model name. Defaults to blank 'en' model.\", \"option\", \"m\", str),\n", " new_model_name=(\"New model name for model meta.\", \"option\", \"nm\", str),\n", " output_dir=(\"Optional output directory\", \"option\", \"o\", Path),\n", " n_iter=(\"Number of training iterations\", \"option\", \"n\", int))\n", "\n", "\n", "def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):\n", " \"\"\"Set up the pipeline and entity recognizer, and train the new entity.\"\"\"\n", " if model is not None:\n", " nlp = spacy.load(model) # load existing spaCy model\n", " print(\"Loaded model '%s'\" % model)\n", " else:\n", " nlp = spacy.blank('en') # create blank Language class\n", " print(\"Created blank 'en' model\")\n", " # Add entity recognizer to model if it's not in the pipeline\n", " # nlp.create_pipe works for built-ins that are registered with spaCy\n", " if 'ner' not in nlp.pipe_names:\n", " ner = nlp.create_pipe('ner')\n", " nlp.add_pipe(ner)\n", " # otherwise, get it, so we can add labels to it\n", " else:\n", " ner = nlp.get_pipe('ner')\n", "\n", " ner.add_label(LABEL) # add new entity label to entity recognizer\n", " if model is None:\n", " optimizer = nlp.begin_training()\n", " else:\n", " # Note that 'begin_training' initializes the models, so it'll zero out\n", " # existing entity types.\n", " optimizer = nlp.entity.create_optimizer()\n", "\n", " # get names of other pipes to disable them during training\n", " other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']\n", " with nlp.disable_pipes(*other_pipes): # only train NER\n", " for itn in range(n_iter):\n", " random.shuffle(TRAIN_DATA)\n", " losses = {}\n", " for text, annotations in tqdm(TRAIN_DATA):\n", " nlp.update([text], [annotations], sgd=optimizer, drop=0.35,\n", " losses=losses)\n", " print(losses)\n", "\n", " # test the trained model\n", " test_text = 'Do you like horses?'\n", " doc = nlp(test_text)\n", " print(\"Entities in '%s'\" % test_text)\n", " for ent in doc.ents:\n", " print(ent.label_, ent.text)\n", "\n", " # save model to output directory\n", " if output_dir is not None:\n", " output_dir = Path(output_dir)\n", " if not output_dir.exists():\n", " output_dir.mkdir()\n", " nlp.meta['name'] = new_model_name # rename model\n", " nlp.to_disk(output_dir)\n", " print(\"Saved model to\", output_dir)\n", "\n", " # test the saved model\n", " print(\"Loading from\", output_dir)\n", " nlp2 = spacy.load(output_dir)\n", " doc2 = nlp2(test_text)\n", " for ent in doc2.ents:\n", " print(ent.label_, ent.text)\n", "\n", "\n", "# if __name__ == '__main__':\n", "# plac.call(main)" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Created blank 'en' model\n", "Warning: Unnamed vectors -- this won't allow multiple vectors models to be loaded. (Shape: (0, 0))\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:07<00:00, 1.22s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'ner': 26.770396717498016}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:06<00:00, 1.02s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'ner': 8.593518038099443}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'ner': 4.161424036550985}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'ner': 3.8918851538918418}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.30it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'ner': 2.01546711932046}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.31it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'ner': 0.000131435854561013}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.32it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'ner': 1.3692610842225425e-07}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.08it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'ner': 0.019683124967466954}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.07it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'ner': 2.078213820644416e-12}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.11it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'ner': 1.5424355623930257e-05}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'ner': 0.34855798227363266}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'ner': 1.2020330928745637e-21}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.23it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'ner': 1.1364459848434984e-19}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.01it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'ner': 5.07038899221475e-16}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.07it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'ner': 7.756965635961777e-18}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.21it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'ner': 4.682540175328388e-13}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.17it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'ner': 4.9982126736537605e-14}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00, 1.15it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'ner': 5.766438963914882e-17}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.25it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'ner': 4.4997379863434744e-20}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'ner': 1.4565571602945852e-16}\n", "Entities in 'Do you like horses?'\n", "ANIMAL horses\n" ] } ], "source": [ "# Run our Function\n", "main()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Our model was able to recognize horses as ANIMAL" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }