{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Natural Language Processing With SpaCy\n",
    "![title](SpaCy_logo.png)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Training the Named Entity Recognizer\n",
    "##### Updating our NER\n",
    "+ Load the model\n",
    "    + spacy.load('en')\n",
    "     - Disable existing pipe line (nlp.disable_pipes)\n",
    "    + spacy.blank('en')\n",
    "     - Added Entity Recognizer to Pipeline\n",
    "+ Shuffle and loop over the examples\n",
    "     - update the model (nlp.update)\n",
    "+ Save the trained model (nlp.to_disk)\n",
    "+ Test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load Packages\n",
    "from __future__ import unicode_literals, print_function\n",
    "\n",
    "import plac #  wrapper over argparse\n",
    "import random\n",
    "from pathlib import Path\n",
    "import spacy\n",
    "from tqdm import tqdm # loading bar"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nlp1 = spacy.load('en')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "docx1 = nlp1(u\"Who was Kofi Annan?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for token in docx1.ents:\n",
    "    print(token.text,token.start_char, token.end_char,token.label_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "docx2 = nlp1(u\"Who is Steve Jobs?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for token in docx2.ents:\n",
    "    print(token.text,token.start_char, token.end_char,token.label_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "docx3 = nlp1(u\"Who is Shaka Khan?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# training data\n",
    "TRAIN_DATA = [\n",
    "    ('Who is Kofi Annan?', {\n",
    "        'entities': [(8, 18, 'PERSON')]\n",
    "    }),\n",
    "     ('Who is Steve Jobs?', {\n",
    "        'entities': [(7, 17, 'PERSON')]\n",
    "    }),\n",
    "    ('I like London and Berlin.', {\n",
    "        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]\n",
    "    })\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## plac is wrapper for argparser \n",
    "@plac.annotations(\n",
    "    model=(\"Model name. Defaults to blank 'en' model.\", \"option\", \"m\", str),\n",
    "    output_dir=(\"C:\\Users\\This PC\\Documents\\JLabs\\JFlow\", \"option\", \"o\", Path),\n",
    "    n_iter=(\"Number of training iterations\", \"option\", \"n\", int))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define our variables\n",
    "model = None\n",
    "output_dir=Path(\"C:\\\\Users\\\\This PC\\\\Documents\\\\JLabs\\\\JFlow\")\n",
    "n_iter=100"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Load the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if model is not None:\n",
    "    nlp = spacy.load(model)  # load existing spaCy model\n",
    "    print(\"Loaded model '%s'\" % model)\n",
    "else:\n",
    "    nlp = spacy.blank('en')  # create blank Language class\n",
    "    print(\"Created blank 'en' model\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Set Up the Pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create the built-in pipeline components and add them to the pipeline\n",
    "    # nlp.create_pipe works for built-ins that are registered with spaCy\n",
    "if 'ner' not in nlp.pipe_names:\n",
    "    ner = nlp.create_pipe('ner')\n",
    "    nlp.add_pipe(ner, last=True)\n",
    "# otherwise, get it so we can add labels\n",
    "else:\n",
    "    ner = nlp.get_pipe('ner')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Train the Recognizer\n",
    "+ Add labels,Annotate them\n",
    "+ Pipes\n",
    "+ Begin_training()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "    # add labels\n",
    "for _, annotations in TRAIN_DATA:\n",
    "    for ent in annotations.get('entities'):\n",
    "        ner.add_label(ent[2])\n",
    "\n",
    "    # get names of other pipes to disable them during training\n",
    "other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']\n",
    "with nlp.disable_pipes(*other_pipes):  # only train NER\n",
    "    optimizer = nlp.begin_training()\n",
    "    for itn in range(n_iter):\n",
    "        random.shuffle(TRAIN_DATA)\n",
    "        losses = {}\n",
    "        for text, annotations in tqdm(TRAIN_DATA):\n",
    "            nlp.update(\n",
    "                [text],  # batch of texts\n",
    "                [annotations],  # batch of annotations\n",
    "                drop=0.5,  # dropout - make it harder to memorise data\n",
    "                sgd=optimizer,  # callable to update weights\n",
    "                losses=losses)\n",
    "        print(losses)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Test the trained model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# test the trained model\n",
    "for text, _ in TRAIN_DATA:\n",
    "    doc = nlp(text)\n",
    "    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])\n",
    "    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Save the Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save model to output directory\n",
    "if output_dir is not None:\n",
    "    output_dir = Path(output_dir)\n",
    "    if not output_dir.exists():\n",
    "        output_dir.mkdir()\n",
    "    nlp.to_disk(output_dir)\n",
    "    print(\"Saved model to\", output_dir)\n",
    "\n",
    "        "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Test The Saved Model\n",
    "+ NB Output Directory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# test the saved model\n",
    "print(\"Loading from\", output_dir)\n",
    "nlp2 = spacy.load(output_dir)\n",
    "for text, _ in TRAIN_DATA:\n",
    "    doc = nlp2(text)\n",
    "    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])\n",
    "    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Adding Additional Entity Types\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Natural Language Processing With SpaCy\n",
    "![title](SpaCy_logo.png)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Training the Named Entity Recognizer (NER)\n",
    "##### Adding An Additional Entity (NER)\n",
    "+ Load the model\n",
    "    + spacy.load('en')\n",
    "     - Disable existing pipe line (nlp.disable_pipes)\n",
    "    + spacy.blank('en')\n",
    "     - Added Entity Recognizer to Pipeline\n",
    "+ Add a Label eg(ner.add_label(LABEL) & (nlp.begin_training())\n",
    "+ Shuffle and loop over the examples\n",
    "     - update the model (nlp.update)\n",
    "+ Save the trained model (nlp.to_disk)\n",
    "+ Test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import unicode_literals, print_function\n",
    "\n",
    "import plac\n",
    "import random\n",
    "from pathlib import Path\n",
    "import spacy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "# new entity label\n",
    "LABEL = 'ANIMAL'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "TRAIN_DATA = [\n",
    "    (\"Horses are too tall and they pretend to care about your feelings\", {\n",
    "        'entities': [(0, 6, 'ANIMAL')]\n",
    "    }),\n",
    "\n",
    "    (\"Do they bite?\", {\n",
    "        'entities': []\n",
    "    }),\n",
    "\n",
    "    (\"horses are too tall and they pretend to care about your feelings\", {\n",
    "        'entities': [(0, 6, 'ANIMAL')]\n",
    "    }),\n",
    "\n",
    "    (\"horses pretend to care about your feelings\", {\n",
    "        'entities': [(0, 6, 'ANIMAL')]\n",
    "    }),\n",
    "\n",
    "    (\"they pretend to care about your feelings, those horses\", {\n",
    "        'entities': [(48, 54, 'ANIMAL')]\n",
    "    }),\n",
    "\n",
    "    (\"horses?\", {\n",
    "        'entities': [(0, 6, 'ANIMAL')]\n",
    "    })\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "@plac.annotations(\n",
    "    model=(\"Model name. Defaults to blank 'en' model.\", \"option\", \"m\", str),\n",
    "    new_model_name=(\"New model name for model meta.\", \"option\", \"nm\", str),\n",
    "    output_dir=(\"Optional output directory\", \"option\", \"o\", Path),\n",
    "    n_iter=(\"Number of training iterations\", \"option\", \"n\", int))\n",
    "\n",
    "\n",
    "def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):\n",
    "    \"\"\"Set up the pipeline and entity recognizer, and train the new entity.\"\"\"\n",
    "    if model is not None:\n",
    "        nlp = spacy.load(model)  # load existing spaCy model\n",
    "        print(\"Loaded model '%s'\" % model)\n",
    "    else:\n",
    "        nlp = spacy.blank('en')  # create blank Language class\n",
    "        print(\"Created blank 'en' model\")\n",
    "    # Add entity recognizer to model if it's not in the pipeline\n",
    "    # nlp.create_pipe works for built-ins that are registered with spaCy\n",
    "    if 'ner' not in nlp.pipe_names:\n",
    "        ner = nlp.create_pipe('ner')\n",
    "        nlp.add_pipe(ner)\n",
    "    # otherwise, get it, so we can add labels to it\n",
    "    else:\n",
    "        ner = nlp.get_pipe('ner')\n",
    "\n",
    "    ner.add_label(LABEL)   # add new entity label to entity recognizer\n",
    "    if model is None:\n",
    "        optimizer = nlp.begin_training()\n",
    "    else:\n",
    "        # Note that 'begin_training' initializes the models, so it'll zero out\n",
    "        # existing entity types.\n",
    "        optimizer = nlp.entity.create_optimizer()\n",
    "\n",
    "    # get names of other pipes to disable them during training\n",
    "    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']\n",
    "    with nlp.disable_pipes(*other_pipes):  # only train NER\n",
    "        for itn in range(n_iter):\n",
    "            random.shuffle(TRAIN_DATA)\n",
    "            losses = {}\n",
    "            for text, annotations in tqdm(TRAIN_DATA):\n",
    "                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,\n",
    "                           losses=losses)\n",
    "            print(losses)\n",
    "\n",
    "    # test the trained model\n",
    "    test_text = 'Do you like horses?'\n",
    "    doc = nlp(test_text)\n",
    "    print(\"Entities in '%s'\" % test_text)\n",
    "    for ent in doc.ents:\n",
    "        print(ent.label_, ent.text)\n",
    "\n",
    "    # save model to output directory\n",
    "    if output_dir is not None:\n",
    "        output_dir = Path(output_dir)\n",
    "        if not output_dir.exists():\n",
    "            output_dir.mkdir()\n",
    "        nlp.meta['name'] = new_model_name  # rename model\n",
    "        nlp.to_disk(output_dir)\n",
    "        print(\"Saved model to\", output_dir)\n",
    "\n",
    "        # test the saved model\n",
    "        print(\"Loading from\", output_dir)\n",
    "        nlp2 = spacy.load(output_dir)\n",
    "        doc2 = nlp2(test_text)\n",
    "        for ent in doc2.ents:\n",
    "            print(ent.label_, ent.text)\n",
    "\n",
    "\n",
    "# if __name__ == '__main__':\n",
    "#     plac.call(main)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Created blank 'en' model\n",
      "Warning: Unnamed vectors -- this won't allow multiple vectors models to be loaded. (Shape: (0, 0))\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:07<00:00,  1.22s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ner': 26.770396717498016}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:06<00:00,  1.02s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ner': 8.593518038099443}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00,  1.29it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ner': 4.161424036550985}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00,  1.29it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ner': 3.8918851538918418}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00,  1.30it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ner': 2.01546711932046}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00,  1.31it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ner': 0.000131435854561013}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00,  1.32it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ner': 1.3692610842225425e-07}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00,  1.08it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ner': 0.019683124967466954}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00,  1.07it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ner': 2.078213820644416e-12}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00,  1.11it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ner': 1.5424355623930257e-05}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00,  1.29it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ner': 0.34855798227363266}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00,  1.29it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ner': 1.2020330928745637e-21}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00,  1.23it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ner': 1.1364459848434984e-19}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00,  1.01it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ner': 5.07038899221475e-16}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00,  1.07it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ner': 7.756965635961777e-18}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00,  1.21it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ner': 4.682540175328388e-13}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00,  1.17it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ner': 4.9982126736537605e-14}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00,  1.15it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ner': 5.766438963914882e-17}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00,  1.25it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ner': 4.4997379863434744e-20}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00,  1.29it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ner': 1.4565571602945852e-16}\n",
      "Entities in 'Do you like horses?'\n",
      "ANIMAL horses\n"
     ]
    }
   ],
   "source": [
    "# Run our Function\n",
    "main()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Our model was able to recognize horses as ANIMAL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}