{ "cells": [ { "cell_type": "code", "execution_count": null, "source": [ "'''\r\n", "What are Stop words?\r\n", "\r\n", "Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine has been programmed to ignore,\r\n", "both when indexing entries for searching and when retrieving them as the result of a search query. \r\n", "We would not want these words to take up space in our database, or taking up valuable processing time.\r\n", "For this, we can remove them easily, by storing a list of words that you consider to stop words. \r\n", "NLTK(Natural Language Toolkit) in python has a list of stopwords stored in 16 different languages.\r\n", "You can find them in the nltk_data directory.\r\n", "'''" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 2, "source": [ "from nltk.corpus import stopwords\r\n", "from nltk.tokenize import word_tokenize" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 3, "source": [ "example_sentence = \"Facebook is an American online social media and social networking service owned by Facebook, Inc.\"" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 4, "source": [ "stop_words = set(stopwords.words(\"english\"))" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 5, "source": [ "print(stop_words)" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "{'just', 'such', 'did', 'and', \"weren't\", 'out', 'his', \"won't\", 'nor', 'was', 'those', 'd', 'doesn', 'through', \"shouldn't\", 'shouldn', \"couldn't\", \"haven't\", 'be', 'myself', \"hadn't\", 'shan', 'yourself', 'couldn', 'didn', 'under', 'it', 'that', 'above', 'an', 'will', \"isn't\", 'ma', 'each', 'him', 'where', 'few', \"needn't\", 'been', 'we', \"shan't\", 'you', 'there', 'after', 'y', 'hadn', 'does', \"didn't\", 'am', 'he', \"mustn't\", 'hers', 'is', 'to', 'she', 'because', 'own', 'other', 'than', \"it's\", 'which', 'having', 'as', 'whom', 'wasn', 'all', 'up', 'won', 'i', \"should've\", 'our', 'aren', 'or', 'they', 'them', 'its', \"that'll\", \"you'd\", 'itself', 'were', 'any', \"wouldn't\", \"you'll\", 'in', 'about', 'doing', 'these', 'being', 'on', 'himself', 'why', 'with', 'wouldn', 'my', 'do', 'until', 'for', 'most', 'this', 'further', 'against', 'too', 'while', 'a', 'down', \"doesn't\", \"don't\", 'm', 'needn', 'over', 'then', 'has', 'their', 'ourselves', 'before', 'should', 'from', 'once', \"mightn't\", 'had', 'but', 'll', 'of', 'here', 're', 'themselves', 'below', 'off', 'weren', 'o', 'are', 'if', \"you've\", 'between', 'herself', 'isn', 's', 'some', 'not', 'mustn', 'now', 'during', 't', 've', 'into', 'what', 'theirs', 'so', 'how', 'ours', 'only', 'mightn', 'have', 'yours', \"wasn't\", 'her', 'more', 'when', 'at', 'don', \"aren't\", \"hasn't\", 'very', 'same', \"you're\", 'again', 'by', 'no', 'haven', 'me', 'both', 'yourselves', 'hasn', 'the', 'your', 'ain', 'who', \"she's\", 'can'}\n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": 6, "source": [ "words = word_tokenize(example_sentence)" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 7, "source": [ "print(words)" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "['Facebook', 'is', 'an', 'American', 'online', 'social', 'media', 'and', 'social', 'networking', 'service', 'owned', 'by', 'Facebook', ',', 'Inc', '.']\n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": 8, "source": [ "filtered_sentence = []\r\n", "filtered_sentence = [w for w in words if w not in stop_words]" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 9, "source": [ "print(filtered_sentence)" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "['Facebook', 'American', 'online', 'social', 'media', 'social', 'networking', 'service', 'owned', 'Facebook', ',', 'Inc', '.']\n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "'''----Additional Things----'''" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "#Adding Stop Words to Default NLTK Stop Word List\r\n", "''' READ:\r\n", "To add a word to NLTK stop words collection, first create an object from the stopwords.words('english') list. \r\n", "Next, use the append() method on the list to add any word to the list.\r\n", "The following script adds the word play to the NLTK stop word collection.\r\n", "Again, we remove all the words from our text variable to see if the word play is removed or not.\r\n", "'''" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 14, "source": [ "all_stopwords = stopwords.words('english')\r\n", "all_stopwords.append('online')\r\n", "\r\n", "text_tokens = word_tokenize(example_sentence)\r\n", "tokens_without_sw = [word for word in text_tokens if not word in all_stopwords]\r\n", "\r\n", "print(tokens_without_sw)" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "['Facebook', 'American', 'social', 'media', 'social', 'networking', 'service', 'owned', 'Facebook', ',', 'Inc', '.']\n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "#The output shows that the word 'online' has been removed.\r\n", "\r\n", "#You can also add a list of words to the stopwords.words list using the append method, as shown below:" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 16, "source": [ "sw_list = ['social','online']\r\n", "all_stopwords.extend(sw_list)\r\n", "\r\n", "text_tokens = word_tokenize(example_sentence)\r\n", "tokens_without_sw = [word for word in text_tokens if not word in all_stopwords]\r\n", "\r\n", "print(tokens_without_sw)" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "['Facebook', 'American', 'media', 'networking', 'service', 'owned', 'Facebook', ',', 'Inc', '.']\n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "##Removing Stop Words from Default NLTK Stop Word List\r\n", "\r\n", "'''\r\n", "Since stopwords.word('english') is merely a list of items, you can remove items from this list like any other list.\r\n", "The simplest way to do so is via the remove() method. \r\n", "This is helpful for when your application needs a stop word to not be removed.\r\n", "For example, you may need to keep the word \"is\" in a sentence to know when a statement is being negated.\r\n", "\r\n", "The following script removes the stop word \"is\" from the default list of stop words in NLTK:\r\n", "'''" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 18, "source": [ "all_stopwords = stopwords.words('english')\r\n", "all_stopwords.remove('is')\r\n", "\r\n", "text_tokens = word_tokenize(example_sentence)\r\n", "tokens_without_sw = [word for word in text_tokens if not word in all_stopwords]\r\n", "\r\n", "print(tokens_without_sw)" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "['Facebook', 'is', 'American', 'online', 'social', 'media', 'social', 'networking', 'service', 'owned', 'Facebook', ',', 'Inc', '.']\n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "##Using the SpaCy Library\r\n", "'''\r\n", "The SpaCy library in Python is yet another extremely useful language for natural language processing in Python.\r\n", "\r\n", "To install SpaCy, you have to execute the following script on your command terminal:\r\n", "'''" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "! pip install -U spacy\r\n" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "! python -m spacy download en\r\n" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "'''\r\n", "Once the library is downloaded, you also need to download the language model.\r\n", "Several models exist in SpaCy for different languages.\r\n", "We will be installing the English language model. Execute the following command in your terminal:\r\n", "'''" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 21, "source": [ "import spacy\r\n", "sp = spacy.load('en_core_web_sm')\r\n", "\r\n", "all_stopwords = sp.Defaults.stop_words\r\n", "\r\n", "text = \"Facebook is an American online social media and social networking service owned by Facebook, Inc.\"\r\n", "text_tokens = word_tokenize(text)\r\n", "tokens_without_sw= [word for word in text_tokens if not word in all_stopwords]\r\n", "\r\n", "print(tokens_without_sw)" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "['Facebook', 'American', 'online', 'social', 'media', 'social', 'networking', 'service', 'owned', 'Facebook', ',', 'Inc', '.']\n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "#Adding and Removing Stop Words in SpaCy Default Stop Word List\r\n", "'''\r\n", "Like the other NLP libraries, you can also add or remove stop words from the default stop word list in Spacy.\r\n", " But before that, we will see a list of all the existing stop words in SpaCy.'''" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 23, "source": [ "print(len(all_stopwords))\r\n", "print('-----')\r\n", "print(all_stopwords)" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "326\n", "-----\n", "{'just', 'none', 'such', 'did', 'ever', 'since', 'thereupon', 'would', 'fifteen', '‘ve', 'seeming', 'formerly', 'and', '’s', 'ca', 'us', 'out', 'his', 'without', 'call', 'nor', 'those', 'was', 'nine', 'through', 'name', 'mine', 'hence', 'never', 'nowhere', 'something', 'therein', 'anyway', 'herein', 'be', 'many', 'meanwhile', \"n't\", '‘ll', 'anyhow', 'fifty', 'onto', 'myself', 'toward', 'serious', 'somewhere', 'quite', 'much', 'throughout', 'various', 'wherein', 'indeed', 'hereby', 'yourself', 'empty', 'it', 'seems', 'although', 'under', 'could', 'somehow', 'above', 'that', \"'ll\", 'across', 'an', 'will', 'bottom', 'last', 'each', 'everyone', 'him', 'four', 'take', 'where', 'part', 'third', 'few', 'whenever', 'whither', \"'ve\", 'forty', 'been', 'we', 'you', 'there', 'after', 'cannot', 'move', '’m', 'does', 'around', 'via', 'might', 'am', 'also', 'became', 'he', '’ve', 'enough', 'even', 'hers', 'is', 'to', 'must', 'she', 'eight', 'ten', 'two', 'show', 'wherever', 'because', 'own', 'n’t', \"'re\", '‘m', 'afterwards', 'other', 'than', 'as', 'thus', 'regarding', 'which', 'whom', \"'m\", 'often', 'all', 'twelve', 'up', '’d', 'give', 'another', 'i', 'our', 'get', 'or', 'still', 'next', 'them', 'they', 'n‘t', 'its', 'towards', 'noone', 'make', 'itself', 'were', '‘d', 'any', 'thereafter', 'nothing', '’ll', 'in', 'beyond', 'except', 'latterly', 'whereafter', 'about', 'doing', 'using', 'being', 'nobody', 'on', 'himself', 'really', 'these', 'whence', 'why', 'with', 'along', 'beforehand', 'full', 'sixty', '‘s', 'my', 'former', 'seem', 'whereupon', 'do', 'until', 'becoming', 'for', 'almost', 'most', 'further', 'this', 'whose', 'against', 'too', 'while', 'elsewhere', 'a', 'down', 'others', 'top', 'over', 'then', 'has', 'latter', 'their', 'ourselves', 'before', 'seemed', 'should', 'else', 'from', 'moreover', 'due', 'back', 'once', 'put', 'go', 'had', 'but', 'together', 'yet', 'of', '‘re', 'someone', 'alone', 'here', 're', 'themselves', 'below', 'nevertheless', 'hundred', 'may', 'off', 'perhaps', 'three', 'are', 'if', 'upon', '’re', 'between', 'several', \"'s\", 'herself', 'done', \"'d\", 'either', 'behind', 'five', 'some', 'twenty', 'amount', 'thereby', 'per', 'though', 'beside', 'not', 'now', 'used', 'whoever', 'first', 'during', 'however', 'therefore', 'whether', 'everything', 'besides', 'into', 'become', 'what', 'whereas', 'becomes', 'say', 'mostly', 'front', 'every', 'so', 'how', 'ours', 'only', 'anyone', 'have', 'yours', 'her', 'more', 'when', 'anything', 'well', 'among', 'at', 'see', 'very', 'same', 'always', 'within', 'everywhere', 'keep', 'thru', 'unless', 'please', 'no', 'again', 'by', 'neither', 'sometime', 'whatever', 'me', 'hereupon', 'whereby', 'hereafter', 'least', 'six', 'otherwise', 'sometimes', 'whole', 'both', 'yourselves', 'less', 'already', 'namely', 'made', 'one', 'rather', 'the', 'thence', 'amongst', 'anywhere', 'your', 'who', 'eleven', 'side', 'can'}\n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "'''\r\n", "Adding Stop Words to Default SpaCy Stop Words List\r\n", "The SpaCy stop word list is basically a set of strings. You can add a new word to the set like you would add any new item to a set.\r\n", "\r\n", "Look at the following script in which we add the word \"AI\" to existing list of stop words in Spacy:\r\n", "'''" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 24, "source": [ "import spacy\r\n", "sp = spacy.load('en_core_web_sm')\r\n", "\r\n", "all_stopwords = sp.Defaults.stop_words\r\n", "all_stopwords.add(\"Inc\")\r\n", "\r\n", "text = \"Facebook is an American online social media and social networking service owned by Facebook, Inc.\"\r\n", "\r\n", "text_tokens = word_tokenize(text)\r\n", "tokens_without_sw = [word for word in text_tokens if not word in all_stopwords]\r\n", "\r\n", "print(tokens_without_sw)" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "['Facebook', 'American', 'online', 'social', 'media', 'social', 'networking', 'service', 'owned', 'Facebook', ',', '.']\n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "'''\r\n", "Removing Stop Words from Default SpaCy Stop Words List\r\n", "To remove a word from the set of stop words in SpaCy, you can pass the word to remove to the remove method of the set.\r\n", "\r\n", "The following script removes the word \"is\" from the set of stop words in SpaCy:\r\n", "'''" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 25, "source": [ "import spacy\r\n", "sp = spacy.load('en_core_web_sm')\r\n", "\r\n", "all_stopwords = sp.Defaults.stop_words\r\n", "all_stopwords.add(\"is\")\r\n", "\r\n", "text = \"Facebook is an American online social media and social networking service owned by Facebook, Inc.\"\r\n", "\r\n", "text_tokens = word_tokenize(text)\r\n", "tokens_without_sw = [word for word in text_tokens if not word in all_stopwords]\r\n", "\r\n", "print(tokens_without_sw)" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "['Facebook', 'American', 'online', 'social', 'media', 'social', 'networking', 'service', 'owned', 'Facebook', ',', '.']\n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "'''\r\n", "Using Custom Script to Remove Stop Words\r\n", "If you want full control over stop word removal, you can write your own script to remove stop words from your string.\r\n", "\r\n", "The first step in this regard is to define a list of words that you want treated as stop words.\r\n", "Let's create a list of some of the most commonly used stop words:\r\n", "\r\n", "'''" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 26, "source": [ "my_stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', \\\r\n", " \"you're\", \"you've\", \"you'll\", \"you'd\", 'your', 'yours', 'yourself', 'yourselves'\\\r\n", " , 'he', 'him', 'his', 'himself', 'she', \"she's\", 'her', 'hers', 'herself', \\\r\n", " 'it', \"it's\", 'its', 'itself', 'they', 'them', 'their', 'theirs', \\\r\n", " 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', \"that'll\", \\\r\n", " 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', \\\r\n", " 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', \\\r\n", " 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because',\\\r\n", " 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', \\\r\n", " 'about', 'against', 'between', 'into', 'through', \\\r\n", " 'during', 'before', 'after', 'above', 'below',\\\r\n", " 'to', 'from', 'up', 'down', 'in', 'out', \\\r\n", " 'on', 'off', 'over', 'under', 'again',\\\r\n", " 'further', 'then', 'once', 'here', 'there',\\\r\n", " 'when', 'where', 'why', 'how', 'all', 'any',\\\r\n", " 'both', 'each', 'few', 'more', 'most', 'other',\\\r\n", " 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than',\\\r\n", " 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', \"don't\", 'should', \\\r\n", " \"should've\", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren',\\\r\n", " \"aren't\", 'couldn', \"couldn't\", 'didn', \"didn't\", 'doesn', \"doesn't\", \\\r\n", " 'hadn', \"hadn't\", 'hasn', \"hasn't\", 'haven', \"haven't\", 'isn', \"isn't\", 'ma', \\\r\n", " 'mightn', \"mightn't\", 'mustn', \"mustn't\", 'needn', \"needn't\", 'shan', \"shan't\", \\\r\n", " 'shouldn', \"shouldn't\", 'wasn', \"wasn't\", 'weren', \"weren't\", 'won', \"won't\", 'wouldn', \"wouldn't\"]\r\n" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 27, "source": [ "def remove_mystopwords(sentence):\r\n", " tokens = sentence.split(\" \")\r\n", " tokens_filtered= [word for word in text_tokens if not word in my_stopwords]\r\n", " return (\" \").join(tokens_filtered)" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 28, "source": [ "text = \"Facebook is an American online social media and social networking service owned by Facebook, Inc.\"\r\n", "filtered_text = remove_mystopwords(text)\r\n", "print(filtered_text)" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Facebook American online social media social networking service owned Facebook , Inc .\n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [], "outputs": [], "metadata": {} } ], "metadata": { "kernelspec": { "name": "python3", "display_name": "Python 3.6.10 64-bit ('tf-gpu': conda)" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.10" }, "interpreter": { "hash": "135e78ef6267b613ce7b86630936d470174b66187aad9f784a45e5cc3235687c" } }, "nbformat": 4, "nbformat_minor": 2 }