-
Notifications
You must be signed in to change notification settings - Fork 21
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
95a7ff7
commit 8629a99
Showing
2 changed files
with
240 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# 5.2 Bag of Words" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"from sklearn.feature_extraction.text import CountVectorizer" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"data = [' Most shark attacks occur about 10 feet from the beach since that is where the people are',\n", | ||
" 'the efficiency with which he paired the socks in the drawer was quite admirable',\n", | ||
" 'carol drank the blood as if she were a vampire',\n", | ||
" 'giving directions that the mountains are to the west only works when you can see them',\n", | ||
" 'the sign said there was road work ahead so he decided to speed up',\n", | ||
" 'the gruff old man sat in the back of the bait shop grumbling to himself as he scooped out a handful of worms']" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"countvec = CountVectorizer()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"countvec_fit = countvec.fit_transform(data)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"bag_of_words = pd.DataFrame(countvec_fit.toarray(), columns = countvec.get_feature_names_out())" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
" 10 about admirable ahead are as attacks back bait beach ... \\\n", | ||
"0 1 1 0 0 1 0 1 0 0 1 ... \n", | ||
"1 0 0 1 0 0 0 0 0 0 0 ... \n", | ||
"2 0 0 0 0 0 1 0 0 0 0 ... \n", | ||
"3 0 0 0 0 1 0 0 0 0 0 ... \n", | ||
"4 0 0 0 1 0 0 0 0 0 0 ... \n", | ||
"5 0 0 0 0 0 1 0 1 1 0 ... \n", | ||
"\n", | ||
" were west when where which with work works worms you \n", | ||
"0 0 0 0 1 0 0 0 0 0 0 \n", | ||
"1 0 0 0 0 1 1 0 0 0 0 \n", | ||
"2 1 0 0 0 0 0 0 0 0 0 \n", | ||
"3 0 1 1 0 0 0 0 1 0 1 \n", | ||
"4 0 0 0 0 0 0 1 0 0 0 \n", | ||
"5 0 0 0 0 0 0 0 0 1 0 \n", | ||
"\n", | ||
"[6 rows x 71 columns]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(bag_of_words)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.5" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# 5.3 TF-IDF" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"from sklearn.feature_extraction.text import TfidfVectorizer" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"data = [' Most shark attacks occur about 10 feet from the beach since that is where the people are',\n", | ||
" 'the efficiency with which he paired the socks in the drawer was quite admirable',\n", | ||
" 'carol drank the blood as if she were a vampire',\n", | ||
" 'giving directions that the mountains are to the west only works when you can see them',\n", | ||
" 'the sign said there was road work ahead so he decided to speed up',\n", | ||
" 'the gruff old man sat in the back of the bait shop grumbling to himself as he scooped out a handful of worms']" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"tfidfvec = TfidfVectorizer()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"tfidfvec_fit = tfidfvec.fit_transform(data)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"tfidf_bag = pd.DataFrame(tfidfvec_fit.toarray(), columns = tfidfvec.get_feature_names_out())" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
" 10 about admirable ahead are as attacks \\\n", | ||
"0 0.257061 0.257061 0.000000 0.000000 0.210794 0.000000 0.257061 \n", | ||
"1 0.000000 0.000000 0.293641 0.000000 0.000000 0.000000 0.000000 \n", | ||
"2 0.000000 0.000000 0.000000 0.000000 0.000000 0.292313 0.000000 \n", | ||
"3 0.000000 0.000000 0.000000 0.000000 0.222257 0.000000 0.000000 \n", | ||
"4 0.000000 0.000000 0.000000 0.290766 0.000000 0.000000 0.000000 \n", | ||
"5 0.000000 0.000000 0.000000 0.000000 0.000000 0.178615 0.000000 \n", | ||
"\n", | ||
" back bait beach ... were west when where \\\n", | ||
"0 0.00000 0.00000 0.257061 ... 0.000000 0.00000 0.00000 0.257061 \n", | ||
"1 0.00000 0.00000 0.000000 ... 0.000000 0.00000 0.00000 0.000000 \n", | ||
"2 0.00000 0.00000 0.000000 ... 0.356474 0.00000 0.00000 0.000000 \n", | ||
"3 0.00000 0.00000 0.000000 ... 0.000000 0.27104 0.27104 0.000000 \n", | ||
"4 0.00000 0.00000 0.000000 ... 0.000000 0.00000 0.00000 0.000000 \n", | ||
"5 0.21782 0.21782 0.000000 ... 0.000000 0.00000 0.00000 0.000000 \n", | ||
"\n", | ||
" which with work works worms you \n", | ||
"0 0.000000 0.000000 0.000000 0.00000 0.00000 0.00000 \n", | ||
"1 0.293641 0.293641 0.000000 0.00000 0.00000 0.00000 \n", | ||
"2 0.000000 0.000000 0.000000 0.00000 0.00000 0.00000 \n", | ||
"3 0.000000 0.000000 0.000000 0.27104 0.00000 0.27104 \n", | ||
"4 0.000000 0.000000 0.290766 0.00000 0.00000 0.00000 \n", | ||
"5 0.000000 0.000000 0.000000 0.00000 0.21782 0.00000 \n", | ||
"\n", | ||
"[6 rows x 71 columns]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(tfidf_bag)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.5" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |