-
Notifications
You must be signed in to change notification settings - Fork 1
/
webpage_wordcount.py
76 lines (65 loc) · 1.64 KB
/
webpage_wordcount.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from bs4 import BeautifulSoup
import requests
import pandas as pd
import matplotlib.pyplot as plt
url='https://en.wikipedia.org/wiki/Machine_learning'
webpage=requests.get(url)
soup=BeautifulSoup(webpage.text,features="lxml")
para=[]
word_count={}
words=[]
temp_links=soup.select('a')
final_links=[link for link in temp_links if 'href' in str(link)]
print("NO of links are ",len(final_links))
words=soup.get_text().split()
for word in words:
if word in word_count:
word_count[word]+=1
else:
word_count[word]=1
words=[]
for word,num in word_count.items():
words.append((num,word))
words.sort()
words.reverse()
print("There are " +str(len(words))+" words in this webpage" )
#creating a numpy datatype for easy access
plot_data=pd.DataFrame(words)
words=list(plot_data.iloc[0:,1])
counts=list(plot_data.iloc[0:,0])
#accesing data for plotting
top20=counts[0:20]
top20_words=words[0:20]
#last3wordspostion
counts.reverse()
pos=len(counts)-counts.index(3)
counts.reverse()
words_above_3=words[:pos]
count_above_3=counts[:pos]
#plotting top20
plt.title("Top 20 words")
plt.pie(top20,labels=top20_words)
plt.show()
#top 20 bar
plt.title("Top 20 word")
plt.bar(top20_words,top20)
plt.xlabel("Words")
plt.ylabel("Occurance")
plt.xticks(rotation=90)
plt.show()
#all above 3 in scatter
plt.title("Words above count 3")
plt.scatter(words_above_3,count_above_3)
plt.xlabel("words")
plt.ylabel("Occurance")
plt.xticks(rotation=90)
plt.show()
#plotting stack plot for links
plt.title("No of Links in the webpage")
plt.xlabel("Links on webpage")
y=[0,len(final_links),1]
x=[1.9,2,2.1]
plt.stackplot(x,y,colors=['brown'])
plt.xlim(1,3)
plt.show()
exit()