-
Notifications
You must be signed in to change notification settings - Fork 0
/
R_WebCrawler
152 lines (132 loc) · 7.77 KB
/
R_WebCrawler
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# start stopwatch ----
# experience suggests 25 years of data for all teams takes about 5 hours to run
rts_startTime = Sys.time()
# import libraries ----
library(rvest)
library(stringr)
# looping arrays
rarr_year = c(2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000,1999,1998,1997,1996,1995,1994,1993,1992,1991,1990)
rarr_team = c("New_England_Patriots","Buffalo_Bills","Miami_Dolphins","New_York_Jets",
"Baltimore_Ravens","Cincinnati_Bengals","Cleveland_Browns","Pittsburgh_Steelers",
"Houston_Texans","Indianapolis_Colts","Jacksonville_Jaguars","Tennessee_Titans","Houston_Oilers",
"Denver_Broncos","Kansas_City_Chiefs","Los_Angeles_Chargers","Oakland_Raiders","San_Diego_Chargers",
"Dallas_Cowboys","New_York_Giants","Philadelphia_Eagles","Washington_Redskins",
"Chicago_Bears","Detroit_Lions","Green_Bay_Packers","Minnesota_Vikings",
"Atlanta_Falcons","Carolina_Panthers","New_Orleans_Saints","Tampa_Bay_Buccaneers",
"Arizona_Cardinals","Los_Angeles_Rams","San_Francisco_49ers","Seattle_Seahawks","St._Louis_Rams")
# functions for parsing out elements ----
rfx_ParseOutHeading = function(rfxparam_BaseArray,rfxparam_Value){
rfxndx_Value = min(grep(rfxparam_Value,tolower(rfxparam_BaseArray)))
rfxparam_BaseArray = append(rfxparam_BaseArray,rfxparam_Value,after = rfxndx_Value)
rfxparam_BaseArray[rfxndx_Value] = gsub(rfxparam_Value,"",tolower(rfxparam_BaseArray[rfxndx_Value]))
rfxlist_return = list(rfxparam_BaseArray,rfxndx_Value)
return(rfxlist_return)
}
rfx_AttachHeadings = function(rfxparam_BaseArray,rfxparam_Value,rfxparam_startIndex,rfxparam_endIndex){
rfxparam_BaseArray = append(rfxparam_BaseArray,rep(rfxparam_Value,times = rfxparam_endIndex - rfxparam_startIndex))
return(rfxparam_BaseArray)
}
# arrays / dataframe for compiling elements ----
rarr_Position = c()
rarr_Person = c()
rarr_Headings = c()
rarr_CurrentYear = c()
rarr_CurrentTeam = c()
rdf_ElementHeadingTeamYear = cbind(rarr_Position,rarr_Person,rarr_Headings,rarr_CurrentYear,rarr_CurrentTeam)
rdf_Errors = cbind(rarr_CurrentYear,rarr_CurrentTeam)
# looping parameters ----
rndx_Year = 1
rndx_Counter = 1
rndx_Year_Length = length(rarr_year)
rndx_Team_Length = length(rarr_team)
# for each year ----
while(rndx_Year <= rndx_Year_Length){
rndx_Team = 1
# for each team ----
while(rndx_Team <= rndx_Team_Length){
rbin_tableIndex = 0
rndx_tableIndex = 1
# look through the various tables in wiki page to see if any of them have what we're looking for
# if one of them does, execute parsing code them break out of loop
while(rndx_tableIndex <= 100 & rbin_tableIndex == 0){
rvar_url = paste('https://en.wikipedia.org/wiki/',rarr_year[rndx_Year],'_',rarr_team[rndx_Team],'_season',sep="")
tryCatch({
robj_webpage = read_html(rvar_url)
rvar_xPath = paste('/html/body/div[3]/div[3]/div[4]/div/table[',rndx_tableIndex,']',sep="")
robj_rankDataHTML = html_nodes(robj_webpage,xpath = rvar_xPath)
robj_rankData = html_text(robj_rankDataHTML)
# getting rid of all newline entries in the table, which is of HTML class "tocolours"
#print(robj_rankData)
rarr_elements_1 = unlist(strsplit(gsub("(^\\s+)|(\\s+$)", "",robj_rankData),"\n\n\n\n\n"))
#print(rarr_elements_1)
rarr_elements_2 = unlist(strsplit(gsub("(^\\s+)|(\\s+$)", "",rarr_elements_1),"\n\n"))
#print(rarr_elements_2)
rarr_elements_3 = unlist(strsplit(gsub("(^\\s+)|(\\s+$)", "",rarr_elements_2),"\n"))
#print(rarr_elements_3)
rarr_elements_3 = rarr_elements_3[-1]
# in particular we're looking for the term "front office"
# this is included almost universally in the tables we look at, and is always the first entry
rndx_FrontOffice = match("front office",tolower(rarr_elements_3))
if(rndx_FrontOffice == 1){
rbin_tableIndex = 1
}
},
error = function(err){
rdf_Errors = rbind(rdf_Errors,cbind(rarr_team[rndx_Team],rarr_year[rndx_Year]))
}
)
rndx_tableIndex = rndx_tableIndex + 1
}
# now parse out the rest of the headings: head coaches, offensive coaches, defensive coaches, special teams coaches, strength and conditioning
# parsing out "defensive coaches" is a bit of an odd bird for reasons related to the "\n" I don't fully understand ...
# ... so far that reason the method in which it's parsed out is slightly different
# in any case, parse these bad boys out and remember their index in the list
tryCatch({
rlist_ParsedOutHeadings = rfx_ParseOutHeading(rarr_elements_3,"head coaches")
rarr_elements_3 = rlist_ParsedOutHeadings[[1]]
rndx_HeadCoaches = rlist_ParsedOutHeadings[[2]] + 1
rlist_ParsedOutHeadings = rfx_ParseOutHeading(rarr_elements_3,"offensive coaches")
rarr_elements_3 = rlist_ParsedOutHeadings[[1]]
rndx_OffensiveCoaches = rlist_ParsedOutHeadings[[2]] + 1
rndx_DefensiveCoaches = match("defensive coaches",tolower(rarr_elements_3))
rlist_ParsedOutHeadings = rfx_ParseOutHeading(rarr_elements_3,"special teams coaches")
rarr_elements_3 = rlist_ParsedOutHeadings[[1]]
rndx_SpecialTeamsCoaches = rlist_ParsedOutHeadings[[2]] + 1
rlist_ParsedOutHeadings = rfx_ParseOutHeading(rarr_elements_3,"strength and conditioning")
rarr_elements_3 = rlist_ParsedOutHeadings[[1]]
rndx_StrengthAndConditioningCoaches = rlist_ParsedOutHeadings[[2]] + 1
# fill out the proverbial dataframe using the headings and list indeces parsed out above
rarr_Headings = c()
rarr_Headings = rfx_AttachHeadings(rarr_Headings,"front office",rndx_FrontOffice,rndx_HeadCoaches)
rarr_Headings = rfx_AttachHeadings(rarr_Headings,"head coaches",rndx_HeadCoaches,rndx_OffensiveCoaches)
rarr_Headings = rfx_AttachHeadings(rarr_Headings,"offensive coaches",rndx_OffensiveCoaches,rndx_DefensiveCoaches)
rarr_Headings = rfx_AttachHeadings(rarr_Headings,"defensive coaches",rndx_DefensiveCoaches,rndx_SpecialTeamsCoaches)
rarr_Headings = rfx_AttachHeadings(rarr_Headings,"special teams coaches",rndx_SpecialTeamsCoaches,rndx_StrengthAndConditioningCoaches)
rarr_Headings = rfx_AttachHeadings(rarr_Headings,"strength and conditioning",rndx_StrengthAndConditioningCoaches,length(rarr_elements_3)+1)
# attach current year, current team, the name of the individual, and the name of his/her position
rarr_CurrentYear = rep(rarr_year[rndx_Year],times = length(rarr_Headings))
rarr_CurrentTeam = rep(rarr_team[rndx_Team],times = length(rarr_Headings))
rarr_Position = sapply(strsplit(rarr_elements_3, split=' \u2013 ', fixed=TRUE), `[`, 1)
rarr_Person = sapply(strsplit(rarr_elements_3, split=' \u2013 ', fixed=TRUE), `[`, 2)
# compile and df out of all of these arrays, append this to the master array
rdf_ElementHeading = cbind(rarr_Position,rarr_Person,rarr_Headings,rarr_CurrentYear,rarr_CurrentTeam)
rdf_ElementHeading = rdf_ElementHeading[-c(rndx_FrontOffice,rndx_HeadCoaches,rndx_OffensiveCoaches,rndx_DefensiveCoaches,rndx_SpecialTeamsCoaches,rndx_StrengthAndConditioningCoaches),]
rdf_ElementHeadingTeamYear = rbind(rdf_ElementHeadingTeamYear,rdf_ElementHeading)
},error = function(err){
rdf_Errors = rbind(rdf_Errors,cbind(rarr_team[rndx_Team],rarr_year[rndx_Year]))
}
)
print(rndx_Counter)
print(rndx_Team)
print(rndx_Year)
rndx_Team = rndx_Team + 1
rndx_Counter = rndx_Counter + 1
}
rndx_Year = rndx_Year + 1
}
# stop stopwatch
rts_endTime = Sys.time()
print(rts_endTime - rts_startTime)
# spit this guy out into a text file
write.csv(rdf_ElementHeadingTeamYear,"graphTable.txt")
write.csv(rdf_Errors,"errorLog.txt")