-
Notifications
You must be signed in to change notification settings - Fork 60
/
get_match_urls.R
189 lines (150 loc) · 6.15 KB
/
get_match_urls.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#' Get FBref match URLs
#'
#' Returns the URL for each match played for a given league season
#' Replaces the deprecated get_match_urls
#'
#' @param country the three character country code
#' @param gender gender of competition, either "M" or "F", or both
#' @param season_end_year the year the season(s) concludes
#' @param tier the tier of the league, ie '1st' for the EPL or '2nd' for the Championship and so on
#' @param non_dom_league_url the URL for Cups and Competitions found at https://fbref.com/en/comps/
#' @param time_pause the wait time (in seconds) between page loads
#'
#' @return returns a character vector of all fbref match URLs for selected competition, season and gender
#'
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#'
#' @export
#'
#' @examples
#' \dontrun{
#' try({
#' fb_match_urls(country = "ENG", gender = "M", season_end_year = c(2019:2021), tier = "1st")
#' non_dom <- "https://fbref.com/en/comps/218/history/Friendlies-M-Seasons"
#' fb_match_urls(country = "", gender = "M", season_end_year = 2021, non_dom_league_url = non_dom)
#' })
#' }
fb_match_urls <- function(country, gender, season_end_year, tier = "1st", non_dom_league_url = NA, time_pause=3) {
main_url <- "https://fbref.com"
# .pkg_message("Scraping match URLs")
country_abbr <- country
gender_M_F <- gender
season_end_year_num <- season_end_year
comp_tier <- tier
cups_url <- non_dom_league_url
seasons <- read.csv("https://raw.githubusercontent.com/JaseZiv/worldfootballR_data/master/raw-data/all_leages_and_cups/all_competitions.csv", stringsAsFactors = F)
if(is.na(cups_url)) {
fixtures_url <- seasons %>%
dplyr::filter(stringr::str_detect(.data[["competition_type"]], "Leagues")) %>%
dplyr::filter(country %in% country_abbr,
gender %in% gender_M_F,
season_end_year %in% season_end_year_num,
tier %in% comp_tier,
!is.na(fixtures_url)) %>%
dplyr::arrange(season_end_year) %>%
dplyr::pull(fixtures_url) %>% unique()
} else {
fixtures_url <- seasons %>%
dplyr::filter(.data[["comp_url"]] %in% cups_url,
gender %in% gender_M_F,
season_end_year %in% season_end_year_num,
!is.na(fixtures_url)) %>%
dplyr::arrange(season_end_year) %>%
dplyr::pull(fixtures_url) %>% unique()
}
time_wait <- time_pause
get_each_seasons_urls <- function(fixture_url, time_pause=time_wait) {
# put sleep in as per new user agreement on FBref
Sys.sleep(time_pause)
match_report_urls <- .load_page(fixture_url) %>%
rvest::html_nodes("td.left~ .left+ .left a") %>%
rvest::html_attr("href") %>%
paste0(main_url, .) %>% unique()
return(match_report_urls)
}
all_seasons_match_urls <- fixtures_url %>%
purrr::map(get_each_seasons_urls) %>%
unlist()
history_index <- grep("-History", all_seasons_match_urls)
if(length(history_index) != 0) {
all_seasons_match_urls <- all_seasons_match_urls[-history_index]
}
# .pkg_message("Match URLs scrape completed")
return(all_seasons_match_urls)
}
#' Get match URLs
#'
#' Returns the URL for each match played for a given league season
#'
#' @param country the three character country code
#' @param gender gender of competition, either "M" or "F", or both
#' @param season_end_year the year the season(s) concludes
#' @param tier the tier of the league, ie '1st' for the EPL or '2nd' for the Championship and so on
#' @param non_dom_league_url the URL for Cups and Competitions found at https://fbref.com/en/comps/
#' @param time_pause the wait time (in seconds) between page loads
#'
#' @return returns a character vector of all fbref match URLs for selected competition, season and gender
#'
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#'
#' @export
#'
#' @examples
#' \dontrun{
#' try({
#' get_match_urls(country = "ENG", gender = "M", season_end_year = c(2019:2021), tier = "1st")
#' non_dom <- "https://fbref.com/en/comps/218/history/Friendlies-M-Seasons"
#' get_match_urls(country = "", gender = "M", season_end_year = 2021, non_dom_league_url = non_dom)
#' })
#' }
get_match_urls <- function(country, gender, season_end_year, tier = "1st", non_dom_league_url = NA, time_pause=3) {
.Deprecated("fb_match_urls")
main_url <- "https://fbref.com"
# .pkg_message("Scraping match URLs")
country_abbr <- country
gender_M_F <- gender
season_end_year_num <- season_end_year
comp_tier <- tier
cups_url <- non_dom_league_url
seasons <- read.csv("https://raw.githubusercontent.com/JaseZiv/worldfootballR_data/master/raw-data/all_leages_and_cups/all_competitions.csv", stringsAsFactors = F)
if(is.na(cups_url)) {
fixtures_url <- seasons %>%
dplyr::filter(stringr::str_detect(.data[["competition_type"]], "Leagues")) %>%
dplyr::filter(country %in% country_abbr,
gender %in% gender_M_F,
season_end_year %in% season_end_year_num,
tier %in% comp_tier,
!is.na(fixtures_url)) %>%
dplyr::arrange(season_end_year) %>%
dplyr::pull(fixtures_url) %>% unique()
} else {
fixtures_url <- seasons %>%
dplyr::filter(.data[["comp_url"]] %in% cups_url,
gender %in% gender_M_F,
season_end_year %in% season_end_year_num,
!is.na(fixtures_url)) %>%
dplyr::arrange(season_end_year) %>%
dplyr::pull(fixtures_url) %>% unique()
}
time_wait <- time_pause
get_each_seasons_urls <- function(fixture_url, time_pause=time_wait) {
# put sleep in as per new user agreement on FBref
Sys.sleep(time_pause)
match_report_urls <- xml2::read_html(fixture_url) %>%
rvest::html_nodes("td.left~ .left+ .left a") %>%
rvest::html_attr("href") %>%
paste0(main_url, .) %>% unique()
return(match_report_urls)
}
all_seasons_match_urls <- fixtures_url %>%
purrr::map(get_each_seasons_urls) %>%
unlist()
history_index <- grep("-History", all_seasons_match_urls)
if(length(history_index) != 0) {
all_seasons_match_urls <- all_seasons_match_urls[-history_index]
}
# .pkg_message("Match URLs scrape completed")
return(all_seasons_match_urls)
}