-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
165 additions
and
44 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
package gotube | ||
|
||
import ( | ||
"bytes" | ||
"errors" | ||
"net/url" | ||
"strconv" | ||
. "strings" | ||
) | ||
|
||
/* | ||
* Get the top k video id from the search result. | ||
* If k is larger than the number of search result, this function would return all the search result | ||
*/ | ||
func GetTopKVideoIds(keywords string, k int) ([]string, error) { | ||
num := 0 | ||
pageNum := 1 | ||
set := make(map[string]bool) | ||
var idList []string | ||
for num < k { | ||
//Get url of search result from #pageNum page | ||
searchUrl, err := GetSearchUrl(keywords, pageNum) | ||
if err != nil { | ||
return idList, err | ||
} | ||
//Get list of video id from current page | ||
idListOfPage, err := GetVideoIdsFromPage(searchUrl) | ||
if err != nil { | ||
return idList, err | ||
} | ||
//Add id from id list retrieved in current page to result until we already got top k or out of result | ||
idIdx := 0 | ||
for num < k && idIdx < len(idListOfPage) { | ||
_, ok := set[idListOfPage[idIdx]] | ||
if ok { //We have ran out of search results, it's repeating the last page | ||
MapToArray(set, &idList) | ||
return idList, err | ||
} else { //This id is new | ||
set[idListOfPage[idIdx]] = true | ||
} | ||
idIdx++ | ||
num++ | ||
} | ||
pageNum++ | ||
} | ||
MapToArray(set, &idList) | ||
return idList, nil | ||
} | ||
|
||
/* | ||
* Get a search url from the provided keywords | ||
*/ | ||
func GetSearchUrl(keywords string, pageNum int) (searchUrl string, err error) { | ||
//Replace ' ' with '+', like what the YouTube search does | ||
keywords = Map( | ||
func(r rune) rune { | ||
if r == ' ' { | ||
r = '+' | ||
} | ||
return r | ||
}, keywords) | ||
//Escape keyword to safely put into url | ||
keywords = url.QueryEscape(keywords) | ||
searchUrl = "https://www.youtube.com/results?search_query=" + keywords | ||
//Make sure page number is valid | ||
switch { | ||
case pageNum < 1: | ||
err = errors.New("invalid page number") | ||
return | ||
case pageNum == 1: | ||
//No action needed | ||
case pageNum > 1: | ||
searchUrl += "&page=" + strconv.Itoa(pageNum) | ||
} | ||
return | ||
} | ||
|
||
/* | ||
* Parse the http data of the page get from url and retrieve the id list | ||
*/ | ||
func GetVideoIdsFromPage(searchUrl string) (idList []string, err error) { | ||
//Get the http code of the page get from url | ||
body, err := GetHttpFromUrl(searchUrl) | ||
if err != nil { | ||
return | ||
} | ||
//Retrive id list | ||
idBeg := []byte("class=\"yt-lockup yt-lockup-tile yt-lockup-video vve-check clearfix yt-uix-tile\" data-context-item-id=\"") | ||
beg := 0 | ||
for { | ||
//Find the index of begin pattern | ||
offset := bytes.Index(body[beg:], idBeg) | ||
if offset < 0 { | ||
return | ||
} | ||
beg += offset + len(idBeg) | ||
//Find the index of closing parenthesis | ||
offset = bytes.Index(body[beg:], []byte("\"")) | ||
if offset < 0 { | ||
err = errors.New("unmatched parenthesis") | ||
return | ||
} | ||
end := beg + offset | ||
idList = append(idList, string(body[beg:end])) | ||
} | ||
return | ||
} | ||
|
||
func MapToArray(m map[string]bool, a *[]string) { | ||
for key, _ := range m { | ||
*a = append(*a, key) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters