From 75e47cee33fb919c3d66fd6205f5e58f2bad3a23 Mon Sep 17 00:00:00 2001 From: Jay Taylor Date: Wed, 21 Mar 2018 23:16:38 -0700 Subject: [PATCH] Initial commit. --- .gitignore | 67 +++++++++++++++++++++++++++++ .travis.yml | 18 ++++++++ LICENSE | 21 ++++++++++ README.md | 25 +++++++++++ archiveis.go | 104 ++++++++++++++++++++++++++++++++++++++++++++++ archiveis_test.go | 27 ++++++++++++ 6 files changed, 262 insertions(+) create mode 100644 .gitignore create mode 100644 .travis.yml create mode 100644 LICENSE create mode 100644 README.md create mode 100644 archiveis.go create mode 100644 archiveis_test.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ba30c6c --- /dev/null +++ b/.gitignore @@ -0,0 +1,67 @@ +## +# Vim swap/working files. +*.sw[opa] + +## +# SublimeText files. +*.sublime-project +*.sublime-workspace + +## +# IDEA IntelliJ files. +*.idea +*.iml + +## +# Visual Studio Code files. +.vscode + +## +# Mac OS-X miscellany. +.DS_Store +.AppleDouble +.LSOverride +._* + +## +# Windows image file caches +Thumbs.db +ehthumbs.db + +# Folder config file +Desktop.ini + +## +# Compiled Object files, Static and Dynamic libs (Shared Objects). +*.o +*.a +*.so + +## +# Folders. +_obj +_test + +## +# Architecture specific extensions/prefixes. +*.[568vq] +[568vq].out + +*.cgo1.go +*.cgo2.c +_cgo_defun.c +_cgo_gotypes.go +_cgo_export.* + +_testmain.go + +*.exe +*.test +*.prof + +## +# Certificate files. +id_[rd]sa +*.pem +*.crt +*.key diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..3502ed6 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,18 @@ +language: go + +go: + - tip + - "1.10" + - 1.9 + - 1.8 + - 1.7 + - 1.6 + - 1.5 + +script: + - go test ./... + +notifications: + email: + on_success: change + on_failure: always diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..fc4d5ab --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Jay Taylor + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..7f6f0bb --- /dev/null +++ b/README.md @@ -0,0 +1,25 @@ +# archiveis + +[![Documentation](https://godoc.org/github.com/jaytaylor/archiveis?status.svg)](https://godoc.org/github.com/jaytaylor/archiveis) +[![Build Status](https://travis-ci.org/jaytaylor/archiveis.svg?branch=master)](https://travis-ci.org/jaytaylor/archiveis) +[![Report Card](https://goreportcard.com/badge/github.com/jaytaylor/archiveis)](https://goreportcard.com/report/github.com/jaytaylor/archiveis) + +### About + +archiveis is a golang package for archiving web pages via [archive.is](https://archive.is). + +Please be mindful and responsible and go easy on them, we want archive.is to last forever! + +Created by [Jay Taylor](https://jaytaylor.com/). + +### Requirements + +* Go version 1.5 or newer + +### Running the test suite + + go test ./... + +#### License + +Permissive MIT license, see the [LICENSE](LICENSE) file for more information. diff --git a/archiveis.go b/archiveis.go new file mode 100644 index 0000000..96de082 --- /dev/null +++ b/archiveis.go @@ -0,0 +1,104 @@ +package archiveis + +import ( + "bytes" + "errors" + "fmt" + "net/url" + "regexp" + "strings" + + "github.com/PuerkitoBio/goquery" + "github.com/gigawattio/errorlib" + "github.com/parnurzeal/gorequest" +) + +const ( + baseURL = "https://archive.is" + userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36" +) + +var jsLocationExpr = regexp.MustCompile(`document\.location\.replace\(["']([^"']+)`) + +// Capture archives the provided URL using the archive.is service. +func Capture(u string) (string, error) { + submitID, err := newSubmitID() + if err != nil { + return "", err + } + + // return id, nil + + content := fmt.Sprintf("submitid=%v&url=%v", url.QueryEscape(submitID), url.QueryEscape(u)) + fmt.Printf("content=%v\n", content) + resp, body, errs := newRequest().Post(baseURL+"/submit/").Send(content).Set("content-type", "application/x-www-form-urlencoded").EndBytes() + if err := errorlib.Merge(errs); err != nil { + return "", err + } + if resp.StatusCode/100 != 2 { + return "", fmt.Errorf("form submit received unhappy response status-code=%v", resp.StatusCode) + } + + doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(body)) + if err != nil { + return "", fmt.Errorf("constructing goquery doc from submission response: %s", err) + } + + if script := doc.Find("script").First(); script != nil { + js := strings.Trim(script.Text(), "\r\n\t ") + if match := jsLocationExpr.FindStringSubmatch(js); len(match) > 1 { + return match[1], nil + } + } + + fmt.Printf("body: %+v\n", string(body)) + fmt.Printf("headers: %+v\n", resp.Header) + fmt.Printf("trailers: %+v\n", resp.Trailer) + + input := doc.Find("input[name=id]").First() + if input == nil { + return "", errors.New("page archive ID not found in submission response content") + } + id, exists := input.Attr("value") + if !exists { + return "", errors.New("no page archive ID value available") + } + + final := fmt.Sprintf("%v/%v", baseURL, id) + return final, nil +} + +// newSubmitID gets the index page and extracts the form submission identifier. +func newSubmitID() (string, error) { + resp, body, errs := newRequest().Get(baseURL).EndBytes() + if err := errorlib.Merge(errs); err != nil { + return "", err + } + if resp.StatusCode/100 != 2 { + return "", fmt.Errorf("index retrieval received unhappy response status-code=%v", resp.StatusCode) + } + + doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(body)) + if err != nil { + return "", fmt.Errorf("constructing goquery doc from index: %s", err) + } + + input := doc.Find("input[name=submitid]").First() + if input == nil { + return "", errors.New("no submitid element found") + } + id, exists := input.Attr("value") + if !exists { + return "", errors.New("no submitid value available") + } + return id, nil +} + +func newRequest() *gorequest.SuperAgent { + r := gorequest.New(). + Set("host", strings.Split(baseURL, "://")[1]). + Set("user-agent", userAgent). + Set("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"). + Set("referer", baseURL+"/") + return r +} diff --git a/archiveis_test.go b/archiveis_test.go new file mode 100644 index 0000000..44301a9 --- /dev/null +++ b/archiveis_test.go @@ -0,0 +1,27 @@ +package archiveis + +import ( + "fmt" + "testing" + "time" +) + +const page = "https://yro.slashdot.org/story/18/03/21/2112247/russia-secretly-helped-venezuela-launch-a-cryptocurrency-to-evade-us-sanctions#comments" + +func TestCapture1(t *testing.T) { + // Link which has been submitted before. + url, err := Capture(page) + if err != nil { + t.Fatal(err) + } + t.Logf("Resolved URL=%q", url) +} + +func TestCapture2(t *testing.T) { + // Link which has likely not been submitted before. + url, err := Capture(fmt.Sprintf("%v?%v", page, time.Now().Unix())) + if err != nil { + t.Fatal(err) + } + t.Logf("Resolved URL=%q", url) +}