Created
March 20, 2013 00:33
-
-
Save imjared/5201405 to your computer and use it in GitHub Desktop.
A CasperJS script that crawled a list of links on then scraped the relevant content on each page and output it to a nicely formatted XML file. Sure beats database dumps/SQL manipulation, in my opinion.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*jshint strict:false*/ | |
/*global CasperError console phantom require*/ | |
/** | |
* grab links and push them into xml | |
*/ | |
var casper = require("casper").create({ | |
}); | |
var numberOfLinks = 0; | |
var currentLink = 0; | |
var links = []; | |
var buildPage, capture, selectLink, grabContent, writeContent; | |
var pageXML = '<channel>'; | |
var fs = require('fs'); | |
var filename = 'content.xml' | |
casper.start("https://www.washingtonindependentreviewofbooks.com/archives/features/", function() { | |
numberOfLinks = this.evaluate(function() { | |
return __utils__.findAll('.the_article a').length; | |
}); | |
this.echo(numberOfLinks + " items found"); | |
// cause jquery makes it easier | |
casper.page.injectJs('/PATH/TO/jquery.js'); | |
}); | |
// Capture links | |
capture = function() { | |
links = this.evaluate(function() { | |
var link = []; | |
jQuery('.the_article a').each(function() { | |
link.push($(this).attr('href')); | |
}); | |
return link; | |
}); | |
this.then(selectLink); | |
}; | |
selectLink = function() { | |
if (currentLink < numberOfLinks) { | |
this.then(grabContent); | |
} else { | |
pageXML += '</channel>' | |
} | |
}; | |
grabContent = function() { | |
var postTitle; | |
var postID; | |
var postContent; | |
casper.open(links[currentLink]).then(function() { | |
// these will eventually be mapped into XML nodes | |
postTitle = this.fetchText('.post h2'); | |
postID = this.getElementAttribute('.post', 'id'); | |
postContent = this.evaluate(function() { | |
// items on the scraped page that needed to be removed | |
jQuery('.interactive_right').remove(); | |
jQuery('.shareinpost').remove(); | |
return jQuery('.entry').html(); | |
}); | |
this.echo( 'processing item ' + currentLink + ' out of ' + numberOfLinks + ' | ' + postTitle + ' | entry #' + postID ); | |
pageXML += '<row><postposition><![CDATA[' + currentLink + ']]></postposition><title><![CDATA[' + postTitle + ']]></title><postContent><![CDATA[' + postContent + ']]></postContent><postId><![CDATA[' + postID + ']]></postId></row>'; | |
}); | |
this.then(buildPage); | |
}; | |
buildPage = function() { | |
this.echo('writing to ' + filename); | |
fs.write(filename, pageXML, 'w'); | |
currentLink++; | |
this.then(selectLink); | |
}; | |
casper.then(capture); | |
casper.run(); |
@tdhz77 I would like to see your re-write of this script, words are cheap.
An excellent example, neatly written and very helpful. Thanks @imjared
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
But the script print everytimes the content of the first page and not the content of each article... why?