Skip to content

Instantly share code, notes, and snippets.

@imjared
Created March 20, 2013 00:33
Show Gist options
  • Save imjared/5201405 to your computer and use it in GitHub Desktop.
Save imjared/5201405 to your computer and use it in GitHub Desktop.
A CasperJS script that crawled a list of links on then scraped the relevant content on each page and output it to a nicely formatted XML file. Sure beats database dumps/SQL manipulation, in my opinion.
/*jshint strict:false*/
/*global CasperError console phantom require*/
/**
* grab links and push them into xml
*/
var casper = require("casper").create({
});
var numberOfLinks = 0;
var currentLink = 0;
var links = [];
var buildPage, capture, selectLink, grabContent, writeContent;
var pageXML = '<channel>';
var fs = require('fs');
var filename = 'content.xml'
casper.start("https://www.washingtonindependentreviewofbooks.com/archives/features/", function() {
numberOfLinks = this.evaluate(function() {
return __utils__.findAll('.the_article a').length;
});
this.echo(numberOfLinks + " items found");
// cause jquery makes it easier
casper.page.injectJs('/PATH/TO/jquery.js');
});
// Capture links
capture = function() {
links = this.evaluate(function() {
var link = [];
jQuery('.the_article a').each(function() {
link.push($(this).attr('href'));
});
return link;
});
this.then(selectLink);
};
selectLink = function() {
if (currentLink < numberOfLinks) {
this.then(grabContent);
} else {
pageXML += '</channel>'
}
};
grabContent = function() {
var postTitle;
var postID;
var postContent;
casper.open(links[currentLink]).then(function() {
// these will eventually be mapped into XML nodes
postTitle = this.fetchText('.post h2');
postID = this.getElementAttribute('.post', 'id');
postContent = this.evaluate(function() {
// items on the scraped page that needed to be removed
jQuery('.interactive_right').remove();
jQuery('.shareinpost').remove();
return jQuery('.entry').html();
});
this.echo( 'processing item ' + currentLink + ' out of ' + numberOfLinks + ' | ' + postTitle + ' | entry #' + postID );
pageXML += '<row><postposition><![CDATA[' + currentLink + ']]></postposition><title><![CDATA[' + postTitle + ']]></title><postContent><![CDATA[' + postContent + ']]></postContent><postId><![CDATA[' + postID + ']]></postId></row>';
});
this.then(buildPage);
};
buildPage = function() {
this.echo('writing to ' + filename);
fs.write(filename, pageXML, 'w');
currentLink++;
this.then(selectLink);
};
casper.then(capture);
casper.run();
@andreaderrico2
Copy link

But the script print everytimes the content of the first page and not the content of each article... why?

@darrylhebbes
Copy link

@tdhz77 I would like to see your re-write of this script, words are cheap.

@gauravkakkar87
Copy link

An excellent example, neatly written and very helpful. Thanks @imjared

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment