Add hinet news crawler

fangpenlin · Dec 15, 2010 · af1bf24 · af1bf24
1 parent 0fb0053
commit af1bf24
Showing 1 changed file with 59 additions and 0 deletions.
diff --git a/loso/crawlers/hinet_news.py b/loso/crawlers/hinet_news.py
@@ -0,0 +1,59 @@
+import urllib2
+import cStringIO as StringIO
+
+from lxml import etree
+
+rss_urls = [
+ 'http:https://times.hinet.net/times/rss.do?option=entertainment',
+ 'http:https://times.hinet.net/times/rss.do?option=sport',
+ 'http:https://times.hinet.net/times/rss.do?option=society',
+ 'http:https://times.hinet.net/times/rss.do?option=infotech',
+ 'http:https://times.hinet.net/times/rss.do?option=politics',
+ 'http:https://times.hinet.net/times/rss.do?option=mainland',
+ 'http:https://times.hinet.net/times/rss.do?option=finance',
+ 'http:https://times.hinet.net/times/rss.do?option=internationality',
+ 'http:https://times.hinet.net/times/rss.do?option=weather'
+]
+
+def parseHtml(html):
+ parser = etree.HTMLParser(encoding='utf8')
+ tree = etree.parse(StringIO.StringIO(html), parser)
+ return tree
+
+def parseXml(xml):
+ parser = etree.XMLParser(encoding='utf8')
+ tree = etree.parse(StringIO.StringIO(xml), parser)
+ return tree
+
+def getPage(url):
+ file = urllib2.urlopen(url)
+ content = file.read()
+ file.close() 
+ return content
+
+def getLinks(rss_url):
+ content = getPage(rss_url)
+ tree = parseXml(content)
+ return tree.xpath('//link/text()')
+
+def getNewsText(news_url):
+ content = getPage(news_url)
+ tree = parseHtml(content)
+ for paragraph in tree.xpath("//div[@id='newsp']/p/text()"):
+ yield paragraph.strip()
+
+def crawelCategory(rss_url):
+ links = getLinks(rss_url)
+ for link in links[1:]:
+ yield ' '.join(getNewsText(link))
+
+def main():
+ with open('hinet_news.txt', 'wt') as file:
+ for url in rss_urls:
+ for text in crawelCategory(url):
+ print text
+ print >> file, text
+ print 'Done.'
+
+if __name__ == '__main__':
+ main()