Skip to content

Commit

Permalink
Add hinet news crawler
Browse files Browse the repository at this point in the history
  • Loading branch information
fangpenlin committed Dec 15, 2010
1 parent 0fb0053 commit af1bf24
Showing 1 changed file with 59 additions and 0 deletions.
59 changes: 59 additions & 0 deletions loso/crawlers/hinet_news.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import urllib2
import cStringIO as StringIO

from lxml import etree

rss_urls = [
'http:https://times.hinet.net/times/rss.do?option=entertainment',
'http:https://times.hinet.net/times/rss.do?option=sport',
'http:https://times.hinet.net/times/rss.do?option=society',
'http:https://times.hinet.net/times/rss.do?option=infotech',
'http:https://times.hinet.net/times/rss.do?option=politics',
'http:https://times.hinet.net/times/rss.do?option=mainland',
'http:https://times.hinet.net/times/rss.do?option=finance',
'http:https://times.hinet.net/times/rss.do?option=internationality',
'http:https://times.hinet.net/times/rss.do?option=weather'
]

def parseHtml(html):
parser = etree.HTMLParser(encoding='utf8')
tree = etree.parse(StringIO.StringIO(html), parser)
return tree

def parseXml(xml):
parser = etree.XMLParser(encoding='utf8')
tree = etree.parse(StringIO.StringIO(xml), parser)
return tree

def getPage(url):
file = urllib2.urlopen(url)
content = file.read()
file.close()
return content

def getLinks(rss_url):
content = getPage(rss_url)
tree = parseXml(content)
return tree.xpath('//link/text()')

def getNewsText(news_url):
content = getPage(news_url)
tree = parseHtml(content)
for paragraph in tree.xpath("//div[@id='newsp']/p/text()"):
yield paragraph.strip()

def crawelCategory(rss_url):
links = getLinks(rss_url)
for link in links[1:]:
yield ' '.join(getNewsText(link))

def main():
with open('hinet_news.txt', 'wt') as file:
for url in rss_urls:
for text in crawelCategory(url):
print text
print >> file, text
print 'Done.'

if __name__ == '__main__':
main()

0 comments on commit af1bf24

Please sign in to comment.