-
Notifications
You must be signed in to change notification settings - Fork 1
/
nrk-dumper.py
executable file
·131 lines (113 loc) · 4.67 KB
/
nrk-dumper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python
import cookielib
import logging
import os
import optparse
import re
import stat
import subprocess
from lxml import etree
import mechanize
class NRKRipper(object):
movie_object_url = 'string(//object[@id="ctl00_ucPlayer_Player"]/@url[1])'
asx_mms_url = 'string(//entry/ref[1]/@href)'
program_page_links = '//div[@id="dyn-navigation"]//a[@class="icon-video-black" or @class="icon-video-black indexPadding"]'
program_subpage_links = '//div[@id="dyn-navigation"]//a[@class="icon-closed-black"]/@href'
def __init__(self):
self.browser = mechanize.Browser()
self.browser.set_cookiejar(self.make_cookiejar())
self.visited_program_pages = {}
def make_cookiejar(self):
"""
Inject a cookie containing requested speed, avoiding a redirect
to the speed detection page.
"""
cookie_jar = cookielib.LWPCookieJar()
cookie = cookielib.Cookie(
version=0, name='NetTV2.0Speed',
value='100000', port=None, port_specified=False, domain='www.nrk.no',
domain_specified=False, domain_initial_dot=False, path='/',
path_specified=True, secure=False, expires=None, discard=True,
comment=None, comment_url=None, rest={'HttpOnly': None},
rfc2109=False)
cookie_jar.set_cookie(cookie)
return cookie_jar
def rip_program(self, url):
"""
Given an URL to an NRK Nett-TV page
(http:https://www.nrk.no/nett-tv/klipp/[0-9]+/), return the mms url for
that resource.
"""
self.browser.open(url)
response_data = self.browser.response().read()
response = etree.HTML(response_data)
# The page contains an object with an url parameter pointing at an
# ASX file
source = response.xpath(self.movie_object_url)
self.browser.open(source)
asx_data = self.browser.response().read()
asx = etree.HTML(asx_data)
# The ASX file contains a list of movie source URLs. Get the
# one using mms:https://.
return asx.xpath(self.asx_mms_url)
def list_project(self, url):
logging.info("Looking for clips at %s", url)
self.browser.open(url)
response_data = self.browser.response().read()
response = etree.HTML(response_data)
links = response.xpath(self.program_page_links)
sublinks = response.xpath(self.program_subpage_links)
sources = []
for sublink in sublinks:
if sublink not in self.visited_program_pages:
self.visited_program_pages[sublink] = 1
sources.extend(self.list_project('http:https://www.nrk.no' + sublink))
else:
# Already visited
pass
for link in links:
name = link.xpath('string(./text())')
href = link.xpath('string(./@href)')
sources.append((name, href))
return sources
def search_and_rip(self, url, output_dir):
if not os.path.exists(output_dir.encode('utf-8')):
os.makedirs(output_dir)
for name, href in self.list_project(url):
name = self.fix_stupid_dates(name)
output_name = os.path.join(output_dir, name + '.wmv')
if not os.path.exists(output_name.encode('utf-8')):
source = self.rip_program('http:https://www.nrk.no' + href)
logging.info(u"Ripping %s => %s", source, output_name)
proc = subprocess.Popen(['mplayer', '-dumpstream', '-dumpfile',
output_name.encode('utf-8'), source.encode('utf-8')])
proc.wait()
os.chmod(output_name.encode('utf-8'),
stat.S_IRUSR
| stat.S_IWUSR
| stat.S_IRGRP
| stat.S_IROTH)
def fix_stupid_dates(self, name):
pattern = '(.*)([0-9]{2})\.([0-9]{2})\.([0-9]+)$'
match = re.match(pattern, name)
if match is None:
return name
prefix, day, month, year = match.groups()
name = '%s%s-%s-%s' % (prefix, year, month, day)
return name
def rip_all(source):
ripper = NRKRipper()
for line in source:
line = line.decode('utf-8').strip()
if not line or line.startswith('#'):
continue
logging.info("Line: %s", line)
url, output_dir = line.split(' ', 1)
logging.info(u"Searching %s", url)
ripper.search_and_rip(url, output_dir)
if __name__ == '__main__':
logging.basicConfig(
level = logging.INFO,
format = "%(asctime)s %(name)s %(levelname)s %(message)s")
import sys
rip_all(sys.stdin)