Force ipv4
[svtplaydump.git] / svtplaydump.py
1 #!/usr/bin/env python3.4
2 # -*- coding: utf-8 -*-
3 #
4 # (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
5 #
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
15 #
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>
18 #
19 # Changelog:
20 # 0.4 added mirror mode.
21 # 0.3 added apple streaming playlist parsing and decryption
22 # 0.2 added python 2.4 urlparse compatibility
23 # 0.1 initial release
24
25 from bs4 import BeautifulSoup, Doctype
26 from subprocess import *
27 import re
28 from Crypto.Cipher import AES
29 import struct
30 import argparse
31 import requests
32 import sys, os
33 import feedparser
34 from datetime import datetime, timezone
35 from pathlib import Path
36
37
38 class Video(dict):
39 def __init__(self, *args, **kwargs):
40 self.update(dict(*args, **kwargs)) # use the free update to set keys
41
42 def __setattr__(self, name, value):
43 return self.__setitem__(name, value)
44
45 def __getattr__(self, name):
46 return self.__getitem__(name)
47
48 def is_downloaded(self):
49 raise ("NotImplemented")
50
51
52 def scrape_player_page(video):
53 """
54 Try to scrape the site for video and download.
55 """
56 if not video['url'].startswith('http'):
57 video['url'] = "http://www.svtplay.se" + video['url']
58 soup = BeautifulSoup(requests.get(video['url']).text)
59 video_player = soup.body('a', {'data-json-href': True})[0]
60 if 'oppetarkiv.se' in video['url']:
61 flashvars = requests.get(
62 "http://www.oppetarkiv.se/%s" % video_player.attrs['data-json-href'] + "?output=json").json()
63 else:
64 if video_player.attrs['data-json-href'].startswith("/wd"):
65 flashvars = requests.get("http://www.svt.se/%s" % video_player.attrs['data-json-href']).json()
66 else:
67 flashvars = requests.get(
68 "http://www.svtplay.se/%s" % video_player.attrs['data-json-href'] + "?output=json").json()
69 video['duration'] = video_player.attrs.get('data-length', 0)
70 if not 'title' in video:
71 video['title'] = soup.find('meta', {'property': 'og:title'}).attrs['content'].replace('|', '_').replace('/', '_')
72 if 'genre' not in video:
73 if soup.find(text='Kategori:'):
74 video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
75 else:
76 video['genre'] = 'Ingen Genre'
77 if 'dynamicStreams' in flashvars:
78 video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] + '.mp4'
79 filename = Path(video['title']).with_suffix(".mp4")
80 print(Popen(["rtmpdump", "-o" + filename, "-r", video['url']], stdout=PIPE).communicate()[0])
81 if 'pathflv' in flashvars:
82 rtmp = flashvars['pathflv'][0]
83 filename = Path(video['title']).with_suffix(".flv")
84 print(Popen(["mplayer", "-dumpstream", "-dumpfile", filename, rtmp], stdout=PIPE).communicate()[0])
85 if not 'timestamp' in video and soup.find_all(datetime=True):
86 xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
87 if xmldate_str:
88 video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) # naive in utc
89 video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) # convert to local time
90 if 'video' in flashvars:
91 for reference in flashvars['video']['videoReferences']:
92 if 'm3u8' in reference['url']:
93 video['url'] = reference['url']
94 video['filename'] = Path(video['title']).with_suffix('.ts')
95 if 'statistics' in flashvars:
96 video['category'] = flashvars['statistics']['category']
97 if not download_from_playlist(video):
98 return False
99 if 'url' not in video:
100 print("Could not find any streams")
101 return False
102 return video
103
104
105 def download_from_playlist(video):
106 params = requests.utils.urlparse(video['url']).query
107 print(params)
108 if 'cc1=' in params: # 'cc1=name=Svenska~default=yes~forced=no~uri=http://media.svt.se/download/mcc/wp3/undertexter-wsrt/1134047/1134047-025A/C(sv)/index.m3u8~lang=sv'
109 video['subs'] = [
110 dict([k.split('=') for k in params.split('cc1=')[1].split('~')])] # make a dict from the paramstring
111 try:
112 req = requests.get(video['url']).text
113 except:
114 print("Error reading, skipping file")
115 print(sys.exc_info()[1])
116 return False
117 if 'subs' in video:
118 try:
119 segments = [item for item in requests.get(video['subs'][0]['uri']).text.split('\n') if 'vtt' in item]
120 except:
121 print("Error reading, skipping subtitle")
122 print(sys.exc_info()[1])
123 segments = [] # ugly FIXME
124 video['subs'][0]['download'] = []
125 for segment in segments:
126 if not segment.startswith('http'):
127 segment = "{}/{}".format(os.path.dirname(video['subs'][0]['uri']), segment)
128 try:
129 video['subs'][0]['download'].append(requests.get(segment).text)
130 except:
131 print("Error reading, skipping subtitle")
132 print(sys.exc_info()[1])
133 break
134 playlist = parse_playlist(req)
135 if not playlist:
136 return
137 videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
138 if not videourl.startswith('http'): # if relative path
139 videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
140 segments, metadata = parse_segment_playlist(videourl)
141 if "EXT-X-KEY" in metadata:
142 try:
143 key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
144 except:
145 print("Error reading, skipping file")
146 print(sys.exc_info()[1])
147 return False
148 decrypt = True
149 else:
150 decrypt = False
151 with video['filename'].open("wb") as ofile:
152 segment = 0
153 size = 0
154 for url in segments:
155 try:
156 ufile = requests.get(url, stream=True).raw
157 except:
158 print("Error reading, skipping file")
159 print(sys.exc_info()[1])
160 return False
161 print("\r{0:.2f} MB".format(size / 1024 / 1024), end="")
162 sys.stdout.flush()
163 if decrypt:
164 iv = struct.pack("IIII", segment, 0, 0, 0)
165 try:
166 decryptor = AES.new(key, AES.MODE_CBC,
167 iv) # ValueError: AES key must be either 16, 24, or 32 bytes long
168 except ValueError as e:
169 print("Error using decryption key. Skipping")
170 print(e)
171 return False
172 while True:
173 try:
174 buf = ufile.read(4096)
175 except:
176 print("Error reading, skipping file")
177 print(sys.exc_info()[1])
178 return False
179 if not buf:
180 break
181 if decrypt:
182 buf = decryptor.decrypt(buf)
183 ofile.write(buf)
184 size += len(buf)
185 segment += 1
186
187 if 'thumb-url' in video:
188 try:
189 video['thumb'] = requests.get(video['thumb-url'], stream=True).raw
190 except:
191 print("Error reading thumbnail") # FIXME mark file as failed
192 print(sys.exc_info()[1])
193
194 return True
195
196
197 def parse_playlist(playlist):
198 if not playlist.startswith("#EXTM3U"):
199 print(playlist)
200 return False
201 playlist = playlist.splitlines()
202 while not 'EXT-X-STREAM-INF' in playlist[0]:
203 playlist = playlist[1:]
204 items = []
205 for (metadata_string, url) in zip(playlist[0::2], playlist[1::2]):
206 md = Video()
207 if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
208 continue
209 for item in metadata_string.split(':')[1].split(','):
210 if '=' in item:
211 md.update([item.split('='), ])
212 md['url'] = url
213 items.append(md)
214 return items
215
216
217 def parse_segment_playlist(playlisturl):
218 playlist = requests.get(playlisturl).text
219 assert playlist.startswith("#EXTM3U")
220 PATTERN = re.compile(r'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
221 segments = []
222 next_is_url = False
223 metadata = {}
224 for row in playlist.splitlines():
225 if next_is_url:
226 if not row.startswith('http'): # if relative path
227 row = "{}/{}".format(os.path.dirname(playlisturl), row)
228 segments.append(row)
229 next_is_url = False
230 continue
231 if 'EXTINF' in row:
232 next_is_url = True
233 if "EXT-X-KEY" in row:
234 row = row.split(':', 1)[1] # skip first part
235 parts = PATTERN.split(row)[1:-1] # do magic re split and keep quotes
236 metadata["EXT-X-KEY"] = dict([part.split('=', 1) for part in parts if
237 '=' in part]) # throw away the commas and make dict of the pairs
238 return segments, metadata
239
240
241 def parse_videolist():
242 page_num = 1
243 soup = BeautifulSoup(requests.get(
244 "http://www.svtplay.se/ajax/videospager").text) # this call does not work for getting the pages, we use it for the page totals only
245 page_tot = int(soup.find('a', {'data-currentpage': True}).attrs['data-lastpage'])
246 videos_per_page = 8
247 video_num = 0
248 while page_num <= page_tot:
249 base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
250 soup = BeautifulSoup(requests.get(base_url).text)
251 for article in soup.findAll('article'):
252 meta = dict(article.attrs)
253 video = Video()
254 video['title'] = meta['data-title']
255 video['description'] = meta['data-description']
256 video['url'] = dict(article.find('a').attrs)['href']
257 video['thumb-url'] = dict(article.find('img', {}).attrs)['src']
258 video['num'] = video_num
259 video['total'] = page_tot * videos_per_page
260 video_num += 1
261 yield video
262 page_num += 1
263
264
265 def remux(video, xml=None):
266 if 'genre' in video:
267 if not os.path.exists(video['genre']):
268 os.mkdir(video['genre'])
269 video['path'] = Path(video['genre'] / video['filename']).with_suffix('.mkv')
270 else:
271 video['path'] = video['filename'].with_suffix('.mkv')
272 command = ["mkvmerge", "-o", str(video['path']), '--title', video['title']]
273
274 if xml:
275 with video['filename'].with_suffix('.xml').open('w') as f:
276 f.write(xml)
277 command.extend(['--global-tags', str(video['filename'].with_suffix('.xml'))])
278 if 'thumb' in video:
279 with open('thumbnail.jpg', 'wb') as f: # FIXME use title instead for many downloaders
280 f.write(video['thumb'].read())
281 command.extend(['--attachment-description', "Thumbnail",
282 '--attachment-mime-type', 'image/jpeg',
283 '--attach-file', 'thumbnail.jpg'])
284 # if 'subs' in video:
285 # for sub in video['subs']:
286 # if 'download' in sub:
287 # with open("{}.vtt".format(sub['lang']),'wb') as f:
288 # f.write(bytes("".join(sub['download']),'utf-8')) #FIXME
289 # command.extend(['--language 0:{} {}.vtt'.format(sub['lang'],sub['lang'])])
290
291 command.append(str(video['filename']))
292 print(Popen(command, stdout=PIPE).communicate()[0])
293 for fname in (video['filename'], video['filename'].with_suffix('.xml'), Path('thumbnail.jpg')):
294 try:
295 fname.unlink()
296 except:
297 pass
298 if 'timestamp' in video:
299 try:
300 os.utime(str(video['path']), times=(video['timestamp'].timestamp(), video['timestamp'].timestamp()))
301 except FileNotFoundError as e:
302 print(e)
303
304
305 def mkv_metadata(video):
306 root = BeautifulSoup(features='xml')
307 root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
308 tags = root.new_tag("Tags")
309 tag = root.new_tag("Tag")
310 tags.append(tag)
311 root.append(tags)
312 keep = ('title', 'description', 'url', 'genre')
313 targets = root.new_tag("Targets")
314 ttv = root.new_tag("TargetTypeValue")
315 ttv.string = str(50)
316 targets.append(ttv)
317 tag.append(targets)
318 for key in video:
319 if not key in keep:
320 continue
321 simple = root.new_tag('Simple')
322 name = root.new_tag('Name')
323 name.string = key.upper()
324 simple.append(name)
325 sstring = root.new_tag('String')
326 sstring.string = video[key]
327 simple.append(sstring)
328 tag.append(simple)
329 return str(root)
330
331
332 if __name__ == "__main__":
333 parser = argparse.ArgumentParser()
334 group = parser.add_mutually_exclusive_group(required=True)
335 group.add_argument("-r", "--rss", help="Download all files in rss")
336 group.add_argument("-u", "--url", help="Download video in url")
337 group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
338 parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.",
339 action="store_true")
340 parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
341
342 args = parser.parse_args()
343 if args.rss:
344 d = feedparser.parse(args.rss)
345 for e in d.entries:
346 print(("Downloading: %s" % e.title))
347 if args.no_act:
348 continue
349 video = scrape_player_page({'title': e.title, 'url': e.link})
350 if args.no_remux:
351 continue
352 remux(video)
353 # print(e.description)
354 if args.mirror:
355 if not os.path.exists('.seen'):
356 os.mkdir('.seen')
357 for video in parse_videolist():
358 video['title'] = video['title'].replace('/', '_')
359 print(video['title'] + '.mkv')
360 print("{} of {}".format(video['num'], video['total']))
361
362 if os.path.exists(os.path.join('.seen', video['title'])):
363 print("Skipping")
364 continue
365 print("Downloading...")
366 if args.no_act:
367 continue
368 open(os.path.join('.seen', video['title']), 'w').close() # touch
369 ret = scrape_player_page(video)
370 if not ret:
371 if not os.path.exists('.failed'):
372 os.mkdir('.failed')
373 open(os.path.join('.failed', video['title']), 'w').close() # touch
374 continue
375 video = ret
376 if args.no_remux:
377 continue
378 xml = mkv_metadata(video)
379 remux(video, xml)
380
381 else:
382 if not args.no_act:
383 video = scrape_player_page({'url': args.url})
384 if not args.no_remux:
385 remux(video)
386 print(("Downloaded {}".format(args.url)))