]> git.frykholm.com Git - svtplaydump.git/blame_incremental - svtplaydump.py
Force ipv4
[svtplaydump.git] / svtplaydump.py
... / ...
CommitLineData
1#!/usr/bin/env python3.4
2# -*- coding: utf-8 -*-
3#
4# (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
5#
6# This program is free software: you can redistribute it and/or modify
7# it under the terms of the GNU General Public License as published by
8# the Free Software Foundation, either version 3 of the License, or
9# (at your option) any later version.
10#
11# This program is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14# GNU General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License
17# along with this program. If not, see <http://www.gnu.org/licenses/>
18#
19# Changelog:
20# 0.4 added mirror mode.
21# 0.3 added apple streaming playlist parsing and decryption
22# 0.2 added python 2.4 urlparse compatibility
23# 0.1 initial release
24
25from bs4 import BeautifulSoup, Doctype
26from subprocess import *
27import re
28from Crypto.Cipher import AES
29import struct
30import argparse
31import requests
32import sys, os
33import feedparser
34from datetime import datetime, timezone
35from pathlib import Path
36
37
38class Video(dict):
39 def __init__(self, *args, **kwargs):
40 self.update(dict(*args, **kwargs)) # use the free update to set keys
41
42 def __setattr__(self, name, value):
43 return self.__setitem__(name, value)
44
45 def __getattr__(self, name):
46 return self.__getitem__(name)
47
48 def is_downloaded(self):
49 raise ("NotImplemented")
50
51
52def scrape_player_page(video):
53 """
54 Try to scrape the site for video and download.
55 """
56 if not video['url'].startswith('http'):
57 video['url'] = "http://www.svtplay.se" + video['url']
58 soup = BeautifulSoup(requests.get(video['url']).text)
59 video_player = soup.body('a', {'data-json-href': True})[0]
60 if 'oppetarkiv.se' in video['url']:
61 flashvars = requests.get(
62 "http://www.oppetarkiv.se/%s" % video_player.attrs['data-json-href'] + "?output=json").json()
63 else:
64 if video_player.attrs['data-json-href'].startswith("/wd"):
65 flashvars = requests.get("http://www.svt.se/%s" % video_player.attrs['data-json-href']).json()
66 else:
67 flashvars = requests.get(
68 "http://www.svtplay.se/%s" % video_player.attrs['data-json-href'] + "?output=json").json()
69 video['duration'] = video_player.attrs.get('data-length', 0)
70 if not 'title' in video:
71 video['title'] = soup.find('meta', {'property': 'og:title'}).attrs['content'].replace('|', '_').replace('/', '_')
72 if 'genre' not in video:
73 if soup.find(text='Kategori:'):
74 video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
75 else:
76 video['genre'] = 'Ingen Genre'
77 if 'dynamicStreams' in flashvars:
78 video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] + '.mp4'
79 filename = Path(video['title']).with_suffix(".mp4")
80 print(Popen(["rtmpdump", "-o" + filename, "-r", video['url']], stdout=PIPE).communicate()[0])
81 if 'pathflv' in flashvars:
82 rtmp = flashvars['pathflv'][0]
83 filename = Path(video['title']).with_suffix(".flv")
84 print(Popen(["mplayer", "-dumpstream", "-dumpfile", filename, rtmp], stdout=PIPE).communicate()[0])
85 if not 'timestamp' in video and soup.find_all(datetime=True):
86 xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
87 if xmldate_str:
88 video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) # naive in utc
89 video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) # convert to local time
90 if 'video' in flashvars:
91 for reference in flashvars['video']['videoReferences']:
92 if 'm3u8' in reference['url']:
93 video['url'] = reference['url']
94 video['filename'] = Path(video['title']).with_suffix('.ts')
95 if 'statistics' in flashvars:
96 video['category'] = flashvars['statistics']['category']
97 if not download_from_playlist(video):
98 return False
99 if 'url' not in video:
100 print("Could not find any streams")
101 return False
102 return video
103
104
105def download_from_playlist(video):
106 params = requests.utils.urlparse(video['url']).query
107 print(params)
108 if 'cc1=' in params: # 'cc1=name=Svenska~default=yes~forced=no~uri=http://media.svt.se/download/mcc/wp3/undertexter-wsrt/1134047/1134047-025A/C(sv)/index.m3u8~lang=sv'
109 video['subs'] = [
110 dict([k.split('=') for k in params.split('cc1=')[1].split('~')])] # make a dict from the paramstring
111 try:
112 req = requests.get(video['url']).text
113 except:
114 print("Error reading, skipping file")
115 print(sys.exc_info()[1])
116 return False
117 if 'subs' in video:
118 try:
119 segments = [item for item in requests.get(video['subs'][0]['uri']).text.split('\n') if 'vtt' in item]
120 except:
121 print("Error reading, skipping subtitle")
122 print(sys.exc_info()[1])
123 segments = [] # ugly FIXME
124 video['subs'][0]['download'] = []
125 for segment in segments:
126 if not segment.startswith('http'):
127 segment = "{}/{}".format(os.path.dirname(video['subs'][0]['uri']), segment)
128 try:
129 video['subs'][0]['download'].append(requests.get(segment).text)
130 except:
131 print("Error reading, skipping subtitle")
132 print(sys.exc_info()[1])
133 break
134 playlist = parse_playlist(req)
135 if not playlist:
136 return
137 videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
138 if not videourl.startswith('http'): # if relative path
139 videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
140 segments, metadata = parse_segment_playlist(videourl)
141 if "EXT-X-KEY" in metadata:
142 try:
143 key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
144 except:
145 print("Error reading, skipping file")
146 print(sys.exc_info()[1])
147 return False
148 decrypt = True
149 else:
150 decrypt = False
151 with video['filename'].open("wb") as ofile:
152 segment = 0
153 size = 0
154 for url in segments:
155 try:
156 ufile = requests.get(url, stream=True).raw
157 except:
158 print("Error reading, skipping file")
159 print(sys.exc_info()[1])
160 return False
161 print("\r{0:.2f} MB".format(size / 1024 / 1024), end="")
162 sys.stdout.flush()
163 if decrypt:
164 iv = struct.pack("IIII", segment, 0, 0, 0)
165 try:
166 decryptor = AES.new(key, AES.MODE_CBC,
167 iv) # ValueError: AES key must be either 16, 24, or 32 bytes long
168 except ValueError as e:
169 print("Error using decryption key. Skipping")
170 print(e)
171 return False
172 while True:
173 try:
174 buf = ufile.read(4096)
175 except:
176 print("Error reading, skipping file")
177 print(sys.exc_info()[1])
178 return False
179 if not buf:
180 break
181 if decrypt:
182 buf = decryptor.decrypt(buf)
183 ofile.write(buf)
184 size += len(buf)
185 segment += 1
186
187 if 'thumb-url' in video:
188 try:
189 video['thumb'] = requests.get(video['thumb-url'], stream=True).raw
190 except:
191 print("Error reading thumbnail") # FIXME mark file as failed
192 print(sys.exc_info()[1])
193
194 return True
195
196
197def parse_playlist(playlist):
198 if not playlist.startswith("#EXTM3U"):
199 print(playlist)
200 return False
201 playlist = playlist.splitlines()
202 while not 'EXT-X-STREAM-INF' in playlist[0]:
203 playlist = playlist[1:]
204 items = []
205 for (metadata_string, url) in zip(playlist[0::2], playlist[1::2]):
206 md = Video()
207 if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
208 continue
209 for item in metadata_string.split(':')[1].split(','):
210 if '=' in item:
211 md.update([item.split('='), ])
212 md['url'] = url
213 items.append(md)
214 return items
215
216
217def parse_segment_playlist(playlisturl):
218 playlist = requests.get(playlisturl).text
219 assert playlist.startswith("#EXTM3U")
220 PATTERN = re.compile(r'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
221 segments = []
222 next_is_url = False
223 metadata = {}
224 for row in playlist.splitlines():
225 if next_is_url:
226 if not row.startswith('http'): # if relative path
227 row = "{}/{}".format(os.path.dirname(playlisturl), row)
228 segments.append(row)
229 next_is_url = False
230 continue
231 if 'EXTINF' in row:
232 next_is_url = True
233 if "EXT-X-KEY" in row:
234 row = row.split(':', 1)[1] # skip first part
235 parts = PATTERN.split(row)[1:-1] # do magic re split and keep quotes
236 metadata["EXT-X-KEY"] = dict([part.split('=', 1) for part in parts if
237 '=' in part]) # throw away the commas and make dict of the pairs
238 return segments, metadata
239
240
241def parse_videolist():
242 page_num = 1
243 soup = BeautifulSoup(requests.get(
244 "http://www.svtplay.se/ajax/videospager").text) # this call does not work for getting the pages, we use it for the page totals only
245 page_tot = int(soup.find('a', {'data-currentpage': True}).attrs['data-lastpage'])
246 videos_per_page = 8
247 video_num = 0
248 while page_num <= page_tot:
249 base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
250 soup = BeautifulSoup(requests.get(base_url).text)
251 for article in soup.findAll('article'):
252 meta = dict(article.attrs)
253 video = Video()
254 video['title'] = meta['data-title']
255 video['description'] = meta['data-description']
256 video['url'] = dict(article.find('a').attrs)['href']
257 video['thumb-url'] = dict(article.find('img', {}).attrs)['src']
258 video['num'] = video_num
259 video['total'] = page_tot * videos_per_page
260 video_num += 1
261 yield video
262 page_num += 1
263
264
265def remux(video, xml=None):
266 if 'genre' in video:
267 if not os.path.exists(video['genre']):
268 os.mkdir(video['genre'])
269 video['path'] = Path(video['genre'] / video['filename']).with_suffix('.mkv')
270 else:
271 video['path'] = video['filename'].with_suffix('.mkv')
272 command = ["mkvmerge", "-o", str(video['path']), '--title', video['title']]
273
274 if xml:
275 with video['filename'].with_suffix('.xml').open('w') as f:
276 f.write(xml)
277 command.extend(['--global-tags', str(video['filename'].with_suffix('.xml'))])
278 if 'thumb' in video:
279 with open('thumbnail.jpg', 'wb') as f: # FIXME use title instead for many downloaders
280 f.write(video['thumb'].read())
281 command.extend(['--attachment-description', "Thumbnail",
282 '--attachment-mime-type', 'image/jpeg',
283 '--attach-file', 'thumbnail.jpg'])
284 # if 'subs' in video:
285 # for sub in video['subs']:
286 # if 'download' in sub:
287 # with open("{}.vtt".format(sub['lang']),'wb') as f:
288 # f.write(bytes("".join(sub['download']),'utf-8')) #FIXME
289 # command.extend(['--language 0:{} {}.vtt'.format(sub['lang'],sub['lang'])])
290
291 command.append(str(video['filename']))
292 print(Popen(command, stdout=PIPE).communicate()[0])
293 for fname in (video['filename'], video['filename'].with_suffix('.xml'), Path('thumbnail.jpg')):
294 try:
295 fname.unlink()
296 except:
297 pass
298 if 'timestamp' in video:
299 try:
300 os.utime(str(video['path']), times=(video['timestamp'].timestamp(), video['timestamp'].timestamp()))
301 except FileNotFoundError as e:
302 print(e)
303
304
305def mkv_metadata(video):
306 root = BeautifulSoup(features='xml')
307 root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
308 tags = root.new_tag("Tags")
309 tag = root.new_tag("Tag")
310 tags.append(tag)
311 root.append(tags)
312 keep = ('title', 'description', 'url', 'genre')
313 targets = root.new_tag("Targets")
314 ttv = root.new_tag("TargetTypeValue")
315 ttv.string = str(50)
316 targets.append(ttv)
317 tag.append(targets)
318 for key in video:
319 if not key in keep:
320 continue
321 simple = root.new_tag('Simple')
322 name = root.new_tag('Name')
323 name.string = key.upper()
324 simple.append(name)
325 sstring = root.new_tag('String')
326 sstring.string = video[key]
327 simple.append(sstring)
328 tag.append(simple)
329 return str(root)
330
331
332if __name__ == "__main__":
333 parser = argparse.ArgumentParser()
334 group = parser.add_mutually_exclusive_group(required=True)
335 group.add_argument("-r", "--rss", help="Download all files in rss")
336 group.add_argument("-u", "--url", help="Download video in url")
337 group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
338 parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.",
339 action="store_true")
340 parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
341
342 args = parser.parse_args()
343 if args.rss:
344 d = feedparser.parse(args.rss)
345 for e in d.entries:
346 print(("Downloading: %s" % e.title))
347 if args.no_act:
348 continue
349 video = scrape_player_page({'title': e.title, 'url': e.link})
350 if args.no_remux:
351 continue
352 remux(video)
353 # print(e.description)
354 if args.mirror:
355 if not os.path.exists('.seen'):
356 os.mkdir('.seen')
357 for video in parse_videolist():
358 video['title'] = video['title'].replace('/', '_')
359 print(video['title'] + '.mkv')
360 print("{} of {}".format(video['num'], video['total']))
361
362 if os.path.exists(os.path.join('.seen', video['title'])):
363 print("Skipping")
364 continue
365 print("Downloading...")
366 if args.no_act:
367 continue
368 open(os.path.join('.seen', video['title']), 'w').close() # touch
369 ret = scrape_player_page(video)
370 if not ret:
371 if not os.path.exists('.failed'):
372 os.mkdir('.failed')
373 open(os.path.join('.failed', video['title']), 'w').close() # touch
374 continue
375 video = ret
376 if args.no_remux:
377 continue
378 xml = mkv_metadata(video)
379 remux(video, xml)
380
381 else:
382 if not args.no_act:
383 video = scrape_player_page({'url': args.url})
384 if not args.no_remux:
385 remux(video)
386 print(("Downloaded {}".format(args.url)))