]> git.frykholm.com Git - svtplaydump.git/blame - svtplaydump.py
Force ipv4
[svtplaydump.git] / svtplaydump.py
CommitLineData
fa7d6ee2 1#!/usr/bin/env python3.4
56181f0a 2# -*- coding: utf-8 -*-
ca2553c7
MF
3#
4# (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
5#
6# This program is free software: you can redistribute it and/or modify
7# it under the terms of the GNU General Public License as published by
8# the Free Software Foundation, either version 3 of the License, or
9# (at your option) any later version.
10#
11# This program is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14# GNU General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License
17# along with this program. If not, see <http://www.gnu.org/licenses/>
18#
19# Changelog:
d05b6699 20# 0.4 added mirror mode.
56181f0a 21# 0.3 added apple streaming playlist parsing and decryption
ca2553c7
MF
22# 0.2 added python 2.4 urlparse compatibility
23# 0.1 initial release
24
d26e6919 25from bs4 import BeautifulSoup, Doctype
ca2553c7 26from subprocess import *
89a00fa0 27import re
56181f0a
MF
28from Crypto.Cipher import AES
29import struct
72beea17 30import argparse
84f7ef7d 31import requests
d05b6699 32import sys, os
1e13b6eb
MF
33import feedparser
34from datetime import datetime, timezone
fa7d6ee2
MF
35from pathlib import Path
36
3d7ac34a 37
d26e6919
MF
38class Video(dict):
39 def __init__(self, *args, **kwargs):
40 self.update(dict(*args, **kwargs)) # use the free update to set keys
41
42 def __setattr__(self, name, value):
3d7ac34a
MF
43 return self.__setitem__(name, value)
44
d26e6919
MF
45 def __getattr__(self, name):
46 return self.__getitem__(name)
3d7ac34a 47
d26e6919 48 def is_downloaded(self):
3d7ac34a
MF
49 raise ("NotImplemented")
50
d26e6919
MF
51
52def scrape_player_page(video):
d05b6699
MF
53 """
54 Try to scrape the site for video and download.
55 """
d26e6919
MF
56 if not video['url'].startswith('http'):
57 video['url'] = "http://www.svtplay.se" + video['url']
58 soup = BeautifulSoup(requests.get(video['url']).text)
3d7ac34a 59 video_player = soup.body('a', {'data-json-href': True})[0]
d26e6919 60 if 'oppetarkiv.se' in video['url']:
3d7ac34a
MF
61 flashvars = requests.get(
62 "http://www.oppetarkiv.se/%s" % video_player.attrs['data-json-href'] + "?output=json").json()
63 else:
7370a42e 64 if video_player.attrs['data-json-href'].startswith("/wd"):
3d7ac34a
MF
65 flashvars = requests.get("http://www.svt.se/%s" % video_player.attrs['data-json-href']).json()
66 else:
67 flashvars = requests.get(
68 "http://www.svtplay.se/%s" % video_player.attrs['data-json-href'] + "?output=json").json()
69 video['duration'] = video_player.attrs.get('data-length', 0)
24160239 70 if not 'title' in video:
3d7ac34a
MF
71 video['title'] = soup.find('meta', {'property': 'og:title'}).attrs['content'].replace('|', '_').replace('/', '_')
72 if 'genre' not in video:
d26e6919
MF
73 if soup.find(text='Kategori:'):
74 video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
75 else:
3d7ac34a 76 video['genre'] = 'Ingen Genre'
ca2553c7 77 if 'dynamicStreams' in flashvars:
3d7ac34a 78 video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] + '.mp4'
fa7d6ee2 79 filename = Path(video['title']).with_suffix(".mp4")
3d7ac34a 80 print(Popen(["rtmpdump", "-o" + filename, "-r", video['url']], stdout=PIPE).communicate()[0])
ca2553c7
MF
81 if 'pathflv' in flashvars:
82 rtmp = flashvars['pathflv'][0]
fa7d6ee2 83 filename = Path(video['title']).with_suffix(".flv")
3d7ac34a
MF
84 print(Popen(["mplayer", "-dumpstream", "-dumpfile", filename, rtmp], stdout=PIPE).communicate()[0])
85 if not 'timestamp' in video and soup.find_all(datetime=True):
86 xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
87 if xmldate_str:
88 video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) # naive in utc
89 video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) # convert to local time
89a00fa0 90 if 'video' in flashvars:
56181f0a 91 for reference in flashvars['video']['videoReferences']:
2d8521d8 92 if 'm3u8' in reference['url']:
3d7ac34a 93 video['url'] = reference['url']
fa7d6ee2 94 video['filename'] = Path(video['title']).with_suffix('.ts')
d05b6699
MF
95 if 'statistics' in flashvars:
96 video['category'] = flashvars['statistics']['category']
1e111d91
MF
97 if not download_from_playlist(video):
98 return False
3d7ac34a 99 if 'url' not in video:
84f7ef7d 100 print("Could not find any streams")
2d8521d8 101 return False
d05b6699
MF
102 return video
103
3d7ac34a 104
d05b6699 105def download_from_playlist(video):
24160239
MF
106 params = requests.utils.urlparse(video['url']).query
107 print(params)
3d7ac34a
MF
108 if 'cc1=' in params: # 'cc1=name=Svenska~default=yes~forced=no~uri=http://media.svt.se/download/mcc/wp3/undertexter-wsrt/1134047/1134047-025A/C(sv)/index.m3u8~lang=sv'
109 video['subs'] = [
110 dict([k.split('=') for k in params.split('cc1=')[1].split('~')])] # make a dict from the paramstring
24160239
MF
111 try:
112 req = requests.get(video['url']).text
113 except:
3d7ac34a 114 print("Error reading, skipping file")
24160239
MF
115 print(sys.exc_info()[1])
116 return False
117 if 'subs' in video:
118 try:
119 segments = [item for item in requests.get(video['subs'][0]['uri']).text.split('\n') if 'vtt' in item]
120 except:
3d7ac34a 121 print("Error reading, skipping subtitle")
24160239 122 print(sys.exc_info()[1])
3d7ac34a 123 segments = [] # ugly FIXME
24160239
MF
124 video['subs'][0]['download'] = []
125 for segment in segments:
126 if not segment.startswith('http'):
127 segment = "{}/{}".format(os.path.dirname(video['subs'][0]['uri']), segment)
128 try:
129 video['subs'][0]['download'].append(requests.get(segment).text)
130 except:
3d7ac34a 131 print("Error reading, skipping subtitle")
24160239
MF
132 print(sys.exc_info()[1])
133 break
134 playlist = parse_playlist(req)
84f7ef7d
MF
135 if not playlist:
136 return
56181f0a 137 videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
3d7ac34a
MF
138 if not videourl.startswith('http'): # if relative path
139 videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
2d8521d8 140 segments, metadata = parse_segment_playlist(videourl)
56181f0a 141 if "EXT-X-KEY" in metadata:
24160239
MF
142 try:
143 key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
144 except:
3d7ac34a 145 print("Error reading, skipping file")
24160239
MF
146 print(sys.exc_info()[1])
147 return False
3d7ac34a 148 decrypt = True
56181f0a 149 else:
3d7ac34a 150 decrypt = False
fa7d6ee2 151 with video['filename'].open("wb") as ofile:
3d7ac34a 152 segment = 0
72beea17 153 size = 0
56181f0a 154 for url in segments:
1e111d91
MF
155 try:
156 ufile = requests.get(url, stream=True).raw
157 except:
3d7ac34a 158 print("Error reading, skipping file")
1e111d91
MF
159 print(sys.exc_info()[1])
160 return False
3d7ac34a 161 print("\r{0:.2f} MB".format(size / 1024 / 1024), end="")
72beea17 162 sys.stdout.flush()
56181f0a 163 if decrypt:
3d7ac34a 164 iv = struct.pack("IIII", segment, 0, 0, 0)
24160239 165 try:
3d7ac34a
MF
166 decryptor = AES.new(key, AES.MODE_CBC,
167 iv) # ValueError: AES key must be either 16, 24, or 32 bytes long
168 except ValueError as e:
24160239
MF
169 print("Error using decryption key. Skipping")
170 print(e)
3d7ac34a
MF
171 return False
172 while True:
c1d3d702
MF
173 try:
174 buf = ufile.read(4096)
1e111d91 175 except:
3d7ac34a 176 print("Error reading, skipping file")
1e111d91
MF
177 print(sys.exc_info()[1])
178 return False
84f7ef7d 179 if not buf:
56181f0a 180 break
84f7ef7d
MF
181 if decrypt:
182 buf = decryptor.decrypt(buf)
183 ofile.write(buf)
184 size += len(buf)
56181f0a
MF
185 segment += 1
186
d26e6919 187 if 'thumb-url' in video:
24160239 188 try:
3d7ac34a 189 video['thumb'] = requests.get(video['thumb-url'], stream=True).raw
24160239 190 except:
3d7ac34a 191 print("Error reading thumbnail") # FIXME mark file as failed
24160239
MF
192 print(sys.exc_info()[1])
193
1e111d91 194 return True
d26e6919 195
3d7ac34a 196
56181f0a 197def parse_playlist(playlist):
d05b6699 198 if not playlist.startswith("#EXTM3U"):
84f7ef7d 199 print(playlist)
d05b6699 200 return False
2d8521d8
MF
201 playlist = playlist.splitlines()
202 while not 'EXT-X-STREAM-INF' in playlist[0]:
203 playlist = playlist[1:]
3d7ac34a
MF
204 items = []
205 for (metadata_string, url) in zip(playlist[0::2], playlist[1::2]):
d26e6919 206 md = Video()
2d8521d8
MF
207 if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
208 continue
56181f0a
MF
209 for item in metadata_string.split(':')[1].split(','):
210 if '=' in item:
3d7ac34a
MF
211 md.update([item.split('='), ])
212 md['url'] = url
56181f0a 213 items.append(md)
3d7ac34a
MF
214 return items
215
56181f0a 216
2d8521d8
MF
217def parse_segment_playlist(playlisturl):
218 playlist = requests.get(playlisturl).text
56181f0a
MF
219 assert playlist.startswith("#EXTM3U")
220 PATTERN = re.compile(r'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
221 segments = []
3d7ac34a 222 next_is_url = False
56181f0a
MF
223 metadata = {}
224 for row in playlist.splitlines():
225 if next_is_url:
3d7ac34a
MF
226 if not row.startswith('http'): # if relative path
227 row = "{}/{}".format(os.path.dirname(playlisturl), row)
56181f0a 228 segments.append(row)
3d7ac34a 229 next_is_url = False
56181f0a
MF
230 continue
231 if 'EXTINF' in row:
3d7ac34a 232 next_is_url = True
56181f0a 233 if "EXT-X-KEY" in row:
3d7ac34a
MF
234 row = row.split(':', 1)[1] # skip first part
235 parts = PATTERN.split(row)[1:-1] # do magic re split and keep quotes
236 metadata["EXT-X-KEY"] = dict([part.split('=', 1) for part in parts if
237 '=' in part]) # throw away the commas and make dict of the pairs
238 return segments, metadata
239
84f7ef7d 240
d05b6699 241def parse_videolist():
5b0549b5 242 page_num = 1
3d7ac34a
MF
243 soup = BeautifulSoup(requests.get(
244 "http://www.svtplay.se/ajax/videospager").text) # this call does not work for getting the pages, we use it for the page totals only
245 page_tot = int(soup.find('a', {'data-currentpage': True}).attrs['data-lastpage'])
5b0549b5
MF
246 videos_per_page = 8
247 video_num = 0
3d7ac34a 248 while page_num <= page_tot:
5b0549b5 249 base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
84f7ef7d 250 soup = BeautifulSoup(requests.get(base_url).text)
5b0549b5
MF
251 for article in soup.findAll('article'):
252 meta = dict(article.attrs)
d26e6919 253 video = Video()
5b0549b5
MF
254 video['title'] = meta['data-title']
255 video['description'] = meta['data-description']
256 video['url'] = dict(article.find('a').attrs)['href']
3d7ac34a 257 video['thumb-url'] = dict(article.find('img', {}).attrs)['src']
5b0549b5
MF
258 video['num'] = video_num
259 video['total'] = page_tot * videos_per_page
260 video_num += 1
261 yield video
262 page_num += 1
263
3d7ac34a 264
d26e6919 265def remux(video, xml=None):
d26e6919
MF
266 if 'genre' in video:
267 if not os.path.exists(video['genre']):
268 os.mkdir(video['genre'])
fa7d6ee2 269 video['path'] = Path(video['genre'] / video['filename']).with_suffix('.mkv')
5ab69ab4 270 else:
3d7ac34a
MF
271 video['path'] = video['filename'].with_suffix('.mkv')
272 command = ["mkvmerge", "-o", str(video['path']), '--title', video['title']]
d26e6919
MF
273
274 if xml:
fa7d6ee2 275 with video['filename'].with_suffix('.xml').open('w') as f:
d26e6919 276 f.write(xml)
3d7ac34a 277 command.extend(['--global-tags', str(video['filename'].with_suffix('.xml'))])
d26e6919 278 if 'thumb' in video:
3d7ac34a 279 with open('thumbnail.jpg', 'wb') as f: # FIXME use title instead for many downloaders
d26e6919
MF
280 f.write(video['thumb'].read())
281 command.extend(['--attachment-description', "Thumbnail",
3d7ac34a
MF
282 '--attachment-mime-type', 'image/jpeg',
283 '--attach-file', 'thumbnail.jpg'])
24160239
MF
284 # if 'subs' in video:
285 # for sub in video['subs']:
286 # if 'download' in sub:
287 # with open("{}.vtt".format(sub['lang']),'wb') as f:
288 # f.write(bytes("".join(sub['download']),'utf-8')) #FIXME
289 # command.extend(['--language 0:{} {}.vtt'.format(sub['lang'],sub['lang'])])
fa7d6ee2
MF
290
291 command.append(str(video['filename']))
d26e6919 292 print(Popen(command, stdout=PIPE).communicate()[0])
3d7ac34a 293 for fname in (video['filename'], video['filename'].with_suffix('.xml'), Path('thumbnail.jpg')):
d26e6919 294 try:
fa7d6ee2 295 fname.unlink()
d26e6919
MF
296 except:
297 pass
1e13b6eb 298 if 'timestamp' in video:
2301fe14 299 try:
3d7ac34a 300 os.utime(str(video['path']), times=(video['timestamp'].timestamp(), video['timestamp'].timestamp()))
2301fe14
MF
301 except FileNotFoundError as e:
302 print(e)
1e111d91 303
3d7ac34a 304
d26e6919
MF
305def mkv_metadata(video):
306 root = BeautifulSoup(features='xml')
307 root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
308 tags = root.new_tag("Tags")
309 tag = root.new_tag("Tag")
310 tags.append(tag)
311 root.append(tags)
3d7ac34a 312 keep = ('title', 'description', 'url', 'genre')
d26e6919
MF
313 targets = root.new_tag("Targets")
314 ttv = root.new_tag("TargetTypeValue")
315 ttv.string = str(50)
316 targets.append(ttv)
317 tag.append(targets)
318 for key in video:
319 if not key in keep:
320 continue
321 simple = root.new_tag('Simple')
322 name = root.new_tag('Name')
3d7ac34a 323 name.string = key.upper()
d26e6919
MF
324 simple.append(name)
325 sstring = root.new_tag('String')
3d7ac34a 326 sstring.string = video[key]
d26e6919
MF
327 simple.append(sstring)
328 tag.append(simple)
329 return str(root)
56181f0a 330
3d7ac34a 331
ca2553c7 332if __name__ == "__main__":
72beea17 333 parser = argparse.ArgumentParser()
1ad04c01
MF
334 group = parser.add_mutually_exclusive_group(required=True)
335 group.add_argument("-r", "--rss", help="Download all files in rss")
336 group.add_argument("-u", "--url", help="Download video in url")
337 group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
3d7ac34a
MF
338 parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.",
339 action="store_true")
2d8521d8 340 parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
3d7ac34a 341
72beea17 342 args = parser.parse_args()
3d7ac34a 343 if args.rss:
5b0549b5 344 d = feedparser.parse(args.rss)
72beea17 345 for e in d.entries:
3d7ac34a 346 print(("Downloading: %s" % e.title))
5b0549b5
MF
347 if args.no_act:
348 continue
3d7ac34a 349 video = scrape_player_page({'title': e.title, 'url': e.link})
2d8521d8
MF
350 if args.no_remux:
351 continue
3d7ac34a
MF
352 remux(video)
353 # print(e.description)
d05b6699 354 if args.mirror:
d26e6919
MF
355 if not os.path.exists('.seen'):
356 os.mkdir('.seen')
d05b6699 357 for video in parse_videolist():
3d7ac34a
MF
358 video['title'] = video['title'].replace('/', '_')
359 print(video['title'] + '.mkv')
84f7ef7d 360 print("{} of {}".format(video['num'], video['total']))
3d7ac34a
MF
361
362 if os.path.exists(os.path.join('.seen', video['title'])):
363 print("Skipping")
d05b6699
MF
364 continue
365 print("Downloading...")
5b0549b5
MF
366 if args.no_act:
367 continue
3d7ac34a 368 open(os.path.join('.seen', video['title']), 'w').close() # touch
1e111d91
MF
369 ret = scrape_player_page(video)
370 if not ret:
371 if not os.path.exists('.failed'):
372 os.mkdir('.failed')
3d7ac34a 373 open(os.path.join('.failed', video['title']), 'w').close() # touch
1e111d91
MF
374 continue
375 video = ret
2d8521d8
MF
376 if args.no_remux:
377 continue
d26e6919
MF
378 xml = mkv_metadata(video)
379 remux(video, xml)
3d7ac34a 380
72beea17 381 else:
5b0549b5 382 if not args.no_act:
3d7ac34a 383 video = scrape_player_page({'url': args.url})
2d8521d8 384 if not args.no_remux:
24160239 385 remux(video)
3d7ac34a 386 print(("Downloaded {}".format(args.url)))