]> git.frykholm.com Git - svtplaydump.git/blame - svtplaydump.py
Some more robustness fixes
[svtplaydump.git] / svtplaydump.py
CommitLineData
84f7ef7d 1#!/usr/bin/env python3
56181f0a 2# -*- coding: utf-8 -*-
ca2553c7
MF
3#
4# (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
5#
6# This program is free software: you can redistribute it and/or modify
7# it under the terms of the GNU General Public License as published by
8# the Free Software Foundation, either version 3 of the License, or
9# (at your option) any later version.
10#
11# This program is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14# GNU General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License
17# along with this program. If not, see <http://www.gnu.org/licenses/>
18#
19# Changelog:
d05b6699 20# 0.4 added mirror mode.
56181f0a 21# 0.3 added apple streaming playlist parsing and decryption
ca2553c7
MF
22# 0.2 added python 2.4 urlparse compatibility
23# 0.1 initial release
24
d26e6919 25from bs4 import BeautifulSoup, Doctype
ca2553c7 26from subprocess import *
89a00fa0 27import re
56181f0a
MF
28from Crypto.Cipher import AES
29import struct
72beea17 30import argparse
84f7ef7d 31import requests
d05b6699 32import sys, os
c1d3d702 33import socket
1e13b6eb
MF
34import feedparser
35from datetime import datetime, timezone
d26e6919
MF
36class Video(dict):
37 def __init__(self, *args, **kwargs):
38 self.update(dict(*args, **kwargs)) # use the free update to set keys
39
40 def __setattr__(self, name, value):
41 return self.__setitem__(name,value)
42
43 def __getattr__(self, name):
44 return self.__getitem__(name)
45
46 def is_downloaded(self):
47 raise("NotImplemented")
48
49def scrape_player_page(video):
d05b6699
MF
50 """
51 Try to scrape the site for video and download.
52 """
d26e6919
MF
53 if not video['url'].startswith('http'):
54 video['url'] = "http://www.svtplay.se" + video['url']
55 soup = BeautifulSoup(requests.get(video['url']).text)
d05b6699 56 video_player = soup.body('a',{'data-json-href':True})[0]
d26e6919 57 if 'oppetarkiv.se' in video['url']:
7370a42e 58 flashvars = requests.get("http://www.oppetarkiv.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
d3ebb57d 59 else:
7370a42e
MF
60 if video_player.attrs['data-json-href'].startswith("/wd"):
61 flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
62 else:
63 flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
84f7ef7d 64 video['duration'] = video_player.attrs.get('data-length',0)
24160239 65 if not 'title' in video:
84f7ef7d 66 video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('|','_').replace('/','_')
d26e6919
MF
67 if not 'genre' in video:
68 if soup.find(text='Kategori:'):
69 video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
70 else:
71 video['genre'] = 'Ingen Genre'
ca2553c7 72 if 'dynamicStreams' in flashvars:
d05b6699
MF
73 video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
74 filename = video['title']+".mp4"
84f7ef7d 75 print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
ca2553c7
MF
76 if 'pathflv' in flashvars:
77 rtmp = flashvars['pathflv'][0]
d05b6699 78 filename = video['title']+".flv"
84f7ef7d 79 print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
1e13b6eb
MF
80 if not 'timestamp' in video:
81 if soup.find_all(datetime=True):
82 xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
83 video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) #naive in utc
84 video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) #convert to local time
89a00fa0 85 if 'video' in flashvars:
56181f0a 86 for reference in flashvars['video']['videoReferences']:
2d8521d8 87 if 'm3u8' in reference['url']:
d05b6699
MF
88 video['url']=reference['url']
89 video['filename'] = video['title']+'.ts'
90 if 'statistics' in flashvars:
91 video['category'] = flashvars['statistics']['category']
1e111d91
MF
92 if not download_from_playlist(video):
93 return False
2d8521d8 94 if not 'url' in video:
84f7ef7d 95 print("Could not find any streams")
2d8521d8 96 return False
d05b6699
MF
97 return video
98
99def download_from_playlist(video):
24160239
MF
100 params = requests.utils.urlparse(video['url']).query
101 print(params)
102 if 'cc1=' in params: #'cc1=name=Svenska~default=yes~forced=no~uri=http://media.svt.se/download/mcc/wp3/undertexter-wsrt/1134047/1134047-025A/C(sv)/index.m3u8~lang=sv'
103 video['subs'] = [dict([k.split('=') for k in params.split('cc1=')[1].split('~')])] #make a dict from the paramstring
104 try:
105 req = requests.get(video['url']).text
106 except:
107 print("Error reading, skipping file")
108 print(sys.exc_info()[1])
109 return False
110 if 'subs' in video:
111 try:
112 segments = [item for item in requests.get(video['subs'][0]['uri']).text.split('\n') if 'vtt' in item]
113 except:
114 print("Error reading, skipping subtitle")
115 print(sys.exc_info()[1])
116 segments = [] #ugly FIXME
117 video['subs'][0]['download'] = []
118 for segment in segments:
119 if not segment.startswith('http'):
120 segment = "{}/{}".format(os.path.dirname(video['subs'][0]['uri']), segment)
121 try:
122 video['subs'][0]['download'].append(requests.get(segment).text)
123 except:
124 print("Error reading, skipping subtitle")
125 print(sys.exc_info()[1])
126 break
127 playlist = parse_playlist(req)
84f7ef7d
MF
128 if not playlist:
129 return
56181f0a 130 videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
2d8521d8
MF
131 if not videourl.startswith('http'): #if relative path
132 videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
133 segments, metadata = parse_segment_playlist(videourl)
56181f0a 134 if "EXT-X-KEY" in metadata:
24160239
MF
135 try:
136 key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
137 except:
138 print("Error reading, skipping file")
139 print(sys.exc_info()[1])
140 return False
56181f0a
MF
141 decrypt=True
142 else:
143 decrypt=False
84f7ef7d 144 with open("%s"%video['filename'],"wb") as ofile:
56181f0a 145 segment=0
72beea17 146 size = 0
56181f0a 147 for url in segments:
1e111d91
MF
148 try:
149 ufile = requests.get(url, stream=True).raw
150 except:
24160239 151 print("Error reading, skipping file")
1e111d91
MF
152 print(sys.exc_info()[1])
153 return False
c1d3d702 154 print("\r{0:.2f} MB".format(size/1024/1024),end="")
72beea17 155 sys.stdout.flush()
56181f0a
MF
156 if decrypt:
157 iv=struct.pack("IIII",segment,0,0,0)
24160239
MF
158 try:
159 decryptor = AES.new(key, AES.MODE_CBC, iv) #ValueError: AES key must be either 16, 24, or 32 bytes long
160 except(ValueError) as e:
161 print("Error using decryption key. Skipping")
162 print(e)
163 return False
56181f0a 164 while(True):
c1d3d702
MF
165 try:
166 buf = ufile.read(4096)
1e111d91
MF
167 except:
168 print("Error reading, skipping file") #FIXME mark file as failed
169 print(sys.exc_info()[1])
170 return False
84f7ef7d 171 if not buf:
56181f0a 172 break
84f7ef7d
MF
173 if decrypt:
174 buf = decryptor.decrypt(buf)
175 ofile.write(buf)
176 size += len(buf)
56181f0a
MF
177 segment += 1
178
d26e6919 179 if 'thumb-url' in video:
24160239
MF
180 try:
181 video['thumb'] = requests.get(video['thumb-url'],stream=True).raw
182 except:
183 print("Error reading thumbnail") #FIXME mark file as failed
184 print(sys.exc_info()[1])
185
1e111d91 186 return True
d26e6919 187
56181f0a 188def parse_playlist(playlist):
d05b6699 189 if not playlist.startswith("#EXTM3U"):
84f7ef7d 190 print(playlist)
d05b6699 191 return False
2d8521d8
MF
192 playlist = playlist.splitlines()
193 while not 'EXT-X-STREAM-INF' in playlist[0]:
194 playlist = playlist[1:]
56181f0a
MF
195 items=[]
196 for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
d26e6919 197 md = Video()
2d8521d8
MF
198 if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
199 continue
56181f0a
MF
200 for item in metadata_string.split(':')[1].split(','):
201 if '=' in item:
202 md.update([item.split('='),])
203 md['url']=url
204 items.append(md)
205 return items
206
2d8521d8
MF
207def parse_segment_playlist(playlisturl):
208 playlist = requests.get(playlisturl).text
56181f0a
MF
209 assert playlist.startswith("#EXTM3U")
210 PATTERN = re.compile(r'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
211 segments = []
212 next_is_url=False
213 metadata = {}
214 for row in playlist.splitlines():
215 if next_is_url:
2d8521d8
MF
216 if not row.startswith('http'): #if relative path
217 row = "{}/{}".format(os.path.dirname(playlisturl), row)
56181f0a
MF
218 segments.append(row)
219 next_is_url=False
220 continue
221 if 'EXTINF' in row:
222 next_is_url=True
223 if "EXT-X-KEY" in row:
224 row = row.split(':',1)[1] #skip first part
d05b6699 225 parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
56181f0a 226 metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
84f7ef7d
MF
227 return(segments, metadata)
228
d05b6699 229def parse_videolist():
5b0549b5 230 page_num = 1
84f7ef7d
MF
231 soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
232 page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
5b0549b5
MF
233 videos_per_page = 8
234 video_num = 0
235 while(page_num <= page_tot):
236 base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
84f7ef7d 237 soup = BeautifulSoup(requests.get(base_url).text)
5b0549b5
MF
238 for article in soup.findAll('article'):
239 meta = dict(article.attrs)
d26e6919 240 video = Video()
5b0549b5
MF
241 video['title'] = meta['data-title']
242 video['description'] = meta['data-description']
243 video['url'] = dict(article.find('a').attrs)['href']
244 video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
245 video['num'] = video_num
246 video['total'] = page_tot * videos_per_page
247 video_num += 1
248 yield video
249 page_num += 1
250
d26e6919 251def remux(video, xml=None):
2d8521d8 252 basename = video['filename'].split('.ts')[0]
d26e6919
MF
253 if 'genre' in video:
254 if not os.path.exists(video['genre']):
255 os.mkdir(video['genre'])
5ab69ab4
MF
256 video['path'] = os.path.join(video['genre'],basename+'.mkv')
257 else:
1e13b6eb 258 video['path'] = basename+'.mkv'
d26e6919
MF
259 command = ["mkvmerge","-o",video['path'], '--title',video['title']]
260
261 if xml:
262 with open(basename+'.xml','w') as f:
263 f.write(xml)
264 command.extend(['--global-tags',basename+'.xml'])
265 if 'thumb' in video:
266 with open('thumbnail.jpg','wb') as f: #FIXME use title instead for many downloaders
267 f.write(video['thumb'].read())
268 command.extend(['--attachment-description', "Thumbnail",
269 '--attachment-mime-type', 'image/jpeg',
270 '--attach-file', 'thumbnail.jpg'])
24160239
MF
271 # if 'subs' in video:
272 # for sub in video['subs']:
273 # if 'download' in sub:
274 # with open("{}.vtt".format(sub['lang']),'wb') as f:
275 # f.write(bytes("".join(sub['download']),'utf-8')) #FIXME
276 # command.extend(['--language 0:{} {}.vtt'.format(sub['lang'],sub['lang'])])
277
278
d26e6919
MF
279 command.append(video['filename'])
280 print(Popen(command, stdout=PIPE).communicate()[0])
281 for fname in (video['filename'], basename+'.xml','thumbnail.jpg'):
282 try:
283 os.unlink(fname)
284 except:
285 pass
1e13b6eb 286 if 'timestamp' in video:
2301fe14
MF
287 try:
288 os.utime(video['path'], times=(video['timestamp'].timestamp(),video['timestamp'].timestamp()))
289 except FileNotFoundError as e:
290 print(e)
1e111d91 291
d26e6919
MF
292
293def mkv_metadata(video):
294 root = BeautifulSoup(features='xml')
295 root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
296 tags = root.new_tag("Tags")
297 tag = root.new_tag("Tag")
298 tags.append(tag)
299 root.append(tags)
300 keep = ('title','description', 'url','genre')
301 targets = root.new_tag("Targets")
302 ttv = root.new_tag("TargetTypeValue")
303 ttv.string = str(50)
304 targets.append(ttv)
305 tag.append(targets)
306 for key in video:
307 if not key in keep:
308 continue
309 simple = root.new_tag('Simple')
310 name = root.new_tag('Name')
311 name.string=key.upper()
312 simple.append(name)
313 sstring = root.new_tag('String')
314 sstring.string=video[key]
315 simple.append(sstring)
316 tag.append(simple)
317 return str(root)
56181f0a 318
ca2553c7 319if __name__ == "__main__":
72beea17 320 parser = argparse.ArgumentParser()
1ad04c01
MF
321 group = parser.add_mutually_exclusive_group(required=True)
322 group.add_argument("-r", "--rss", help="Download all files in rss")
323 group.add_argument("-u", "--url", help="Download video in url")
324 group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
5b0549b5 325 parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
2d8521d8
MF
326 parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
327
72beea17 328 args = parser.parse_args()
d05b6699 329 if args.rss:
5b0549b5 330 d = feedparser.parse(args.rss)
72beea17 331 for e in d.entries:
84f7ef7d 332 print(("Downloading: %s"%e.title))
5b0549b5
MF
333 if args.no_act:
334 continue
d26e6919 335 video = scrape_player_page({'title':e.title,'url':e.link})
2d8521d8
MF
336 if args.no_remux:
337 continue
d26e6919 338 self.remux(video)
72beea17 339 #print(e.description)
d05b6699 340 if args.mirror:
d26e6919
MF
341 if not os.path.exists('.seen'):
342 os.mkdir('.seen')
d05b6699
MF
343 for video in parse_videolist():
344 video['title'] = video['title'].replace('/','_')
84f7ef7d
MF
345 print(video['title']+'.mkv')
346 print("{} of {}".format(video['num'], video['total']))
d26e6919
MF
347
348 if os.path.exists(os.path.join('.seen',video['title'])):
84f7ef7d 349 print("Skipping")
d05b6699
MF
350 continue
351 print("Downloading...")
5b0549b5
MF
352 if args.no_act:
353 continue
d26e6919 354 open(os.path.join('.seen',video['title']),'w').close() #touch
1e111d91
MF
355 ret = scrape_player_page(video)
356 if not ret:
357 if not os.path.exists('.failed'):
358 os.mkdir('.failed')
359 open(os.path.join('.failed',video['title']),'w').close() #touch
360 continue
361 video = ret
2d8521d8
MF
362 if args.no_remux:
363 continue
d26e6919
MF
364 xml = mkv_metadata(video)
365 remux(video, xml)
366
72beea17 367 else:
5b0549b5 368 if not args.no_act:
d26e6919 369 video = scrape_player_page({'url':args.url})
2d8521d8 370 if not args.no_remux:
24160239 371 remux(video)
84f7ef7d 372 print(("Downloaded {}".format(args.url)))