]> git.frykholm.com Git - svtplaydump.git/blob - svtplaydump.py
Some more robustness fixes
[svtplaydump.git] / svtplaydump.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 #
4 # (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
5 #
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
15 #
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>
18 #
19 # Changelog:
20 # 0.4 added mirror mode.
21 # 0.3 added apple streaming playlist parsing and decryption
22 # 0.2 added python 2.4 urlparse compatibility
23 # 0.1 initial release
24
25 from bs4 import BeautifulSoup, Doctype
26 from subprocess import *
27 import re
28 from Crypto.Cipher import AES
29 import struct
30 import argparse
31 import requests
32 import sys, os
33 import socket
34 import feedparser
35 from datetime import datetime, timezone
36 class Video(dict):
37 def __init__(self, *args, **kwargs):
38 self.update(dict(*args, **kwargs)) # use the free update to set keys
39
40 def __setattr__(self, name, value):
41 return self.__setitem__(name,value)
42
43 def __getattr__(self, name):
44 return self.__getitem__(name)
45
46 def is_downloaded(self):
47 raise("NotImplemented")
48
49 def scrape_player_page(video):
50 """
51 Try to scrape the site for video and download.
52 """
53 if not video['url'].startswith('http'):
54 video['url'] = "http://www.svtplay.se" + video['url']
55 soup = BeautifulSoup(requests.get(video['url']).text)
56 video_player = soup.body('a',{'data-json-href':True})[0]
57 if 'oppetarkiv.se' in video['url']:
58 flashvars = requests.get("http://www.oppetarkiv.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
59 else:
60 if video_player.attrs['data-json-href'].startswith("/wd"):
61 flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
62 else:
63 flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
64 video['duration'] = video_player.attrs.get('data-length',0)
65 if not 'title' in video:
66 video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('|','_').replace('/','_')
67 if not 'genre' in video:
68 if soup.find(text='Kategori:'):
69 video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
70 else:
71 video['genre'] = 'Ingen Genre'
72 if 'dynamicStreams' in flashvars:
73 video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
74 filename = video['title']+".mp4"
75 print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
76 if 'pathflv' in flashvars:
77 rtmp = flashvars['pathflv'][0]
78 filename = video['title']+".flv"
79 print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
80 if not 'timestamp' in video:
81 if soup.find_all(datetime=True):
82 xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
83 video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) #naive in utc
84 video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) #convert to local time
85 if 'video' in flashvars:
86 for reference in flashvars['video']['videoReferences']:
87 if 'm3u8' in reference['url']:
88 video['url']=reference['url']
89 video['filename'] = video['title']+'.ts'
90 if 'statistics' in flashvars:
91 video['category'] = flashvars['statistics']['category']
92 if not download_from_playlist(video):
93 return False
94 if not 'url' in video:
95 print("Could not find any streams")
96 return False
97 return video
98
99 def download_from_playlist(video):
100 params = requests.utils.urlparse(video['url']).query
101 print(params)
102 if 'cc1=' in params: #'cc1=name=Svenska~default=yes~forced=no~uri=http://media.svt.se/download/mcc/wp3/undertexter-wsrt/1134047/1134047-025A/C(sv)/index.m3u8~lang=sv'
103 video['subs'] = [dict([k.split('=') for k in params.split('cc1=')[1].split('~')])] #make a dict from the paramstring
104 try:
105 req = requests.get(video['url']).text
106 except:
107 print("Error reading, skipping file")
108 print(sys.exc_info()[1])
109 return False
110 if 'subs' in video:
111 try:
112 segments = [item for item in requests.get(video['subs'][0]['uri']).text.split('\n') if 'vtt' in item]
113 except:
114 print("Error reading, skipping subtitle")
115 print(sys.exc_info()[1])
116 segments = [] #ugly FIXME
117 video['subs'][0]['download'] = []
118 for segment in segments:
119 if not segment.startswith('http'):
120 segment = "{}/{}".format(os.path.dirname(video['subs'][0]['uri']), segment)
121 try:
122 video['subs'][0]['download'].append(requests.get(segment).text)
123 except:
124 print("Error reading, skipping subtitle")
125 print(sys.exc_info()[1])
126 break
127 playlist = parse_playlist(req)
128 if not playlist:
129 return
130 videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
131 if not videourl.startswith('http'): #if relative path
132 videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
133 segments, metadata = parse_segment_playlist(videourl)
134 if "EXT-X-KEY" in metadata:
135 try:
136 key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
137 except:
138 print("Error reading, skipping file")
139 print(sys.exc_info()[1])
140 return False
141 decrypt=True
142 else:
143 decrypt=False
144 with open("%s"%video['filename'],"wb") as ofile:
145 segment=0
146 size = 0
147 for url in segments:
148 try:
149 ufile = requests.get(url, stream=True).raw
150 except:
151 print("Error reading, skipping file")
152 print(sys.exc_info()[1])
153 return False
154 print("\r{0:.2f} MB".format(size/1024/1024),end="")
155 sys.stdout.flush()
156 if decrypt:
157 iv=struct.pack("IIII",segment,0,0,0)
158 try:
159 decryptor = AES.new(key, AES.MODE_CBC, iv) #ValueError: AES key must be either 16, 24, or 32 bytes long
160 except(ValueError) as e:
161 print("Error using decryption key. Skipping")
162 print(e)
163 return False
164 while(True):
165 try:
166 buf = ufile.read(4096)
167 except:
168 print("Error reading, skipping file") #FIXME mark file as failed
169 print(sys.exc_info()[1])
170 return False
171 if not buf:
172 break
173 if decrypt:
174 buf = decryptor.decrypt(buf)
175 ofile.write(buf)
176 size += len(buf)
177 segment += 1
178
179 if 'thumb-url' in video:
180 try:
181 video['thumb'] = requests.get(video['thumb-url'],stream=True).raw
182 except:
183 print("Error reading thumbnail") #FIXME mark file as failed
184 print(sys.exc_info()[1])
185
186 return True
187
188 def parse_playlist(playlist):
189 if not playlist.startswith("#EXTM3U"):
190 print(playlist)
191 return False
192 playlist = playlist.splitlines()
193 while not 'EXT-X-STREAM-INF' in playlist[0]:
194 playlist = playlist[1:]
195 items=[]
196 for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
197 md = Video()
198 if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
199 continue
200 for item in metadata_string.split(':')[1].split(','):
201 if '=' in item:
202 md.update([item.split('='),])
203 md['url']=url
204 items.append(md)
205 return items
206
207 def parse_segment_playlist(playlisturl):
208 playlist = requests.get(playlisturl).text
209 assert playlist.startswith("#EXTM3U")
210 PATTERN = re.compile(r'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
211 segments = []
212 next_is_url=False
213 metadata = {}
214 for row in playlist.splitlines():
215 if next_is_url:
216 if not row.startswith('http'): #if relative path
217 row = "{}/{}".format(os.path.dirname(playlisturl), row)
218 segments.append(row)
219 next_is_url=False
220 continue
221 if 'EXTINF' in row:
222 next_is_url=True
223 if "EXT-X-KEY" in row:
224 row = row.split(':',1)[1] #skip first part
225 parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
226 metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
227 return(segments, metadata)
228
229 def parse_videolist():
230 page_num = 1
231 soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
232 page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
233 videos_per_page = 8
234 video_num = 0
235 while(page_num <= page_tot):
236 base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
237 soup = BeautifulSoup(requests.get(base_url).text)
238 for article in soup.findAll('article'):
239 meta = dict(article.attrs)
240 video = Video()
241 video['title'] = meta['data-title']
242 video['description'] = meta['data-description']
243 video['url'] = dict(article.find('a').attrs)['href']
244 video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
245 video['num'] = video_num
246 video['total'] = page_tot * videos_per_page
247 video_num += 1
248 yield video
249 page_num += 1
250
251 def remux(video, xml=None):
252 basename = video['filename'].split('.ts')[0]
253 if 'genre' in video:
254 if not os.path.exists(video['genre']):
255 os.mkdir(video['genre'])
256 video['path'] = os.path.join(video['genre'],basename+'.mkv')
257 else:
258 video['path'] = basename+'.mkv'
259 command = ["mkvmerge","-o",video['path'], '--title',video['title']]
260
261 if xml:
262 with open(basename+'.xml','w') as f:
263 f.write(xml)
264 command.extend(['--global-tags',basename+'.xml'])
265 if 'thumb' in video:
266 with open('thumbnail.jpg','wb') as f: #FIXME use title instead for many downloaders
267 f.write(video['thumb'].read())
268 command.extend(['--attachment-description', "Thumbnail",
269 '--attachment-mime-type', 'image/jpeg',
270 '--attach-file', 'thumbnail.jpg'])
271 # if 'subs' in video:
272 # for sub in video['subs']:
273 # if 'download' in sub:
274 # with open("{}.vtt".format(sub['lang']),'wb') as f:
275 # f.write(bytes("".join(sub['download']),'utf-8')) #FIXME
276 # command.extend(['--language 0:{} {}.vtt'.format(sub['lang'],sub['lang'])])
277
278
279 command.append(video['filename'])
280 print(Popen(command, stdout=PIPE).communicate()[0])
281 for fname in (video['filename'], basename+'.xml','thumbnail.jpg'):
282 try:
283 os.unlink(fname)
284 except:
285 pass
286 if 'timestamp' in video:
287 try:
288 os.utime(video['path'], times=(video['timestamp'].timestamp(),video['timestamp'].timestamp()))
289 except FileNotFoundError as e:
290 print(e)
291
292
293 def mkv_metadata(video):
294 root = BeautifulSoup(features='xml')
295 root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
296 tags = root.new_tag("Tags")
297 tag = root.new_tag("Tag")
298 tags.append(tag)
299 root.append(tags)
300 keep = ('title','description', 'url','genre')
301 targets = root.new_tag("Targets")
302 ttv = root.new_tag("TargetTypeValue")
303 ttv.string = str(50)
304 targets.append(ttv)
305 tag.append(targets)
306 for key in video:
307 if not key in keep:
308 continue
309 simple = root.new_tag('Simple')
310 name = root.new_tag('Name')
311 name.string=key.upper()
312 simple.append(name)
313 sstring = root.new_tag('String')
314 sstring.string=video[key]
315 simple.append(sstring)
316 tag.append(simple)
317 return str(root)
318
319 if __name__ == "__main__":
320 parser = argparse.ArgumentParser()
321 group = parser.add_mutually_exclusive_group(required=True)
322 group.add_argument("-r", "--rss", help="Download all files in rss")
323 group.add_argument("-u", "--url", help="Download video in url")
324 group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
325 parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
326 parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
327
328 args = parser.parse_args()
329 if args.rss:
330 d = feedparser.parse(args.rss)
331 for e in d.entries:
332 print(("Downloading: %s"%e.title))
333 if args.no_act:
334 continue
335 video = scrape_player_page({'title':e.title,'url':e.link})
336 if args.no_remux:
337 continue
338 self.remux(video)
339 #print(e.description)
340 if args.mirror:
341 if not os.path.exists('.seen'):
342 os.mkdir('.seen')
343 for video in parse_videolist():
344 video['title'] = video['title'].replace('/','_')
345 print(video['title']+'.mkv')
346 print("{} of {}".format(video['num'], video['total']))
347
348 if os.path.exists(os.path.join('.seen',video['title'])):
349 print("Skipping")
350 continue
351 print("Downloading...")
352 if args.no_act:
353 continue
354 open(os.path.join('.seen',video['title']),'w').close() #touch
355 ret = scrape_player_page(video)
356 if not ret:
357 if not os.path.exists('.failed'):
358 os.mkdir('.failed')
359 open(os.path.join('.failed',video['title']),'w').close() #touch
360 continue
361 video = ret
362 if args.no_remux:
363 continue
364 xml = mkv_metadata(video)
365 remux(video, xml)
366
367 else:
368 if not args.no_act:
369 video = scrape_player_page({'url':args.url})
370 if not args.no_remux:
371 remux(video)
372 print(("Downloaded {}".format(args.url)))