]> git.frykholm.com Git - svtplaydump.git/blob - svtplaydump.py
8b4a3c25231fa4a299e6493e0596cea37dba4fcd
[svtplaydump.git] / svtplaydump.py
1 #!/usr/bin/env python3.4
2 # -*- coding: utf-8 -*-
3 #
4 # (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
5 #
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
15 #
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>
18 #
19 # Changelog:
20 # 0.4 added mirror mode.
21 # 0.3 added apple streaming playlist parsing and decryption
22 # 0.2 added python 2.4 urlparse compatibility
23 # 0.1 initial release
24
25 from bs4 import BeautifulSoup, Doctype
26 from subprocess import *
27 import re
28 from Crypto.Cipher import AES
29 import struct
30 import argparse
31 import requests
32 import sys, os
33 import socket
34 import feedparser
35 from datetime import datetime, timezone
36 from pathlib import Path
37
38 class Video(dict):
39 def __init__(self, *args, **kwargs):
40 self.update(dict(*args, **kwargs)) # use the free update to set keys
41
42 def __setattr__(self, name, value):
43 return self.__setitem__(name,value)
44
45 def __getattr__(self, name):
46 return self.__getitem__(name)
47
48 def is_downloaded(self):
49 raise("NotImplemented")
50
51 def scrape_player_page(video):
52 """
53 Try to scrape the site for video and download.
54 """
55 if not video['url'].startswith('http'):
56 video['url'] = "http://www.svtplay.se" + video['url']
57 soup = BeautifulSoup(requests.get(video['url']).text)
58 video_player = soup.body('a',{'data-json-href':True})[0]
59 if 'oppetarkiv.se' in video['url']:
60 flashvars = requests.get("http://www.oppetarkiv.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
61 else:
62 if video_player.attrs['data-json-href'].startswith("/wd"):
63 flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
64 else:
65 flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
66 video['duration'] = video_player.attrs.get('data-length',0)
67 if not 'title' in video:
68 video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('|','_').replace('/','_')
69 if not 'genre' in video:
70 if soup.find(text='Kategori:'):
71 video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
72 else:
73 video['genre'] = 'Ingen Genre'
74 if 'dynamicStreams' in flashvars:
75 video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
76 filename = Path(video['title']).with_suffix(".mp4")
77 print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
78 if 'pathflv' in flashvars:
79 rtmp = flashvars['pathflv'][0]
80 filename = Path(video['title']).with_suffix(".flv")
81 print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
82 if not 'timestamp' in video:
83 if soup.find_all(datetime=True):
84 xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
85 if xmldate_str:
86 video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) #naive in utc
87 video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) #convert to local time
88 if 'video' in flashvars:
89 for reference in flashvars['video']['videoReferences']:
90 if 'm3u8' in reference['url']:
91 video['url']=reference['url']
92 video['filename'] = Path(video['title']).with_suffix('.ts')
93 if 'statistics' in flashvars:
94 video['category'] = flashvars['statistics']['category']
95 if not download_from_playlist(video):
96 return False
97 if not 'url' in video:
98 print("Could not find any streams")
99 return False
100 return video
101
102 def download_from_playlist(video):
103 params = requests.utils.urlparse(video['url']).query
104 print(params)
105 if 'cc1=' in params: #'cc1=name=Svenska~default=yes~forced=no~uri=http://media.svt.se/download/mcc/wp3/undertexter-wsrt/1134047/1134047-025A/C(sv)/index.m3u8~lang=sv'
106 video['subs'] = [dict([k.split('=') for k in params.split('cc1=')[1].split('~')])] #make a dict from the paramstring
107 try:
108 req = requests.get(video['url']).text
109 except:
110 print("Error reading, skipping file")
111 print(sys.exc_info()[1])
112 return False
113 if 'subs' in video:
114 try:
115 segments = [item for item in requests.get(video['subs'][0]['uri']).text.split('\n') if 'vtt' in item]
116 except:
117 print("Error reading, skipping subtitle")
118 print(sys.exc_info()[1])
119 segments = [] #ugly FIXME
120 video['subs'][0]['download'] = []
121 for segment in segments:
122 if not segment.startswith('http'):
123 segment = "{}/{}".format(os.path.dirname(video['subs'][0]['uri']), segment)
124 try:
125 video['subs'][0]['download'].append(requests.get(segment).text)
126 except:
127 print("Error reading, skipping subtitle")
128 print(sys.exc_info()[1])
129 break
130 playlist = parse_playlist(req)
131 if not playlist:
132 return
133 videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
134 if not videourl.startswith('http'): #if relative path
135 videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
136 segments, metadata = parse_segment_playlist(videourl)
137 if "EXT-X-KEY" in metadata:
138 try:
139 key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
140 except:
141 print("Error reading, skipping file")
142 print(sys.exc_info()[1])
143 return False
144 decrypt=True
145 else:
146 decrypt=False
147 with video['filename'].open("wb") as ofile:
148 segment=0
149 size = 0
150 for url in segments:
151 try:
152 ufile = requests.get(url, stream=True).raw
153 except:
154 print("Error reading, skipping file")
155 print(sys.exc_info()[1])
156 return False
157 print("\r{0:.2f} MB".format(size/1024/1024), end="")
158 sys.stdout.flush()
159 if decrypt:
160 iv=struct.pack("IIII",segment,0,0,0)
161 try:
162 decryptor = AES.new(key, AES.MODE_CBC, iv) #ValueError: AES key must be either 16, 24, or 32 bytes long
163 except(ValueError) as e:
164 print("Error using decryption key. Skipping")
165 print(e)
166 return False
167 while(True):
168 try:
169 buf = ufile.read(4096)
170 except:
171 print("Error reading, skipping file") #FIXME mark file as failed
172 print(sys.exc_info()[1])
173 return False
174 if not buf:
175 break
176 if decrypt:
177 buf = decryptor.decrypt(buf)
178 ofile.write(buf)
179 size += len(buf)
180 segment += 1
181
182 if 'thumb-url' in video:
183 try:
184 video['thumb'] = requests.get(video['thumb-url'],stream=True).raw
185 except:
186 print("Error reading thumbnail") #FIXME mark file as failed
187 print(sys.exc_info()[1])
188
189 return True
190
191 def parse_playlist(playlist):
192 if not playlist.startswith("#EXTM3U"):
193 print(playlist)
194 return False
195 playlist = playlist.splitlines()
196 while not 'EXT-X-STREAM-INF' in playlist[0]:
197 playlist = playlist[1:]
198 items=[]
199 for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
200 md = Video()
201 if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
202 continue
203 for item in metadata_string.split(':')[1].split(','):
204 if '=' in item:
205 md.update([item.split('='),])
206 md['url']=url
207 items.append(md)
208 return items
209
210 def parse_segment_playlist(playlisturl):
211 playlist = requests.get(playlisturl).text
212 assert playlist.startswith("#EXTM3U")
213 PATTERN = re.compile(r'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
214 segments = []
215 next_is_url=False
216 metadata = {}
217 for row in playlist.splitlines():
218 if next_is_url:
219 if not row.startswith('http'): #if relative path
220 row = "{}/{}".format(os.path.dirname(playlisturl), row)
221 segments.append(row)
222 next_is_url=False
223 continue
224 if 'EXTINF' in row:
225 next_is_url=True
226 if "EXT-X-KEY" in row:
227 row = row.split(':',1)[1] #skip first part
228 parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
229 metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
230 return(segments, metadata)
231
232 def parse_videolist():
233 page_num = 1
234 soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
235 page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
236 videos_per_page = 8
237 video_num = 0
238 while(page_num <= page_tot):
239 base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
240 soup = BeautifulSoup(requests.get(base_url).text)
241 for article in soup.findAll('article'):
242 meta = dict(article.attrs)
243 video = Video()
244 video['title'] = meta['data-title']
245 video['description'] = meta['data-description']
246 video['url'] = dict(article.find('a').attrs)['href']
247 video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
248 video['num'] = video_num
249 video['total'] = page_tot * videos_per_page
250 video_num += 1
251 yield video
252 page_num += 1
253
254 def remux(video, xml=None):
255 if 'genre' in video:
256 if not os.path.exists(video['genre']):
257 os.mkdir(video['genre'])
258 video['path'] = Path(video['genre'] / video['filename']).with_suffix('.mkv')
259 else:
260 video['path'] = video['filename'].with_suffix('.mkv')
261 command = ["mkvmerge","-o",str(video['path']), '--title',video['title']]
262
263 if xml:
264 with video['filename'].with_suffix('.xml').open('w') as f:
265 f.write(xml)
266 command.extend(['--global-tags',str(video['filename'].with_suffix('.xml'))])
267 if 'thumb' in video:
268 with open('thumbnail.jpg','wb') as f: #FIXME use title instead for many downloaders
269 f.write(video['thumb'].read())
270 command.extend(['--attachment-description', "Thumbnail",
271 '--attachment-mime-type', 'image/jpeg',
272 '--attach-file', 'thumbnail.jpg'])
273 # if 'subs' in video:
274 # for sub in video['subs']:
275 # if 'download' in sub:
276 # with open("{}.vtt".format(sub['lang']),'wb') as f:
277 # f.write(bytes("".join(sub['download']),'utf-8')) #FIXME
278 # command.extend(['--language 0:{} {}.vtt'.format(sub['lang'],sub['lang'])])
279
280 command.append(str(video['filename']))
281 print(Popen(command, stdout=PIPE).communicate()[0])
282 for fname in (video['filename'], video['filename'].with_suffix('.xml'),Path('thumbnail.jpg')):
283 try:
284 fname.unlink()
285 except:
286 pass
287 if 'timestamp' in video:
288 try:
289 os.utime(str(video['path']), times=(video['timestamp'].timestamp(),video['timestamp'].timestamp()))
290 except FileNotFoundError as e:
291 print(e)
292
293
294 def mkv_metadata(video):
295 root = BeautifulSoup(features='xml')
296 root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
297 tags = root.new_tag("Tags")
298 tag = root.new_tag("Tag")
299 tags.append(tag)
300 root.append(tags)
301 keep = ('title','description', 'url','genre')
302 targets = root.new_tag("Targets")
303 ttv = root.new_tag("TargetTypeValue")
304 ttv.string = str(50)
305 targets.append(ttv)
306 tag.append(targets)
307 for key in video:
308 if not key in keep:
309 continue
310 simple = root.new_tag('Simple')
311 name = root.new_tag('Name')
312 name.string=key.upper()
313 simple.append(name)
314 sstring = root.new_tag('String')
315 sstring.string=video[key]
316 simple.append(sstring)
317 tag.append(simple)
318 return str(root)
319
320 if __name__ == "__main__":
321 parser = argparse.ArgumentParser()
322 group = parser.add_mutually_exclusive_group(required=True)
323 group.add_argument("-r", "--rss", help="Download all files in rss")
324 group.add_argument("-u", "--url", help="Download video in url")
325 group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
326 parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
327 parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
328
329 args = parser.parse_args()
330 if args.rss:
331 d = feedparser.parse(args.rss)
332 for e in d.entries:
333 print(("Downloading: %s"%e.title))
334 if args.no_act:
335 continue
336 video = scrape_player_page({'title':e.title,'url':e.link})
337 if args.no_remux:
338 continue
339 self.remux(video)
340 #print(e.description)
341 if args.mirror:
342 if not os.path.exists('.seen'):
343 os.mkdir('.seen')
344 for video in parse_videolist():
345 video['title'] = video['title'].replace('/','_')
346 print(video['title']+'.mkv')
347 print("{} of {}".format(video['num'], video['total']))
348
349 if os.path.exists(os.path.join('.seen',video['title'])):
350 print("Skipping")
351 continue
352 print("Downloading...")
353 if args.no_act:
354 continue
355 open(os.path.join('.seen',video['title']),'w').close() #touch
356 ret = scrape_player_page(video)
357 if not ret:
358 if not os.path.exists('.failed'):
359 os.mkdir('.failed')
360 open(os.path.join('.failed',video['title']),'w').close() #touch
361 continue
362 video = ret
363 if args.no_remux:
364 continue
365 xml = mkv_metadata(video)
366 remux(video, xml)
367
368 else:
369 if not args.no_act:
370 video = scrape_player_page({'url':args.url})
371 if not args.no_remux:
372 remux(video)
373 print(("Downloaded {}".format(args.url)))