]> git.frykholm.com Git - svtplaydump.git/blame_incremental - svtplaydump.py
Some more robustness fixes
[svtplaydump.git] / svtplaydump.py
... / ...
CommitLineData
1#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
3#
4# (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
5#
6# This program is free software: you can redistribute it and/or modify
7# it under the terms of the GNU General Public License as published by
8# the Free Software Foundation, either version 3 of the License, or
9# (at your option) any later version.
10#
11# This program is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14# GNU General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License
17# along with this program. If not, see <http://www.gnu.org/licenses/>
18#
19# Changelog:
20# 0.4 added mirror mode.
21# 0.3 added apple streaming playlist parsing and decryption
22# 0.2 added python 2.4 urlparse compatibility
23# 0.1 initial release
24
25from bs4 import BeautifulSoup, Doctype
26from subprocess import *
27import re
28from Crypto.Cipher import AES
29import struct
30import argparse
31import requests
32import sys, os
33import socket
34import feedparser
35from datetime import datetime, timezone
36class Video(dict):
37 def __init__(self, *args, **kwargs):
38 self.update(dict(*args, **kwargs)) # use the free update to set keys
39
40 def __setattr__(self, name, value):
41 return self.__setitem__(name,value)
42
43 def __getattr__(self, name):
44 return self.__getitem__(name)
45
46 def is_downloaded(self):
47 raise("NotImplemented")
48
49def scrape_player_page(video):
50 """
51 Try to scrape the site for video and download.
52 """
53 if not video['url'].startswith('http'):
54 video['url'] = "http://www.svtplay.se" + video['url']
55 soup = BeautifulSoup(requests.get(video['url']).text)
56 video_player = soup.body('a',{'data-json-href':True})[0]
57 if 'oppetarkiv.se' in video['url']:
58 flashvars = requests.get("http://www.oppetarkiv.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
59 else:
60 if video_player.attrs['data-json-href'].startswith("/wd"):
61 flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
62 else:
63 flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
64 video['duration'] = video_player.attrs.get('data-length',0)
65 if not 'title' in video:
66 video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('|','_').replace('/','_')
67 if not 'genre' in video:
68 if soup.find(text='Kategori:'):
69 video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
70 else:
71 video['genre'] = 'Ingen Genre'
72 if 'dynamicStreams' in flashvars:
73 video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
74 filename = video['title']+".mp4"
75 print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
76 if 'pathflv' in flashvars:
77 rtmp = flashvars['pathflv'][0]
78 filename = video['title']+".flv"
79 print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
80 if not 'timestamp' in video:
81 if soup.find_all(datetime=True):
82 xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
83 video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) #naive in utc
84 video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) #convert to local time
85 if 'video' in flashvars:
86 for reference in flashvars['video']['videoReferences']:
87 if 'm3u8' in reference['url']:
88 video['url']=reference['url']
89 video['filename'] = video['title']+'.ts'
90 if 'statistics' in flashvars:
91 video['category'] = flashvars['statistics']['category']
92 if not download_from_playlist(video):
93 return False
94 if not 'url' in video:
95 print("Could not find any streams")
96 return False
97 return video
98
99def download_from_playlist(video):
100 params = requests.utils.urlparse(video['url']).query
101 print(params)
102 if 'cc1=' in params: #'cc1=name=Svenska~default=yes~forced=no~uri=http://media.svt.se/download/mcc/wp3/undertexter-wsrt/1134047/1134047-025A/C(sv)/index.m3u8~lang=sv'
103 video['subs'] = [dict([k.split('=') for k in params.split('cc1=')[1].split('~')])] #make a dict from the paramstring
104 try:
105 req = requests.get(video['url']).text
106 except:
107 print("Error reading, skipping file")
108 print(sys.exc_info()[1])
109 return False
110 if 'subs' in video:
111 try:
112 segments = [item for item in requests.get(video['subs'][0]['uri']).text.split('\n') if 'vtt' in item]
113 except:
114 print("Error reading, skipping subtitle")
115 print(sys.exc_info()[1])
116 segments = [] #ugly FIXME
117 video['subs'][0]['download'] = []
118 for segment in segments:
119 if not segment.startswith('http'):
120 segment = "{}/{}".format(os.path.dirname(video['subs'][0]['uri']), segment)
121 try:
122 video['subs'][0]['download'].append(requests.get(segment).text)
123 except:
124 print("Error reading, skipping subtitle")
125 print(sys.exc_info()[1])
126 break
127 playlist = parse_playlist(req)
128 if not playlist:
129 return
130 videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
131 if not videourl.startswith('http'): #if relative path
132 videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
133 segments, metadata = parse_segment_playlist(videourl)
134 if "EXT-X-KEY" in metadata:
135 try:
136 key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
137 except:
138 print("Error reading, skipping file")
139 print(sys.exc_info()[1])
140 return False
141 decrypt=True
142 else:
143 decrypt=False
144 with open("%s"%video['filename'],"wb") as ofile:
145 segment=0
146 size = 0
147 for url in segments:
148 try:
149 ufile = requests.get(url, stream=True).raw
150 except:
151 print("Error reading, skipping file")
152 print(sys.exc_info()[1])
153 return False
154 print("\r{0:.2f} MB".format(size/1024/1024),end="")
155 sys.stdout.flush()
156 if decrypt:
157 iv=struct.pack("IIII",segment,0,0,0)
158 try:
159 decryptor = AES.new(key, AES.MODE_CBC, iv) #ValueError: AES key must be either 16, 24, or 32 bytes long
160 except(ValueError) as e:
161 print("Error using decryption key. Skipping")
162 print(e)
163 return False
164 while(True):
165 try:
166 buf = ufile.read(4096)
167 except:
168 print("Error reading, skipping file") #FIXME mark file as failed
169 print(sys.exc_info()[1])
170 return False
171 if not buf:
172 break
173 if decrypt:
174 buf = decryptor.decrypt(buf)
175 ofile.write(buf)
176 size += len(buf)
177 segment += 1
178
179 if 'thumb-url' in video:
180 try:
181 video['thumb'] = requests.get(video['thumb-url'],stream=True).raw
182 except:
183 print("Error reading thumbnail") #FIXME mark file as failed
184 print(sys.exc_info()[1])
185
186 return True
187
188def parse_playlist(playlist):
189 if not playlist.startswith("#EXTM3U"):
190 print(playlist)
191 return False
192 playlist = playlist.splitlines()
193 while not 'EXT-X-STREAM-INF' in playlist[0]:
194 playlist = playlist[1:]
195 items=[]
196 for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
197 md = Video()
198 if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
199 continue
200 for item in metadata_string.split(':')[1].split(','):
201 if '=' in item:
202 md.update([item.split('='),])
203 md['url']=url
204 items.append(md)
205 return items
206
207def parse_segment_playlist(playlisturl):
208 playlist = requests.get(playlisturl).text
209 assert playlist.startswith("#EXTM3U")
210 PATTERN = re.compile(r'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
211 segments = []
212 next_is_url=False
213 metadata = {}
214 for row in playlist.splitlines():
215 if next_is_url:
216 if not row.startswith('http'): #if relative path
217 row = "{}/{}".format(os.path.dirname(playlisturl), row)
218 segments.append(row)
219 next_is_url=False
220 continue
221 if 'EXTINF' in row:
222 next_is_url=True
223 if "EXT-X-KEY" in row:
224 row = row.split(':',1)[1] #skip first part
225 parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
226 metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
227 return(segments, metadata)
228
229def parse_videolist():
230 page_num = 1
231 soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
232 page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
233 videos_per_page = 8
234 video_num = 0
235 while(page_num <= page_tot):
236 base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
237 soup = BeautifulSoup(requests.get(base_url).text)
238 for article in soup.findAll('article'):
239 meta = dict(article.attrs)
240 video = Video()
241 video['title'] = meta['data-title']
242 video['description'] = meta['data-description']
243 video['url'] = dict(article.find('a').attrs)['href']
244 video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
245 video['num'] = video_num
246 video['total'] = page_tot * videos_per_page
247 video_num += 1
248 yield video
249 page_num += 1
250
251def remux(video, xml=None):
252 basename = video['filename'].split('.ts')[0]
253 if 'genre' in video:
254 if not os.path.exists(video['genre']):
255 os.mkdir(video['genre'])
256 video['path'] = os.path.join(video['genre'],basename+'.mkv')
257 else:
258 video['path'] = basename+'.mkv'
259 command = ["mkvmerge","-o",video['path'], '--title',video['title']]
260
261 if xml:
262 with open(basename+'.xml','w') as f:
263 f.write(xml)
264 command.extend(['--global-tags',basename+'.xml'])
265 if 'thumb' in video:
266 with open('thumbnail.jpg','wb') as f: #FIXME use title instead for many downloaders
267 f.write(video['thumb'].read())
268 command.extend(['--attachment-description', "Thumbnail",
269 '--attachment-mime-type', 'image/jpeg',
270 '--attach-file', 'thumbnail.jpg'])
271 # if 'subs' in video:
272 # for sub in video['subs']:
273 # if 'download' in sub:
274 # with open("{}.vtt".format(sub['lang']),'wb') as f:
275 # f.write(bytes("".join(sub['download']),'utf-8')) #FIXME
276 # command.extend(['--language 0:{} {}.vtt'.format(sub['lang'],sub['lang'])])
277
278
279 command.append(video['filename'])
280 print(Popen(command, stdout=PIPE).communicate()[0])
281 for fname in (video['filename'], basename+'.xml','thumbnail.jpg'):
282 try:
283 os.unlink(fname)
284 except:
285 pass
286 if 'timestamp' in video:
287 try:
288 os.utime(video['path'], times=(video['timestamp'].timestamp(),video['timestamp'].timestamp()))
289 except FileNotFoundError as e:
290 print(e)
291
292
293def mkv_metadata(video):
294 root = BeautifulSoup(features='xml')
295 root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
296 tags = root.new_tag("Tags")
297 tag = root.new_tag("Tag")
298 tags.append(tag)
299 root.append(tags)
300 keep = ('title','description', 'url','genre')
301 targets = root.new_tag("Targets")
302 ttv = root.new_tag("TargetTypeValue")
303 ttv.string = str(50)
304 targets.append(ttv)
305 tag.append(targets)
306 for key in video:
307 if not key in keep:
308 continue
309 simple = root.new_tag('Simple')
310 name = root.new_tag('Name')
311 name.string=key.upper()
312 simple.append(name)
313 sstring = root.new_tag('String')
314 sstring.string=video[key]
315 simple.append(sstring)
316 tag.append(simple)
317 return str(root)
318
319if __name__ == "__main__":
320 parser = argparse.ArgumentParser()
321 group = parser.add_mutually_exclusive_group(required=True)
322 group.add_argument("-r", "--rss", help="Download all files in rss")
323 group.add_argument("-u", "--url", help="Download video in url")
324 group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
325 parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
326 parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
327
328 args = parser.parse_args()
329 if args.rss:
330 d = feedparser.parse(args.rss)
331 for e in d.entries:
332 print(("Downloading: %s"%e.title))
333 if args.no_act:
334 continue
335 video = scrape_player_page({'title':e.title,'url':e.link})
336 if args.no_remux:
337 continue
338 self.remux(video)
339 #print(e.description)
340 if args.mirror:
341 if not os.path.exists('.seen'):
342 os.mkdir('.seen')
343 for video in parse_videolist():
344 video['title'] = video['title'].replace('/','_')
345 print(video['title']+'.mkv')
346 print("{} of {}".format(video['num'], video['total']))
347
348 if os.path.exists(os.path.join('.seen',video['title'])):
349 print("Skipping")
350 continue
351 print("Downloading...")
352 if args.no_act:
353 continue
354 open(os.path.join('.seen',video['title']),'w').close() #touch
355 ret = scrape_player_page(video)
356 if not ret:
357 if not os.path.exists('.failed'):
358 os.mkdir('.failed')
359 open(os.path.join('.failed',video['title']),'w').close() #touch
360 continue
361 video = ret
362 if args.no_remux:
363 continue
364 xml = mkv_metadata(video)
365 remux(video, xml)
366
367 else:
368 if not args.no_act:
369 video = scrape_player_page({'url':args.url})
370 if not args.no_remux:
371 remux(video)
372 print(("Downloaded {}".format(args.url)))