Don't crash on missing file
[svtplaydump.git] / svtplaydump.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 #
4 # (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
5 #
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
15 #
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>
18 #
19 # Changelog:
20 # 0.4 added mirror mode.
21 # 0.3 added apple streaming playlist parsing and decryption
22 # 0.2 added python 2.4 urlparse compatibility
23 # 0.1 initial release
24
25 from bs4 import BeautifulSoup, Doctype
26 from subprocess import *
27 import re
28 from Crypto.Cipher import AES
29 import struct
30 import argparse
31 import requests
32 import sys, os
33 import socket
34 import feedparser
35 from datetime import datetime, timezone
36 class Video(dict):
37 def __init__(self, *args, **kwargs):
38 self.update(dict(*args, **kwargs)) # use the free update to set keys
39
40 def __setattr__(self, name, value):
41 return self.__setitem__(name,value)
42
43 def __getattr__(self, name):
44 return self.__getitem__(name)
45
46 def is_downloaded(self):
47 raise("NotImplemented")
48
49 def scrape_player_page(video):
50 """
51 Try to scrape the site for video and download.
52 """
53 if not video['url'].startswith('http'):
54 video['url'] = "http://www.svtplay.se" + video['url']
55 soup = BeautifulSoup(requests.get(video['url']).text)
56 video_player = soup.body('a',{'data-json-href':True})[0]
57 if 'oppetarkiv.se' in video['url']:
58 flashvars = requests.get("http://www.oppetarkiv.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
59 else:
60 if video_player.attrs['data-json-href'].startswith("/wd"):
61 flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
62 else:
63 flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
64 video['duration'] = video_player.attrs.get('data-length',0)
65 if not video['title']:
66 video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('|','_').replace('/','_')
67 if not 'genre' in video:
68 if soup.find(text='Kategori:'):
69 video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
70 else:
71 video['genre'] = 'Ingen Genre'
72 if 'dynamicStreams' in flashvars:
73 video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
74 filename = video['title']+".mp4"
75 print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
76 if 'pathflv' in flashvars:
77 rtmp = flashvars['pathflv'][0]
78 filename = video['title']+".flv"
79 print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
80 if not 'timestamp' in video:
81 if soup.find_all(datetime=True):
82 xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
83 video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) #naive in utc
84 video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) #convert to local time
85 if 'video' in flashvars:
86 for reference in flashvars['video']['videoReferences']:
87 if 'm3u8' in reference['url']:
88 video['url']=reference['url']
89 video['filename'] = video['title']+'.ts'
90 if 'statistics' in flashvars:
91 video['category'] = flashvars['statistics']['category']
92 download_from_playlist(video)
93 if not 'url' in video:
94 print("Could not find any streams")
95 return False
96 return video
97
98 def download_from_playlist(video):
99 playlist = parse_playlist(requests.get(video['url']).text)
100 if not playlist:
101 return
102 videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
103 if not videourl.startswith('http'): #if relative path
104 videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
105 segments, metadata = parse_segment_playlist(videourl)
106 if "EXT-X-KEY" in metadata:
107 key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
108 decrypt=True
109 else:
110 decrypt=False
111 with open("%s"%video['filename'],"wb") as ofile:
112 segment=0
113 size = 0
114 for url in segments:
115 ufile = requests.get(url, stream=True).raw
116 print("\r{0:.2f} MB".format(size/1024/1024),end="")
117 sys.stdout.flush()
118 if decrypt:
119 iv=struct.pack("IIII",segment,0,0,0)
120 decryptor = AES.new(key, AES.MODE_CBC, iv)
121 while(True):
122 try:
123 buf = ufile.read(4096)
124 except (socket.error, TypeError) as e:
125 print("Error reading, skipping file")
126 print(e)
127 return
128 if not buf:
129 break
130 if decrypt:
131 buf = decryptor.decrypt(buf)
132 ofile.write(buf)
133 size += len(buf)
134 segment += 1
135
136 if 'thumb-url' in video:
137 video['thumb'] = requests.get(video['thumb-url'],stream=True).raw
138
139 def parse_playlist(playlist):
140 if not playlist.startswith("#EXTM3U"):
141 print(playlist)
142 return False
143 playlist = playlist.splitlines()
144 while not 'EXT-X-STREAM-INF' in playlist[0]:
145 playlist = playlist[1:]
146 items=[]
147 for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
148 md = Video()
149 if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
150 continue
151 for item in metadata_string.split(':')[1].split(','):
152 if '=' in item:
153 md.update([item.split('='),])
154 md['url']=url
155 items.append(md)
156 return items
157
158 def parse_segment_playlist(playlisturl):
159 playlist = requests.get(playlisturl).text
160 assert playlist.startswith("#EXTM3U")
161 PATTERN = re.compile(r'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
162 segments = []
163 next_is_url=False
164 metadata = {}
165 for row in playlist.splitlines():
166 if next_is_url:
167 if not row.startswith('http'): #if relative path
168 row = "{}/{}".format(os.path.dirname(playlisturl), row)
169 segments.append(row)
170 next_is_url=False
171 continue
172 if 'EXTINF' in row:
173 next_is_url=True
174 if "EXT-X-KEY" in row:
175 row = row.split(':',1)[1] #skip first part
176 parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
177 metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
178 return(segments, metadata)
179
180 def parse_videolist():
181 page_num = 1
182 soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
183 page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
184 videos_per_page = 8
185 video_num = 0
186 while(page_num <= page_tot):
187 base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
188 soup = BeautifulSoup(requests.get(base_url).text)
189 for article in soup.findAll('article'):
190 meta = dict(article.attrs)
191 video = Video()
192 video['title'] = meta['data-title']
193 video['description'] = meta['data-description']
194 video['url'] = dict(article.find('a').attrs)['href']
195 video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
196 video['num'] = video_num
197 video['total'] = page_tot * videos_per_page
198 video_num += 1
199 yield video
200 page_num += 1
201
202 def remux(video, xml=None):
203 basename = video['filename'].split('.ts')[0]
204 if 'genre' in video:
205 if not os.path.exists(video['genre']):
206 os.mkdir(video['genre'])
207 video['path'] = os.path.join(video['genre'],basename+'.mkv')
208 else:
209 video['path'] = basename+'.mkv'
210 command = ["mkvmerge","-o",video['path'], '--title',video['title']]
211
212 if xml:
213 with open(basename+'.xml','w') as f:
214 f.write(xml)
215 command.extend(['--global-tags',basename+'.xml'])
216 if 'thumb' in video:
217 with open('thumbnail.jpg','wb') as f: #FIXME use title instead for many downloaders
218 f.write(video['thumb'].read())
219 command.extend(['--attachment-description', "Thumbnail",
220 '--attachment-mime-type', 'image/jpeg',
221 '--attach-file', 'thumbnail.jpg'])
222 command.append(video['filename'])
223 print(Popen(command, stdout=PIPE).communicate()[0])
224 for fname in (video['filename'], basename+'.xml','thumbnail.jpg'):
225 try:
226 os.unlink(fname)
227 except:
228 pass
229 if 'timestamp' in video:
230 try:
231 os.utime(video['path'], times=(video['timestamp'].timestamp(),video['timestamp'].timestamp()))
232 except FileNotFoundError as e:
233 print(e)
234
235
236 def mkv_metadata(video):
237 root = BeautifulSoup(features='xml')
238 root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
239 tags = root.new_tag("Tags")
240 tag = root.new_tag("Tag")
241 tags.append(tag)
242 root.append(tags)
243 keep = ('title','description', 'url','genre')
244 targets = root.new_tag("Targets")
245 ttv = root.new_tag("TargetTypeValue")
246 ttv.string = str(50)
247 targets.append(ttv)
248 tag.append(targets)
249 for key in video:
250 if not key in keep:
251 continue
252 simple = root.new_tag('Simple')
253 name = root.new_tag('Name')
254 name.string=key.upper()
255 simple.append(name)
256 sstring = root.new_tag('String')
257 sstring.string=video[key]
258 simple.append(sstring)
259 tag.append(simple)
260 return str(root)
261
262 if __name__ == "__main__":
263 parser = argparse.ArgumentParser()
264 group = parser.add_mutually_exclusive_group(required=True)
265 group.add_argument("-r", "--rss", help="Download all files in rss")
266 group.add_argument("-u", "--url", help="Download video in url")
267 group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
268 parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
269 parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
270
271 args = parser.parse_args()
272 if args.rss:
273 d = feedparser.parse(args.rss)
274 for e in d.entries:
275 print(("Downloading: %s"%e.title))
276 if args.no_act:
277 continue
278 video = scrape_player_page({'title':e.title,'url':e.link})
279 if args.no_remux:
280 continue
281 self.remux(video)
282 #print(e.description)
283 if args.mirror:
284 if not os.path.exists('.seen'):
285 os.mkdir('.seen')
286 for video in parse_videolist():
287 video['title'] = video['title'].replace('/','_')
288 print(video['title']+'.mkv')
289 print("{} of {}".format(video['num'], video['total']))
290
291 if os.path.exists(os.path.join('.seen',video['title'])):
292 print("Skipping")
293 continue
294 print("Downloading...")
295 if args.no_act:
296 continue
297 open(os.path.join('.seen',video['title']),'w').close() #touch
298 video = scrape_player_page(video)
299 if args.no_remux:
300 continue
301 xml = mkv_metadata(video)
302 remux(video, xml)
303
304 else:
305 if not args.no_act:
306 video = scrape_player_page({'url':args.url})
307 if not args.no_remux:
308 remux({'title':e.title})
309 print(("Downloaded {}".format(args.url)))