Don't crash on missing file
[svtplaydump.git] / svtplaydump.py
CommitLineData
84f7ef7d 1#!/usr/bin/env python3
56181f0a 2# -*- coding: utf-8 -*-
ca2553c7
MF
3#
4# (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
5#
6# This program is free software: you can redistribute it and/or modify
7# it under the terms of the GNU General Public License as published by
8# the Free Software Foundation, either version 3 of the License, or
9# (at your option) any later version.
10#
11# This program is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14# GNU General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License
17# along with this program. If not, see <http://www.gnu.org/licenses/>
18#
19# Changelog:
d05b6699 20# 0.4 added mirror mode.
56181f0a 21# 0.3 added apple streaming playlist parsing and decryption
ca2553c7
MF
22# 0.2 added python 2.4 urlparse compatibility
23# 0.1 initial release
24
d26e6919 25from bs4 import BeautifulSoup, Doctype
ca2553c7 26from subprocess import *
89a00fa0 27import re
56181f0a
MF
28from Crypto.Cipher import AES
29import struct
72beea17 30import argparse
84f7ef7d 31import requests
d05b6699 32import sys, os
c1d3d702 33import socket
1e13b6eb
MF
34import feedparser
35from datetime import datetime, timezone
d26e6919
MF
36class Video(dict):
37 def __init__(self, *args, **kwargs):
38 self.update(dict(*args, **kwargs)) # use the free update to set keys
39
40 def __setattr__(self, name, value):
41 return self.__setitem__(name,value)
42
43 def __getattr__(self, name):
44 return self.__getitem__(name)
45
46 def is_downloaded(self):
47 raise("NotImplemented")
48
49def scrape_player_page(video):
d05b6699
MF
50 """
51 Try to scrape the site for video and download.
52 """
d26e6919
MF
53 if not video['url'].startswith('http'):
54 video['url'] = "http://www.svtplay.se" + video['url']
55 soup = BeautifulSoup(requests.get(video['url']).text)
d05b6699 56 video_player = soup.body('a',{'data-json-href':True})[0]
d26e6919 57 if 'oppetarkiv.se' in video['url']:
7370a42e 58 flashvars = requests.get("http://www.oppetarkiv.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
d3ebb57d 59 else:
7370a42e
MF
60 if video_player.attrs['data-json-href'].startswith("/wd"):
61 flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
62 else:
63 flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
84f7ef7d 64 video['duration'] = video_player.attrs.get('data-length',0)
d26e6919 65 if not video['title']:
84f7ef7d 66 video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('|','_').replace('/','_')
d26e6919
MF
67 if not 'genre' in video:
68 if soup.find(text='Kategori:'):
69 video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
70 else:
71 video['genre'] = 'Ingen Genre'
ca2553c7 72 if 'dynamicStreams' in flashvars:
d05b6699
MF
73 video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
74 filename = video['title']+".mp4"
84f7ef7d 75 print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
ca2553c7
MF
76 if 'pathflv' in flashvars:
77 rtmp = flashvars['pathflv'][0]
d05b6699 78 filename = video['title']+".flv"
84f7ef7d 79 print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
1e13b6eb
MF
80 if not 'timestamp' in video:
81 if soup.find_all(datetime=True):
82 xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
83 video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) #naive in utc
84 video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) #convert to local time
89a00fa0 85 if 'video' in flashvars:
56181f0a 86 for reference in flashvars['video']['videoReferences']:
2d8521d8 87 if 'm3u8' in reference['url']:
d05b6699
MF
88 video['url']=reference['url']
89 video['filename'] = video['title']+'.ts'
90 if 'statistics' in flashvars:
91 video['category'] = flashvars['statistics']['category']
92 download_from_playlist(video)
2d8521d8 93 if not 'url' in video:
84f7ef7d 94 print("Could not find any streams")
2d8521d8 95 return False
d05b6699
MF
96 return video
97
98def download_from_playlist(video):
84f7ef7d
MF
99 playlist = parse_playlist(requests.get(video['url']).text)
100 if not playlist:
101 return
56181f0a 102 videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
2d8521d8
MF
103 if not videourl.startswith('http'): #if relative path
104 videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
105 segments, metadata = parse_segment_playlist(videourl)
56181f0a 106 if "EXT-X-KEY" in metadata:
84f7ef7d 107 key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
56181f0a
MF
108 decrypt=True
109 else:
110 decrypt=False
84f7ef7d 111 with open("%s"%video['filename'],"wb") as ofile:
56181f0a 112 segment=0
72beea17 113 size = 0
56181f0a 114 for url in segments:
84f7ef7d 115 ufile = requests.get(url, stream=True).raw
c1d3d702 116 print("\r{0:.2f} MB".format(size/1024/1024),end="")
72beea17 117 sys.stdout.flush()
56181f0a
MF
118 if decrypt:
119 iv=struct.pack("IIII",segment,0,0,0)
120 decryptor = AES.new(key, AES.MODE_CBC, iv)
121 while(True):
c1d3d702
MF
122 try:
123 buf = ufile.read(4096)
1e13b6eb 124 except (socket.error, TypeError) as e:
c1d3d702
MF
125 print("Error reading, skipping file")
126 print(e)
127 return
84f7ef7d 128 if not buf:
56181f0a 129 break
84f7ef7d
MF
130 if decrypt:
131 buf = decryptor.decrypt(buf)
132 ofile.write(buf)
133 size += len(buf)
56181f0a
MF
134 segment += 1
135
d26e6919
MF
136 if 'thumb-url' in video:
137 video['thumb'] = requests.get(video['thumb-url'],stream=True).raw
138
56181f0a 139def parse_playlist(playlist):
d05b6699 140 if not playlist.startswith("#EXTM3U"):
84f7ef7d 141 print(playlist)
d05b6699 142 return False
2d8521d8
MF
143 playlist = playlist.splitlines()
144 while not 'EXT-X-STREAM-INF' in playlist[0]:
145 playlist = playlist[1:]
56181f0a
MF
146 items=[]
147 for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
d26e6919 148 md = Video()
2d8521d8
MF
149 if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
150 continue
56181f0a
MF
151 for item in metadata_string.split(':')[1].split(','):
152 if '=' in item:
153 md.update([item.split('='),])
154 md['url']=url
155 items.append(md)
156 return items
157
2d8521d8
MF
158def parse_segment_playlist(playlisturl):
159 playlist = requests.get(playlisturl).text
56181f0a
MF
160 assert playlist.startswith("#EXTM3U")
161 PATTERN = re.compile(r'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
162 segments = []
163 next_is_url=False
164 metadata = {}
165 for row in playlist.splitlines():
166 if next_is_url:
2d8521d8
MF
167 if not row.startswith('http'): #if relative path
168 row = "{}/{}".format(os.path.dirname(playlisturl), row)
56181f0a
MF
169 segments.append(row)
170 next_is_url=False
171 continue
172 if 'EXTINF' in row:
173 next_is_url=True
174 if "EXT-X-KEY" in row:
175 row = row.split(':',1)[1] #skip first part
d05b6699 176 parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
56181f0a 177 metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
84f7ef7d
MF
178 return(segments, metadata)
179
d05b6699 180def parse_videolist():
5b0549b5 181 page_num = 1
84f7ef7d
MF
182 soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
183 page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
5b0549b5
MF
184 videos_per_page = 8
185 video_num = 0
186 while(page_num <= page_tot):
187 base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
84f7ef7d 188 soup = BeautifulSoup(requests.get(base_url).text)
5b0549b5
MF
189 for article in soup.findAll('article'):
190 meta = dict(article.attrs)
d26e6919 191 video = Video()
5b0549b5
MF
192 video['title'] = meta['data-title']
193 video['description'] = meta['data-description']
194 video['url'] = dict(article.find('a').attrs)['href']
195 video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
196 video['num'] = video_num
197 video['total'] = page_tot * videos_per_page
198 video_num += 1
199 yield video
200 page_num += 1
201
d26e6919 202def remux(video, xml=None):
2d8521d8 203 basename = video['filename'].split('.ts')[0]
d26e6919
MF
204 if 'genre' in video:
205 if not os.path.exists(video['genre']):
206 os.mkdir(video['genre'])
5ab69ab4
MF
207 video['path'] = os.path.join(video['genre'],basename+'.mkv')
208 else:
1e13b6eb 209 video['path'] = basename+'.mkv'
d26e6919
MF
210 command = ["mkvmerge","-o",video['path'], '--title',video['title']]
211
212 if xml:
213 with open(basename+'.xml','w') as f:
214 f.write(xml)
215 command.extend(['--global-tags',basename+'.xml'])
216 if 'thumb' in video:
217 with open('thumbnail.jpg','wb') as f: #FIXME use title instead for many downloaders
218 f.write(video['thumb'].read())
219 command.extend(['--attachment-description', "Thumbnail",
220 '--attachment-mime-type', 'image/jpeg',
221 '--attach-file', 'thumbnail.jpg'])
222 command.append(video['filename'])
223 print(Popen(command, stdout=PIPE).communicate()[0])
224 for fname in (video['filename'], basename+'.xml','thumbnail.jpg'):
225 try:
226 os.unlink(fname)
227 except:
228 pass
1e13b6eb 229 if 'timestamp' in video:
2301fe14
MF
230 try:
231 os.utime(video['path'], times=(video['timestamp'].timestamp(),video['timestamp'].timestamp()))
232 except FileNotFoundError as e:
233 print(e)
234
d26e6919
MF
235
236def mkv_metadata(video):
237 root = BeautifulSoup(features='xml')
238 root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
239 tags = root.new_tag("Tags")
240 tag = root.new_tag("Tag")
241 tags.append(tag)
242 root.append(tags)
243 keep = ('title','description', 'url','genre')
244 targets = root.new_tag("Targets")
245 ttv = root.new_tag("TargetTypeValue")
246 ttv.string = str(50)
247 targets.append(ttv)
248 tag.append(targets)
249 for key in video:
250 if not key in keep:
251 continue
252 simple = root.new_tag('Simple')
253 name = root.new_tag('Name')
254 name.string=key.upper()
255 simple.append(name)
256 sstring = root.new_tag('String')
257 sstring.string=video[key]
258 simple.append(sstring)
259 tag.append(simple)
260 return str(root)
56181f0a 261
ca2553c7 262if __name__ == "__main__":
72beea17 263 parser = argparse.ArgumentParser()
1ad04c01
MF
264 group = parser.add_mutually_exclusive_group(required=True)
265 group.add_argument("-r", "--rss", help="Download all files in rss")
266 group.add_argument("-u", "--url", help="Download video in url")
267 group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
5b0549b5 268 parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
2d8521d8
MF
269 parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
270
72beea17 271 args = parser.parse_args()
d05b6699 272 if args.rss:
5b0549b5 273 d = feedparser.parse(args.rss)
72beea17 274 for e in d.entries:
84f7ef7d 275 print(("Downloading: %s"%e.title))
5b0549b5
MF
276 if args.no_act:
277 continue
d26e6919 278 video = scrape_player_page({'title':e.title,'url':e.link})
2d8521d8
MF
279 if args.no_remux:
280 continue
d26e6919 281 self.remux(video)
72beea17 282 #print(e.description)
d05b6699 283 if args.mirror:
d26e6919
MF
284 if not os.path.exists('.seen'):
285 os.mkdir('.seen')
d05b6699
MF
286 for video in parse_videolist():
287 video['title'] = video['title'].replace('/','_')
84f7ef7d
MF
288 print(video['title']+'.mkv')
289 print("{} of {}".format(video['num'], video['total']))
d26e6919
MF
290
291 if os.path.exists(os.path.join('.seen',video['title'])):
84f7ef7d 292 print("Skipping")
d05b6699
MF
293 continue
294 print("Downloading...")
5b0549b5
MF
295 if args.no_act:
296 continue
d26e6919
MF
297 open(os.path.join('.seen',video['title']),'w').close() #touch
298 video = scrape_player_page(video)
2d8521d8
MF
299 if args.no_remux:
300 continue
d26e6919
MF
301 xml = mkv_metadata(video)
302 remux(video, xml)
303
72beea17 304 else:
5b0549b5 305 if not args.no_act:
d26e6919 306 video = scrape_player_page({'url':args.url})
2d8521d8
MF
307 if not args.no_remux:
308 remux({'title':e.title})
84f7ef7d 309 print(("Downloaded {}".format(args.url)))