]> git.frykholm.com Git - svtplaydump.git/blame - svtplaydump.py
more error handling
[svtplaydump.git] / svtplaydump.py
CommitLineData
84f7ef7d 1#!/usr/bin/env python3
56181f0a 2# -*- coding: utf-8 -*-
ca2553c7
MF
3#
4# (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
5#
6# This program is free software: you can redistribute it and/or modify
7# it under the terms of the GNU General Public License as published by
8# the Free Software Foundation, either version 3 of the License, or
9# (at your option) any later version.
10#
11# This program is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14# GNU General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License
17# along with this program. If not, see <http://www.gnu.org/licenses/>
18#
19# Changelog:
d05b6699 20# 0.4 added mirror mode.
56181f0a 21# 0.3 added apple streaming playlist parsing and decryption
ca2553c7
MF
22# 0.2 added python 2.4 urlparse compatibility
23# 0.1 initial release
24
d26e6919 25from bs4 import BeautifulSoup, Doctype
ca2553c7 26from subprocess import *
89a00fa0 27import re
56181f0a
MF
28from Crypto.Cipher import AES
29import struct
72beea17 30import argparse
84f7ef7d 31import requests
d05b6699 32import sys, os
c1d3d702 33import socket
1e13b6eb
MF
34import feedparser
35from datetime import datetime, timezone
d26e6919
MF
36class Video(dict):
37 def __init__(self, *args, **kwargs):
38 self.update(dict(*args, **kwargs)) # use the free update to set keys
39
40 def __setattr__(self, name, value):
41 return self.__setitem__(name,value)
42
43 def __getattr__(self, name):
44 return self.__getitem__(name)
45
46 def is_downloaded(self):
47 raise("NotImplemented")
48
49def scrape_player_page(video):
d05b6699
MF
50 """
51 Try to scrape the site for video and download.
52 """
d26e6919
MF
53 if not video['url'].startswith('http'):
54 video['url'] = "http://www.svtplay.se" + video['url']
55 soup = BeautifulSoup(requests.get(video['url']).text)
d05b6699 56 video_player = soup.body('a',{'data-json-href':True})[0]
d26e6919 57 if 'oppetarkiv.se' in video['url']:
7370a42e 58 flashvars = requests.get("http://www.oppetarkiv.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
d3ebb57d 59 else:
7370a42e
MF
60 if video_player.attrs['data-json-href'].startswith("/wd"):
61 flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
62 else:
63 flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
84f7ef7d 64 video['duration'] = video_player.attrs.get('data-length',0)
d26e6919 65 if not video['title']:
84f7ef7d 66 video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('|','_').replace('/','_')
d26e6919
MF
67 if not 'genre' in video:
68 if soup.find(text='Kategori:'):
69 video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
70 else:
71 video['genre'] = 'Ingen Genre'
ca2553c7 72 if 'dynamicStreams' in flashvars:
d05b6699
MF
73 video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
74 filename = video['title']+".mp4"
84f7ef7d 75 print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
ca2553c7
MF
76 if 'pathflv' in flashvars:
77 rtmp = flashvars['pathflv'][0]
d05b6699 78 filename = video['title']+".flv"
84f7ef7d 79 print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
1e13b6eb
MF
80 if not 'timestamp' in video:
81 if soup.find_all(datetime=True):
82 xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
83 video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) #naive in utc
84 video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) #convert to local time
89a00fa0 85 if 'video' in flashvars:
56181f0a 86 for reference in flashvars['video']['videoReferences']:
2d8521d8 87 if 'm3u8' in reference['url']:
d05b6699
MF
88 video['url']=reference['url']
89 video['filename'] = video['title']+'.ts'
90 if 'statistics' in flashvars:
91 video['category'] = flashvars['statistics']['category']
1e111d91
MF
92 if not download_from_playlist(video):
93 return False
2d8521d8 94 if not 'url' in video:
84f7ef7d 95 print("Could not find any streams")
2d8521d8 96 return False
d05b6699
MF
97 return video
98
99def download_from_playlist(video):
84f7ef7d
MF
100 playlist = parse_playlist(requests.get(video['url']).text)
101 if not playlist:
102 return
56181f0a 103 videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
2d8521d8
MF
104 if not videourl.startswith('http'): #if relative path
105 videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
106 segments, metadata = parse_segment_playlist(videourl)
56181f0a 107 if "EXT-X-KEY" in metadata:
84f7ef7d 108 key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
56181f0a
MF
109 decrypt=True
110 else:
111 decrypt=False
84f7ef7d 112 with open("%s"%video['filename'],"wb") as ofile:
56181f0a 113 segment=0
72beea17 114 size = 0
56181f0a 115 for url in segments:
1e111d91
MF
116 try:
117 ufile = requests.get(url, stream=True).raw
118 except:
119 print("Error reading, skipping file") #FIXME mark file as failed
120 print(sys.exc_info()[1])
121 return False
c1d3d702 122 print("\r{0:.2f} MB".format(size/1024/1024),end="")
72beea17 123 sys.stdout.flush()
56181f0a
MF
124 if decrypt:
125 iv=struct.pack("IIII",segment,0,0,0)
126 decryptor = AES.new(key, AES.MODE_CBC, iv)
127 while(True):
c1d3d702
MF
128 try:
129 buf = ufile.read(4096)
1e111d91
MF
130 except:
131 print("Error reading, skipping file") #FIXME mark file as failed
132 print(sys.exc_info()[1])
133 return False
84f7ef7d 134 if not buf:
56181f0a 135 break
84f7ef7d
MF
136 if decrypt:
137 buf = decryptor.decrypt(buf)
138 ofile.write(buf)
139 size += len(buf)
56181f0a
MF
140 segment += 1
141
d26e6919
MF
142 if 'thumb-url' in video:
143 video['thumb'] = requests.get(video['thumb-url'],stream=True).raw
1e111d91 144 return True
d26e6919 145
56181f0a 146def parse_playlist(playlist):
d05b6699 147 if not playlist.startswith("#EXTM3U"):
84f7ef7d 148 print(playlist)
d05b6699 149 return False
2d8521d8
MF
150 playlist = playlist.splitlines()
151 while not 'EXT-X-STREAM-INF' in playlist[0]:
152 playlist = playlist[1:]
56181f0a
MF
153 items=[]
154 for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
d26e6919 155 md = Video()
2d8521d8
MF
156 if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
157 continue
56181f0a
MF
158 for item in metadata_string.split(':')[1].split(','):
159 if '=' in item:
160 md.update([item.split('='),])
161 md['url']=url
162 items.append(md)
163 return items
164
2d8521d8
MF
165def parse_segment_playlist(playlisturl):
166 playlist = requests.get(playlisturl).text
56181f0a
MF
167 assert playlist.startswith("#EXTM3U")
168 PATTERN = re.compile(r'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
169 segments = []
170 next_is_url=False
171 metadata = {}
172 for row in playlist.splitlines():
173 if next_is_url:
2d8521d8
MF
174 if not row.startswith('http'): #if relative path
175 row = "{}/{}".format(os.path.dirname(playlisturl), row)
56181f0a
MF
176 segments.append(row)
177 next_is_url=False
178 continue
179 if 'EXTINF' in row:
180 next_is_url=True
181 if "EXT-X-KEY" in row:
182 row = row.split(':',1)[1] #skip first part
d05b6699 183 parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
56181f0a 184 metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
84f7ef7d
MF
185 return(segments, metadata)
186
d05b6699 187def parse_videolist():
5b0549b5 188 page_num = 1
84f7ef7d
MF
189 soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
190 page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
5b0549b5
MF
191 videos_per_page = 8
192 video_num = 0
193 while(page_num <= page_tot):
194 base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
84f7ef7d 195 soup = BeautifulSoup(requests.get(base_url).text)
5b0549b5
MF
196 for article in soup.findAll('article'):
197 meta = dict(article.attrs)
d26e6919 198 video = Video()
5b0549b5
MF
199 video['title'] = meta['data-title']
200 video['description'] = meta['data-description']
201 video['url'] = dict(article.find('a').attrs)['href']
202 video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
203 video['num'] = video_num
204 video['total'] = page_tot * videos_per_page
205 video_num += 1
206 yield video
207 page_num += 1
208
d26e6919 209def remux(video, xml=None):
2d8521d8 210 basename = video['filename'].split('.ts')[0]
d26e6919
MF
211 if 'genre' in video:
212 if not os.path.exists(video['genre']):
213 os.mkdir(video['genre'])
5ab69ab4
MF
214 video['path'] = os.path.join(video['genre'],basename+'.mkv')
215 else:
1e13b6eb 216 video['path'] = basename+'.mkv'
d26e6919
MF
217 command = ["mkvmerge","-o",video['path'], '--title',video['title']]
218
219 if xml:
220 with open(basename+'.xml','w') as f:
221 f.write(xml)
222 command.extend(['--global-tags',basename+'.xml'])
223 if 'thumb' in video:
224 with open('thumbnail.jpg','wb') as f: #FIXME use title instead for many downloaders
225 f.write(video['thumb'].read())
226 command.extend(['--attachment-description', "Thumbnail",
227 '--attachment-mime-type', 'image/jpeg',
228 '--attach-file', 'thumbnail.jpg'])
229 command.append(video['filename'])
230 print(Popen(command, stdout=PIPE).communicate()[0])
231 for fname in (video['filename'], basename+'.xml','thumbnail.jpg'):
232 try:
233 os.unlink(fname)
234 except:
235 pass
1e13b6eb 236 if 'timestamp' in video:
2301fe14
MF
237 try:
238 os.utime(video['path'], times=(video['timestamp'].timestamp(),video['timestamp'].timestamp()))
239 except FileNotFoundError as e:
240 print(e)
1e111d91 241
d26e6919
MF
242
243def mkv_metadata(video):
244 root = BeautifulSoup(features='xml')
245 root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
246 tags = root.new_tag("Tags")
247 tag = root.new_tag("Tag")
248 tags.append(tag)
249 root.append(tags)
250 keep = ('title','description', 'url','genre')
251 targets = root.new_tag("Targets")
252 ttv = root.new_tag("TargetTypeValue")
253 ttv.string = str(50)
254 targets.append(ttv)
255 tag.append(targets)
256 for key in video:
257 if not key in keep:
258 continue
259 simple = root.new_tag('Simple')
260 name = root.new_tag('Name')
261 name.string=key.upper()
262 simple.append(name)
263 sstring = root.new_tag('String')
264 sstring.string=video[key]
265 simple.append(sstring)
266 tag.append(simple)
267 return str(root)
56181f0a 268
ca2553c7 269if __name__ == "__main__":
72beea17 270 parser = argparse.ArgumentParser()
1ad04c01
MF
271 group = parser.add_mutually_exclusive_group(required=True)
272 group.add_argument("-r", "--rss", help="Download all files in rss")
273 group.add_argument("-u", "--url", help="Download video in url")
274 group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
5b0549b5 275 parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
2d8521d8
MF
276 parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
277
72beea17 278 args = parser.parse_args()
d05b6699 279 if args.rss:
5b0549b5 280 d = feedparser.parse(args.rss)
72beea17 281 for e in d.entries:
84f7ef7d 282 print(("Downloading: %s"%e.title))
5b0549b5
MF
283 if args.no_act:
284 continue
d26e6919 285 video = scrape_player_page({'title':e.title,'url':e.link})
2d8521d8
MF
286 if args.no_remux:
287 continue
d26e6919 288 self.remux(video)
72beea17 289 #print(e.description)
d05b6699 290 if args.mirror:
d26e6919
MF
291 if not os.path.exists('.seen'):
292 os.mkdir('.seen')
d05b6699
MF
293 for video in parse_videolist():
294 video['title'] = video['title'].replace('/','_')
84f7ef7d
MF
295 print(video['title']+'.mkv')
296 print("{} of {}".format(video['num'], video['total']))
d26e6919
MF
297
298 if os.path.exists(os.path.join('.seen',video['title'])):
84f7ef7d 299 print("Skipping")
d05b6699
MF
300 continue
301 print("Downloading...")
5b0549b5
MF
302 if args.no_act:
303 continue
d26e6919 304 open(os.path.join('.seen',video['title']),'w').close() #touch
1e111d91
MF
305 ret = scrape_player_page(video)
306 if not ret:
307 if not os.path.exists('.failed'):
308 os.mkdir('.failed')
309 open(os.path.join('.failed',video['title']),'w').close() #touch
310 continue
311 video = ret
2d8521d8
MF
312 if args.no_remux:
313 continue
d26e6919
MF
314 xml = mkv_metadata(video)
315 remux(video, xml)
316
72beea17 317 else:
5b0549b5 318 if not args.no_act:
d26e6919 319 video = scrape_player_page({'url':args.url})
2d8521d8
MF
320 if not args.no_remux:
321 remux({'title':e.title})
84f7ef7d 322 print(("Downloaded {}".format(args.url)))