]> git.frykholm.com Git - svtplaydump.git/blob - svtplaydump.py
4a90e28e73e74ab9a740bda1a2a12a3a978d4984
[svtplaydump.git] / svtplaydump.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 #
4 # (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
5 #
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
15 #
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>
18 #
19 # Changelog:
20 # 0.4 added mirror mode.
21 # 0.3 added apple streaming playlist parsing and decryption
22 # 0.2 added python 2.4 urlparse compatibility
23 # 0.1 initial release
24
25 from bs4 import BeautifulSoup, Doctype
26 from subprocess import *
27 import re
28 from Crypto.Cipher import AES
29 import struct
30 import argparse
31 import requests
32 import sys, os
33 import socket
34 import feedparser
35 from datetime import datetime, timezone
36 class Video(dict):
37 def __init__(self, *args, **kwargs):
38 self.update(dict(*args, **kwargs)) # use the free update to set keys
39
40 def __setattr__(self, name, value):
41 return self.__setitem__(name,value)
42
43 def __getattr__(self, name):
44 return self.__getitem__(name)
45
46 def is_downloaded(self):
47 raise("NotImplemented")
48
49 def scrape_player_page(video):
50 """
51 Try to scrape the site for video and download.
52 """
53 if not video['url'].startswith('http'):
54 video['url'] = "http://www.svtplay.se" + video['url']
55 soup = BeautifulSoup(requests.get(video['url']).text)
56 video_player = soup.body('a',{'data-json-href':True})[0]
57 if 'oppetarkiv.se' in video['url']:
58 flashvars = requests.get("http://www.oppetarkiv.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
59 else:
60 if video_player.attrs['data-json-href'].startswith("/wd"):
61 flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
62 else:
63 flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
64 video['duration'] = video_player.attrs.get('data-length',0)
65 if not video['title']:
66 video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('|','_').replace('/','_')
67 if not 'genre' in video:
68 if soup.find(text='Kategori:'):
69 video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
70 else:
71 video['genre'] = 'Ingen Genre'
72 if 'dynamicStreams' in flashvars:
73 video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
74 filename = video['title']+".mp4"
75 print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
76 if 'pathflv' in flashvars:
77 rtmp = flashvars['pathflv'][0]
78 filename = video['title']+".flv"
79 print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
80 if not 'timestamp' in video:
81 if soup.find_all(datetime=True):
82 xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
83 video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) #naive in utc
84 video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) #convert to local time
85 if 'video' in flashvars:
86 for reference in flashvars['video']['videoReferences']:
87 if 'm3u8' in reference['url']:
88 video['url']=reference['url']
89 video['filename'] = video['title']+'.ts'
90 if 'statistics' in flashvars:
91 video['category'] = flashvars['statistics']['category']
92 if not download_from_playlist(video):
93 return False
94 if not 'url' in video:
95 print("Could not find any streams")
96 return False
97 return video
98
99 def download_from_playlist(video):
100 playlist = parse_playlist(requests.get(video['url']).text)
101 if not playlist:
102 return
103 videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
104 if not videourl.startswith('http'): #if relative path
105 videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
106 segments, metadata = parse_segment_playlist(videourl)
107 if "EXT-X-KEY" in metadata:
108 key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
109 decrypt=True
110 else:
111 decrypt=False
112 with open("%s"%video['filename'],"wb") as ofile:
113 segment=0
114 size = 0
115 for url in segments:
116 try:
117 ufile = requests.get(url, stream=True).raw
118 except:
119 print("Error reading, skipping file") #FIXME mark file as failed
120 print(sys.exc_info()[1])
121 return False
122 print("\r{0:.2f} MB".format(size/1024/1024),end="")
123 sys.stdout.flush()
124 if decrypt:
125 iv=struct.pack("IIII",segment,0,0,0)
126 decryptor = AES.new(key, AES.MODE_CBC, iv)
127 while(True):
128 try:
129 buf = ufile.read(4096)
130 except:
131 print("Error reading, skipping file") #FIXME mark file as failed
132 print(sys.exc_info()[1])
133 return False
134 if not buf:
135 break
136 if decrypt:
137 buf = decryptor.decrypt(buf)
138 ofile.write(buf)
139 size += len(buf)
140 segment += 1
141
142 if 'thumb-url' in video:
143 video['thumb'] = requests.get(video['thumb-url'],stream=True).raw
144 return True
145
146 def parse_playlist(playlist):
147 if not playlist.startswith("#EXTM3U"):
148 print(playlist)
149 return False
150 playlist = playlist.splitlines()
151 while not 'EXT-X-STREAM-INF' in playlist[0]:
152 playlist = playlist[1:]
153 items=[]
154 for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
155 md = Video()
156 if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
157 continue
158 for item in metadata_string.split(':')[1].split(','):
159 if '=' in item:
160 md.update([item.split('='),])
161 md['url']=url
162 items.append(md)
163 return items
164
165 def parse_segment_playlist(playlisturl):
166 playlist = requests.get(playlisturl).text
167 assert playlist.startswith("#EXTM3U")
168 PATTERN = re.compile(r'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
169 segments = []
170 next_is_url=False
171 metadata = {}
172 for row in playlist.splitlines():
173 if next_is_url:
174 if not row.startswith('http'): #if relative path
175 row = "{}/{}".format(os.path.dirname(playlisturl), row)
176 segments.append(row)
177 next_is_url=False
178 continue
179 if 'EXTINF' in row:
180 next_is_url=True
181 if "EXT-X-KEY" in row:
182 row = row.split(':',1)[1] #skip first part
183 parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
184 metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
185 return(segments, metadata)
186
187 def parse_videolist():
188 page_num = 1
189 soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
190 page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
191 videos_per_page = 8
192 video_num = 0
193 while(page_num <= page_tot):
194 base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
195 soup = BeautifulSoup(requests.get(base_url).text)
196 for article in soup.findAll('article'):
197 meta = dict(article.attrs)
198 video = Video()
199 video['title'] = meta['data-title']
200 video['description'] = meta['data-description']
201 video['url'] = dict(article.find('a').attrs)['href']
202 video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
203 video['num'] = video_num
204 video['total'] = page_tot * videos_per_page
205 video_num += 1
206 yield video
207 page_num += 1
208
209 def remux(video, xml=None):
210 basename = video['filename'].split('.ts')[0]
211 if 'genre' in video:
212 if not os.path.exists(video['genre']):
213 os.mkdir(video['genre'])
214 video['path'] = os.path.join(video['genre'],basename+'.mkv')
215 else:
216 video['path'] = basename+'.mkv'
217 command = ["mkvmerge","-o",video['path'], '--title',video['title']]
218
219 if xml:
220 with open(basename+'.xml','w') as f:
221 f.write(xml)
222 command.extend(['--global-tags',basename+'.xml'])
223 if 'thumb' in video:
224 with open('thumbnail.jpg','wb') as f: #FIXME use title instead for many downloaders
225 f.write(video['thumb'].read())
226 command.extend(['--attachment-description', "Thumbnail",
227 '--attachment-mime-type', 'image/jpeg',
228 '--attach-file', 'thumbnail.jpg'])
229 command.append(video['filename'])
230 print(Popen(command, stdout=PIPE).communicate()[0])
231 for fname in (video['filename'], basename+'.xml','thumbnail.jpg'):
232 try:
233 os.unlink(fname)
234 except:
235 pass
236 if 'timestamp' in video:
237 try:
238 os.utime(video['path'], times=(video['timestamp'].timestamp(),video['timestamp'].timestamp()))
239 except FileNotFoundError as e:
240 print(e)
241
242
243 def mkv_metadata(video):
244 root = BeautifulSoup(features='xml')
245 root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
246 tags = root.new_tag("Tags")
247 tag = root.new_tag("Tag")
248 tags.append(tag)
249 root.append(tags)
250 keep = ('title','description', 'url','genre')
251 targets = root.new_tag("Targets")
252 ttv = root.new_tag("TargetTypeValue")
253 ttv.string = str(50)
254 targets.append(ttv)
255 tag.append(targets)
256 for key in video:
257 if not key in keep:
258 continue
259 simple = root.new_tag('Simple')
260 name = root.new_tag('Name')
261 name.string=key.upper()
262 simple.append(name)
263 sstring = root.new_tag('String')
264 sstring.string=video[key]
265 simple.append(sstring)
266 tag.append(simple)
267 return str(root)
268
269 if __name__ == "__main__":
270 parser = argparse.ArgumentParser()
271 group = parser.add_mutually_exclusive_group(required=True)
272 group.add_argument("-r", "--rss", help="Download all files in rss")
273 group.add_argument("-u", "--url", help="Download video in url")
274 group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
275 parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
276 parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
277
278 args = parser.parse_args()
279 if args.rss:
280 d = feedparser.parse(args.rss)
281 for e in d.entries:
282 print(("Downloading: %s"%e.title))
283 if args.no_act:
284 continue
285 video = scrape_player_page({'title':e.title,'url':e.link})
286 if args.no_remux:
287 continue
288 self.remux(video)
289 #print(e.description)
290 if args.mirror:
291 if not os.path.exists('.seen'):
292 os.mkdir('.seen')
293 for video in parse_videolist():
294 video['title'] = video['title'].replace('/','_')
295 print(video['title']+'.mkv')
296 print("{} of {}".format(video['num'], video['total']))
297
298 if os.path.exists(os.path.join('.seen',video['title'])):
299 print("Skipping")
300 continue
301 print("Downloading...")
302 if args.no_act:
303 continue
304 open(os.path.join('.seen',video['title']),'w').close() #touch
305 ret = scrape_player_page(video)
306 if not ret:
307 if not os.path.exists('.failed'):
308 os.mkdir('.failed')
309 open(os.path.join('.failed',video['title']),'w').close() #touch
310 continue
311 video = ret
312 if args.no_remux:
313 continue
314 xml = mkv_metadata(video)
315 remux(video, xml)
316
317 else:
318 if not args.no_act:
319 video = scrape_player_page({'url':args.url})
320 if not args.no_remux:
321 remux({'title':e.title})
322 print(("Downloaded {}".format(args.url)))