modification date
[svtplaydump.git] / svtplaydump.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 #
4 # (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
5 #
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
15 #
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>
18 #
19 # Changelog:
20 # 0.4 added mirror mode.
21 # 0.3 added apple streaming playlist parsing and decryption
22 # 0.2 added python 2.4 urlparse compatibility
23 # 0.1 initial release
24
25 from bs4 import BeautifulSoup, Doctype
26 from subprocess import *
27 import re
28 from Crypto.Cipher import AES
29 import struct
30 import argparse
31 import requests
32 import sys, os
33 import socket
34 import feedparser
35 from datetime import datetime, timezone
36 class Video(dict):
37 def __init__(self, *args, **kwargs):
38 self.update(dict(*args, **kwargs)) # use the free update to set keys
39
40 def __setattr__(self, name, value):
41 return self.__setitem__(name,value)
42
43 def __getattr__(self, name):
44 return self.__getitem__(name)
45
46 def is_downloaded(self):
47 raise("NotImplemented")
48
49 def scrape_player_page(video):
50 """
51 Try to scrape the site for video and download.
52 """
53 if not video['url'].startswith('http'):
54 video['url'] = "http://www.svtplay.se" + video['url']
55 soup = BeautifulSoup(requests.get(video['url']).text)
56 video_player = soup.body('a',{'data-json-href':True})[0]
57 if 'oppetarkiv.se' in video['url']:
58 flashvars = requests.get("http://www.oppetarkiv.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
59 else:
60 if video_player.attrs['data-json-href'].startswith("/wd"):
61 flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
62 else:
63 flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
64 video['duration'] = video_player.attrs.get('data-length',0)
65 if not video['title']:
66 video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('|','_').replace('/','_')
67 if not 'genre' in video:
68 if soup.find(text='Kategori:'):
69 video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
70 else:
71 video['genre'] = 'Ingen Genre'
72 if 'dynamicStreams' in flashvars:
73 video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
74 filename = video['title']+".mp4"
75 print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
76 if 'pathflv' in flashvars:
77 rtmp = flashvars['pathflv'][0]
78 filename = video['title']+".flv"
79 print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
80 if not 'timestamp' in video:
81 if soup.find_all(datetime=True):
82 xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
83 video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) #naive in utc
84 video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) #convert to local time
85 if 'video' in flashvars:
86 for reference in flashvars['video']['videoReferences']:
87 if 'm3u8' in reference['url']:
88 video['url']=reference['url']
89 video['filename'] = video['title']+'.ts'
90 if 'statistics' in flashvars:
91 video['category'] = flashvars['statistics']['category']
92 download_from_playlist(video)
93 if not 'url' in video:
94 print("Could not find any streams")
95 return False
96 return video
97
98 def download_from_playlist(video):
99 playlist = parse_playlist(requests.get(video['url']).text)
100 if not playlist:
101 return
102 videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
103 if not videourl.startswith('http'): #if relative path
104 videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
105 segments, metadata = parse_segment_playlist(videourl)
106 if "EXT-X-KEY" in metadata:
107 key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
108 decrypt=True
109 else:
110 decrypt=False
111 with open("%s"%video['filename'],"wb") as ofile:
112 segment=0
113 size = 0
114 for url in segments:
115 ufile = requests.get(url, stream=True).raw
116 print("\r{0:.2f} MB".format(size/1024/1024),end="")
117 sys.stdout.flush()
118 if decrypt:
119 iv=struct.pack("IIII",segment,0,0,0)
120 decryptor = AES.new(key, AES.MODE_CBC, iv)
121 while(True):
122 try:
123 buf = ufile.read(4096)
124 except (socket.error, TypeError) as e:
125 print("Error reading, skipping file")
126 print(e)
127 return
128 if not buf:
129 break
130 if decrypt:
131 buf = decryptor.decrypt(buf)
132 ofile.write(buf)
133 size += len(buf)
134 segment += 1
135
136 if 'thumb-url' in video:
137 video['thumb'] = requests.get(video['thumb-url'],stream=True).raw
138
139 def parse_playlist(playlist):
140 if not playlist.startswith("#EXTM3U"):
141 print(playlist)
142 return False
143 playlist = playlist.splitlines()
144 while not 'EXT-X-STREAM-INF' in playlist[0]:
145 playlist = playlist[1:]
146 items=[]
147 for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
148 md = Video()
149 if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
150 continue
151 for item in metadata_string.split(':')[1].split(','):
152 if '=' in item:
153 md.update([item.split('='),])
154 md['url']=url
155 items.append(md)
156 return items
157
158 def parse_segment_playlist(playlisturl):
159 playlist = requests.get(playlisturl).text
160 assert playlist.startswith("#EXTM3U")
161 PATTERN = re.compile(r'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
162 segments = []
163 next_is_url=False
164 metadata = {}
165 for row in playlist.splitlines():
166 if next_is_url:
167 if not row.startswith('http'): #if relative path
168 row = "{}/{}".format(os.path.dirname(playlisturl), row)
169 segments.append(row)
170 next_is_url=False
171 continue
172 if 'EXTINF' in row:
173 next_is_url=True
174 if "EXT-X-KEY" in row:
175 row = row.split(':',1)[1] #skip first part
176 parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
177 metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
178 return(segments, metadata)
179
180 def parse_videolist():
181 page_num = 1
182 soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
183 page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
184 videos_per_page = 8
185 video_num = 0
186 while(page_num <= page_tot):
187 base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
188 soup = BeautifulSoup(requests.get(base_url).text)
189 for article in soup.findAll('article'):
190 meta = dict(article.attrs)
191 video = Video()
192 video['title'] = meta['data-title']
193 video['description'] = meta['data-description']
194 video['url'] = dict(article.find('a').attrs)['href']
195 video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
196 video['num'] = video_num
197 video['total'] = page_tot * videos_per_page
198 video_num += 1
199 yield video
200 page_num += 1
201
202 def remux(video, xml=None):
203 basename = video['filename'].split('.ts')[0]
204 if 'genre' in video:
205 if not os.path.exists(video['genre']):
206 os.mkdir(video['genre'])
207 video['path'] = os.path.join(video['genre'],basename+'.mkv')
208 else:
209 video['path'] = basename+'.mkv'
210 command = ["mkvmerge","-o",video['path'], '--title',video['title']]
211
212 if xml:
213 with open(basename+'.xml','w') as f:
214 f.write(xml)
215 command.extend(['--global-tags',basename+'.xml'])
216 if 'thumb' in video:
217 with open('thumbnail.jpg','wb') as f: #FIXME use title instead for many downloaders
218 f.write(video['thumb'].read())
219 command.extend(['--attachment-description', "Thumbnail",
220 '--attachment-mime-type', 'image/jpeg',
221 '--attach-file', 'thumbnail.jpg'])
222 command.append(video['filename'])
223 print(Popen(command, stdout=PIPE).communicate()[0])
224 for fname in (video['filename'], basename+'.xml','thumbnail.jpg'):
225 try:
226 os.unlink(fname)
227 except:
228 pass
229 if 'timestamp' in video:
230 os.utime(video['path'], times=(video['timestamp'].timestamp(),video['timestamp'].timestamp()))
231
232
233 def mkv_metadata(video):
234 root = BeautifulSoup(features='xml')
235 root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
236 tags = root.new_tag("Tags")
237 tag = root.new_tag("Tag")
238 tags.append(tag)
239 root.append(tags)
240 keep = ('title','description', 'url','genre')
241 targets = root.new_tag("Targets")
242 ttv = root.new_tag("TargetTypeValue")
243 ttv.string = str(50)
244 targets.append(ttv)
245 tag.append(targets)
246 for key in video:
247 if not key in keep:
248 continue
249 simple = root.new_tag('Simple')
250 name = root.new_tag('Name')
251 name.string=key.upper()
252 simple.append(name)
253 sstring = root.new_tag('String')
254 sstring.string=video[key]
255 simple.append(sstring)
256 tag.append(simple)
257 return str(root)
258
259 if __name__ == "__main__":
260 parser = argparse.ArgumentParser()
261 group = parser.add_mutually_exclusive_group(required=True)
262 group.add_argument("-r", "--rss", help="Download all files in rss")
263 group.add_argument("-u", "--url", help="Download video in url")
264 group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
265 parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
266 parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
267
268 args = parser.parse_args()
269 if args.rss:
270 d = feedparser.parse(args.rss)
271 for e in d.entries:
272 print(("Downloading: %s"%e.title))
273 if args.no_act:
274 continue
275 video = scrape_player_page({'title':e.title,'url':e.link})
276 if args.no_remux:
277 continue
278 self.remux(video)
279 #print(e.description)
280 if args.mirror:
281 if not os.path.exists('.seen'):
282 os.mkdir('.seen')
283 for video in parse_videolist():
284 video['title'] = video['title'].replace('/','_')
285 print(video['title']+'.mkv')
286 print("{} of {}".format(video['num'], video['total']))
287
288 if os.path.exists(os.path.join('.seen',video['title'])):
289 print("Skipping")
290 continue
291 print("Downloading...")
292 if args.no_act:
293 continue
294 open(os.path.join('.seen',video['title']),'w').close() #touch
295 video = scrape_player_page(video)
296 if args.no_remux:
297 continue
298 xml = mkv_metadata(video)
299 remux(video, xml)
300
301 else:
302 if not args.no_act:
303 video = scrape_player_page({'url':args.url})
304 if not args.no_remux:
305 remux({'title':e.title})
306 print(("Downloaded {}".format(args.url)))