39bc9ff8edba82f6a0250e6bbd706c1daad319ba
[svtplaydump.git] / svtplaydump.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 #
4 # (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
5 #
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
15 #
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>
18 #
19 # Changelog:
20 # 0.4 added mirror mode.
21 # 0.3 added apple streaming playlist parsing and decryption
22 # 0.2 added python 2.4 urlparse compatibility
23 # 0.1 initial release
24
25 from bs4 import BeautifulSoup, Doctype
26 from subprocess import *
27 import re
28 from Crypto.Cipher import AES
29 import struct
30 import argparse
31 import requests
32 import sys, os
33
34 class Video(dict):
35 def __init__(self, *args, **kwargs):
36 self.update(dict(*args, **kwargs)) # use the free update to set keys
37
38 def __setattr__(self, name, value):
39 return self.__setitem__(name,value)
40
41 def __getattr__(self, name):
42 return self.__getitem__(name)
43
44 def is_downloaded(self):
45 raise("NotImplemented")
46
47 def scrape_player_page(video):
48 """
49 Try to scrape the site for video and download.
50 """
51 if not video['url'].startswith('http'):
52 video['url'] = "http://www.svtplay.se" + video['url']
53 soup = BeautifulSoup(requests.get(video['url']).text)
54 video_player = soup.body('a',{'data-json-href':True})[0]
55 if 'oppetarkiv.se' in video['url']:
56 flashvars = requests.get("http://www.oppetarkiv.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
57 else:
58 if video_player.attrs['data-json-href'].startswith("/wd"):
59 flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
60 else:
61 flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
62 video['duration'] = video_player.attrs.get('data-length',0)
63 if not video['title']:
64 video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('|','_').replace('/','_')
65 if not 'genre' in video:
66 if soup.find(text='Kategori:'):
67 video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
68 else:
69 video['genre'] = 'Ingen Genre'
70 if 'dynamicStreams' in flashvars:
71 video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
72 filename = video['title']+".mp4"
73 print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
74 if 'pathflv' in flashvars:
75 rtmp = flashvars['pathflv'][0]
76 filename = video['title']+".flv"
77 print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
78 if 'video' in flashvars:
79 for reference in flashvars['video']['videoReferences']:
80 if 'm3u8' in reference['url']:
81 video['url']=reference['url']
82 video['filename'] = video['title']+'.ts'
83 if 'statistics' in flashvars:
84 video['category'] = flashvars['statistics']['category']
85 download_from_playlist(video)
86 if not 'url' in video:
87 print("Could not find any streams")
88 return False
89 return video
90
91 def download_from_playlist(video):
92 playlist = parse_playlist(requests.get(video['url']).text)
93 if not playlist:
94 return
95 videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
96 if not videourl.startswith('http'): #if relative path
97 videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
98 segments, metadata = parse_segment_playlist(videourl)
99 if "EXT-X-KEY" in metadata:
100 key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
101 decrypt=True
102 else:
103 decrypt=False
104 with open("%s"%video['filename'],"wb") as ofile:
105 segment=0
106 size = 0
107 for url in segments:
108 ufile = requests.get(url, stream=True).raw
109 print("\r{0:.2f} MB".format(size/1024/1024))
110 sys.stdout.flush()
111 if decrypt:
112 iv=struct.pack("IIII",segment,0,0,0)
113 decryptor = AES.new(key, AES.MODE_CBC, iv)
114 while(True):
115 buf = ufile.read(4096)
116 if not buf:
117 break
118 if decrypt:
119 buf = decryptor.decrypt(buf)
120 ofile.write(buf)
121 size += len(buf)
122 segment += 1
123
124 if 'thumb-url' in video:
125 video['thumb'] = requests.get(video['thumb-url'],stream=True).raw
126
127 def parse_playlist(playlist):
128 if not playlist.startswith("#EXTM3U"):
129 print(playlist)
130 return False
131 playlist = playlist.splitlines()
132 while not 'EXT-X-STREAM-INF' in playlist[0]:
133 playlist = playlist[1:]
134 items=[]
135 for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
136 md = Video()
137 if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
138 continue
139 for item in metadata_string.split(':')[1].split(','):
140 if '=' in item:
141 md.update([item.split('='),])
142 md['url']=url
143 items.append(md)
144 return items
145
146 def parse_segment_playlist(playlisturl):
147 playlist = requests.get(playlisturl).text
148 assert playlist.startswith("#EXTM3U")
149 PATTERN = re.compile(r'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
150 segments = []
151 next_is_url=False
152 metadata = {}
153 for row in playlist.splitlines():
154 if next_is_url:
155 if not row.startswith('http'): #if relative path
156 row = "{}/{}".format(os.path.dirname(playlisturl), row)
157 segments.append(row)
158 next_is_url=False
159 continue
160 if 'EXTINF' in row:
161 next_is_url=True
162 if "EXT-X-KEY" in row:
163 row = row.split(':',1)[1] #skip first part
164 parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
165 metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
166 return(segments, metadata)
167
168 def parse_videolist():
169 page_num = 1
170 soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
171 page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
172 videos_per_page = 8
173 video_num = 0
174 while(page_num <= page_tot):
175 base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
176 soup = BeautifulSoup(requests.get(base_url).text)
177 for article in soup.findAll('article'):
178 meta = dict(article.attrs)
179 video = Video()
180 video['title'] = meta['data-title']
181 video['description'] = meta['data-description']
182 video['url'] = dict(article.find('a').attrs)['href']
183 video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
184 video['num'] = video_num
185 video['total'] = page_tot * videos_per_page
186 video_num += 1
187 yield video
188 page_num += 1
189
190 def remux(video, xml=None):
191 basename = video['filename'].split('.ts')[0]
192 if 'genre' in video:
193 if not os.path.exists(video['genre']):
194 os.mkdir(video['genre'])
195 video['path'] = os.path.join(video['genre'],basename+'.mkv')
196 command = ["mkvmerge","-o",video['path'], '--title',video['title']]
197
198 if xml:
199 with open(basename+'.xml','w') as f:
200 f.write(xml)
201 command.extend(['--global-tags',basename+'.xml'])
202 if 'thumb' in video:
203 with open('thumbnail.jpg','wb') as f: #FIXME use title instead for many downloaders
204 f.write(video['thumb'].read())
205 command.extend(['--attachment-description', "Thumbnail",
206 '--attachment-mime-type', 'image/jpeg',
207 '--attach-file', 'thumbnail.jpg'])
208 command.append(video['filename'])
209 print(Popen(command, stdout=PIPE).communicate()[0])
210 for fname in (video['filename'], basename+'.xml','thumbnail.jpg'):
211 try:
212 os.unlink(fname)
213 except:
214 pass
215
216 def mkv_metadata(video):
217 root = BeautifulSoup(features='xml')
218 root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
219 tags = root.new_tag("Tags")
220 tag = root.new_tag("Tag")
221 tags.append(tag)
222 root.append(tags)
223 keep = ('title','description', 'url','genre')
224 targets = root.new_tag("Targets")
225 ttv = root.new_tag("TargetTypeValue")
226 ttv.string = str(50)
227 targets.append(ttv)
228 tag.append(targets)
229 for key in video:
230 if not key in keep:
231 continue
232 simple = root.new_tag('Simple')
233 name = root.new_tag('Name')
234 name.string=key.upper()
235 simple.append(name)
236 sstring = root.new_tag('String')
237 sstring.string=video[key]
238 simple.append(sstring)
239 tag.append(simple)
240 return str(root)
241
242 if __name__ == "__main__":
243 parser = argparse.ArgumentParser()
244 group = parser.add_mutually_exclusive_group(required=True)
245 group.add_argument("-r", "--rss", help="Download all files in rss")
246 group.add_argument("-u", "--url", help="Download video in url")
247 group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
248 parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
249 parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
250
251 args = parser.parse_args()
252 if args.rss:
253 import feedparser
254 d = feedparser.parse(args.rss)
255 for e in d.entries:
256 print(("Downloading: %s"%e.title))
257 if args.no_act:
258 continue
259 video = scrape_player_page({'title':e.title,'url':e.link})
260 if args.no_remux:
261 continue
262 self.remux(video)
263 #print(e.description)
264 if args.mirror:
265 if not os.path.exists('.seen'):
266 os.mkdir('.seen')
267 for video in parse_videolist():
268 video['title'] = video['title'].replace('/','_')
269 print(video['title']+'.mkv')
270 print("{} of {}".format(video['num'], video['total']))
271
272 if os.path.exists(os.path.join('.seen',video['title'])):
273 print("Skipping")
274 continue
275 print("Downloading...")
276 if args.no_act:
277 continue
278 open(os.path.join('.seen',video['title']),'w').close() #touch
279 video = scrape_player_page(video)
280 if args.no_remux:
281 continue
282 xml = mkv_metadata(video)
283 remux(video, xml)
284
285 else:
286 if not args.no_act:
287 video = scrape_player_page({'url':args.url})
288 if not args.no_remux:
289 remux({'title':e.title})
290 print(("Downloaded {}".format(args.url)))