]> git.frykholm.com Git - svtplaydump.git/blame - svtplaydump.py
Update to use pathlib
[svtplaydump.git] / svtplaydump.py
CommitLineData
fa7d6ee2 1#!/usr/bin/env python3.4
56181f0a 2# -*- coding: utf-8 -*-
ca2553c7
MF
3#
4# (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
5#
6# This program is free software: you can redistribute it and/or modify
7# it under the terms of the GNU General Public License as published by
8# the Free Software Foundation, either version 3 of the License, or
9# (at your option) any later version.
10#
11# This program is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14# GNU General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License
17# along with this program. If not, see <http://www.gnu.org/licenses/>
18#
19# Changelog:
d05b6699 20# 0.4 added mirror mode.
56181f0a 21# 0.3 added apple streaming playlist parsing and decryption
ca2553c7
MF
22# 0.2 added python 2.4 urlparse compatibility
23# 0.1 initial release
24
d26e6919 25from bs4 import BeautifulSoup, Doctype
ca2553c7 26from subprocess import *
89a00fa0 27import re
56181f0a
MF
28from Crypto.Cipher import AES
29import struct
72beea17 30import argparse
84f7ef7d 31import requests
d05b6699 32import sys, os
c1d3d702 33import socket
1e13b6eb
MF
34import feedparser
35from datetime import datetime, timezone
fa7d6ee2
MF
36from pathlib import Path
37
d26e6919
MF
38class Video(dict):
39 def __init__(self, *args, **kwargs):
40 self.update(dict(*args, **kwargs)) # use the free update to set keys
41
42 def __setattr__(self, name, value):
43 return self.__setitem__(name,value)
44
45 def __getattr__(self, name):
46 return self.__getitem__(name)
47
48 def is_downloaded(self):
49 raise("NotImplemented")
50
51def scrape_player_page(video):
d05b6699
MF
52 """
53 Try to scrape the site for video and download.
54 """
d26e6919
MF
55 if not video['url'].startswith('http'):
56 video['url'] = "http://www.svtplay.se" + video['url']
57 soup = BeautifulSoup(requests.get(video['url']).text)
d05b6699 58 video_player = soup.body('a',{'data-json-href':True})[0]
d26e6919 59 if 'oppetarkiv.se' in video['url']:
7370a42e 60 flashvars = requests.get("http://www.oppetarkiv.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
d3ebb57d 61 else:
7370a42e
MF
62 if video_player.attrs['data-json-href'].startswith("/wd"):
63 flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
64 else:
65 flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
84f7ef7d 66 video['duration'] = video_player.attrs.get('data-length',0)
24160239 67 if not 'title' in video:
84f7ef7d 68 video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('|','_').replace('/','_')
d26e6919
MF
69 if not 'genre' in video:
70 if soup.find(text='Kategori:'):
71 video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
72 else:
73 video['genre'] = 'Ingen Genre'
ca2553c7 74 if 'dynamicStreams' in flashvars:
d05b6699 75 video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
fa7d6ee2 76 filename = Path(video['title']).with_suffix(".mp4")
84f7ef7d 77 print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
ca2553c7
MF
78 if 'pathflv' in flashvars:
79 rtmp = flashvars['pathflv'][0]
fa7d6ee2 80 filename = Path(video['title']).with_suffix(".flv")
84f7ef7d 81 print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
1e13b6eb
MF
82 if not 'timestamp' in video:
83 if soup.find_all(datetime=True):
84 xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
fa7d6ee2
MF
85 if xmldate_str:
86 video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) #naive in utc
87 video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) #convert to local time
89a00fa0 88 if 'video' in flashvars:
56181f0a 89 for reference in flashvars['video']['videoReferences']:
2d8521d8 90 if 'm3u8' in reference['url']:
d05b6699 91 video['url']=reference['url']
fa7d6ee2 92 video['filename'] = Path(video['title']).with_suffix('.ts')
d05b6699
MF
93 if 'statistics' in flashvars:
94 video['category'] = flashvars['statistics']['category']
1e111d91
MF
95 if not download_from_playlist(video):
96 return False
2d8521d8 97 if not 'url' in video:
84f7ef7d 98 print("Could not find any streams")
2d8521d8 99 return False
d05b6699
MF
100 return video
101
102def download_from_playlist(video):
24160239
MF
103 params = requests.utils.urlparse(video['url']).query
104 print(params)
105 if 'cc1=' in params: #'cc1=name=Svenska~default=yes~forced=no~uri=http://media.svt.se/download/mcc/wp3/undertexter-wsrt/1134047/1134047-025A/C(sv)/index.m3u8~lang=sv'
106 video['subs'] = [dict([k.split('=') for k in params.split('cc1=')[1].split('~')])] #make a dict from the paramstring
107 try:
108 req = requests.get(video['url']).text
109 except:
110 print("Error reading, skipping file")
111 print(sys.exc_info()[1])
112 return False
113 if 'subs' in video:
114 try:
115 segments = [item for item in requests.get(video['subs'][0]['uri']).text.split('\n') if 'vtt' in item]
116 except:
117 print("Error reading, skipping subtitle")
118 print(sys.exc_info()[1])
119 segments = [] #ugly FIXME
120 video['subs'][0]['download'] = []
121 for segment in segments:
122 if not segment.startswith('http'):
123 segment = "{}/{}".format(os.path.dirname(video['subs'][0]['uri']), segment)
124 try:
125 video['subs'][0]['download'].append(requests.get(segment).text)
126 except:
127 print("Error reading, skipping subtitle")
128 print(sys.exc_info()[1])
129 break
130 playlist = parse_playlist(req)
84f7ef7d
MF
131 if not playlist:
132 return
56181f0a 133 videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
2d8521d8
MF
134 if not videourl.startswith('http'): #if relative path
135 videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
136 segments, metadata = parse_segment_playlist(videourl)
56181f0a 137 if "EXT-X-KEY" in metadata:
24160239
MF
138 try:
139 key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
140 except:
141 print("Error reading, skipping file")
142 print(sys.exc_info()[1])
143 return False
56181f0a
MF
144 decrypt=True
145 else:
146 decrypt=False
fa7d6ee2 147 with video['filename'].open("wb") as ofile:
56181f0a 148 segment=0
72beea17 149 size = 0
56181f0a 150 for url in segments:
1e111d91
MF
151 try:
152 ufile = requests.get(url, stream=True).raw
153 except:
24160239 154 print("Error reading, skipping file")
1e111d91
MF
155 print(sys.exc_info()[1])
156 return False
fa7d6ee2 157 print("\r{0:.2f} MB".format(size/1024/1024), end="")
72beea17 158 sys.stdout.flush()
56181f0a
MF
159 if decrypt:
160 iv=struct.pack("IIII",segment,0,0,0)
24160239
MF
161 try:
162 decryptor = AES.new(key, AES.MODE_CBC, iv) #ValueError: AES key must be either 16, 24, or 32 bytes long
163 except(ValueError) as e:
164 print("Error using decryption key. Skipping")
165 print(e)
166 return False
56181f0a 167 while(True):
c1d3d702
MF
168 try:
169 buf = ufile.read(4096)
1e111d91
MF
170 except:
171 print("Error reading, skipping file") #FIXME mark file as failed
172 print(sys.exc_info()[1])
173 return False
84f7ef7d 174 if not buf:
56181f0a 175 break
84f7ef7d
MF
176 if decrypt:
177 buf = decryptor.decrypt(buf)
178 ofile.write(buf)
179 size += len(buf)
56181f0a
MF
180 segment += 1
181
d26e6919 182 if 'thumb-url' in video:
24160239
MF
183 try:
184 video['thumb'] = requests.get(video['thumb-url'],stream=True).raw
185 except:
186 print("Error reading thumbnail") #FIXME mark file as failed
187 print(sys.exc_info()[1])
188
1e111d91 189 return True
d26e6919 190
56181f0a 191def parse_playlist(playlist):
d05b6699 192 if not playlist.startswith("#EXTM3U"):
84f7ef7d 193 print(playlist)
d05b6699 194 return False
2d8521d8
MF
195 playlist = playlist.splitlines()
196 while not 'EXT-X-STREAM-INF' in playlist[0]:
197 playlist = playlist[1:]
56181f0a
MF
198 items=[]
199 for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
d26e6919 200 md = Video()
2d8521d8
MF
201 if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
202 continue
56181f0a
MF
203 for item in metadata_string.split(':')[1].split(','):
204 if '=' in item:
205 md.update([item.split('='),])
206 md['url']=url
207 items.append(md)
208 return items
209
2d8521d8
MF
210def parse_segment_playlist(playlisturl):
211 playlist = requests.get(playlisturl).text
56181f0a
MF
212 assert playlist.startswith("#EXTM3U")
213 PATTERN = re.compile(r'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
214 segments = []
215 next_is_url=False
216 metadata = {}
217 for row in playlist.splitlines():
218 if next_is_url:
2d8521d8
MF
219 if not row.startswith('http'): #if relative path
220 row = "{}/{}".format(os.path.dirname(playlisturl), row)
56181f0a
MF
221 segments.append(row)
222 next_is_url=False
223 continue
224 if 'EXTINF' in row:
225 next_is_url=True
226 if "EXT-X-KEY" in row:
227 row = row.split(':',1)[1] #skip first part
d05b6699 228 parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
56181f0a 229 metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
84f7ef7d
MF
230 return(segments, metadata)
231
d05b6699 232def parse_videolist():
5b0549b5 233 page_num = 1
84f7ef7d
MF
234 soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
235 page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
5b0549b5
MF
236 videos_per_page = 8
237 video_num = 0
238 while(page_num <= page_tot):
239 base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
84f7ef7d 240 soup = BeautifulSoup(requests.get(base_url).text)
5b0549b5
MF
241 for article in soup.findAll('article'):
242 meta = dict(article.attrs)
d26e6919 243 video = Video()
5b0549b5
MF
244 video['title'] = meta['data-title']
245 video['description'] = meta['data-description']
246 video['url'] = dict(article.find('a').attrs)['href']
247 video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
248 video['num'] = video_num
249 video['total'] = page_tot * videos_per_page
250 video_num += 1
251 yield video
252 page_num += 1
253
d26e6919 254def remux(video, xml=None):
d26e6919
MF
255 if 'genre' in video:
256 if not os.path.exists(video['genre']):
257 os.mkdir(video['genre'])
fa7d6ee2 258 video['path'] = Path(video['genre'] / video['filename']).with_suffix('.mkv')
5ab69ab4 259 else:
fa7d6ee2
MF
260 video['path'] = video['filename'].with_suffix('.mkv')
261 command = ["mkvmerge","-o",str(video['path']), '--title',video['title']]
d26e6919
MF
262
263 if xml:
fa7d6ee2 264 with video['filename'].with_suffix('.xml').open('w') as f:
d26e6919 265 f.write(xml)
fa7d6ee2 266 command.extend(['--global-tags',str(video['filename'].with_suffix('.xml'))])
d26e6919
MF
267 if 'thumb' in video:
268 with open('thumbnail.jpg','wb') as f: #FIXME use title instead for many downloaders
269 f.write(video['thumb'].read())
270 command.extend(['--attachment-description', "Thumbnail",
271 '--attachment-mime-type', 'image/jpeg',
272 '--attach-file', 'thumbnail.jpg'])
24160239
MF
273 # if 'subs' in video:
274 # for sub in video['subs']:
275 # if 'download' in sub:
276 # with open("{}.vtt".format(sub['lang']),'wb') as f:
277 # f.write(bytes("".join(sub['download']),'utf-8')) #FIXME
278 # command.extend(['--language 0:{} {}.vtt'.format(sub['lang'],sub['lang'])])
fa7d6ee2
MF
279
280 command.append(str(video['filename']))
d26e6919 281 print(Popen(command, stdout=PIPE).communicate()[0])
fa7d6ee2 282 for fname in (video['filename'], video['filename'].with_suffix('.xml'),Path('thumbnail.jpg')):
d26e6919 283 try:
fa7d6ee2 284 fname.unlink()
d26e6919
MF
285 except:
286 pass
1e13b6eb 287 if 'timestamp' in video:
2301fe14 288 try:
fa7d6ee2 289 os.utime(str(video['path']), times=(video['timestamp'].timestamp(),video['timestamp'].timestamp()))
2301fe14
MF
290 except FileNotFoundError as e:
291 print(e)
1e111d91 292
d26e6919
MF
293
294def mkv_metadata(video):
295 root = BeautifulSoup(features='xml')
296 root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
297 tags = root.new_tag("Tags")
298 tag = root.new_tag("Tag")
299 tags.append(tag)
300 root.append(tags)
301 keep = ('title','description', 'url','genre')
302 targets = root.new_tag("Targets")
303 ttv = root.new_tag("TargetTypeValue")
304 ttv.string = str(50)
305 targets.append(ttv)
306 tag.append(targets)
307 for key in video:
308 if not key in keep:
309 continue
310 simple = root.new_tag('Simple')
311 name = root.new_tag('Name')
312 name.string=key.upper()
313 simple.append(name)
314 sstring = root.new_tag('String')
315 sstring.string=video[key]
316 simple.append(sstring)
317 tag.append(simple)
318 return str(root)
56181f0a 319
ca2553c7 320if __name__ == "__main__":
72beea17 321 parser = argparse.ArgumentParser()
1ad04c01
MF
322 group = parser.add_mutually_exclusive_group(required=True)
323 group.add_argument("-r", "--rss", help="Download all files in rss")
324 group.add_argument("-u", "--url", help="Download video in url")
325 group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
5b0549b5 326 parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
2d8521d8
MF
327 parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
328
72beea17 329 args = parser.parse_args()
d05b6699 330 if args.rss:
5b0549b5 331 d = feedparser.parse(args.rss)
72beea17 332 for e in d.entries:
84f7ef7d 333 print(("Downloading: %s"%e.title))
5b0549b5
MF
334 if args.no_act:
335 continue
d26e6919 336 video = scrape_player_page({'title':e.title,'url':e.link})
2d8521d8
MF
337 if args.no_remux:
338 continue
d26e6919 339 self.remux(video)
72beea17 340 #print(e.description)
d05b6699 341 if args.mirror:
d26e6919
MF
342 if not os.path.exists('.seen'):
343 os.mkdir('.seen')
d05b6699
MF
344 for video in parse_videolist():
345 video['title'] = video['title'].replace('/','_')
84f7ef7d
MF
346 print(video['title']+'.mkv')
347 print("{} of {}".format(video['num'], video['total']))
d26e6919
MF
348
349 if os.path.exists(os.path.join('.seen',video['title'])):
84f7ef7d 350 print("Skipping")
d05b6699
MF
351 continue
352 print("Downloading...")
5b0549b5
MF
353 if args.no_act:
354 continue
d26e6919 355 open(os.path.join('.seen',video['title']),'w').close() #touch
1e111d91
MF
356 ret = scrape_player_page(video)
357 if not ret:
358 if not os.path.exists('.failed'):
359 os.mkdir('.failed')
360 open(os.path.join('.failed',video['title']),'w').close() #touch
361 continue
362 video = ret
2d8521d8
MF
363 if args.no_remux:
364 continue
d26e6919
MF
365 xml = mkv_metadata(video)
366 remux(video, xml)
367
72beea17 368 else:
5b0549b5 369 if not args.no_act:
d26e6919 370 video = scrape_player_page({'url':args.url})
2d8521d8 371 if not args.no_remux:
24160239 372 remux(video)
84f7ef7d 373 print(("Downloaded {}".format(args.url)))