]>
git.frykholm.com Git - svtplaydump.git/blob - svtplaydump.py
1 #!/usr/bin/env python3.4
2 # -*- coding: utf-8 -*-
4 # (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>
20 # 0.4 added mirror mode.
21 # 0.3 added apple streaming playlist parsing and decryption
22 # 0.2 added python 2.4 urlparse compatibility
25 from bs4
import BeautifulSoup
, Doctype
26 from subprocess
import *
28 from Crypto
.Cipher
import AES
34 from datetime
import datetime
, timezone
35 from pathlib
import Path
39 def __init__(self
, *args
, **kwargs
):
40 self
.update(dict(*args
, **kwargs
)) # use the free update to set keys
42 def __setattr__(self
, name
, value
):
43 return self
.__setitem
__(name
, value
)
45 def __getattr__(self
, name
):
46 return self
.__getitem
__(name
)
48 def is_downloaded(self
):
49 raise ("NotImplemented")
52 def scrape_player_page(video
):
54 Try to scrape the site for video and download.
56 if not video
['url'].startswith('http'):
57 video
['url'] = "http://www.svtplay.se" + video
['url']
58 soup
= BeautifulSoup(requests
.get(video
['url']).text
)
59 video_player
= soup
.body('a', {'data-json-href': True})[0]
60 if 'oppetarkiv.se' in video
['url']:
61 flashvars
= requests
.get(
62 "http://www.oppetarkiv.se/%s" % video_player
.attrs
['data-json-href'] + "?output=json").json()
64 if video_player
.attrs
['data-json-href'].startswith("/wd"):
65 flashvars
= requests
.get("http://www.svt.se/%s" % video_player
.attrs
['data-json-href']).json()
67 flashvars
= requests
.get(
68 "http://www.svtplay.se/%s" % video_player
.attrs
['data-json-href'] + "?output=json").json()
69 video
['duration'] = video_player
.attrs
.get('data-length', 0)
70 if not 'title' in video
:
71 video
['title'] = soup
.find('meta', {'property': 'og:title'}).attrs
['content'].replace('|', '_').replace('/', '_')
72 if 'genre' not in video
:
73 if soup
.find(text
='Kategori:'):
74 video
['genre'] = soup
.find(text
='Kategori:').parent
.parent
.a
.text
76 video
['genre'] = 'Ingen Genre'
77 if 'dynamicStreams' in flashvars
:
78 video
['url'] = flashvars
['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] + '.mp4'
79 filename
= Path(video
['title']).with_suffix(".mp4")
80 print(Popen(["rtmpdump", "-o" + filename
, "-r", video
['url']], stdout
=PIPE
).communicate()[0])
81 if 'pathflv' in flashvars
:
82 rtmp
= flashvars
['pathflv'][0]
83 filename
= Path(video
['title']).with_suffix(".flv")
84 print(Popen(["mplayer", "-dumpstream", "-dumpfile", filename
, rtmp
], stdout
=PIPE
).communicate()[0])
85 if not 'timestamp' in video
and soup
.find_all(datetime
=True):
86 xmldate_str
= soup
.find_all(datetime
=True)[0].attrs
['datetime']
88 video
['timestamp'] = datetime(*feedparser
._parse
_date
_w
3dtf
(xmldate_str
)[:6]) # naive in utc
89 video
['timestamp'] = video
['timestamp'].replace(tzinfo
=timezone
.utc
).astimezone(tz
=None) # convert to local time
90 if 'video' in flashvars
:
91 for reference
in flashvars
['video']['videoReferences']:
92 if 'm3u8' in reference
['url']:
93 video
['url'] = reference
['url']
94 video
['filename'] = Path(video
['title']).with_suffix('.ts')
95 if 'statistics' in flashvars
:
96 video
['category'] = flashvars
['statistics']['category']
97 if not download_from_playlist(video
):
99 if 'url' not in video
:
100 print("Could not find any streams")
105 def download_from_playlist(video
):
106 params
= requests
.utils
.urlparse(video
['url']).query
108 if 'cc1=' in params
: # 'cc1=name=Svenska~default=yes~forced=no~uri=http://media.svt.se/download/mcc/wp3/undertexter-wsrt/1134047/1134047-025A/C(sv)/index.m3u8~lang=sv'
110 dict([k
.split('=') for k
in params
.split('cc1=')[1].split('~')])] # make a dict from the paramstring
112 req
= requests
.get(video
['url']).text
114 print("Error reading, skipping file")
115 print(sys
.exc_info()[1])
119 segments
= [item
for item
in requests
.get(video
['subs'][0]['uri']).text
.split('\n') if 'vtt' in item
]
121 print("Error reading, skipping subtitle")
122 print(sys
.exc_info()[1])
123 segments
= [] # ugly FIXME
124 video
['subs'][0]['download'] = []
125 for segment
in segments
:
126 if not segment
.startswith('http'):
127 segment
= "{}/{}".format(os
.path
.dirname(video
['subs'][0]['uri']), segment
)
129 video
['subs'][0]['download'].append(requests
.get(segment
).text
)
131 print("Error reading, skipping subtitle")
132 print(sys
.exc_info()[1])
134 playlist
= parse_playlist(req
)
137 videourl
= sorted(playlist
, key
=lambda k
: int(k
['BANDWIDTH']))[-1]['url']
138 if not videourl
.startswith('http'): # if relative path
139 videourl
= "{}/{}".format(os
.path
.dirname(video
['url']), videourl
)
140 segments
, metadata
= parse_segment_playlist(videourl
)
141 if "EXT-X-KEY" in metadata
:
143 key
= requests
.get(metadata
["EXT-X-KEY"]['URI'].strip('"')).text
145 print("Error reading, skipping file")
146 print(sys
.exc_info()[1])
151 with video
['filename'].open("wb") as ofile
:
156 ufile
= requests
.get(url
, stream
=True).raw
158 print("Error reading, skipping file")
159 print(sys
.exc_info()[1])
161 print("\r{0:.2f} MB".format(size
/ 1024 / 1024), end
="")
164 iv
= struct
.pack("IIII", segment
, 0, 0, 0)
166 decryptor
= AES
.new(key
, AES
.MODE_CBC
,
167 iv
) # ValueError: AES key must be either 16, 24, or 32 bytes long
168 except ValueError as e
:
169 print("Error using decryption key. Skipping")
174 buf
= ufile
.read(4096)
176 print("Error reading, skipping file")
177 print(sys
.exc_info()[1])
182 buf
= decryptor
.decrypt(buf
)
187 if 'thumb-url' in video
:
189 video
['thumb'] = requests
.get(video
['thumb-url'], stream
=True).raw
191 print("Error reading thumbnail") # FIXME mark file as failed
192 print(sys
.exc_info()[1])
197 def parse_playlist(playlist
):
198 if not playlist
.startswith("#EXTM3U"):
201 playlist
= playlist
.splitlines()
202 while not 'EXT-X-STREAM-INF' in playlist
[0]:
203 playlist
= playlist
[1:]
205 for (metadata_string
, url
) in zip(playlist
[0::2], playlist
[1::2]):
207 if not 'EXT-X-STREAM-INF' in metadata_string
.split(':')[0]:
209 for item
in metadata_string
.split(':')[1].split(','):
211 md
.update([item
.split('='), ])
217 def parse_segment_playlist(playlisturl
):
218 playlist
= requests
.get(playlisturl
).text
219 assert playlist
.startswith("#EXTM3U")
220 PATTERN
= re
.compile(r
'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
224 for row
in playlist
.splitlines():
226 if not row
.startswith('http'): # if relative path
227 row
= "{}/{}".format(os
.path
.dirname(playlisturl
), row
)
233 if "EXT-X-KEY" in row
:
234 row
= row
.split(':', 1)[1] # skip first part
235 parts
= PATTERN
.split(row
)[1:-1] # do magic re split and keep quotes
236 metadata
["EXT-X-KEY"] = dict([part
.split('=', 1) for part
in parts
if
237 '=' in part
]) # throw away the commas and make dict of the pairs
238 return segments
, metadata
241 def parse_videolist():
243 soup
= BeautifulSoup(requests
.get(
244 "http://www.svtplay.se/ajax/videospager").text
) # this call does not work for getting the pages, we use it for the page totals only
245 page_tot
= int(soup
.find('a', {'data-currentpage': True}).attrs
['data-lastpage'])
248 while page_num
<= page_tot
:
249 base_url
= "http://www.svtplay.se/ajax/videos?sida={}".format(page_num
)
250 soup
= BeautifulSoup(requests
.get(base_url
).text
)
251 for article
in soup
.findAll('article'):
252 meta
= dict(article
.attrs
)
254 video
['title'] = meta
['data-title']
255 video
['description'] = meta
['data-description']
256 video
['url'] = dict(article
.find('a').attrs
)['href']
257 video
['thumb-url'] = dict(article
.find('img', {}).attrs
)['src']
258 video
['num'] = video_num
259 video
['total'] = page_tot
* videos_per_page
265 def remux(video
, xml
=None):
267 if not os
.path
.exists(video
['genre']):
268 os
.mkdir(video
['genre'])
269 video
['path'] = Path(video
['genre'] / video
['filename']).with_suffix('.mkv')
271 video
['path'] = video
['filename'].with_suffix('.mkv')
272 command
= ["mkvmerge", "-o", str(video
['path']), '--title', video
['title']]
275 with video
['filename'].with_suffix('.xml').open('w') as f
:
277 command
.extend(['--global-tags', str(video
['filename'].with_suffix('.xml'))])
279 with
open('thumbnail.jpg', 'wb') as f
: # FIXME use title instead for many downloaders
280 f
.write(video
['thumb'].read())
281 command
.extend(['--attachment-description', "Thumbnail",
282 '--attachment-mime-type', 'image/jpeg',
283 '--attach-file', 'thumbnail.jpg'])
284 # if 'subs' in video:
285 # for sub in video['subs']:
286 # if 'download' in sub:
287 # with open("{}.vtt".format(sub['lang']),'wb') as f:
288 # f.write(bytes("".join(sub['download']),'utf-8')) #FIXME
289 # command.extend(['--language 0:{} {}.vtt'.format(sub['lang'],sub['lang'])])
291 command
.append(str(video
['filename']))
292 print(Popen(command
, stdout
=PIPE
).communicate()[0])
293 for fname
in (video
['filename'], video
['filename'].with_suffix('.xml'), Path('thumbnail.jpg')):
298 if 'timestamp' in video
:
300 os
.utime(str(video
['path']), times
=(video
['timestamp'].timestamp(), video
['timestamp'].timestamp()))
301 except FileNotFoundError
as e
:
305 def mkv_metadata(video
):
306 root
= BeautifulSoup(features
='xml')
307 root
.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
308 tags
= root
.new_tag("Tags")
309 tag
= root
.new_tag("Tag")
312 keep
= ('title', 'description', 'url', 'genre')
313 targets
= root
.new_tag("Targets")
314 ttv
= root
.new_tag("TargetTypeValue")
321 simple
= root
.new_tag('Simple')
322 name
= root
.new_tag('Name')
323 name
.string
= key
.upper()
325 sstring
= root
.new_tag('String')
326 sstring
.string
= video
[key
]
327 simple
.append(sstring
)
332 if __name__
== "__main__":
333 parser
= argparse
.ArgumentParser()
334 group
= parser
.add_mutually_exclusive_group(required
=True)
335 group
.add_argument("-r", "--rss", help="Download all files in rss")
336 group
.add_argument("-u", "--url", help="Download video in url")
337 group
.add_argument("-m", "--mirror", help="Mirror all files", action
="store_true")
338 parser
.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.",
340 parser
.add_argument("--no_remux", help="Don't remux into mkv", action
="store_true")
342 args
= parser
.parse_args()
344 d
= feedparser
.parse(args
.rss
)
346 print(("Downloading: %s" % e
.title
))
349 video
= scrape_player_page({'title': e
.title
, 'url': e
.link
})
353 # print(e.description)
355 if not os
.path
.exists('.seen'):
357 for video
in parse_videolist():
358 video
['title'] = video
['title'].replace('/', '_')
359 print(video
['title'] + '.mkv')
360 print("{} of {}".format(video
['num'], video
['total']))
362 if os
.path
.exists(os
.path
.join('.seen', video
['title'])):
365 print("Downloading...")
368 open(os
.path
.join('.seen', video
['title']), 'w').close() # touch
369 ret
= scrape_player_page(video
)
371 if not os
.path
.exists('.failed'):
373 open(os
.path
.join('.failed', video
['title']), 'w').close() # touch
378 xml
= mkv_metadata(video
)
383 video
= scrape_player_page({'url': args
.url
})
384 if not args
.no_remux
:
386 print(("Downloaded {}".format(args
.url
)))