git.frykholm.com Git - svtplaydump.git/blame

Commit	Line	Data
fa7d6ee2	1	#!/usr/bin/env python3.4
56181f0a	2	# -- coding: utf-8 --
ca2553c7 MF	3	#
	4	# (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
	5	#
	6	# This program is free software: you can redistribute it and/or modify
	7	# it under the terms of the GNU General Public License as published by
	8	# the Free Software Foundation, either version 3 of the License, or
	9	# (at your option) any later version.
	10	#
	11	# This program is distributed in the hope that it will be useful,
	12	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	# GNU General Public License for more details.
	15	#
	16	# You should have received a copy of the GNU General Public License
	17	# along with this program. If not, see <http://www.gnu.org/licenses/>
	18	#
	19	# Changelog:
d05b6699	20	# 0.4 added mirror mode.
56181f0a	21	# 0.3 added apple streaming playlist parsing and decryption
ca2553c7 MF	22	# 0.2 added python 2.4 urlparse compatibility
	23	# 0.1 initial release
	24
d26e6919	25	from bs4 import BeautifulSoup, Doctype
ca2553c7	26	from subprocess import *
89a00fa0	27	import re
56181f0a MF	28	from Crypto.Cipher import AES
56181f0a MF	29	import struct
72beea17	30	import argparse
84f7ef7d	31	import requests
d05b6699	32	import sys, os
1e13b6eb MF	33	import feedparser
1e13b6eb MF	34	from datetime import datetime, timezone
fa7d6ee2 MF	35	from pathlib import Path
fa7d6ee2 MF	36
3d7ac34a	37
d26e6919 MF	38	class Video(dict):
	39	def __init__(self, args, *kwargs):
	40	self.update(dict(args, *kwargs)) # use the free update to set keys
	41
	42	def __setattr__(self, name, value):
3d7ac34a MF	43	return self.__setitem__(name, value)
3d7ac34a MF	44
d26e6919 MF	45	def __getattr__(self, name):
d26e6919 MF	46	return self.__getitem__(name)
3d7ac34a	47
d26e6919	48	def is_downloaded(self):
3d7ac34a MF	49	raise ("NotImplemented")
3d7ac34a MF	50
d26e6919 MF	51
d26e6919 MF	52	def scrape_player_page(video):
d05b6699 MF	53	"""
	54	Try to scrape the site for video and download.
	55	"""
d26e6919 MF	56	if not video['url'].startswith('http'):
	57	video['url'] = "http://www.svtplay.se" + video['url']
	58	soup = BeautifulSoup(requests.get(video['url']).text)
3d7ac34a	59	video_player = soup.body('a', {'data-json-href': True})[0]
d26e6919	60	if 'oppetarkiv.se' in video['url']:
3d7ac34a MF	61	flashvars = requests.get(
	62	"http://www.oppetarkiv.se/%s" % video_player.attrs['data-json-href'] + "?output=json").json()
	63	else:
7370a42e	64	if video_player.attrs['data-json-href'].startswith("/wd"):
3d7ac34a MF	65	flashvars = requests.get("http://www.svt.se/%s" % video_player.attrs['data-json-href']).json()
	66	else:
	67	flashvars = requests.get(
	68	"http://www.svtplay.se/%s" % video_player.attrs['data-json-href'] + "?output=json").json()
	69	video['duration'] = video_player.attrs.get('data-length', 0)
24160239	70	if not 'title' in video:
3d7ac34a MF	71	video['title'] = soup.find('meta', {'property': 'og:title'}).attrs['content'].replace('\|', '_').replace('/', '_')
3d7ac34a MF	72	if 'genre' not in video:
d26e6919 MF	73	if soup.find(text='Kategori:'):
	74	video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
	75	else:
3d7ac34a	76	video['genre'] = 'Ingen Genre'
ca2553c7	77	if 'dynamicStreams' in flashvars:
3d7ac34a	78	video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] + '.mp4'
fa7d6ee2	79	filename = Path(video['title']).with_suffix(".mp4")
3d7ac34a	80	print(Popen(["rtmpdump", "-o" + filename, "-r", video['url']], stdout=PIPE).communicate()[0])
ca2553c7 MF	81	if 'pathflv' in flashvars:
ca2553c7 MF	82	rtmp = flashvars['pathflv'][0]
fa7d6ee2	83	filename = Path(video['title']).with_suffix(".flv")
3d7ac34a MF	84	print(Popen(["mplayer", "-dumpstream", "-dumpfile", filename, rtmp], stdout=PIPE).communicate()[0])
	85	if not 'timestamp' in video and soup.find_all(datetime=True):
	86	xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
	87	if xmldate_str:
	88	video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) # naive in utc
	89	video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) # convert to local time
89a00fa0	90	if 'video' in flashvars:
56181f0a	91	for reference in flashvars['video']['videoReferences']:
2d8521d8	92	if 'm3u8' in reference['url']:
3d7ac34a	93	video['url'] = reference['url']
fa7d6ee2	94	video['filename'] = Path(video['title']).with_suffix('.ts')
d05b6699 MF	95	if 'statistics' in flashvars:
d05b6699 MF	96	video['category'] = flashvars['statistics']['category']
1e111d91 MF	97	if not download_from_playlist(video):
1e111d91 MF	98	return False
3d7ac34a	99	if 'url' not in video:
84f7ef7d	100	print("Could not find any streams")
2d8521d8	101	return False
d05b6699 MF	102	return video
d05b6699 MF	103
3d7ac34a	104
d05b6699	105	def download_from_playlist(video):
24160239 MF	106	params = requests.utils.urlparse(video['url']).query
24160239 MF	107	print(params)
3d7ac34a MF	108	if 'cc1=' in params: # 'cc1=name=Svenska~default=yes~forced=no~uri=http://media.svt.se/download/mcc/wp3/undertexter-wsrt/1134047/1134047-025A/C(sv)/index.m3u8~lang=sv'
	109	video['subs'] = [
	110	dict([k.split('=') for k in params.split('cc1=')[1].split('~')])] # make a dict from the paramstring
24160239 MF	111	try:
	112	req = requests.get(video['url']).text
	113	except:
3d7ac34a	114	print("Error reading, skipping file")
24160239 MF	115	print(sys.exc_info()[1])
	116	return False
	117	if 'subs' in video:
	118	try:
	119	segments = [item for item in requests.get(video['subs'][0]['uri']).text.split('\n') if 'vtt' in item]
	120	except:
3d7ac34a	121	print("Error reading, skipping subtitle")
24160239	122	print(sys.exc_info()[1])
3d7ac34a	123	segments = [] # ugly FIXME
24160239 MF	124	video['subs'][0]['download'] = []
	125	for segment in segments:
	126	if not segment.startswith('http'):
	127	segment = "{}/{}".format(os.path.dirname(video['subs'][0]['uri']), segment)
	128	try:
	129	video['subs'][0]['download'].append(requests.get(segment).text)
	130	except:
3d7ac34a	131	print("Error reading, skipping subtitle")
24160239 MF	132	print(sys.exc_info()[1])
	133	break
	134	playlist = parse_playlist(req)
84f7ef7d MF	135	if not playlist:
84f7ef7d MF	136	return
56181f0a	137	videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
3d7ac34a MF	138	if not videourl.startswith('http'): # if relative path
3d7ac34a MF	139	videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
2d8521d8	140	segments, metadata = parse_segment_playlist(videourl)
56181f0a	141	if "EXT-X-KEY" in metadata:
24160239 MF	142	try:
	143	key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
	144	except:
3d7ac34a	145	print("Error reading, skipping file")
24160239 MF	146	print(sys.exc_info()[1])
24160239 MF	147	return False
3d7ac34a	148	decrypt = True
56181f0a	149	else:
3d7ac34a	150	decrypt = False
fa7d6ee2	151	with video['filename'].open("wb") as ofile:
3d7ac34a	152	segment = 0
72beea17	153	size = 0
56181f0a	154	for url in segments:
1e111d91 MF	155	try:
	156	ufile = requests.get(url, stream=True).raw
	157	except:
3d7ac34a	158	print("Error reading, skipping file")
1e111d91 MF	159	print(sys.exc_info()[1])
1e111d91 MF	160	return False
3d7ac34a	161	print("\r{0:.2f} MB".format(size / 1024 / 1024), end="")
72beea17	162	sys.stdout.flush()
56181f0a	163	if decrypt:
3d7ac34a	164	iv = struct.pack("IIII", segment, 0, 0, 0)
24160239	165	try:
3d7ac34a MF	166	decryptor = AES.new(key, AES.MODE_CBC,
	167	iv) # ValueError: AES key must be either 16, 24, or 32 bytes long
	168	except ValueError as e:
24160239 MF	169	print("Error using decryption key. Skipping")
24160239 MF	170	print(e)
3d7ac34a MF	171	return False
3d7ac34a MF	172	while True:
c1d3d702 MF	173	try:
c1d3d702 MF	174	buf = ufile.read(4096)
1e111d91	175	except:
3d7ac34a	176	print("Error reading, skipping file")
1e111d91 MF	177	print(sys.exc_info()[1])
1e111d91 MF	178	return False
84f7ef7d	179	if not buf:
56181f0a	180	break
84f7ef7d MF	181	if decrypt:
	182	buf = decryptor.decrypt(buf)
	183	ofile.write(buf)
	184	size += len(buf)
56181f0a MF	185	segment += 1
56181f0a MF	186
d26e6919	187	if 'thumb-url' in video:
24160239	188	try:
3d7ac34a	189	video['thumb'] = requests.get(video['thumb-url'], stream=True).raw
24160239	190	except:
3d7ac34a	191	print("Error reading thumbnail") # FIXME mark file as failed
24160239 MF	192	print(sys.exc_info()[1])
24160239 MF	193
1e111d91	194	return True
d26e6919	195
3d7ac34a	196
56181f0a	197	def parse_playlist(playlist):
d05b6699	198	if not playlist.startswith("#EXTM3U"):
84f7ef7d	199	print(playlist)
d05b6699	200	return False
2d8521d8 MF	201	playlist = playlist.splitlines()
	202	while not 'EXT-X-STREAM-INF' in playlist[0]:
	203	playlist = playlist[1:]
3d7ac34a MF	204	items = []
3d7ac34a MF	205	for (metadata_string, url) in zip(playlist[0::2], playlist[1::2]):
d26e6919	206	md = Video()
2d8521d8 MF	207	if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
2d8521d8 MF	208	continue
56181f0a MF	209	for item in metadata_string.split(':')[1].split(','):
56181f0a MF	210	if '=' in item:
3d7ac34a MF	211	md.update([item.split('='), ])
3d7ac34a MF	212	md['url'] = url
56181f0a	213	items.append(md)
3d7ac34a MF	214	return items
3d7ac34a MF	215
56181f0a	216
2d8521d8 MF	217	def parse_segment_playlist(playlisturl):
2d8521d8 MF	218	playlist = requests.get(playlisturl).text
56181f0a MF	219	assert playlist.startswith("#EXTM3U")
	220	PATTERN = re.compile(r'''((?:[^,"']\|"[^"]"\|'[^']')+)''')
	221	segments = []
3d7ac34a	222	next_is_url = False
56181f0a MF	223	metadata = {}
	224	for row in playlist.splitlines():
	225	if next_is_url:
3d7ac34a MF	226	if not row.startswith('http'): # if relative path
3d7ac34a MF	227	row = "{}/{}".format(os.path.dirname(playlisturl), row)
56181f0a	228	segments.append(row)
3d7ac34a	229	next_is_url = False
56181f0a MF	230	continue
56181f0a MF	231	if 'EXTINF' in row:
3d7ac34a	232	next_is_url = True
56181f0a	233	if "EXT-X-KEY" in row:
3d7ac34a MF	234	row = row.split(':', 1)[1] # skip first part
	235	parts = PATTERN.split(row)[1:-1] # do magic re split and keep quotes
	236	metadata["EXT-X-KEY"] = dict([part.split('=', 1) for part in parts if
	237	'=' in part]) # throw away the commas and make dict of the pairs
	238	return segments, metadata
	239
84f7ef7d	240
d05b6699	241	def parse_videolist():
5b0549b5	242	page_num = 1
3d7ac34a MF	243	soup = BeautifulSoup(requests.get(
	244	"http://www.svtplay.se/ajax/videospager").text) # this call does not work for getting the pages, we use it for the page totals only
	245	page_tot = int(soup.find('a', {'data-currentpage': True}).attrs['data-lastpage'])
5b0549b5 MF	246	videos_per_page = 8
5b0549b5 MF	247	video_num = 0
3d7ac34a	248	while page_num <= page_tot:
5b0549b5	249	base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
84f7ef7d	250	soup = BeautifulSoup(requests.get(base_url).text)
5b0549b5 MF	251	for article in soup.findAll('article'):
5b0549b5 MF	252	meta = dict(article.attrs)
d26e6919	253	video = Video()
5b0549b5 MF	254	video['title'] = meta['data-title']
	255	video['description'] = meta['data-description']
	256	video['url'] = dict(article.find('a').attrs)['href']
3d7ac34a	257	video['thumb-url'] = dict(article.find('img', {}).attrs)['src']
5b0549b5 MF	258	video['num'] = video_num
	259	video['total'] = page_tot * videos_per_page
	260	video_num += 1
	261	yield video
	262	page_num += 1
	263
3d7ac34a	264
d26e6919	265	def remux(video, xml=None):
d26e6919 MF	266	if 'genre' in video:
	267	if not os.path.exists(video['genre']):
	268	os.mkdir(video['genre'])
fa7d6ee2	269	video['path'] = Path(video['genre'] / video['filename']).with_suffix('.mkv')
5ab69ab4	270	else:
3d7ac34a MF	271	video['path'] = video['filename'].with_suffix('.mkv')
3d7ac34a MF	272	command = ["mkvmerge", "-o", str(video['path']), '--title', video['title']]
d26e6919 MF	273
d26e6919 MF	274	if xml:
fa7d6ee2	275	with video['filename'].with_suffix('.xml').open('w') as f:
d26e6919	276	f.write(xml)
3d7ac34a	277	command.extend(['--global-tags', str(video['filename'].with_suffix('.xml'))])
d26e6919	278	if 'thumb' in video:
3d7ac34a	279	with open('thumbnail.jpg', 'wb') as f: # FIXME use title instead for many downloaders
d26e6919 MF	280	f.write(video['thumb'].read())
d26e6919 MF	281	command.extend(['--attachment-description', "Thumbnail",
3d7ac34a MF	282	'--attachment-mime-type', 'image/jpeg',
3d7ac34a MF	283	'--attach-file', 'thumbnail.jpg'])
24160239 MF	284	# if 'subs' in video:
	285	# for sub in video['subs']:
	286	# if 'download' in sub:
	287	# with open("{}.vtt".format(sub['lang']),'wb') as f:
	288	# f.write(bytes("".join(sub['download']),'utf-8')) #FIXME
	289	# command.extend(['--language 0:{} {}.vtt'.format(sub['lang'],sub['lang'])])
fa7d6ee2 MF	290
fa7d6ee2 MF	291	command.append(str(video['filename']))
d26e6919	292	print(Popen(command, stdout=PIPE).communicate()[0])
3d7ac34a	293	for fname in (video['filename'], video['filename'].with_suffix('.xml'), Path('thumbnail.jpg')):
d26e6919	294	try:
fa7d6ee2	295	fname.unlink()
d26e6919 MF	296	except:
d26e6919 MF	297	pass
1e13b6eb	298	if 'timestamp' in video:
2301fe14	299	try:
3d7ac34a	300	os.utime(str(video['path']), times=(video['timestamp'].timestamp(), video['timestamp'].timestamp()))
2301fe14 MF	301	except FileNotFoundError as e:
2301fe14 MF	302	print(e)
1e111d91	303
3d7ac34a	304
d26e6919 MF	305	def mkv_metadata(video):
	306	root = BeautifulSoup(features='xml')
	307	root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
	308	tags = root.new_tag("Tags")
	309	tag = root.new_tag("Tag")
	310	tags.append(tag)
	311	root.append(tags)
3d7ac34a	312	keep = ('title', 'description', 'url', 'genre')
d26e6919 MF	313	targets = root.new_tag("Targets")
	314	ttv = root.new_tag("TargetTypeValue")
	315	ttv.string = str(50)
	316	targets.append(ttv)
	317	tag.append(targets)
	318	for key in video:
	319	if not key in keep:
	320	continue
	321	simple = root.new_tag('Simple')
	322	name = root.new_tag('Name')
3d7ac34a	323	name.string = key.upper()
d26e6919 MF	324	simple.append(name)
d26e6919 MF	325	sstring = root.new_tag('String')
3d7ac34a	326	sstring.string = video[key]
d26e6919 MF	327	simple.append(sstring)
	328	tag.append(simple)
	329	return str(root)
56181f0a	330
3d7ac34a	331
ca2553c7	332	if __name__ == "__main__":
72beea17	333	parser = argparse.ArgumentParser()
1ad04c01 MF	334	group = parser.add_mutually_exclusive_group(required=True)
	335	group.add_argument("-r", "--rss", help="Download all files in rss")
	336	group.add_argument("-u", "--url", help="Download video in url")
	337	group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
3d7ac34a MF	338	parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.",
3d7ac34a MF	339	action="store_true")
2d8521d8	340	parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
3d7ac34a	341
72beea17	342	args = parser.parse_args()
3d7ac34a	343	if args.rss:
5b0549b5	344	d = feedparser.parse(args.rss)
72beea17	345	for e in d.entries:
3d7ac34a	346	print(("Downloading: %s" % e.title))
5b0549b5 MF	347	if args.no_act:
5b0549b5 MF	348	continue
3d7ac34a	349	video = scrape_player_page({'title': e.title, 'url': e.link})
2d8521d8 MF	350	if args.no_remux:
2d8521d8 MF	351	continue
3d7ac34a MF	352	remux(video)
3d7ac34a MF	353	# print(e.description)
d05b6699	354	if args.mirror:
d26e6919 MF	355	if not os.path.exists('.seen'):
d26e6919 MF	356	os.mkdir('.seen')
d05b6699	357	for video in parse_videolist():
3d7ac34a MF	358	video['title'] = video['title'].replace('/', '_')
3d7ac34a MF	359	print(video['title'] + '.mkv')
84f7ef7d	360	print("{} of {}".format(video['num'], video['total']))
3d7ac34a MF	361
	362	if os.path.exists(os.path.join('.seen', video['title'])):
	363	print("Skipping")
d05b6699 MF	364	continue
d05b6699 MF	365	print("Downloading...")
5b0549b5 MF	366	if args.no_act:
5b0549b5 MF	367	continue
3d7ac34a	368	open(os.path.join('.seen', video['title']), 'w').close() # touch
1e111d91 MF	369	ret = scrape_player_page(video)
	370	if not ret:
	371	if not os.path.exists('.failed'):
	372	os.mkdir('.failed')
3d7ac34a	373	open(os.path.join('.failed', video['title']), 'w').close() # touch
1e111d91 MF	374	continue
1e111d91 MF	375	video = ret
2d8521d8 MF	376	if args.no_remux:
2d8521d8 MF	377	continue
d26e6919 MF	378	xml = mkv_metadata(video)
d26e6919 MF	379	remux(video, xml)
3d7ac34a	380
72beea17	381	else:
5b0549b5	382	if not args.no_act:
3d7ac34a	383	video = scrape_player_page({'url': args.url})
2d8521d8	384	if not args.no_remux:
24160239	385	remux(video)
3d7ac34a	386	print(("Downloaded {}".format(args.url)))