git.frykholm.com Git - svtplaydump.git/blame_incremental

... / ...

Commit	Line	Data
	1	#!/usr/bin/env python3.4
	2	# -- coding: utf-8 --
	3	#
	4	# (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
	5	#
	6	# This program is free software: you can redistribute it and/or modify
	7	# it under the terms of the GNU General Public License as published by
	8	# the Free Software Foundation, either version 3 of the License, or
	9	# (at your option) any later version.
	10	#
	11	# This program is distributed in the hope that it will be useful,
	12	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	# GNU General Public License for more details.
	15	#
	16	# You should have received a copy of the GNU General Public License
	17	# along with this program. If not, see <http://www.gnu.org/licenses/>
	18	#
	19	# Changelog:
	20	# 0.4 added mirror mode.
	21	# 0.3 added apple streaming playlist parsing and decryption
	22	# 0.2 added python 2.4 urlparse compatibility
	23	# 0.1 initial release
	24
	25	from bs4 import BeautifulSoup, Doctype
	26	from subprocess import *
	27	import re
	28	from Crypto.Cipher import AES
	29	import struct
	30	import argparse
	31	import requests
	32	import sys, os
	33	import feedparser
	34	from datetime import datetime, timezone
	35	from pathlib import Path
	36
	37
	38	class Video(dict):
	39	def __init__(self, args, *kwargs):
	40	self.update(dict(args, *kwargs)) # use the free update to set keys
	41
	42	def __setattr__(self, name, value):
	43	return self.__setitem__(name, value)
	44
	45	def __getattr__(self, name):
	46	return self.__getitem__(name)
	47
	48	def is_downloaded(self):
	49	raise ("NotImplemented")
	50
	51
	52	def scrape_player_page(video):
	53	"""
	54	Try to scrape the site for video and download.
	55	"""
	56	if not video['url'].startswith('http'):
	57	video['url'] = "http://www.svtplay.se" + video['url']
	58	soup = BeautifulSoup(requests.get(video['url']).text)
	59	video_player = soup.body('a', {'data-json-href': True})[0]
	60	if 'oppetarkiv.se' in video['url']:
	61	flashvars = requests.get(
	62	"http://www.oppetarkiv.se/%s" % video_player.attrs['data-json-href'] + "?output=json").json()
	63	else:
	64	if video_player.attrs['data-json-href'].startswith("/wd"):
	65	flashvars = requests.get("http://www.svt.se/%s" % video_player.attrs['data-json-href']).json()
	66	else:
	67	flashvars = requests.get(
	68	"http://www.svtplay.se/%s" % video_player.attrs['data-json-href'] + "?output=json").json()
	69	video['duration'] = video_player.attrs.get('data-length', 0)
	70	if not 'title' in video:
	71	video['title'] = soup.find('meta', {'property': 'og:title'}).attrs['content'].replace('\|', '_').replace('/', '_')
	72	if 'genre' not in video:
	73	if soup.find(text='Kategori:'):
	74	video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
	75	else:
	76	video['genre'] = 'Ingen Genre'
	77	if 'dynamicStreams' in flashvars:
	78	video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] + '.mp4'
	79	filename = Path(video['title']).with_suffix(".mp4")
	80	print(Popen(["rtmpdump", "-o" + filename, "-r", video['url']], stdout=PIPE).communicate()[0])
	81	if 'pathflv' in flashvars:
	82	rtmp = flashvars['pathflv'][0]
	83	filename = Path(video['title']).with_suffix(".flv")
	84	print(Popen(["mplayer", "-dumpstream", "-dumpfile", filename, rtmp], stdout=PIPE).communicate()[0])
	85	if not 'timestamp' in video and soup.find_all(datetime=True):
	86	xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
	87	if xmldate_str:
	88	video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) # naive in utc
	89	video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) # convert to local time
	90	if 'video' in flashvars:
	91	for reference in flashvars['video']['videoReferences']:
	92	if 'm3u8' in reference['url']:
	93	video['url'] = reference['url']
	94	video['filename'] = Path(video['title']).with_suffix('.ts')
	95	if 'statistics' in flashvars:
	96	video['category'] = flashvars['statistics']['category']
	97	if not download_from_playlist(video):
	98	return False
	99	if 'url' not in video:
	100	print("Could not find any streams")
	101	return False
	102	return video
	103
	104
	105	def download_from_playlist(video):
	106	params = requests.utils.urlparse(video['url']).query
	107	print(params)
	108	if 'cc1=' in params: # 'cc1=name=Svenska~default=yes~forced=no~uri=http://media.svt.se/download/mcc/wp3/undertexter-wsrt/1134047/1134047-025A/C(sv)/index.m3u8~lang=sv'
	109	video['subs'] = [
	110	dict([k.split('=') for k in params.split('cc1=')[1].split('~')])] # make a dict from the paramstring
	111	try:
	112	req = requests.get(video['url']).text
	113	except:
	114	print("Error reading, skipping file")
	115	print(sys.exc_info()[1])
	116	return False
	117	if 'subs' in video:
	118	try:
	119	segments = [item for item in requests.get(video['subs'][0]['uri']).text.split('\n') if 'vtt' in item]
	120	except:
	121	print("Error reading, skipping subtitle")
	122	print(sys.exc_info()[1])
	123	segments = [] # ugly FIXME
	124	video['subs'][0]['download'] = []
	125	for segment in segments:
	126	if not segment.startswith('http'):
	127	segment = "{}/{}".format(os.path.dirname(video['subs'][0]['uri']), segment)
	128	try:
	129	video['subs'][0]['download'].append(requests.get(segment).text)
	130	except:
	131	print("Error reading, skipping subtitle")
	132	print(sys.exc_info()[1])
	133	break
	134	playlist = parse_playlist(req)
	135	if not playlist:
	136	return
	137	videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
	138	if not videourl.startswith('http'): # if relative path
	139	videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
	140	segments, metadata = parse_segment_playlist(videourl)
	141	if "EXT-X-KEY" in metadata:
	142	try:
	143	key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
	144	except:
	145	print("Error reading, skipping file")
	146	print(sys.exc_info()[1])
	147	return False
	148	decrypt = True
	149	else:
	150	decrypt = False
	151	with video['filename'].open("wb") as ofile:
	152	segment = 0
	153	size = 0
	154	for url in segments:
	155	try:
	156	ufile = requests.get(url, stream=True).raw
	157	except:
	158	print("Error reading, skipping file")
	159	print(sys.exc_info()[1])
	160	return False
	161	print("\r{0:.2f} MB".format(size / 1024 / 1024), end="")
	162	sys.stdout.flush()
	163	if decrypt:
	164	iv = struct.pack("IIII", segment, 0, 0, 0)
	165	try:
	166	decryptor = AES.new(key, AES.MODE_CBC,
	167	iv) # ValueError: AES key must be either 16, 24, or 32 bytes long
	168	except ValueError as e:
	169	print("Error using decryption key. Skipping")
	170	print(e)
	171	return False
	172	while True:
	173	try:
	174	buf = ufile.read(4096)
	175	except:
	176	print("Error reading, skipping file")
	177	print(sys.exc_info()[1])
	178	return False
	179	if not buf:
	180	break
	181	if decrypt:
	182	buf = decryptor.decrypt(buf)
	183	ofile.write(buf)
	184	size += len(buf)
	185	segment += 1
	186
	187	if 'thumb-url' in video:
	188	try:
	189	video['thumb'] = requests.get(video['thumb-url'], stream=True).raw
	190	except:
	191	print("Error reading thumbnail") # FIXME mark file as failed
	192	print(sys.exc_info()[1])
	193
	194	return True
	195
	196
	197	def parse_playlist(playlist):
	198	if not playlist.startswith("#EXTM3U"):
	199	print(playlist)
	200	return False
	201	playlist = playlist.splitlines()
	202	while not 'EXT-X-STREAM-INF' in playlist[0]:
	203	playlist = playlist[1:]
	204	items = []
	205	for (metadata_string, url) in zip(playlist[0::2], playlist[1::2]):
	206	md = Video()
	207	if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
	208	continue
	209	for item in metadata_string.split(':')[1].split(','):
	210	if '=' in item:
	211	md.update([item.split('='), ])
	212	md['url'] = url
	213	items.append(md)
	214	return items
	215
	216
	217	def parse_segment_playlist(playlisturl):
	218	playlist = requests.get(playlisturl).text
	219	assert playlist.startswith("#EXTM3U")
	220	PATTERN = re.compile(r'''((?:[^,"']\|"[^"]"\|'[^']')+)''')
	221	segments = []
	222	next_is_url = False
	223	metadata = {}
	224	for row in playlist.splitlines():
	225	if next_is_url:
	226	if not row.startswith('http'): # if relative path
	227	row = "{}/{}".format(os.path.dirname(playlisturl), row)
	228	segments.append(row)
	229	next_is_url = False
	230	continue
	231	if 'EXTINF' in row:
	232	next_is_url = True
	233	if "EXT-X-KEY" in row:
	234	row = row.split(':', 1)[1] # skip first part
	235	parts = PATTERN.split(row)[1:-1] # do magic re split and keep quotes
	236	metadata["EXT-X-KEY"] = dict([part.split('=', 1) for part in parts if
	237	'=' in part]) # throw away the commas and make dict of the pairs
	238	return segments, metadata
	239
	240
	241	def parse_videolist():
	242	page_num = 1
	243	soup = BeautifulSoup(requests.get(
	244	"http://www.svtplay.se/ajax/videospager").text) # this call does not work for getting the pages, we use it for the page totals only
	245	page_tot = int(soup.find('a', {'data-currentpage': True}).attrs['data-lastpage'])
	246	videos_per_page = 8
	247	video_num = 0
	248	while page_num <= page_tot:
	249	base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
	250	soup = BeautifulSoup(requests.get(base_url).text)
	251	for article in soup.findAll('article'):
	252	meta = dict(article.attrs)
	253	video = Video()
	254	video['title'] = meta['data-title']
	255	video['description'] = meta['data-description']
	256	video['url'] = dict(article.find('a').attrs)['href']
	257	video['thumb-url'] = dict(article.find('img', {}).attrs)['src']
	258	video['num'] = video_num
	259	video['total'] = page_tot * videos_per_page
	260	video_num += 1
	261	yield video
	262	page_num += 1
	263
	264
	265	def remux(video, xml=None):
	266	if 'genre' in video:
	267	if not os.path.exists(video['genre']):
	268	os.mkdir(video['genre'])
	269	video['path'] = Path(video['genre'] / video['filename']).with_suffix('.mkv')
	270	else:
	271	video['path'] = video['filename'].with_suffix('.mkv')
	272	command = ["mkvmerge", "-o", str(video['path']), '--title', video['title']]
	273
	274	if xml:
	275	with video['filename'].with_suffix('.xml').open('w') as f:
	276	f.write(xml)
	277	command.extend(['--global-tags', str(video['filename'].with_suffix('.xml'))])
	278	if 'thumb' in video:
	279	with open('thumbnail.jpg', 'wb') as f: # FIXME use title instead for many downloaders
	280	f.write(video['thumb'].read())
	281	command.extend(['--attachment-description', "Thumbnail",
	282	'--attachment-mime-type', 'image/jpeg',
	283	'--attach-file', 'thumbnail.jpg'])
	284	# if 'subs' in video:
	285	# for sub in video['subs']:
	286	# if 'download' in sub:
	287	# with open("{}.vtt".format(sub['lang']),'wb') as f:
	288	# f.write(bytes("".join(sub['download']),'utf-8')) #FIXME
	289	# command.extend(['--language 0:{} {}.vtt'.format(sub['lang'],sub['lang'])])
	290
	291	command.append(str(video['filename']))
	292	print(Popen(command, stdout=PIPE).communicate()[0])
	293	for fname in (video['filename'], video['filename'].with_suffix('.xml'), Path('thumbnail.jpg')):
	294	try:
	295	fname.unlink()
	296	except:
	297	pass
	298	if 'timestamp' in video:
	299	try:
	300	os.utime(str(video['path']), times=(video['timestamp'].timestamp(), video['timestamp'].timestamp()))
	301	except FileNotFoundError as e:
	302	print(e)
	303
	304
	305	def mkv_metadata(video):
	306	root = BeautifulSoup(features='xml')
	307	root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
	308	tags = root.new_tag("Tags")
	309	tag = root.new_tag("Tag")
	310	tags.append(tag)
	311	root.append(tags)
	312	keep = ('title', 'description', 'url', 'genre')
	313	targets = root.new_tag("Targets")
	314	ttv = root.new_tag("TargetTypeValue")
	315	ttv.string = str(50)
	316	targets.append(ttv)
	317	tag.append(targets)
	318	for key in video:
	319	if not key in keep:
	320	continue
	321	simple = root.new_tag('Simple')
	322	name = root.new_tag('Name')
	323	name.string = key.upper()
	324	simple.append(name)
	325	sstring = root.new_tag('String')
	326	sstring.string = video[key]
	327	simple.append(sstring)
	328	tag.append(simple)
	329	return str(root)
	330
	331
	332	if __name__ == "__main__":
	333	parser = argparse.ArgumentParser()
	334	group = parser.add_mutually_exclusive_group(required=True)
	335	group.add_argument("-r", "--rss", help="Download all files in rss")
	336	group.add_argument("-u", "--url", help="Download video in url")
	337	group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
	338	parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.",
	339	action="store_true")
	340	parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
	341
	342	args = parser.parse_args()
	343	if args.rss:
	344	d = feedparser.parse(args.rss)
	345	for e in d.entries:
	346	print(("Downloading: %s" % e.title))
	347	if args.no_act:
	348	continue
	349	video = scrape_player_page({'title': e.title, 'url': e.link})
	350	if args.no_remux:
	351	continue
	352	remux(video)
	353	# print(e.description)
	354	if args.mirror:
	355	if not os.path.exists('.seen'):
	356	os.mkdir('.seen')
	357	for video in parse_videolist():
	358	video['title'] = video['title'].replace('/', '_')
	359	print(video['title'] + '.mkv')
	360	print("{} of {}".format(video['num'], video['total']))
	361
	362	if os.path.exists(os.path.join('.seen', video['title'])):
	363	print("Skipping")
	364	continue
	365	print("Downloading...")
	366	if args.no_act:
	367	continue
	368	open(os.path.join('.seen', video['title']), 'w').close() # touch
	369	ret = scrape_player_page(video)
	370	if not ret:
	371	if not os.path.exists('.failed'):
	372	os.mkdir('.failed')
	373	open(os.path.join('.failed', video['title']), 'w').close() # touch
	374	continue
	375	video = ret
	376	if args.no_remux:
	377	continue
	378	xml = mkv_metadata(video)
	379	remux(video, xml)
	380
	381	else:
	382	if not args.no_act:
	383	video = scrape_player_page({'url': args.url})
	384	if not args.no_remux:
	385	remux(video)
	386	print(("Downloaded {}".format(args.url)))