git.frykholm.com Git - svtplaydump.git/blame_incremental

... / ...

Commit	Line	Data
	1	#!/usr/bin/env python3
	2	# -- coding: utf-8 --
	3	#
	4	# (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
	5	#
	6	# This program is free software: you can redistribute it and/or modify
	7	# it under the terms of the GNU General Public License as published by
	8	# the Free Software Foundation, either version 3 of the License, or
	9	# (at your option) any later version.
	10	#
	11	# This program is distributed in the hope that it will be useful,
	12	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	# GNU General Public License for more details.
	15	#
	16	# You should have received a copy of the GNU General Public License
	17	# along with this program. If not, see <http://www.gnu.org/licenses/>
	18	#
	19	# Changelog:
	20	# 0.4 added mirror mode.
	21	# 0.3 added apple streaming playlist parsing and decryption
	22	# 0.2 added python 2.4 urlparse compatibility
	23	# 0.1 initial release
	24
	25	from bs4 import BeautifulSoup, Doctype
	26	from subprocess import *
	27	import re
	28	from Crypto.Cipher import AES
	29	import struct
	30	import argparse
	31	import requests
	32	import sys, os
	33	import socket
	34	import feedparser
	35	from datetime import datetime, timezone
	36	class Video(dict):
	37	def __init__(self, args, *kwargs):
	38	self.update(dict(args, *kwargs)) # use the free update to set keys
	39
	40	def __setattr__(self, name, value):
	41	return self.__setitem__(name,value)
	42
	43	def __getattr__(self, name):
	44	return self.__getitem__(name)
	45
	46	def is_downloaded(self):
	47	raise("NotImplemented")
	48
	49	def scrape_player_page(video):
	50	"""
	51	Try to scrape the site for video and download.
	52	"""
	53	if not video['url'].startswith('http'):
	54	video['url'] = "http://www.svtplay.se" + video['url']
	55	soup = BeautifulSoup(requests.get(video['url']).text)
	56	video_player = soup.body('a',{'data-json-href':True})[0]
	57	if 'oppetarkiv.se' in video['url']:
	58	flashvars = requests.get("http://www.oppetarkiv.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
	59	else:
	60	if video_player.attrs['data-json-href'].startswith("/wd"):
	61	flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
	62	else:
	63	flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
	64	video['duration'] = video_player.attrs.get('data-length',0)
	65	if not 'title' in video:
	66	video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('\|','_').replace('/','_')
	67	if not 'genre' in video:
	68	if soup.find(text='Kategori:'):
	69	video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
	70	else:
	71	video['genre'] = 'Ingen Genre'
	72	if 'dynamicStreams' in flashvars:
	73	video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
	74	filename = video['title']+".mp4"
	75	print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
	76	if 'pathflv' in flashvars:
	77	rtmp = flashvars['pathflv'][0]
	78	filename = video['title']+".flv"
	79	print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
	80	if not 'timestamp' in video:
	81	if soup.find_all(datetime=True):
	82	xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
	83	video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) #naive in utc
	84	video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) #convert to local time
	85	if 'video' in flashvars:
	86	for reference in flashvars['video']['videoReferences']:
	87	if 'm3u8' in reference['url']:
	88	video['url']=reference['url']
	89	video['filename'] = video['title']+'.ts'
	90	if 'statistics' in flashvars:
	91	video['category'] = flashvars['statistics']['category']
	92	if not download_from_playlist(video):
	93	return False
	94	if not 'url' in video:
	95	print("Could not find any streams")
	96	return False
	97	return video
	98
	99	def download_from_playlist(video):
	100	params = requests.utils.urlparse(video['url']).query
	101	print(params)
	102	if 'cc1=' in params: #'cc1=name=Svenska~default=yes~forced=no~uri=http://media.svt.se/download/mcc/wp3/undertexter-wsrt/1134047/1134047-025A/C(sv)/index.m3u8~lang=sv'
	103	video['subs'] = [dict([k.split('=') for k in params.split('cc1=')[1].split('~')])] #make a dict from the paramstring
	104	try:
	105	req = requests.get(video['url']).text
	106	except:
	107	print("Error reading, skipping file")
	108	print(sys.exc_info()[1])
	109	return False
	110	if 'subs' in video:
	111	try:
	112	segments = [item for item in requests.get(video['subs'][0]['uri']).text.split('\n') if 'vtt' in item]
	113	except:
	114	print("Error reading, skipping subtitle")
	115	print(sys.exc_info()[1])
	116	segments = [] #ugly FIXME
	117	video['subs'][0]['download'] = []
	118	for segment in segments:
	119	if not segment.startswith('http'):
	120	segment = "{}/{}".format(os.path.dirname(video['subs'][0]['uri']), segment)
	121	try:
	122	video['subs'][0]['download'].append(requests.get(segment).text)
	123	except:
	124	print("Error reading, skipping subtitle")
	125	print(sys.exc_info()[1])
	126	break
	127	playlist = parse_playlist(req)
	128	if not playlist:
	129	return
	130	videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
	131	if not videourl.startswith('http'): #if relative path
	132	videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
	133	segments, metadata = parse_segment_playlist(videourl)
	134	if "EXT-X-KEY" in metadata:
	135	try:
	136	key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
	137	except:
	138	print("Error reading, skipping file")
	139	print(sys.exc_info()[1])
	140	return False
	141	decrypt=True
	142	else:
	143	decrypt=False
	144	with open("%s"%video['filename'],"wb") as ofile:
	145	segment=0
	146	size = 0
	147	for url in segments:
	148	try:
	149	ufile = requests.get(url, stream=True).raw
	150	except:
	151	print("Error reading, skipping file")
	152	print(sys.exc_info()[1])
	153	return False
	154	print("\r{0:.2f} MB".format(size/1024/1024),end="")
	155	sys.stdout.flush()
	156	if decrypt:
	157	iv=struct.pack("IIII",segment,0,0,0)
	158	try:
	159	decryptor = AES.new(key, AES.MODE_CBC, iv) #ValueError: AES key must be either 16, 24, or 32 bytes long
	160	except(ValueError) as e:
	161	print("Error using decryption key. Skipping")
	162	print(e)
	163	return False
	164	while(True):
	165	try:
	166	buf = ufile.read(4096)
	167	except:
	168	print("Error reading, skipping file") #FIXME mark file as failed
	169	print(sys.exc_info()[1])
	170	return False
	171	if not buf:
	172	break
	173	if decrypt:
	174	buf = decryptor.decrypt(buf)
	175	ofile.write(buf)
	176	size += len(buf)
	177	segment += 1
	178
	179	if 'thumb-url' in video:
	180	try:
	181	video['thumb'] = requests.get(video['thumb-url'],stream=True).raw
	182	except:
	183	print("Error reading thumbnail") #FIXME mark file as failed
	184	print(sys.exc_info()[1])
	185
	186	return True
	187
	188	def parse_playlist(playlist):
	189	if not playlist.startswith("#EXTM3U"):
	190	print(playlist)
	191	return False
	192	playlist = playlist.splitlines()
	193	while not 'EXT-X-STREAM-INF' in playlist[0]:
	194	playlist = playlist[1:]
	195	items=[]
	196	for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
	197	md = Video()
	198	if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
	199	continue
	200	for item in metadata_string.split(':')[1].split(','):
	201	if '=' in item:
	202	md.update([item.split('='),])
	203	md['url']=url
	204	items.append(md)
	205	return items
	206
	207	def parse_segment_playlist(playlisturl):
	208	playlist = requests.get(playlisturl).text
	209	assert playlist.startswith("#EXTM3U")
	210	PATTERN = re.compile(r'''((?:[^,"']\|"[^"]"\|'[^']')+)''')
	211	segments = []
	212	next_is_url=False
	213	metadata = {}
	214	for row in playlist.splitlines():
	215	if next_is_url:
	216	if not row.startswith('http'): #if relative path
	217	row = "{}/{}".format(os.path.dirname(playlisturl), row)
	218	segments.append(row)
	219	next_is_url=False
	220	continue
	221	if 'EXTINF' in row:
	222	next_is_url=True
	223	if "EXT-X-KEY" in row:
	224	row = row.split(':',1)[1] #skip first part
	225	parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
	226	metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
	227	return(segments, metadata)
	228
	229	def parse_videolist():
	230	page_num = 1
	231	soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
	232	page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
	233	videos_per_page = 8
	234	video_num = 0
	235	while(page_num <= page_tot):
	236	base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
	237	soup = BeautifulSoup(requests.get(base_url).text)
	238	for article in soup.findAll('article'):
	239	meta = dict(article.attrs)
	240	video = Video()
	241	video['title'] = meta['data-title']
	242	video['description'] = meta['data-description']
	243	video['url'] = dict(article.find('a').attrs)['href']
	244	video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
	245	video['num'] = video_num
	246	video['total'] = page_tot * videos_per_page
	247	video_num += 1
	248	yield video
	249	page_num += 1
	250
	251	def remux(video, xml=None):
	252	basename = video['filename'].split('.ts')[0]
	253	if 'genre' in video:
	254	if not os.path.exists(video['genre']):
	255	os.mkdir(video['genre'])
	256	video['path'] = os.path.join(video['genre'],basename+'.mkv')
	257	else:
	258	video['path'] = basename+'.mkv'
	259	command = ["mkvmerge","-o",video['path'], '--title',video['title']]
	260
	261	if xml:
	262	with open(basename+'.xml','w') as f:
	263	f.write(xml)
	264	command.extend(['--global-tags',basename+'.xml'])
	265	if 'thumb' in video:
	266	with open('thumbnail.jpg','wb') as f: #FIXME use title instead for many downloaders
	267	f.write(video['thumb'].read())
	268	command.extend(['--attachment-description', "Thumbnail",
	269	'--attachment-mime-type', 'image/jpeg',
	270	'--attach-file', 'thumbnail.jpg'])
	271	# if 'subs' in video:
	272	# for sub in video['subs']:
	273	# if 'download' in sub:
	274	# with open("{}.vtt".format(sub['lang']),'wb') as f:
	275	# f.write(bytes("".join(sub['download']),'utf-8')) #FIXME
	276	# command.extend(['--language 0:{} {}.vtt'.format(sub['lang'],sub['lang'])])
	277
	278
	279	command.append(video['filename'])
	280	print(Popen(command, stdout=PIPE).communicate()[0])
	281	for fname in (video['filename'], basename+'.xml','thumbnail.jpg'):
	282	try:
	283	os.unlink(fname)
	284	except:
	285	pass
	286	if 'timestamp' in video:
	287	try:
	288	os.utime(video['path'], times=(video['timestamp'].timestamp(),video['timestamp'].timestamp()))
	289	except FileNotFoundError as e:
	290	print(e)
	291
	292
	293	def mkv_metadata(video):
	294	root = BeautifulSoup(features='xml')
	295	root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
	296	tags = root.new_tag("Tags")
	297	tag = root.new_tag("Tag")
	298	tags.append(tag)
	299	root.append(tags)
	300	keep = ('title','description', 'url','genre')
	301	targets = root.new_tag("Targets")
	302	ttv = root.new_tag("TargetTypeValue")
	303	ttv.string = str(50)
	304	targets.append(ttv)
	305	tag.append(targets)
	306	for key in video:
	307	if not key in keep:
	308	continue
	309	simple = root.new_tag('Simple')
	310	name = root.new_tag('Name')
	311	name.string=key.upper()
	312	simple.append(name)
	313	sstring = root.new_tag('String')
	314	sstring.string=video[key]
	315	simple.append(sstring)
	316	tag.append(simple)
	317	return str(root)
	318
	319	if __name__ == "__main__":
	320	parser = argparse.ArgumentParser()
	321	group = parser.add_mutually_exclusive_group(required=True)
	322	group.add_argument("-r", "--rss", help="Download all files in rss")
	323	group.add_argument("-u", "--url", help="Download video in url")
	324	group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
	325	parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
	326	parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
	327
	328	args = parser.parse_args()
	329	if args.rss:
	330	d = feedparser.parse(args.rss)
	331	for e in d.entries:
	332	print(("Downloading: %s"%e.title))
	333	if args.no_act:
	334	continue
	335	video = scrape_player_page({'title':e.title,'url':e.link})
	336	if args.no_remux:
	337	continue
	338	self.remux(video)
	339	#print(e.description)
	340	if args.mirror:
	341	if not os.path.exists('.seen'):
	342	os.mkdir('.seen')
	343	for video in parse_videolist():
	344	video['title'] = video['title'].replace('/','_')
	345	print(video['title']+'.mkv')
	346	print("{} of {}".format(video['num'], video['total']))
	347
	348	if os.path.exists(os.path.join('.seen',video['title'])):
	349	print("Skipping")
	350	continue
	351	print("Downloading...")
	352	if args.no_act:
	353	continue
	354	open(os.path.join('.seen',video['title']),'w').close() #touch
	355	ret = scrape_player_page(video)
	356	if not ret:
	357	if not os.path.exists('.failed'):
	358	os.mkdir('.failed')
	359	open(os.path.join('.failed',video['title']),'w').close() #touch
	360	continue
	361	video = ret
	362	if args.no_remux:
	363	continue
	364	xml = mkv_metadata(video)
	365	remux(video, xml)
	366
	367	else:
	368	if not args.no_act:
	369	video = scrape_player_page({'url':args.url})
	370	if not args.no_remux:
	371	remux(video)
	372	print(("Downloaded {}".format(args.url)))