git.frykholm.com Git - svtplaydump.git/blame_incremental

... / ...

Commit	Line	Data
	1	#!/usr/bin/env python3
	2	# -- coding: utf-8 --
	3	#
	4	# (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
	5	#
	6	# This program is free software: you can redistribute it and/or modify
	7	# it under the terms of the GNU General Public License as published by
	8	# the Free Software Foundation, either version 3 of the License, or
	9	# (at your option) any later version.
	10	#
	11	# This program is distributed in the hope that it will be useful,
	12	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	# GNU General Public License for more details.
	15	#
	16	# You should have received a copy of the GNU General Public License
	17	# along with this program. If not, see <http://www.gnu.org/licenses/>
	18	#
	19	# Changelog:
	20	# 0.4 added mirror mode.
	21	# 0.3 added apple streaming playlist parsing and decryption
	22	# 0.2 added python 2.4 urlparse compatibility
	23	# 0.1 initial release
	24
	25	from bs4 import BeautifulSoup, Doctype
	26	from subprocess import *
	27	import re
	28	from Crypto.Cipher import AES
	29	import struct
	30	import argparse
	31	import requests
	32	import sys, os
	33	import socket
	34	import feedparser
	35	from datetime import datetime, timezone
	36	class Video(dict):
	37	def __init__(self, args, *kwargs):
	38	self.update(dict(args, *kwargs)) # use the free update to set keys
	39
	40	def __setattr__(self, name, value):
	41	return self.__setitem__(name,value)
	42
	43	def __getattr__(self, name):
	44	return self.__getitem__(name)
	45
	46	def is_downloaded(self):
	47	raise("NotImplemented")
	48
	49	def scrape_player_page(video):
	50	"""
	51	Try to scrape the site for video and download.
	52	"""
	53	if not video['url'].startswith('http'):
	54	video['url'] = "http://www.svtplay.se" + video['url']
	55	soup = BeautifulSoup(requests.get(video['url']).text)
	56	video_player = soup.body('a',{'data-json-href':True})[0]
	57	if 'oppetarkiv.se' in video['url']:
	58	flashvars = requests.get("http://www.oppetarkiv.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
	59	else:
	60	if video_player.attrs['data-json-href'].startswith("/wd"):
	61	flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
	62	else:
	63	flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
	64	video['duration'] = video_player.attrs.get('data-length',0)
	65	if not video['title']:
	66	video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('\|','_').replace('/','_')
	67	if not 'genre' in video:
	68	if soup.find(text='Kategori:'):
	69	video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
	70	else:
	71	video['genre'] = 'Ingen Genre'
	72	if 'dynamicStreams' in flashvars:
	73	video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
	74	filename = video['title']+".mp4"
	75	print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
	76	if 'pathflv' in flashvars:
	77	rtmp = flashvars['pathflv'][0]
	78	filename = video['title']+".flv"
	79	print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
	80	if not 'timestamp' in video:
	81	if soup.find_all(datetime=True):
	82	xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
	83	video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) #naive in utc
	84	video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) #convert to local time
	85	if 'video' in flashvars:
	86	for reference in flashvars['video']['videoReferences']:
	87	if 'm3u8' in reference['url']:
	88	video['url']=reference['url']
	89	video['filename'] = video['title']+'.ts'
	90	if 'statistics' in flashvars:
	91	video['category'] = flashvars['statistics']['category']
	92	if not download_from_playlist(video):
	93	return False
	94	if not 'url' in video:
	95	print("Could not find any streams")
	96	return False
	97	return video
	98
	99	def download_from_playlist(video):
	100	playlist = parse_playlist(requests.get(video['url']).text)
	101	if not playlist:
	102	return
	103	videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
	104	if not videourl.startswith('http'): #if relative path
	105	videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
	106	segments, metadata = parse_segment_playlist(videourl)
	107	if "EXT-X-KEY" in metadata:
	108	key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
	109	decrypt=True
	110	else:
	111	decrypt=False
	112	with open("%s"%video['filename'],"wb") as ofile:
	113	segment=0
	114	size = 0
	115	for url in segments:
	116	try:
	117	ufile = requests.get(url, stream=True).raw
	118	except:
	119	print("Error reading, skipping file") #FIXME mark file as failed
	120	print(sys.exc_info()[1])
	121	return False
	122	print("\r{0:.2f} MB".format(size/1024/1024),end="")
	123	sys.stdout.flush()
	124	if decrypt:
	125	iv=struct.pack("IIII",segment,0,0,0)
	126	decryptor = AES.new(key, AES.MODE_CBC, iv)
	127	while(True):
	128	try:
	129	buf = ufile.read(4096)
	130	except:
	131	print("Error reading, skipping file") #FIXME mark file as failed
	132	print(sys.exc_info()[1])
	133	return False
	134	if not buf:
	135	break
	136	if decrypt:
	137	buf = decryptor.decrypt(buf)
	138	ofile.write(buf)
	139	size += len(buf)
	140	segment += 1
	141
	142	if 'thumb-url' in video:
	143	video['thumb'] = requests.get(video['thumb-url'],stream=True).raw
	144	return True
	145
	146	def parse_playlist(playlist):
	147	if not playlist.startswith("#EXTM3U"):
	148	print(playlist)
	149	return False
	150	playlist = playlist.splitlines()
	151	while not 'EXT-X-STREAM-INF' in playlist[0]:
	152	playlist = playlist[1:]
	153	items=[]
	154	for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
	155	md = Video()
	156	if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
	157	continue
	158	for item in metadata_string.split(':')[1].split(','):
	159	if '=' in item:
	160	md.update([item.split('='),])
	161	md['url']=url
	162	items.append(md)
	163	return items
	164
	165	def parse_segment_playlist(playlisturl):
	166	playlist = requests.get(playlisturl).text
	167	assert playlist.startswith("#EXTM3U")
	168	PATTERN = re.compile(r'''((?:[^,"']\|"[^"]"\|'[^']')+)''')
	169	segments = []
	170	next_is_url=False
	171	metadata = {}
	172	for row in playlist.splitlines():
	173	if next_is_url:
	174	if not row.startswith('http'): #if relative path
	175	row = "{}/{}".format(os.path.dirname(playlisturl), row)
	176	segments.append(row)
	177	next_is_url=False
	178	continue
	179	if 'EXTINF' in row:
	180	next_is_url=True
	181	if "EXT-X-KEY" in row:
	182	row = row.split(':',1)[1] #skip first part
	183	parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
	184	metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
	185	return(segments, metadata)
	186
	187	def parse_videolist():
	188	page_num = 1
	189	soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
	190	page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
	191	videos_per_page = 8
	192	video_num = 0
	193	while(page_num <= page_tot):
	194	base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
	195	soup = BeautifulSoup(requests.get(base_url).text)
	196	for article in soup.findAll('article'):
	197	meta = dict(article.attrs)
	198	video = Video()
	199	video['title'] = meta['data-title']
	200	video['description'] = meta['data-description']
	201	video['url'] = dict(article.find('a').attrs)['href']
	202	video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
	203	video['num'] = video_num
	204	video['total'] = page_tot * videos_per_page
	205	video_num += 1
	206	yield video
	207	page_num += 1
	208
	209	def remux(video, xml=None):
	210	basename = video['filename'].split('.ts')[0]
	211	if 'genre' in video:
	212	if not os.path.exists(video['genre']):
	213	os.mkdir(video['genre'])
	214	video['path'] = os.path.join(video['genre'],basename+'.mkv')
	215	else:
	216	video['path'] = basename+'.mkv'
	217	command = ["mkvmerge","-o",video['path'], '--title',video['title']]
	218
	219	if xml:
	220	with open(basename+'.xml','w') as f:
	221	f.write(xml)
	222	command.extend(['--global-tags',basename+'.xml'])
	223	if 'thumb' in video:
	224	with open('thumbnail.jpg','wb') as f: #FIXME use title instead for many downloaders
	225	f.write(video['thumb'].read())
	226	command.extend(['--attachment-description', "Thumbnail",
	227	'--attachment-mime-type', 'image/jpeg',
	228	'--attach-file', 'thumbnail.jpg'])
	229	command.append(video['filename'])
	230	print(Popen(command, stdout=PIPE).communicate()[0])
	231	for fname in (video['filename'], basename+'.xml','thumbnail.jpg'):
	232	try:
	233	os.unlink(fname)
	234	except:
	235	pass
	236	if 'timestamp' in video:
	237	try:
	238	os.utime(video['path'], times=(video['timestamp'].timestamp(),video['timestamp'].timestamp()))
	239	except FileNotFoundError as e:
	240	print(e)
	241
	242
	243	def mkv_metadata(video):
	244	root = BeautifulSoup(features='xml')
	245	root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
	246	tags = root.new_tag("Tags")
	247	tag = root.new_tag("Tag")
	248	tags.append(tag)
	249	root.append(tags)
	250	keep = ('title','description', 'url','genre')
	251	targets = root.new_tag("Targets")
	252	ttv = root.new_tag("TargetTypeValue")
	253	ttv.string = str(50)
	254	targets.append(ttv)
	255	tag.append(targets)
	256	for key in video:
	257	if not key in keep:
	258	continue
	259	simple = root.new_tag('Simple')
	260	name = root.new_tag('Name')
	261	name.string=key.upper()
	262	simple.append(name)
	263	sstring = root.new_tag('String')
	264	sstring.string=video[key]
	265	simple.append(sstring)
	266	tag.append(simple)
	267	return str(root)
	268
	269	if __name__ == "__main__":
	270	parser = argparse.ArgumentParser()
	271	group = parser.add_mutually_exclusive_group(required=True)
	272	group.add_argument("-r", "--rss", help="Download all files in rss")
	273	group.add_argument("-u", "--url", help="Download video in url")
	274	group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
	275	parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
	276	parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
	277
	278	args = parser.parse_args()
	279	if args.rss:
	280	d = feedparser.parse(args.rss)
	281	for e in d.entries:
	282	print(("Downloading: %s"%e.title))
	283	if args.no_act:
	284	continue
	285	video = scrape_player_page({'title':e.title,'url':e.link})
	286	if args.no_remux:
	287	continue
	288	self.remux(video)
	289	#print(e.description)
	290	if args.mirror:
	291	if not os.path.exists('.seen'):
	292	os.mkdir('.seen')
	293	for video in parse_videolist():
	294	video['title'] = video['title'].replace('/','_')
	295	print(video['title']+'.mkv')
	296	print("{} of {}".format(video['num'], video['total']))
	297
	298	if os.path.exists(os.path.join('.seen',video['title'])):
	299	print("Skipping")
	300	continue
	301	print("Downloading...")
	302	if args.no_act:
	303	continue
	304	open(os.path.join('.seen',video['title']),'w').close() #touch
	305	ret = scrape_player_page(video)
	306	if not ret:
	307	if not os.path.exists('.failed'):
	308	os.mkdir('.failed')
	309	open(os.path.join('.failed',video['title']),'w').close() #touch
	310	continue
	311	video = ret
	312	if args.no_remux:
	313	continue
	314	xml = mkv_metadata(video)
	315	remux(video, xml)
	316
	317	else:
	318	if not args.no_act:
	319	video = scrape_player_page({'url':args.url})
	320	if not args.no_remux:
	321	remux({'title':e.title})
	322	print(("Downloaded {}".format(args.url)))