git.frykholm.com Git - svtplaydump.git/blame_incremental

... / ...

Commit	Line	Data
	1	#!/usr/bin/env python3
	2	# -- coding: utf-8 --
	3	#
	4	# (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
	5	#
	6	# This program is free software: you can redistribute it and/or modify
	7	# it under the terms of the GNU General Public License as published by
	8	# the Free Software Foundation, either version 3 of the License, or
	9	# (at your option) any later version.
	10	#
	11	# This program is distributed in the hope that it will be useful,
	12	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	# GNU General Public License for more details.
	15	#
	16	# You should have received a copy of the GNU General Public License
	17	# along with this program. If not, see <http://www.gnu.org/licenses/>
	18	#
	19	# Changelog:
	20	# 0.4 added mirror mode.
	21	# 0.3 added apple streaming playlist parsing and decryption
	22	# 0.2 added python 2.4 urlparse compatibility
	23	# 0.1 initial release
	24
	25	from bs4 import BeautifulSoup, Doctype
	26	from subprocess import *
	27	import re
	28	from Crypto.Cipher import AES
	29	import struct
	30	import argparse
	31	import requests
	32	import sys, os
	33	import socket
	34	import feedparser
	35	from datetime import datetime, timezone
	36	class Video(dict):
	37	def __init__(self, args, *kwargs):
	38	self.update(dict(args, *kwargs)) # use the free update to set keys
	39
	40	def __setattr__(self, name, value):
	41	return self.__setitem__(name,value)
	42
	43	def __getattr__(self, name):
	44	return self.__getitem__(name)
	45
	46	def is_downloaded(self):
	47	raise("NotImplemented")
	48
	49	def scrape_player_page(video):
	50	"""
	51	Try to scrape the site for video and download.
	52	"""
	53	if not video['url'].startswith('http'):
	54	video['url'] = "http://www.svtplay.se" + video['url']
	55	soup = BeautifulSoup(requests.get(video['url']).text)
	56	video_player = soup.body('a',{'data-json-href':True})[0]
	57	if 'oppetarkiv.se' in video['url']:
	58	flashvars = requests.get("http://www.oppetarkiv.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
	59	else:
	60	if video_player.attrs['data-json-href'].startswith("/wd"):
	61	flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
	62	else:
	63	flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
	64	video['duration'] = video_player.attrs.get('data-length',0)
	65	if not video['title']:
	66	video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('\|','_').replace('/','_')
	67	if not 'genre' in video:
	68	if soup.find(text='Kategori:'):
	69	video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
	70	else:
	71	video['genre'] = 'Ingen Genre'
	72	if 'dynamicStreams' in flashvars:
	73	video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
	74	filename = video['title']+".mp4"
	75	print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
	76	if 'pathflv' in flashvars:
	77	rtmp = flashvars['pathflv'][0]
	78	filename = video['title']+".flv"
	79	print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
	80	if not 'timestamp' in video:
	81	if soup.find_all(datetime=True):
	82	xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
	83	video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) #naive in utc
	84	video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) #convert to local time
	85	if 'video' in flashvars:
	86	for reference in flashvars['video']['videoReferences']:
	87	if 'm3u8' in reference['url']:
	88	video['url']=reference['url']
	89	video['filename'] = video['title']+'.ts'
	90	if 'statistics' in flashvars:
	91	video['category'] = flashvars['statistics']['category']
	92	download_from_playlist(video)
	93	if not 'url' in video:
	94	print("Could not find any streams")
	95	return False
	96	return video
	97
	98	def download_from_playlist(video):
	99	playlist = parse_playlist(requests.get(video['url']).text)
	100	if not playlist:
	101	return
	102	videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
	103	if not videourl.startswith('http'): #if relative path
	104	videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
	105	segments, metadata = parse_segment_playlist(videourl)
	106	if "EXT-X-KEY" in metadata:
	107	key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
	108	decrypt=True
	109	else:
	110	decrypt=False
	111	with open("%s"%video['filename'],"wb") as ofile:
	112	segment=0
	113	size = 0
	114	for url in segments:
	115	ufile = requests.get(url, stream=True).raw
	116	print("\r{0:.2f} MB".format(size/1024/1024),end="")
	117	sys.stdout.flush()
	118	if decrypt:
	119	iv=struct.pack("IIII",segment,0,0,0)
	120	decryptor = AES.new(key, AES.MODE_CBC, iv)
	121	while(True):
	122	try:
	123	buf = ufile.read(4096)
	124	except (socket.error, TypeError) as e:
	125	print("Error reading, skipping file")
	126	print(e)
	127	return
	128	if not buf:
	129	break
	130	if decrypt:
	131	buf = decryptor.decrypt(buf)
	132	ofile.write(buf)
	133	size += len(buf)
	134	segment += 1
	135
	136	if 'thumb-url' in video:
	137	video['thumb'] = requests.get(video['thumb-url'],stream=True).raw
	138
	139	def parse_playlist(playlist):
	140	if not playlist.startswith("#EXTM3U"):
	141	print(playlist)
	142	return False
	143	playlist = playlist.splitlines()
	144	while not 'EXT-X-STREAM-INF' in playlist[0]:
	145	playlist = playlist[1:]
	146	items=[]
	147	for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
	148	md = Video()
	149	if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
	150	continue
	151	for item in metadata_string.split(':')[1].split(','):
	152	if '=' in item:
	153	md.update([item.split('='),])
	154	md['url']=url
	155	items.append(md)
	156	return items
	157
	158	def parse_segment_playlist(playlisturl):
	159	playlist = requests.get(playlisturl).text
	160	assert playlist.startswith("#EXTM3U")
	161	PATTERN = re.compile(r'''((?:[^,"']\|"[^"]"\|'[^']')+)''')
	162	segments = []
	163	next_is_url=False
	164	metadata = {}
	165	for row in playlist.splitlines():
	166	if next_is_url:
	167	if not row.startswith('http'): #if relative path
	168	row = "{}/{}".format(os.path.dirname(playlisturl), row)
	169	segments.append(row)
	170	next_is_url=False
	171	continue
	172	if 'EXTINF' in row:
	173	next_is_url=True
	174	if "EXT-X-KEY" in row:
	175	row = row.split(':',1)[1] #skip first part
	176	parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
	177	metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
	178	return(segments, metadata)
	179
	180	def parse_videolist():
	181	page_num = 1
	182	soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
	183	page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
	184	videos_per_page = 8
	185	video_num = 0
	186	while(page_num <= page_tot):
	187	base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
	188	soup = BeautifulSoup(requests.get(base_url).text)
	189	for article in soup.findAll('article'):
	190	meta = dict(article.attrs)
	191	video = Video()
	192	video['title'] = meta['data-title']
	193	video['description'] = meta['data-description']
	194	video['url'] = dict(article.find('a').attrs)['href']
	195	video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
	196	video['num'] = video_num
	197	video['total'] = page_tot * videos_per_page
	198	video_num += 1
	199	yield video
	200	page_num += 1
	201
	202	def remux(video, xml=None):
	203	basename = video['filename'].split('.ts')[0]
	204	if 'genre' in video:
	205	if not os.path.exists(video['genre']):
	206	os.mkdir(video['genre'])
	207	video['path'] = os.path.join(video['genre'],basename+'.mkv')
	208	else:
	209	video['path'] = basename+'.mkv'
	210	command = ["mkvmerge","-o",video['path'], '--title',video['title']]
	211
	212	if xml:
	213	with open(basename+'.xml','w') as f:
	214	f.write(xml)
	215	command.extend(['--global-tags',basename+'.xml'])
	216	if 'thumb' in video:
	217	with open('thumbnail.jpg','wb') as f: #FIXME use title instead for many downloaders
	218	f.write(video['thumb'].read())
	219	command.extend(['--attachment-description', "Thumbnail",
	220	'--attachment-mime-type', 'image/jpeg',
	221	'--attach-file', 'thumbnail.jpg'])
	222	command.append(video['filename'])
	223	print(Popen(command, stdout=PIPE).communicate()[0])
	224	for fname in (video['filename'], basename+'.xml','thumbnail.jpg'):
	225	try:
	226	os.unlink(fname)
	227	except:
	228	pass
	229	if 'timestamp' in video:
	230	try:
	231	os.utime(video['path'], times=(video['timestamp'].timestamp(),video['timestamp'].timestamp()))
	232	except FileNotFoundError as e:
	233	print(e)
	234
	235
	236	def mkv_metadata(video):
	237	root = BeautifulSoup(features='xml')
	238	root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
	239	tags = root.new_tag("Tags")
	240	tag = root.new_tag("Tag")
	241	tags.append(tag)
	242	root.append(tags)
	243	keep = ('title','description', 'url','genre')
	244	targets = root.new_tag("Targets")
	245	ttv = root.new_tag("TargetTypeValue")
	246	ttv.string = str(50)
	247	targets.append(ttv)
	248	tag.append(targets)
	249	for key in video:
	250	if not key in keep:
	251	continue
	252	simple = root.new_tag('Simple')
	253	name = root.new_tag('Name')
	254	name.string=key.upper()
	255	simple.append(name)
	256	sstring = root.new_tag('String')
	257	sstring.string=video[key]
	258	simple.append(sstring)
	259	tag.append(simple)
	260	return str(root)
	261
	262	if __name__ == "__main__":
	263	parser = argparse.ArgumentParser()
	264	group = parser.add_mutually_exclusive_group(required=True)
	265	group.add_argument("-r", "--rss", help="Download all files in rss")
	266	group.add_argument("-u", "--url", help="Download video in url")
	267	group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
	268	parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
	269	parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
	270
	271	args = parser.parse_args()
	272	if args.rss:
	273	d = feedparser.parse(args.rss)
	274	for e in d.entries:
	275	print(("Downloading: %s"%e.title))
	276	if args.no_act:
	277	continue
	278	video = scrape_player_page({'title':e.title,'url':e.link})
	279	if args.no_remux:
	280	continue
	281	self.remux(video)
	282	#print(e.description)
	283	if args.mirror:
	284	if not os.path.exists('.seen'):
	285	os.mkdir('.seen')
	286	for video in parse_videolist():
	287	video['title'] = video['title'].replace('/','_')
	288	print(video['title']+'.mkv')
	289	print("{} of {}".format(video['num'], video['total']))
	290
	291	if os.path.exists(os.path.join('.seen',video['title'])):
	292	print("Skipping")
	293	continue
	294	print("Downloading...")
	295	if args.no_act:
	296	continue
	297	open(os.path.join('.seen',video['title']),'w').close() #touch
	298	video = scrape_player_page(video)
	299	if args.no_remux:
	300	continue
	301	xml = mkv_metadata(video)
	302	remux(video, xml)
	303
	304	else:
	305	if not args.no_act:
	306	video = scrape_player_page({'url':args.url})
	307	if not args.no_remux:
	308	remux({'title':e.title})
	309	print(("Downloaded {}".format(args.url)))