[svtplaydump.git] / svtplaydump.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
#   (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#   
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#   
#   You should have received a copy of the GNU General Public License
#   along with this program.  If not, see <http://www.gnu.org/licenses/>
#
# Changelog:
# 0.4 added mirror mode.
# 0.3 added apple streaming playlist parsing and decryption
# 0.2 added python 2.4 urlparse compatibility
# 0.1 initial release

from BeautifulSoup import BeautifulSoup
from subprocess import *
import re
import json
from Crypto.Cipher import AES
import struct
import argparse
import feedparser 
try:
    import urlparse
except ImportError:
    pass
import urllib2
try:
    import urllib2.urlparse as urlparse
except ImportError:
    pass
import sys, os

def main(url, title):
    """
    Try to scrape the site for video and download. 
    """
    if not url.startswith('http'):
        url = "http://www.svtplay.se" + url
    video = {}
    page = urllib2.urlopen(url).read()
    soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)
    video_player = soup.body('a',{'data-json-href':True})[0]
    if video_player.attrMap['data-json-href'].startswith("/wd"):
        flashvars = json.loads(urllib2.urlopen("http://www.svt.se/%s"%video_player.attrMap['data-json-href']).read())
    else:    
        flashvars = json.loads(urllib2.urlopen("http://www.svtplay.se/%s"%video_player.attrMap['data-json-href']+"?output=json").read())
    video['duration'] = video_player.attrMap.get('data-length',0)
    video['title'] = title
    if not title:
        video['title'] = soup.find('meta',{'property':'og:title'}).attrMap['content'].replace('|','_').replace('/','_')
    if 'dynamicStreams' in flashvars:
        video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
        filename = video['title']+".mp4"
        print Popen(["rtmpdump",u"-o"+filename,"-r", url], stdout=PIPE).communicate()[0]
    if 'pathflv' in flashvars:
        rtmp = flashvars['pathflv'][0]
        filename = video['title']+".flv"
        print Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0]
    if 'video' in flashvars:
        for reference in flashvars['video']['videoReferences']:
            if reference['url'].endswith("m3u8"):
                video['url']=reference['url']
                video['filename'] = video['title']+'.ts'
                if 'statistics' in flashvars:
                    video['category'] = flashvars['statistics']['category']
        download_from_playlist(video)
    else:
        print "Could not find any streams"
        return
    return video

def download_from_playlist(video):
    playlist = parse_playlist(urllib2.urlopen(video['url']).read())
    videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
    segments, metadata = parse_segment_playlist(urllib2.urlopen(videourl).read())
    if "EXT-X-KEY" in metadata:
        key = urllib2.urlopen(metadata["EXT-X-KEY"]['URI'].strip('"')).read()
        decrypt=True
    else:
        decrypt=False
    with open("%s"%video['filename'],"w") as ofile:
        segment=0
        size = 0
        for url in segments:
            ufile = urllib2.urlopen(url)
            print "\r{} MB".format(size/1024/1024),
            sys.stdout.flush()
            if decrypt:
                iv=struct.pack("IIII",segment,0,0,0)
                decryptor = AES.new(key, AES.MODE_CBC, iv)
            while(True):
                buf = ufile.read(1024)
                if buf:
                    if decrypt:
                        buf = decryptor.decrypt(buf)
                    ofile.write(buf)
                    size += len(buf)
                else:
                    ufile.close()
                    break
            segment += 1

def parse_playlist(playlist):
    if not playlist.startswith("#EXTM3U"):
        print playlist
        return False
    playlist = playlist.splitlines()[1:]
    items=[]
    for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
        md = dict()
        assert 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]
        for item in metadata_string.split(':')[1].split(','):
            if '=' in item:
                md.update([item.split('='),]) 
        md['url']=url
        items.append(md)
    return items 

def parse_segment_playlist(playlist):
    assert playlist.startswith("#EXTM3U")
    PATTERN = re.compile(r'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
    segments = []
    next_is_url=False
    metadata = {}
    for row in playlist.splitlines():
        if next_is_url:
            segments.append(row)
            next_is_url=False
            continue
        if 'EXTINF' in row:
            next_is_url=True
        if "EXT-X-KEY" in row:
             row = row.split(':',1)[1] #skip first part
             parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
             metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
    return(segments, metadata)   
def parse_videolist():
    page = urllib2.urlopen("http://www.svtplay.se/ajax/videos?antal=100").read()
    soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)
    videos = []
    for article in soup.findAll('article'):
        meta = dict(article.attrs)
        video = {}
        video['title'] = meta['data-title']
        video['description'] = meta['data-description']
        video['url'] = dict(article.find('a').attrs)['href']
        videos.append(video)
    return videos

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("-r", "--rss", help="Download all files in rss")
    group.add_argument("-u", "--url", help="Download video in url")
    group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
    args = parser.parse_args()
    if args.rss: 
        d = feedparser.parse(args.url)
        for e in d.entries:
            print("Downloading: %s"%e.title)
            filename = main(e.link, e.title)
            print Popen(["avconv","-i",filename,"-vcodec","copy","-acodec","copy", filename+'.mkv'], stdout=PIPE).communicate()[0]
        #print(e.description)
    if args.mirror:
        for video in parse_videolist():
            video['title'] = video['title'].replace('/','_')
            print video['title']+'.mkv',
            if os.path.exists(video['title']+'.mkv'):
                print "Skipping" 
                continue
            print("Downloading...")
            ret = main(video['url'], video['title'])
            print ret
            print Popen(["avconv","-i",video['title']+'.ts',"-vcodec","copy","-acodec","copy", video['title']+'.mkv'], stdout=PIPE).communicate()[0]
            try:
                os.unlink(video['title']+'.ts')
            except:
                import pdb;pdb.set_trace()
    else:
        video = main(args.url, None)
        print("Downloaded {}".format(video['title']))
Commit	Line	Data
ca2553c7	1	#!/usr/bin/env python
56181f0a	2	# -- coding: utf-8 --
ca2553c7 MF	3	#
	4	# (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
	5	#
	6	# This program is free software: you can redistribute it and/or modify
	7	# it under the terms of the GNU General Public License as published by
	8	# the Free Software Foundation, either version 3 of the License, or
	9	# (at your option) any later version.
	10	#
	11	# This program is distributed in the hope that it will be useful,
	12	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	# GNU General Public License for more details.
	15	#
	16	# You should have received a copy of the GNU General Public License
	17	# along with this program. If not, see <http://www.gnu.org/licenses/>
	18	#
	19	# Changelog:
d05b6699	20	# 0.4 added mirror mode.
56181f0a	21	# 0.3 added apple streaming playlist parsing and decryption
ca2553c7 MF	22	# 0.2 added python 2.4 urlparse compatibility
	23	# 0.1 initial release
	24
	25	from BeautifulSoup import BeautifulSoup
	26	from subprocess import *
89a00fa0 MF	27	import re
89a00fa0 MF	28	import json
56181f0a MF	29	from Crypto.Cipher import AES
56181f0a MF	30	import struct
72beea17 MF	31	import argparse
72beea17 MF	32	import feedparser
ca2553c7 MF	33	try:
	34	import urlparse
	35	except ImportError:
	36	pass
	37	import urllib2
	38	try:
	39	import urllib2.urlparse as urlparse
	40	except ImportError:
	41	pass
d05b6699	42	import sys, os
ca2553c7	43
d05b6699 MF	44	def main(url, title):
	45	"""
	46	Try to scrape the site for video and download.
	47	"""
	48	if not url.startswith('http'):
	49	url = "http://www.svtplay.se" + url
	50	video = {}
72beea17	51	page = urllib2.urlopen(url).read()
a7502370	52	soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)
d05b6699	53	video_player = soup.body('a',{'data-json-href':True})[0]
d3ebb57d MF	54	if video_player.attrMap['data-json-href'].startswith("/wd"):
	55	flashvars = json.loads(urllib2.urlopen("http://www.svt.se/%s"%video_player.attrMap['data-json-href']).read())
	56	else:
	57	flashvars = json.loads(urllib2.urlopen("http://www.svtplay.se/%s"%video_player.attrMap['data-json-href']+"?output=json").read())
	58	video['duration'] = video_player.attrMap.get('data-length',0)
d05b6699 MF	59	video['title'] = title
	60	if not title:
	61	video['title'] = soup.find('meta',{'property':'og:title'}).attrMap['content'].replace('\|','_').replace('/','_')
ca2553c7	62	if 'dynamicStreams' in flashvars:
d05b6699 MF	63	video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
d05b6699 MF	64	filename = video['title']+".mp4"
ca2553c7 MF	65	print Popen(["rtmpdump",u"-o"+filename,"-r", url], stdout=PIPE).communicate()[0]
	66	if 'pathflv' in flashvars:
	67	rtmp = flashvars['pathflv'][0]
d05b6699	68	filename = video['title']+".flv"
ca2553c7	69	print Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0]
89a00fa0	70	if 'video' in flashvars:
56181f0a MF	71	for reference in flashvars['video']['videoReferences']:
56181f0a MF	72	if reference['url'].endswith("m3u8"):
d05b6699 MF	73	video['url']=reference['url']
	74	video['filename'] = video['title']+'.ts'
	75	if 'statistics' in flashvars:
	76	video['category'] = flashvars['statistics']['category']
	77	download_from_playlist(video)
ca2553c7 MF	78	else:
	79	print "Could not find any streams"
	80	return
d05b6699 MF	81	return video
	82
	83	def download_from_playlist(video):
	84	playlist = parse_playlist(urllib2.urlopen(video['url']).read())
56181f0a MF	85	videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
	86	segments, metadata = parse_segment_playlist(urllib2.urlopen(videourl).read())
	87	if "EXT-X-KEY" in metadata:
	88	key = urllib2.urlopen(metadata["EXT-X-KEY"]['URI'].strip('"')).read()
	89	decrypt=True
	90	else:
	91	decrypt=False
d05b6699	92	with open("%s"%video['filename'],"w") as ofile:
56181f0a	93	segment=0
72beea17	94	size = 0
56181f0a	95	for url in segments:
56181f0a	96	ufile = urllib2.urlopen(url)
72beea17 MF	97	print "\r{} MB".format(size/1024/1024),
72beea17 MF	98	sys.stdout.flush()
56181f0a MF	99	if decrypt:
	100	iv=struct.pack("IIII",segment,0,0,0)
	101	decryptor = AES.new(key, AES.MODE_CBC, iv)
	102	while(True):
	103	buf = ufile.read(1024)
	104	if buf:
	105	if decrypt:
	106	buf = decryptor.decrypt(buf)
	107	ofile.write(buf)
72beea17	108	size += len(buf)
56181f0a MF	109	else:
	110	ufile.close()
	111	break
	112	segment += 1
	113
	114	def parse_playlist(playlist):
d05b6699 MF	115	if not playlist.startswith("#EXTM3U"):
	116	print playlist
	117	return False
56181f0a MF	118	playlist = playlist.splitlines()[1:]
	119	items=[]
	120	for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
	121	md = dict()
	122	assert 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]
	123	for item in metadata_string.split(':')[1].split(','):
	124	if '=' in item:
	125	md.update([item.split('='),])
	126	md['url']=url
	127	items.append(md)
	128	return items
	129
	130	def parse_segment_playlist(playlist):
	131	assert playlist.startswith("#EXTM3U")
	132	PATTERN = re.compile(r'''((?:[^,"']\|"[^"]"\|'[^']')+)''')
	133	segments = []
	134	next_is_url=False
	135	metadata = {}
	136	for row in playlist.splitlines():
	137	if next_is_url:
	138	segments.append(row)
	139	next_is_url=False
	140	continue
	141	if 'EXTINF' in row:
	142	next_is_url=True
	143	if "EXT-X-KEY" in row:
	144	row = row.split(':',1)[1] #skip first part
d05b6699	145	parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
56181f0a MF	146	metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
56181f0a MF	147	return(segments, metadata)
d05b6699 MF	148	def parse_videolist():
	149	page = urllib2.urlopen("http://www.svtplay.se/ajax/videos?antal=100").read()
	150	soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)
	151	videos = []
	152	for article in soup.findAll('article'):
	153	meta = dict(article.attrs)
	154	video = {}
	155	video['title'] = meta['data-title']
	156	video['description'] = meta['data-description']
	157	video['url'] = dict(article.find('a').attrs)['href']
	158	videos.append(video)
	159	return videos
56181f0a	160
ca2553c7	161	if __name__ == "__main__":
72beea17	162	parser = argparse.ArgumentParser()
1ad04c01 MF	163	group = parser.add_mutually_exclusive_group(required=True)
	164	group.add_argument("-r", "--rss", help="Download all files in rss")
	165	group.add_argument("-u", "--url", help="Download video in url")
	166	group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
72beea17	167	args = parser.parse_args()
d05b6699	168	if args.rss:
72beea17 MF	169	d = feedparser.parse(args.url)
	170	for e in d.entries:
	171	print("Downloading: %s"%e.title)
d05b6699	172	filename = main(e.link, e.title)
72beea17 MF	173	print Popen(["avconv","-i",filename,"-vcodec","copy","-acodec","copy", filename+'.mkv'], stdout=PIPE).communicate()[0]
72beea17 MF	174	#print(e.description)
d05b6699 MF	175	if args.mirror:
	176	for video in parse_videolist():
	177	video['title'] = video['title'].replace('/','_')
	178	print video['title']+'.mkv',
	179	if os.path.exists(video['title']+'.mkv'):
	180	print "Skipping"
	181	continue
	182	print("Downloading...")
	183	ret = main(video['url'], video['title'])
d3ebb57d	184	print ret
d05b6699 MF	185	print Popen(["avconv","-i",video['title']+'.ts',"-vcodec","copy","-acodec","copy", video['title']+'.mkv'], stdout=PIPE).communicate()[0]
	186	try:
	187	os.unlink(video['title']+'.ts')
	188	except:
	189	import pdb;pdb.set_trace()
72beea17	190	else:
d05b6699 MF	191	video = main(args.url, None)
d05b6699 MF	192	print("Downloaded {}".format(video['title']))