[svtplaydump.git] / svtplaydump.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
#   (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#   
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#   
#   You should have received a copy of the GNU General Public License
#   along with this program.  If not, see <http://www.gnu.org/licenses/>
#
# Changelog:
# 0.4 added mirror mode.
# 0.3 added apple streaming playlist parsing and decryption
# 0.2 added python 2.4 urlparse compatibility
# 0.1 initial release

from bs4 import BeautifulSoup
from subprocess import *
import re
from Crypto.Cipher import AES
import struct
import argparse
import requests
import sys, os

def scrape_player_page(url, title):
    """
    Try to scrape the site for video and download. 
    """
    if not url.startswith('http'):
        url = "http://www.svtplay.se" + url
    video = {}
    soup = BeautifulSoup(requests.get(url).text)
    video_player = soup.body('a',{'data-json-href':True})[0]
    if 'oppetarkiv.se' in url:
        flashvars = requests.get("http://www.oppetarkiv.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
    else:    
        if video_player.attrs['data-json-href'].startswith("/wd"):
            flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
        else:    
            flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
    video['duration'] = video_player.attrs.get('data-length',0)
    video['title'] = title
    if not title:
        video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('|','_').replace('/','_')
    if 'dynamicStreams' in flashvars:
        video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
        filename = video['title']+".mp4"
        print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
    if 'pathflv' in flashvars:
        rtmp = flashvars['pathflv'][0]
        filename = video['title']+".flv"
        print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
    if 'video' in flashvars:
        for reference in flashvars['video']['videoReferences']:
            if 'm3u8' in reference['url']:
                video['url']=reference['url']
                video['filename'] = video['title']+'.ts'
                if 'statistics' in flashvars:
                    video['category'] = flashvars['statistics']['category']
        download_from_playlist(video)
    if not 'url' in video:
        print("Could not find any streams")
        return False
    return video

def download_from_playlist(video):
    playlist = parse_playlist(requests.get(video['url']).text)
    if not playlist:
        return
    videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
    if not videourl.startswith('http'): #if relative path
        videourl = "{}/{}".format(os.path.dirname(video['url']), videourl) 
    segments, metadata = parse_segment_playlist(videourl)
    if "EXT-X-KEY" in metadata:
        key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
        decrypt=True
    else:
        decrypt=False
    with open("%s"%video['filename'],"wb") as ofile:
        segment=0
        size = 0
        for url in segments:
            ufile = requests.get(url, stream=True).raw
            print("\r{} MB".format(size/1024/1024))
            sys.stdout.flush()
            if decrypt:
                iv=struct.pack("IIII",segment,0,0,0)
                decryptor = AES.new(key, AES.MODE_CBC, iv)
            while(True):
                buf = ufile.read(4096)
                if not buf:
                    break
                if decrypt:
                    buf = decryptor.decrypt(buf)
                ofile.write(buf)
                size += len(buf)
            segment += 1

def parse_playlist(playlist):
    if not playlist.startswith("#EXTM3U"):
        print(playlist)
        return False
    playlist = playlist.splitlines()
    while not 'EXT-X-STREAM-INF' in playlist[0]:
        playlist = playlist[1:]
    items=[]
    for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
        md = dict()
        if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
            continue
        for item in metadata_string.split(':')[1].split(','):
            if '=' in item:
                md.update([item.split('='),]) 
        md['url']=url
        items.append(md)
    return items 

def parse_segment_playlist(playlisturl):
    playlist = requests.get(playlisturl).text
    assert playlist.startswith("#EXTM3U")
    PATTERN = re.compile(r'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
    segments = []
    next_is_url=False
    metadata = {}
    for row in playlist.splitlines():
        if next_is_url:
            if not row.startswith('http'): #if relative path
                row = "{}/{}".format(os.path.dirname(playlisturl), row) 
            segments.append(row)
            next_is_url=False
            continue
        if 'EXTINF' in row:
            next_is_url=True
        if "EXT-X-KEY" in row:
             row = row.split(':',1)[1] #skip first part
             parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
             metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
    return(segments, metadata)

def parse_videolist():
    page_num = 1
    soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
    page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
    videos_per_page = 8
    video_num = 0
    while(page_num <= page_tot):
        base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
        soup = BeautifulSoup(requests.get(base_url).text)
        for article in soup.findAll('article'):
            meta = dict(article.attrs)
            video = {}
            video['title'] = meta['data-title']
            video['description'] = meta['data-description']
            video['url'] = dict(article.find('a').attrs)['href']
            video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
            video['num'] = video_num
            video['total'] = page_tot * videos_per_page
            video_num += 1
            yield video
        page_num += 1

def remux(video):
    basename = video['filename'].split('.ts')[0]
    print(Popen(["avconv","-i",video['filename'],"-vcodec","copy","-acodec","copy", basename+'.mkv'], stdout=PIPE).communicate()[0])
    try:
        os.unlink(video['filename'])
    except:
        pass

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("-r", "--rss", help="Download all files in rss")
    group.add_argument("-u", "--url", help="Download video in url")
    group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
    parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
    parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
    
    args = parser.parse_args()
    if args.rss: 
        import feedparser
        d = feedparser.parse(args.rss)
        for e in d.entries:
            print(("Downloading: %s"%e.title))
            if args.no_act:
                continue
            filename = scrape_player_page(e.link, e.title)
            if args.no_remux:
                continue
            self.remux({'title':e.title})
        #print(e.description)
    if args.mirror:
        for video in parse_videolist():
            video['title'] = video['title'].replace('/','_')
            print(video['title']+'.mkv')
            print("{} of {}".format(video['num'], video['total']))
            if os.path.exists(video['title']+'.mkv'):
                print("Skipping") 
                continue
            print("Downloading...")
            if args.no_act:
                continue
            video = scrape_player_page(video['url'], video['title'])
            if args.no_remux:
                continue
            remux(video)
    else:
        if not args.no_act:
            video = scrape_player_page(args.url, None)
        if not args.no_remux:
            remux({'title':e.title})
        print(("Downloaded {}".format(args.url)))
Commit	Line	Data
84f7ef7d	1	#!/usr/bin/env python3
56181f0a	2	# -- coding: utf-8 --
ca2553c7 MF	3	#
	4	# (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
	5	#
	6	# This program is free software: you can redistribute it and/or modify
	7	# it under the terms of the GNU General Public License as published by
	8	# the Free Software Foundation, either version 3 of the License, or
	9	# (at your option) any later version.
	10	#
	11	# This program is distributed in the hope that it will be useful,
	12	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	# GNU General Public License for more details.
	15	#
	16	# You should have received a copy of the GNU General Public License
	17	# along with this program. If not, see <http://www.gnu.org/licenses/>
	18	#
	19	# Changelog:
d05b6699	20	# 0.4 added mirror mode.
56181f0a	21	# 0.3 added apple streaming playlist parsing and decryption
ca2553c7 MF	22	# 0.2 added python 2.4 urlparse compatibility
	23	# 0.1 initial release
	24
84f7ef7d	25	from bs4 import BeautifulSoup
ca2553c7	26	from subprocess import *
89a00fa0	27	import re
56181f0a MF	28	from Crypto.Cipher import AES
56181f0a MF	29	import struct
72beea17	30	import argparse
84f7ef7d	31	import requests
d05b6699	32	import sys, os
ca2553c7	33
5b0549b5	34	def scrape_player_page(url, title):
d05b6699 MF	35	"""
	36	Try to scrape the site for video and download.
	37	"""
	38	if not url.startswith('http'):
	39	url = "http://www.svtplay.se" + url
	40	video = {}
84f7ef7d	41	soup = BeautifulSoup(requests.get(url).text)
d05b6699	42	video_player = soup.body('a',{'data-json-href':True})[0]
7370a42e MF	43	if 'oppetarkiv.se' in url:
7370a42e MF	44	flashvars = requests.get("http://www.oppetarkiv.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
d3ebb57d	45	else:
7370a42e MF	46	if video_player.attrs['data-json-href'].startswith("/wd"):
	47	flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
	48	else:
	49	flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
84f7ef7d	50	video['duration'] = video_player.attrs.get('data-length',0)
d05b6699 MF	51	video['title'] = title
d05b6699 MF	52	if not title:
84f7ef7d	53	video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('\|','_').replace('/','_')
ca2553c7	54	if 'dynamicStreams' in flashvars:
d05b6699 MF	55	video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
d05b6699 MF	56	filename = video['title']+".mp4"
84f7ef7d	57	print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
ca2553c7 MF	58	if 'pathflv' in flashvars:
ca2553c7 MF	59	rtmp = flashvars['pathflv'][0]
d05b6699	60	filename = video['title']+".flv"
84f7ef7d	61	print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
89a00fa0	62	if 'video' in flashvars:
56181f0a	63	for reference in flashvars['video']['videoReferences']:
2d8521d8	64	if 'm3u8' in reference['url']:
d05b6699 MF	65	video['url']=reference['url']
	66	video['filename'] = video['title']+'.ts'
	67	if 'statistics' in flashvars:
	68	video['category'] = flashvars['statistics']['category']
	69	download_from_playlist(video)
2d8521d8	70	if not 'url' in video:
84f7ef7d	71	print("Could not find any streams")
2d8521d8	72	return False
d05b6699 MF	73	return video
	74
	75	def download_from_playlist(video):
84f7ef7d MF	76	playlist = parse_playlist(requests.get(video['url']).text)
	77	if not playlist:
	78	return
56181f0a	79	videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
2d8521d8 MF	80	if not videourl.startswith('http'): #if relative path
	81	videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
	82	segments, metadata = parse_segment_playlist(videourl)
56181f0a	83	if "EXT-X-KEY" in metadata:
84f7ef7d	84	key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
56181f0a MF	85	decrypt=True
	86	else:
	87	decrypt=False
84f7ef7d	88	with open("%s"%video['filename'],"wb") as ofile:
56181f0a	89	segment=0
72beea17	90	size = 0
56181f0a	91	for url in segments:
84f7ef7d MF	92	ufile = requests.get(url, stream=True).raw
84f7ef7d MF	93	print("\r{} MB".format(size/1024/1024))
72beea17	94	sys.stdout.flush()
56181f0a MF	95	if decrypt:
	96	iv=struct.pack("IIII",segment,0,0,0)
	97	decryptor = AES.new(key, AES.MODE_CBC, iv)
	98	while(True):
84f7ef7d MF	99	buf = ufile.read(4096)
84f7ef7d MF	100	if not buf:
56181f0a	101	break
84f7ef7d MF	102	if decrypt:
	103	buf = decryptor.decrypt(buf)
	104	ofile.write(buf)
	105	size += len(buf)
56181f0a MF	106	segment += 1
	107
	108	def parse_playlist(playlist):
d05b6699	109	if not playlist.startswith("#EXTM3U"):
84f7ef7d	110	print(playlist)
d05b6699	111	return False
2d8521d8 MF	112	playlist = playlist.splitlines()
	113	while not 'EXT-X-STREAM-INF' in playlist[0]:
	114	playlist = playlist[1:]
56181f0a MF	115	items=[]
	116	for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
	117	md = dict()
2d8521d8 MF	118	if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
2d8521d8 MF	119	continue
56181f0a MF	120	for item in metadata_string.split(':')[1].split(','):
	121	if '=' in item:
	122	md.update([item.split('='),])
	123	md['url']=url
	124	items.append(md)
	125	return items
	126
2d8521d8 MF	127	def parse_segment_playlist(playlisturl):
2d8521d8 MF	128	playlist = requests.get(playlisturl).text
56181f0a MF	129	assert playlist.startswith("#EXTM3U")
	130	PATTERN = re.compile(r'''((?:[^,"']\|"[^"]"\|'[^']')+)''')
	131	segments = []
	132	next_is_url=False
	133	metadata = {}
	134	for row in playlist.splitlines():
	135	if next_is_url:
2d8521d8 MF	136	if not row.startswith('http'): #if relative path
2d8521d8 MF	137	row = "{}/{}".format(os.path.dirname(playlisturl), row)
56181f0a MF	138	segments.append(row)
	139	next_is_url=False
	140	continue
	141	if 'EXTINF' in row:
	142	next_is_url=True
	143	if "EXT-X-KEY" in row:
	144	row = row.split(':',1)[1] #skip first part
d05b6699	145	parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
56181f0a	146	metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
84f7ef7d MF	147	return(segments, metadata)
84f7ef7d MF	148
d05b6699	149	def parse_videolist():
5b0549b5	150	page_num = 1
84f7ef7d MF	151	soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
84f7ef7d MF	152	page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
5b0549b5 MF	153	videos_per_page = 8
	154	video_num = 0
	155	while(page_num <= page_tot):
	156	base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
84f7ef7d	157	soup = BeautifulSoup(requests.get(base_url).text)
5b0549b5 MF	158	for article in soup.findAll('article'):
	159	meta = dict(article.attrs)
	160	video = {}
	161	video['title'] = meta['data-title']
	162	video['description'] = meta['data-description']
	163	video['url'] = dict(article.find('a').attrs)['href']
	164	video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
	165	video['num'] = video_num
	166	video['total'] = page_tot * videos_per_page
	167	video_num += 1
	168	yield video
	169	page_num += 1
	170
2d8521d8 MF	171	def remux(video):
	172	basename = video['filename'].split('.ts')[0]
	173	print(Popen(["avconv","-i",video['filename'],"-vcodec","copy","-acodec","copy", basename+'.mkv'], stdout=PIPE).communicate()[0])
	174	try:
	175	os.unlink(video['filename'])
	176	except:
	177	pass
56181f0a	178
ca2553c7	179	if __name__ == "__main__":
72beea17	180	parser = argparse.ArgumentParser()
1ad04c01 MF	181	group = parser.add_mutually_exclusive_group(required=True)
	182	group.add_argument("-r", "--rss", help="Download all files in rss")
	183	group.add_argument("-u", "--url", help="Download video in url")
	184	group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
5b0549b5	185	parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
2d8521d8 MF	186	parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
2d8521d8 MF	187
72beea17	188	args = parser.parse_args()
d05b6699	189	if args.rss:
84f7ef7d	190	import feedparser
5b0549b5	191	d = feedparser.parse(args.rss)
72beea17	192	for e in d.entries:
84f7ef7d	193	print(("Downloading: %s"%e.title))
5b0549b5 MF	194	if args.no_act:
	195	continue
	196	filename = scrape_player_page(e.link, e.title)
2d8521d8 MF	197	if args.no_remux:
	198	continue
	199	self.remux({'title':e.title})
72beea17	200	#print(e.description)
d05b6699 MF	201	if args.mirror:
	202	for video in parse_videolist():
	203	video['title'] = video['title'].replace('/','_')
84f7ef7d MF	204	print(video['title']+'.mkv')
84f7ef7d MF	205	print("{} of {}".format(video['num'], video['total']))
d05b6699	206	if os.path.exists(video['title']+'.mkv'):
84f7ef7d	207	print("Skipping")
d05b6699 MF	208	continue
d05b6699 MF	209	print("Downloading...")
5b0549b5 MF	210	if args.no_act:
5b0549b5 MF	211	continue
2d8521d8 MF	212	video = scrape_player_page(video['url'], video['title'])
	213	if args.no_remux:
	214	continue
	215	remux(video)
72beea17	216	else:
5b0549b5 MF	217	if not args.no_act:
5b0549b5 MF	218	video = scrape_player_page(args.url, None)
2d8521d8 MF	219	if not args.no_remux:
2d8521d8 MF	220	remux({'title':e.title})
84f7ef7d	221	print(("Downloaded {}".format(args.url)))