[svtplaydump.git] / svtplaydump.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
#   (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#   
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#   
#   You should have received a copy of the GNU General Public License
#   along with this program.  If not, see <http://www.gnu.org/licenses/>
#
# Changelog:
# 0.4 added mirror mode.
# 0.3 added apple streaming playlist parsing and decryption
# 0.2 added python 2.4 urlparse compatibility
# 0.1 initial release

from bs4 import BeautifulSoup
from subprocess import *
import re
from Crypto.Cipher import AES
import struct
import argparse
import requests
import sys, os

def scrape_player_page(url, title):
    """
    Try to scrape the site for video and download. 
    """
    if not url.startswith('http'):
        url = "http://www.svtplay.se" + url
    video = {}
    soup = BeautifulSoup(requests.get(url).text)
    video_player = soup.body('a',{'data-json-href':True})[0]
    if 'oppetarkiv.se' in url:
        flashvars = requests.get("http://www.oppetarkiv.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
    else:    
        if video_player.attrs['data-json-href'].startswith("/wd"):
            flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
        else:    
            flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
    video['duration'] = video_player.attrs.get('data-length',0)
    video['title'] = title
    if not title:
        video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('|','_').replace('/','_')
    if 'dynamicStreams' in flashvars:
        video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
        filename = video['title']+".mp4"
        print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
    if 'pathflv' in flashvars:
        rtmp = flashvars['pathflv'][0]
        filename = video['title']+".flv"
        print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
    if 'video' in flashvars:
        for reference in flashvars['video']['videoReferences']:
            if reference['url'].endswith("m3u8"):
                video['url']=reference['url']
                video['filename'] = video['title']+'.ts'
                if 'statistics' in flashvars:
                    video['category'] = flashvars['statistics']['category']
        download_from_playlist(video)
    else:
        print("Could not find any streams")
        return
    return video

def download_from_playlist(video):
    playlist = parse_playlist(requests.get(video['url']).text)
    if not playlist:
        return
    videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
    segments, metadata = parse_segment_playlist(requests.get(videourl).text)
    if "EXT-X-KEY" in metadata:
        key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
        decrypt=True
    else:
        decrypt=False
    with open("%s"%video['filename'],"wb") as ofile:
        segment=0
        size = 0
        for url in segments:
            ufile = requests.get(url, stream=True).raw
            print("\r{} MB".format(size/1024/1024))
            sys.stdout.flush()
            if decrypt:
                iv=struct.pack("IIII",segment,0,0,0)
                decryptor = AES.new(key, AES.MODE_CBC, iv)
            while(True):
                buf = ufile.read(4096)
                if not buf:
                    break
                if decrypt:
                    buf = decryptor.decrypt(buf)
                ofile.write(buf)
                size += len(buf)
            segment += 1

def parse_playlist(playlist):
    if not playlist.startswith("#EXTM3U"):
        print(playlist)
        return False
    playlist = playlist.splitlines()[1:]
    items=[]
    for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
        md = dict()
        assert 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]
        for item in metadata_string.split(':')[1].split(','):
            if '=' in item:
                md.update([item.split('='),]) 
        md['url']=url
        items.append(md)
    return items 

def parse_segment_playlist(playlist):
    assert playlist.startswith("#EXTM3U")
    PATTERN = re.compile(r'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
    segments = []
    next_is_url=False
    metadata = {}
    for row in playlist.splitlines():
        if next_is_url:
            segments.append(row)
            next_is_url=False
            continue
        if 'EXTINF' in row:
            next_is_url=True
        if "EXT-X-KEY" in row:
             row = row.split(':',1)[1] #skip first part
             parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
             metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
    return(segments, metadata)

def parse_videolist():
    page_num = 1
    soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
    page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
    videos_per_page = 8
    video_num = 0
    while(page_num <= page_tot):
        base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
        soup = BeautifulSoup(requests.get(base_url).text)
        for article in soup.findAll('article'):
            meta = dict(article.attrs)
            video = {}
            video['title'] = meta['data-title']
            video['description'] = meta['data-description']
            video['url'] = dict(article.find('a').attrs)['href']
            video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
            video['num'] = video_num
            video['total'] = page_tot * videos_per_page
            video_num += 1
            yield video
        page_num += 1


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("-r", "--rss", help="Download all files in rss")
    group.add_argument("-u", "--url", help="Download video in url")
    group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
    parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
    args = parser.parse_args()
    if args.rss: 
        import feedparser
        d = feedparser.parse(args.rss)
        for e in d.entries:
            print(("Downloading: %s"%e.title))
            if args.no_act:
                continue
            filename = scrape_player_page(e.link, e.title)
            print(Popen(["avconv","-i",filename,"-vcodec","copy","-acodec","copy", filename+'.mkv'], stdout=PIPE).communicate()[0])
        #print(e.description)
    if args.mirror:
        for video in parse_videolist():
            video['title'] = video['title'].replace('/','_')
            print(video['title']+'.mkv')
            print("{} of {}".format(video['num'], video['total']))
            if os.path.exists(video['title']+'.mkv'):
                print("Skipping") 
                continue
            print("Downloading...")
            if args.no_act:
                continue
            ret = scrape_player_page(video['url'], video['title'])
            print(ret)
            print(Popen(["avconv","-i",video['title']+'.ts',"-vcodec","copy","-acodec","copy", video['title']+'.mkv'], stdout=PIPE).communicate()[0])
            try:
                os.unlink(video['title']+'.ts')
            except:
                import pdb;pdb.set_trace()
    else:
        if not args.no_act:
            video = scrape_player_page(args.url, None)
        print(("Downloaded {}".format(args.url)))
Commit	Line	Data
84f7ef7d	1	#!/usr/bin/env python3
56181f0a	2	# -- coding: utf-8 --
ca2553c7 MF	3	#
	4	# (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
	5	#
	6	# This program is free software: you can redistribute it and/or modify
	7	# it under the terms of the GNU General Public License as published by
	8	# the Free Software Foundation, either version 3 of the License, or
	9	# (at your option) any later version.
	10	#
	11	# This program is distributed in the hope that it will be useful,
	12	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	# GNU General Public License for more details.
	15	#
	16	# You should have received a copy of the GNU General Public License
	17	# along with this program. If not, see <http://www.gnu.org/licenses/>
	18	#
	19	# Changelog:
d05b6699	20	# 0.4 added mirror mode.
56181f0a	21	# 0.3 added apple streaming playlist parsing and decryption
ca2553c7 MF	22	# 0.2 added python 2.4 urlparse compatibility
	23	# 0.1 initial release
	24
84f7ef7d	25	from bs4 import BeautifulSoup
ca2553c7	26	from subprocess import *
89a00fa0	27	import re
56181f0a MF	28	from Crypto.Cipher import AES
56181f0a MF	29	import struct
72beea17	30	import argparse
84f7ef7d	31	import requests
d05b6699	32	import sys, os
ca2553c7	33
5b0549b5	34	def scrape_player_page(url, title):
d05b6699 MF	35	"""
	36	Try to scrape the site for video and download.
	37	"""
	38	if not url.startswith('http'):
	39	url = "http://www.svtplay.se" + url
	40	video = {}
84f7ef7d	41	soup = BeautifulSoup(requests.get(url).text)
d05b6699	42	video_player = soup.body('a',{'data-json-href':True})[0]
7370a42e MF	43	if 'oppetarkiv.se' in url:
7370a42e MF	44	flashvars = requests.get("http://www.oppetarkiv.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
d3ebb57d	45	else:
7370a42e MF	46	if video_player.attrs['data-json-href'].startswith("/wd"):
	47	flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
	48	else:
	49	flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
84f7ef7d	50	video['duration'] = video_player.attrs.get('data-length',0)
d05b6699 MF	51	video['title'] = title
d05b6699 MF	52	if not title:
84f7ef7d	53	video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('\|','_').replace('/','_')
ca2553c7	54	if 'dynamicStreams' in flashvars:
d05b6699 MF	55	video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
d05b6699 MF	56	filename = video['title']+".mp4"
84f7ef7d	57	print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
ca2553c7 MF	58	if 'pathflv' in flashvars:
ca2553c7 MF	59	rtmp = flashvars['pathflv'][0]
d05b6699	60	filename = video['title']+".flv"
84f7ef7d	61	print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
89a00fa0	62	if 'video' in flashvars:
56181f0a MF	63	for reference in flashvars['video']['videoReferences']:
56181f0a MF	64	if reference['url'].endswith("m3u8"):
d05b6699 MF	65	video['url']=reference['url']
	66	video['filename'] = video['title']+'.ts'
	67	if 'statistics' in flashvars:
	68	video['category'] = flashvars['statistics']['category']
	69	download_from_playlist(video)
ca2553c7	70	else:
84f7ef7d	71	print("Could not find any streams")
ca2553c7	72	return
d05b6699 MF	73	return video
	74
	75	def download_from_playlist(video):
84f7ef7d MF	76	playlist = parse_playlist(requests.get(video['url']).text)
	77	if not playlist:
	78	return
56181f0a	79	videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
84f7ef7d	80	segments, metadata = parse_segment_playlist(requests.get(videourl).text)
56181f0a	81	if "EXT-X-KEY" in metadata:
84f7ef7d	82	key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
56181f0a MF	83	decrypt=True
	84	else:
	85	decrypt=False
84f7ef7d	86	with open("%s"%video['filename'],"wb") as ofile:
56181f0a	87	segment=0
72beea17	88	size = 0
56181f0a	89	for url in segments:
84f7ef7d MF	90	ufile = requests.get(url, stream=True).raw
84f7ef7d MF	91	print("\r{} MB".format(size/1024/1024))
72beea17	92	sys.stdout.flush()
56181f0a MF	93	if decrypt:
	94	iv=struct.pack("IIII",segment,0,0,0)
	95	decryptor = AES.new(key, AES.MODE_CBC, iv)
	96	while(True):
84f7ef7d MF	97	buf = ufile.read(4096)
84f7ef7d MF	98	if not buf:
56181f0a	99	break
84f7ef7d MF	100	if decrypt:
	101	buf = decryptor.decrypt(buf)
	102	ofile.write(buf)
	103	size += len(buf)
56181f0a MF	104	segment += 1
	105
	106	def parse_playlist(playlist):
d05b6699	107	if not playlist.startswith("#EXTM3U"):
84f7ef7d	108	print(playlist)
d05b6699	109	return False
56181f0a MF	110	playlist = playlist.splitlines()[1:]
	111	items=[]
	112	for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
	113	md = dict()
	114	assert 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]
	115	for item in metadata_string.split(':')[1].split(','):
	116	if '=' in item:
	117	md.update([item.split('='),])
	118	md['url']=url
	119	items.append(md)
	120	return items
	121
	122	def parse_segment_playlist(playlist):
	123	assert playlist.startswith("#EXTM3U")
	124	PATTERN = re.compile(r'''((?:[^,"']\|"[^"]"\|'[^']')+)''')
	125	segments = []
	126	next_is_url=False
	127	metadata = {}
	128	for row in playlist.splitlines():
	129	if next_is_url:
	130	segments.append(row)
	131	next_is_url=False
	132	continue
	133	if 'EXTINF' in row:
	134	next_is_url=True
	135	if "EXT-X-KEY" in row:
	136	row = row.split(':',1)[1] #skip first part
d05b6699	137	parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
56181f0a	138	metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
84f7ef7d MF	139	return(segments, metadata)
84f7ef7d MF	140
d05b6699	141	def parse_videolist():
5b0549b5	142	page_num = 1
84f7ef7d MF	143	soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
84f7ef7d MF	144	page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
5b0549b5 MF	145	videos_per_page = 8
	146	video_num = 0
	147	while(page_num <= page_tot):
	148	base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
84f7ef7d	149	soup = BeautifulSoup(requests.get(base_url).text)
5b0549b5 MF	150	for article in soup.findAll('article'):
	151	meta = dict(article.attrs)
	152	video = {}
	153	video['title'] = meta['data-title']
	154	video['description'] = meta['data-description']
	155	video['url'] = dict(article.find('a').attrs)['href']
	156	video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
	157	video['num'] = video_num
	158	video['total'] = page_tot * videos_per_page
	159	video_num += 1
	160	yield video
	161	page_num += 1
	162
56181f0a	163
ca2553c7	164	if __name__ == "__main__":
72beea17	165	parser = argparse.ArgumentParser()
1ad04c01 MF	166	group = parser.add_mutually_exclusive_group(required=True)
	167	group.add_argument("-r", "--rss", help="Download all files in rss")
	168	group.add_argument("-u", "--url", help="Download video in url")
	169	group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
5b0549b5	170	parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
72beea17	171	args = parser.parse_args()
d05b6699	172	if args.rss:
84f7ef7d	173	import feedparser
5b0549b5	174	d = feedparser.parse(args.rss)
72beea17	175	for e in d.entries:
84f7ef7d	176	print(("Downloading: %s"%e.title))
5b0549b5 MF	177	if args.no_act:
	178	continue
	179	filename = scrape_player_page(e.link, e.title)
84f7ef7d	180	print(Popen(["avconv","-i",filename,"-vcodec","copy","-acodec","copy", filename+'.mkv'], stdout=PIPE).communicate()[0])
72beea17	181	#print(e.description)
d05b6699 MF	182	if args.mirror:
	183	for video in parse_videolist():
	184	video['title'] = video['title'].replace('/','_')
84f7ef7d MF	185	print(video['title']+'.mkv')
84f7ef7d MF	186	print("{} of {}".format(video['num'], video['total']))
d05b6699	187	if os.path.exists(video['title']+'.mkv'):
84f7ef7d	188	print("Skipping")
d05b6699 MF	189	continue
d05b6699 MF	190	print("Downloading...")
5b0549b5 MF	191	if args.no_act:
	192	continue
	193	ret = scrape_player_page(video['url'], video['title'])
84f7ef7d MF	194	print(ret)
84f7ef7d MF	195	print(Popen(["avconv","-i",video['title']+'.ts',"-vcodec","copy","-acodec","copy", video['title']+'.mkv'], stdout=PIPE).communicate()[0])
d05b6699 MF	196	try:
	197	os.unlink(video['title']+'.ts')
	198	except:
	199	import pdb;pdb.set_trace()
72beea17	200	else:
5b0549b5 MF	201	if not args.no_act:
5b0549b5 MF	202	video = scrape_player_page(args.url, None)
84f7ef7d	203	print(("Downloaded {}".format(args.url)))