[svtplaydump.git] / svtplaydump.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
#   (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#   
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#   
#   You should have received a copy of the GNU General Public License
#   along with this program.  If not, see <http://www.gnu.org/licenses/>
#
# Changelog:
# 0.4 added mirror mode.
# 0.3 added apple streaming playlist parsing and decryption
# 0.2 added python 2.4 urlparse compatibility
# 0.1 initial release

from bs4 import BeautifulSoup
from subprocess import *
import re
from Crypto.Cipher import AES
import struct
import argparse
import requests
import sys, os

def scrape_player_page(url, title):
    """
    Try to scrape the site for video and download. 
    """
    if not url.startswith('http'):
        url = "http://www.svtplay.se" + url
    video = {}
    soup = BeautifulSoup(requests.get(url).text)
    video_player = soup.body('a',{'data-json-href':True})[0]
    if video_player.attrs['data-json-href'].startswith("/wd"):
        flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
    else:    
        flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
    video['duration'] = video_player.attrs.get('data-length',0)
    video['title'] = title
    if not title:
        video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('|','_').replace('/','_')
    if 'dynamicStreams' in flashvars:
        video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
        filename = video['title']+".mp4"
        print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
    if 'pathflv' in flashvars:
        rtmp = flashvars['pathflv'][0]
        filename = video['title']+".flv"
        print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
    if 'video' in flashvars:
        for reference in flashvars['video']['videoReferences']:
            if reference['url'].endswith("m3u8"):
                video['url']=reference['url']
                video['filename'] = video['title']+'.ts'
                if 'statistics' in flashvars:
                    video['category'] = flashvars['statistics']['category']
        download_from_playlist(video)
    else:
        print("Could not find any streams")
        return
    return video

def download_from_playlist(video):
    playlist = parse_playlist(requests.get(video['url']).text)
    if not playlist:
        return
    videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
    segments, metadata = parse_segment_playlist(requests.get(videourl).text)
    if "EXT-X-KEY" in metadata:
        key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
        decrypt=True
    else:
        decrypt=False
    with open("%s"%video['filename'],"wb") as ofile:
        segment=0
        size = 0
        for url in segments:
            ufile = requests.get(url, stream=True).raw
            print("\r{} MB".format(size/1024/1024))
            sys.stdout.flush()
            if decrypt:
                iv=struct.pack("IIII",segment,0,0,0)
                decryptor = AES.new(key, AES.MODE_CBC, iv)
            while(True):
                buf = ufile.read(4096)
                if not buf:
                    break
                if decrypt:
                    buf = decryptor.decrypt(buf)
                ofile.write(buf)
                size += len(buf)
            segment += 1

def parse_playlist(playlist):
    if not playlist.startswith("#EXTM3U"):
        print(playlist)
        return False
    playlist = playlist.splitlines()[1:]
    items=[]
    for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
        md = dict()
        assert 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]
        for item in metadata_string.split(':')[1].split(','):
            if '=' in item:
                md.update([item.split('='),]) 
        md['url']=url
        items.append(md)
    return items 

def parse_segment_playlist(playlist):
    assert playlist.startswith("#EXTM3U")
    PATTERN = re.compile(r'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
    segments = []
    next_is_url=False
    metadata = {}
    for row in playlist.splitlines():
        if next_is_url:
            segments.append(row)
            next_is_url=False
            continue
        if 'EXTINF' in row:
            next_is_url=True
        if "EXT-X-KEY" in row:
             row = row.split(':',1)[1] #skip first part
             parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
             metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
    return(segments, metadata)

def parse_videolist():
    page_num = 1
    soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
    page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
    videos_per_page = 8
    video_num = 0
    while(page_num <= page_tot):
        base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
        soup = BeautifulSoup(requests.get(base_url).text)
        for article in soup.findAll('article'):
            meta = dict(article.attrs)
            video = {}
            video['title'] = meta['data-title']
            video['description'] = meta['data-description']
            video['url'] = dict(article.find('a').attrs)['href']
            video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
            video['num'] = video_num
            video['total'] = page_tot * videos_per_page
            video_num += 1
            yield video
        page_num += 1


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("-r", "--rss", help="Download all files in rss")
    group.add_argument("-u", "--url", help="Download video in url")
    group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
    parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
    args = parser.parse_args()
    if args.rss: 
        import feedparser
        d = feedparser.parse(args.rss)
        for e in d.entries:
            print(("Downloading: %s"%e.title))
            if args.no_act:
                continue
            filename = scrape_player_page(e.link, e.title)
            print(Popen(["avconv","-i",filename,"-vcodec","copy","-acodec","copy", filename+'.mkv'], stdout=PIPE).communicate()[0])
        #print(e.description)
    if args.mirror:
        for video in parse_videolist():
            video['title'] = video['title'].replace('/','_')
            print(video['title']+'.mkv')
            print("{} of {}".format(video['num'], video['total']))
            if os.path.exists(video['title']+'.mkv'):
                print("Skipping") 
                continue
            print("Downloading...")
            if args.no_act:
                continue
            ret = scrape_player_page(video['url'], video['title'])
            print(ret)
            print(Popen(["avconv","-i",video['title']+'.ts',"-vcodec","copy","-acodec","copy", video['title']+'.mkv'], stdout=PIPE).communicate()[0])
            try:
                os.unlink(video['title']+'.ts')
            except:
                import pdb;pdb.set_trace()
    else:
        if not args.no_act:
            video = scrape_player_page(args.url, None)
        print(("Downloaded {}".format(args.url)))
Commit	Line	Data
84f7ef7d	1	#!/usr/bin/env python3
56181f0a	2	# -- coding: utf-8 --
ca2553c7 MF	3	#
	4	# (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
	5	#
	6	# This program is free software: you can redistribute it and/or modify
	7	# it under the terms of the GNU General Public License as published by
	8	# the Free Software Foundation, either version 3 of the License, or
	9	# (at your option) any later version.
	10	#
	11	# This program is distributed in the hope that it will be useful,
	12	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	# GNU General Public License for more details.
	15	#
	16	# You should have received a copy of the GNU General Public License
	17	# along with this program. If not, see <http://www.gnu.org/licenses/>
	18	#
	19	# Changelog:
d05b6699	20	# 0.4 added mirror mode.
56181f0a	21	# 0.3 added apple streaming playlist parsing and decryption
ca2553c7 MF	22	# 0.2 added python 2.4 urlparse compatibility
	23	# 0.1 initial release
	24
84f7ef7d	25	from bs4 import BeautifulSoup
ca2553c7	26	from subprocess import *
89a00fa0	27	import re
56181f0a MF	28	from Crypto.Cipher import AES
56181f0a MF	29	import struct
72beea17	30	import argparse
84f7ef7d	31	import requests
d05b6699	32	import sys, os
ca2553c7	33
5b0549b5	34	def scrape_player_page(url, title):
d05b6699 MF	35	"""
	36	Try to scrape the site for video and download.
	37	"""
	38	if not url.startswith('http'):
	39	url = "http://www.svtplay.se" + url
	40	video = {}
84f7ef7d	41	soup = BeautifulSoup(requests.get(url).text)
d05b6699	42	video_player = soup.body('a',{'data-json-href':True})[0]
84f7ef7d MF	43	if video_player.attrs['data-json-href'].startswith("/wd"):
84f7ef7d MF	44	flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
d3ebb57d	45	else:
84f7ef7d MF	46	flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
84f7ef7d MF	47	video['duration'] = video_player.attrs.get('data-length',0)
d05b6699 MF	48	video['title'] = title
d05b6699 MF	49	if not title:
84f7ef7d	50	video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('\|','_').replace('/','_')
ca2553c7	51	if 'dynamicStreams' in flashvars:
d05b6699 MF	52	video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
d05b6699 MF	53	filename = video['title']+".mp4"
84f7ef7d	54	print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
ca2553c7 MF	55	if 'pathflv' in flashvars:
ca2553c7 MF	56	rtmp = flashvars['pathflv'][0]
d05b6699	57	filename = video['title']+".flv"
84f7ef7d	58	print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
89a00fa0	59	if 'video' in flashvars:
56181f0a MF	60	for reference in flashvars['video']['videoReferences']:
56181f0a MF	61	if reference['url'].endswith("m3u8"):
d05b6699 MF	62	video['url']=reference['url']
	63	video['filename'] = video['title']+'.ts'
	64	if 'statistics' in flashvars:
	65	video['category'] = flashvars['statistics']['category']
	66	download_from_playlist(video)
ca2553c7	67	else:
84f7ef7d	68	print("Could not find any streams")
ca2553c7	69	return
d05b6699 MF	70	return video
	71
	72	def download_from_playlist(video):
84f7ef7d MF	73	playlist = parse_playlist(requests.get(video['url']).text)
	74	if not playlist:
	75	return
56181f0a	76	videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
84f7ef7d	77	segments, metadata = parse_segment_playlist(requests.get(videourl).text)
56181f0a	78	if "EXT-X-KEY" in metadata:
84f7ef7d	79	key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
56181f0a MF	80	decrypt=True
	81	else:
	82	decrypt=False
84f7ef7d	83	with open("%s"%video['filename'],"wb") as ofile:
56181f0a	84	segment=0
72beea17	85	size = 0
56181f0a	86	for url in segments:
84f7ef7d MF	87	ufile = requests.get(url, stream=True).raw
84f7ef7d MF	88	print("\r{} MB".format(size/1024/1024))
72beea17	89	sys.stdout.flush()
56181f0a MF	90	if decrypt:
	91	iv=struct.pack("IIII",segment,0,0,0)
	92	decryptor = AES.new(key, AES.MODE_CBC, iv)
	93	while(True):
84f7ef7d MF	94	buf = ufile.read(4096)
84f7ef7d MF	95	if not buf:
56181f0a	96	break
84f7ef7d MF	97	if decrypt:
	98	buf = decryptor.decrypt(buf)
	99	ofile.write(buf)
	100	size += len(buf)
56181f0a MF	101	segment += 1
	102
	103	def parse_playlist(playlist):
d05b6699	104	if not playlist.startswith("#EXTM3U"):
84f7ef7d	105	print(playlist)
d05b6699	106	return False
56181f0a MF	107	playlist = playlist.splitlines()[1:]
	108	items=[]
	109	for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
	110	md = dict()
	111	assert 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]
	112	for item in metadata_string.split(':')[1].split(','):
	113	if '=' in item:
	114	md.update([item.split('='),])
	115	md['url']=url
	116	items.append(md)
	117	return items
	118
	119	def parse_segment_playlist(playlist):
	120	assert playlist.startswith("#EXTM3U")
	121	PATTERN = re.compile(r'''((?:[^,"']\|"[^"]"\|'[^']')+)''')
	122	segments = []
	123	next_is_url=False
	124	metadata = {}
	125	for row in playlist.splitlines():
	126	if next_is_url:
	127	segments.append(row)
	128	next_is_url=False
	129	continue
	130	if 'EXTINF' in row:
	131	next_is_url=True
	132	if "EXT-X-KEY" in row:
	133	row = row.split(':',1)[1] #skip first part
d05b6699	134	parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
56181f0a	135	metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
84f7ef7d MF	136	return(segments, metadata)
84f7ef7d MF	137
d05b6699	138	def parse_videolist():
5b0549b5	139	page_num = 1
84f7ef7d MF	140	soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
84f7ef7d MF	141	page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
5b0549b5 MF	142	videos_per_page = 8
	143	video_num = 0
	144	while(page_num <= page_tot):
	145	base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
84f7ef7d	146	soup = BeautifulSoup(requests.get(base_url).text)
5b0549b5 MF	147	for article in soup.findAll('article'):
	148	meta = dict(article.attrs)
	149	video = {}
	150	video['title'] = meta['data-title']
	151	video['description'] = meta['data-description']
	152	video['url'] = dict(article.find('a').attrs)['href']
	153	video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
	154	video['num'] = video_num
	155	video['total'] = page_tot * videos_per_page
	156	video_num += 1
	157	yield video
	158	page_num += 1
	159
56181f0a	160
ca2553c7	161	if __name__ == "__main__":
72beea17	162	parser = argparse.ArgumentParser()
1ad04c01 MF	163	group = parser.add_mutually_exclusive_group(required=True)
	164	group.add_argument("-r", "--rss", help="Download all files in rss")
	165	group.add_argument("-u", "--url", help="Download video in url")
	166	group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
5b0549b5	167	parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
72beea17	168	args = parser.parse_args()
d05b6699	169	if args.rss:
84f7ef7d	170	import feedparser
5b0549b5	171	d = feedparser.parse(args.rss)
72beea17	172	for e in d.entries:
84f7ef7d	173	print(("Downloading: %s"%e.title))
5b0549b5 MF	174	if args.no_act:
	175	continue
	176	filename = scrape_player_page(e.link, e.title)
84f7ef7d	177	print(Popen(["avconv","-i",filename,"-vcodec","copy","-acodec","copy", filename+'.mkv'], stdout=PIPE).communicate()[0])
72beea17	178	#print(e.description)
d05b6699 MF	179	if args.mirror:
	180	for video in parse_videolist():
	181	video['title'] = video['title'].replace('/','_')
84f7ef7d MF	182	print(video['title']+'.mkv')
84f7ef7d MF	183	print("{} of {}".format(video['num'], video['total']))
d05b6699	184	if os.path.exists(video['title']+'.mkv'):
84f7ef7d	185	print("Skipping")
d05b6699 MF	186	continue
d05b6699 MF	187	print("Downloading...")
5b0549b5 MF	188	if args.no_act:
	189	continue
	190	ret = scrape_player_page(video['url'], video['title'])
84f7ef7d MF	191	print(ret)
84f7ef7d MF	192	print(Popen(["avconv","-i",video['title']+'.ts',"-vcodec","copy","-acodec","copy", video['title']+'.mkv'], stdout=PIPE).communicate()[0])
d05b6699 MF	193	try:
	194	os.unlink(video['title']+'.ts')
	195	except:
	196	import pdb;pdb.set_trace()
72beea17	197	else:
5b0549b5 MF	198	if not args.no_act:
5b0549b5 MF	199	video = scrape_player_page(args.url, None)
84f7ef7d	200	print(("Downloaded {}".format(args.url)))