svtplaydump.py

   1 #!/usr/bin/env python3
   2 # -*- coding: utf-8 -*-
   3 #
   4 #   (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
   5 #
   6 #   This program is free software: you can redistribute it and/or modify
   7 #   it under the terms of the GNU General Public License as published by
   8 #   the Free Software Foundation, either version 3 of the License, or
   9 #   (at your option) any later version.
  10 #
  11 #   This program is distributed in the hope that it will be useful,
  12 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 #   GNU General Public License for more details.
  15 #
  16 #   You should have received a copy of the GNU General Public License
  17 #   along with this program.  If not, see <http://www.gnu.org/licenses/>
  18 #
  19 # Changelog:
  20 # 0.4 added mirror mode.
  21 # 0.3 added apple streaming playlist parsing and decryption
  22 # 0.2 added python 2.4 urlparse compatibility
  23 # 0.1 initial release
  24
  25 from bs4 import BeautifulSoup, Doctype
  26 from subprocess import *
  27 import re
  28 from Crypto.Cipher import AES
  29 import struct
  30 import argparse
  31 import requests
  32 import sys, os
  33 import socket
  34 import feedparser
  35 from datetime import datetime, timezone
  36 class Video(dict):
  37     def __init__(self, *args, **kwargs):
  38         self.update(dict(*args, **kwargs))  # use the free update to set keys
  39
  40     def __setattr__(self, name, value):
  41         return self.__setitem__(name,value)
  42
  43     def __getattr__(self, name):
  44         return self.__getitem__(name)
  45
  46     def is_downloaded(self):
  47         raise("NotImplemented")
  48
  49 def scrape_player_page(video):
  50     """
  51     Try to scrape the site for video and download.
  52     """
  53     if not video['url'].startswith('http'):
  54         video['url'] = "http://www.svtplay.se" + video['url']
  55     soup = BeautifulSoup(requests.get(video['url']).text)
  56     video_player = soup.body('a',{'data-json-href':True})[0]
  57     if 'oppetarkiv.se' in video['url']:
  58         flashvars = requests.get("http://www.oppetarkiv.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
  59     else:
  60         if video_player.attrs['data-json-href'].startswith("/wd"):
  61             flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
  62         else:
  63             flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
  64     video['duration'] = video_player.attrs.get('data-length',0)
  65     if not video['title']:
  66         video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('|','_').replace('/','_')
  67     if not 'genre' in video:
  68         if soup.find(text='Kategori:'):
  69             video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
  70         else:
  71             video['genre'] = 'Ingen Genre'
  72     if 'dynamicStreams' in flashvars:
  73         video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
  74         filename = video['title']+".mp4"
  75         print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
  76     if 'pathflv' in flashvars:
  77         rtmp = flashvars['pathflv'][0]
  78         filename = video['title']+".flv"
  79         print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
  80     if not 'timestamp' in video:
  81         if soup.find_all(datetime=True):
  82             xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
  83             video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) #naive in utc
  84             video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) #convert to local time
  85     if 'video' in flashvars:
  86         for reference in flashvars['video']['videoReferences']:
  87             if 'm3u8' in reference['url']:
  88                 video['url']=reference['url']
  89                 video['filename'] = video['title']+'.ts'
  90                 if 'statistics' in flashvars:
  91                     video['category'] = flashvars['statistics']['category']
  92         download_from_playlist(video)
  93     if not 'url' in video:
  94         print("Could not find any streams")
  95         return False
  96     return video
  97
  98 def download_from_playlist(video):
  99     playlist = parse_playlist(requests.get(video['url']).text)
 100     if not playlist:
 101         return
 102     videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
 103     if not videourl.startswith('http'): #if relative path
 104         videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
 105     segments, metadata = parse_segment_playlist(videourl)
 106     if "EXT-X-KEY" in metadata:
 107         key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
 108         decrypt=True
 109     else:
 110         decrypt=False
 111     with open("%s"%video['filename'],"wb") as ofile:
 112         segment=0
 113         size = 0
 114         for url in segments:
 115             ufile = requests.get(url, stream=True).raw
 116             print("\r{0:.2f} MB".format(size/1024/1024),end="")
 117             sys.stdout.flush()
 118             if decrypt:
 119                 iv=struct.pack("IIII",segment,0,0,0)
 120                 decryptor = AES.new(key, AES.MODE_CBC, iv)
 121             while(True):
 122                 try:
 123                     buf = ufile.read(4096)
 124                 except (socket.error, TypeError) as e:
 125                     print("Error reading, skipping file")
 126                     print(e)
 127                     return
 128                 if not buf:
 129                     break
 130                 if decrypt:
 131                     buf = decryptor.decrypt(buf)
 132                 ofile.write(buf)
 133                 size += len(buf)
 134             segment += 1
 135
 136     if 'thumb-url' in video:
 137         video['thumb'] = requests.get(video['thumb-url'],stream=True).raw
 138
 139 def parse_playlist(playlist):
 140     if not playlist.startswith("#EXTM3U"):
 141         print(playlist)
 142         return False
 143     playlist = playlist.splitlines()
 144     while not 'EXT-X-STREAM-INF' in playlist[0]:
 145         playlist = playlist[1:]
 146     items=[]
 147     for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
 148         md = Video()
 149         if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
 150             continue
 151         for item in metadata_string.split(':')[1].split(','):
 152             if '=' in item:
 153                 md.update([item.split('='),])
 154         md['url']=url
 155         items.append(md)
 156     return items
 157
 158 def parse_segment_playlist(playlisturl):
 159     playlist = requests.get(playlisturl).text
 160     assert playlist.startswith("#EXTM3U")
 161     PATTERN = re.compile(r'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
 162     segments = []
 163     next_is_url=False
 164     metadata = {}
 165     for row in playlist.splitlines():
 166         if next_is_url:
 167             if not row.startswith('http'): #if relative path
 168                 row = "{}/{}".format(os.path.dirname(playlisturl), row)
 169             segments.append(row)
 170             next_is_url=False
 171             continue
 172         if 'EXTINF' in row:
 173             next_is_url=True
 174         if "EXT-X-KEY" in row:
 175              row = row.split(':',1)[1] #skip first part
 176              parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
 177              metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
 178     return(segments, metadata)
 179
 180 def parse_videolist():
 181     page_num = 1
 182     soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
 183     page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
 184     videos_per_page = 8
 185     video_num = 0
 186     while(page_num <= page_tot):
 187         base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
 188         soup = BeautifulSoup(requests.get(base_url).text)
 189         for article in soup.findAll('article'):
 190             meta = dict(article.attrs)
 191             video = Video()
 192             video['title'] = meta['data-title']
 193             video['description'] = meta['data-description']
 194             video['url'] = dict(article.find('a').attrs)['href']
 195             video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
 196             video['num'] = video_num
 197             video['total'] = page_tot * videos_per_page
 198             video_num += 1
 199             yield video
 200         page_num += 1
 201
 202 def remux(video, xml=None):
 203     basename = video['filename'].split('.ts')[0]
 204     if 'genre' in video:
 205         if not os.path.exists(video['genre']):
 206             os.mkdir(video['genre'])
 207         video['path'] = os.path.join(video['genre'],basename+'.mkv')
 208     else:
 209         video['path'] = basename+'.mkv'
 210     command = ["mkvmerge","-o",video['path'], '--title',video['title']]
 211
 212     if xml:
 213         with open(basename+'.xml','w') as f:
 214             f.write(xml)
 215             command.extend(['--global-tags',basename+'.xml'])
 216     if 'thumb' in video:
 217         with open('thumbnail.jpg','wb') as f: #FIXME use title instead for many downloaders
 218             f.write(video['thumb'].read())
 219             command.extend(['--attachment-description', "Thumbnail",
 220                  '--attachment-mime-type', 'image/jpeg',
 221                  '--attach-file', 'thumbnail.jpg'])
 222     command.append(video['filename'])
 223     print(Popen(command, stdout=PIPE).communicate()[0])
 224     for fname in (video['filename'], basename+'.xml','thumbnail.jpg'):
 225         try:
 226             os.unlink(fname)
 227         except:
 228             pass
 229     if 'timestamp' in video:
 230         try:
 231             os.utime(video['path'], times=(video['timestamp'].timestamp(),video['timestamp'].timestamp()))
 232         except FileNotFoundError as e:
 233             print(e)
 234
 235
 236 def mkv_metadata(video):
 237     root = BeautifulSoup(features='xml')
 238     root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
 239     tags = root.new_tag("Tags")
 240     tag = root.new_tag("Tag")
 241     tags.append(tag)
 242     root.append(tags)
 243     keep = ('title','description', 'url','genre')
 244     targets = root.new_tag("Targets")
 245     ttv = root.new_tag("TargetTypeValue")
 246     ttv.string = str(50)
 247     targets.append(ttv)
 248     tag.append(targets)
 249     for key in video:
 250         if not key in keep:
 251             continue
 252         simple = root.new_tag('Simple')
 253         name = root.new_tag('Name')
 254         name.string=key.upper()
 255         simple.append(name)
 256         sstring = root.new_tag('String')
 257         sstring.string=video[key]
 258         simple.append(sstring)
 259         tag.append(simple)
 260     return str(root)
 261
 262 if __name__ == "__main__":
 263     parser = argparse.ArgumentParser()
 264     group = parser.add_mutually_exclusive_group(required=True)
 265     group.add_argument("-r", "--rss", help="Download all files in rss")
 266     group.add_argument("-u", "--url", help="Download video in url")
 267     group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
 268     parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
 269     parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
 270
 271     args = parser.parse_args()
 272     if args.rss:
 273         d = feedparser.parse(args.rss)
 274         for e in d.entries:
 275             print(("Downloading: %s"%e.title))
 276             if args.no_act:
 277                 continue
 278             video = scrape_player_page({'title':e.title,'url':e.link})
 279             if args.no_remux:
 280                 continue
 281             self.remux(video)
 282         #print(e.description)
 283     if args.mirror:
 284         if not os.path.exists('.seen'):
 285             os.mkdir('.seen')
 286         for video in parse_videolist():
 287             video['title'] = video['title'].replace('/','_')
 288             print(video['title']+'.mkv')
 289             print("{} of {}".format(video['num'], video['total']))
 290
 291             if os.path.exists(os.path.join('.seen',video['title'])):
 292                 print("Skipping")
 293                 continue
 294             print("Downloading...")
 295             if args.no_act:
 296                 continue
 297             open(os.path.join('.seen',video['title']),'w').close() #touch
 298             video = scrape_player_page(video)
 299             if args.no_remux:
 300                 continue
 301             xml = mkv_metadata(video)
 302             remux(video, xml)
 303
 304     else:
 305         if not args.no_act:
 306             video = scrape_player_page({'url':args.url})
 307         if not args.no_remux:
 308             remux({'title':e.title})
 309         print(("Downloaded {}".format(args.url)))