svtplaydump.py

   1 #!/usr/bin/env python3
   2 # -*- coding: utf-8 -*-
   3 #
   4 #   (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
   5 #
   6 #   This program is free software: you can redistribute it and/or modify
   7 #   it under the terms of the GNU General Public License as published by
   8 #   the Free Software Foundation, either version 3 of the License, or
   9 #   (at your option) any later version.
  10 #
  11 #   This program is distributed in the hope that it will be useful,
  12 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 #   GNU General Public License for more details.
  15 #
  16 #   You should have received a copy of the GNU General Public License
  17 #   along with this program.  If not, see <http://www.gnu.org/licenses/>
  18 #
  19 # Changelog:
  20 # 0.4 added mirror mode.
  21 # 0.3 added apple streaming playlist parsing and decryption
  22 # 0.2 added python 2.4 urlparse compatibility
  23 # 0.1 initial release
  24
  25 from bs4 import BeautifulSoup, Doctype
  26 from subprocess import *
  27 import re
  28 from Crypto.Cipher import AES
  29 import struct
  30 import argparse
  31 import requests
  32 import sys, os
  33 import socket
  34 import feedparser
  35 from datetime import datetime, timezone
  36 class Video(dict):
  37     def __init__(self, *args, **kwargs):
  38         self.update(dict(*args, **kwargs))  # use the free update to set keys
  39
  40     def __setattr__(self, name, value):
  41         return self.__setitem__(name,value)
  42
  43     def __getattr__(self, name):
  44         return self.__getitem__(name)
  45
  46     def is_downloaded(self):
  47         raise("NotImplemented")
  48
  49 def scrape_player_page(video):
  50     """
  51     Try to scrape the site for video and download.
  52     """
  53     if not video['url'].startswith('http'):
  54         video['url'] = "http://www.svtplay.se" + video['url']
  55     soup = BeautifulSoup(requests.get(video['url']).text)
  56     video_player = soup.body('a',{'data-json-href':True})[0]
  57     if 'oppetarkiv.se' in video['url']:
  58         flashvars = requests.get("http://www.oppetarkiv.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
  59     else:
  60         if video_player.attrs['data-json-href'].startswith("/wd"):
  61             flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
  62         else:
  63             flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
  64     video['duration'] = video_player.attrs.get('data-length',0)
  65     if not video['title']:
  66         video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('|','_').replace('/','_')
  67     if not 'genre' in video:
  68         if soup.find(text='Kategori:'):
  69             video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
  70         else:
  71             video['genre'] = 'Ingen Genre'
  72     if 'dynamicStreams' in flashvars:
  73         video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
  74         filename = video['title']+".mp4"
  75         print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
  76     if 'pathflv' in flashvars:
  77         rtmp = flashvars['pathflv'][0]
  78         filename = video['title']+".flv"
  79         print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
  80     if not 'timestamp' in video:
  81         if soup.find_all(datetime=True):
  82             xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
  83             video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) #naive in utc
  84             video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) #convert to local time
  85     if 'video' in flashvars:
  86         for reference in flashvars['video']['videoReferences']:
  87             if 'm3u8' in reference['url']:
  88                 video['url']=reference['url']
  89                 video['filename'] = video['title']+'.ts'
  90                 if 'statistics' in flashvars:
  91                     video['category'] = flashvars['statistics']['category']
  92         download_from_playlist(video)
  93     if not 'url' in video:
  94         print("Could not find any streams")
  95         return False
  96     return video
  97
  98 def download_from_playlist(video):
  99     playlist = parse_playlist(requests.get(video['url']).text)
 100     if not playlist:
 101         return
 102     videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
 103     if not videourl.startswith('http'): #if relative path
 104         videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
 105     segments, metadata = parse_segment_playlist(videourl)
 106     if "EXT-X-KEY" in metadata:
 107         key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
 108         decrypt=True
 109     else:
 110         decrypt=False
 111     with open("%s"%video['filename'],"wb") as ofile:
 112         segment=0
 113         size = 0
 114         for url in segments:
 115             ufile = requests.get(url, stream=True).raw
 116             print("\r{0:.2f} MB".format(size/1024/1024),end="")
 117             sys.stdout.flush()
 118             if decrypt:
 119                 iv=struct.pack("IIII",segment,0,0,0)
 120                 decryptor = AES.new(key, AES.MODE_CBC, iv)
 121             while(True):
 122                 try:
 123                     buf = ufile.read(4096)
 124                 except (socket.error, TypeError) as e:
 125                     print("Error reading, skipping file")
 126                     print(e)
 127                     return
 128                 if not buf:
 129                     break
 130                 if decrypt:
 131                     buf = decryptor.decrypt(buf)
 132                 ofile.write(buf)
 133                 size += len(buf)
 134             segment += 1
 135
 136     if 'thumb-url' in video:
 137         video['thumb'] = requests.get(video['thumb-url'],stream=True).raw
 138
 139 def parse_playlist(playlist):
 140     if not playlist.startswith("#EXTM3U"):
 141         print(playlist)
 142         return False
 143     playlist = playlist.splitlines()
 144     while not 'EXT-X-STREAM-INF' in playlist[0]:
 145         playlist = playlist[1:]
 146     items=[]
 147     for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
 148         md = Video()
 149         if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
 150             continue
 151         for item in metadata_string.split(':')[1].split(','):
 152             if '=' in item:
 153                 md.update([item.split('='),])
 154         md['url']=url
 155         items.append(md)
 156     return items
 157
 158 def parse_segment_playlist(playlisturl):
 159     playlist = requests.get(playlisturl).text
 160     assert playlist.startswith("#EXTM3U")
 161     PATTERN = re.compile(r'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
 162     segments = []
 163     next_is_url=False
 164     metadata = {}
 165     for row in playlist.splitlines():
 166         if next_is_url:
 167             if not row.startswith('http'): #if relative path
 168                 row = "{}/{}".format(os.path.dirname(playlisturl), row)
 169             segments.append(row)
 170             next_is_url=False
 171             continue
 172         if 'EXTINF' in row:
 173             next_is_url=True
 174         if "EXT-X-KEY" in row:
 175              row = row.split(':',1)[1] #skip first part
 176              parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
 177              metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
 178     return(segments, metadata)
 179
 180 def parse_videolist():
 181     page_num = 1
 182     soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
 183     page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
 184     videos_per_page = 8
 185     video_num = 0
 186     while(page_num <= page_tot):
 187         base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
 188         soup = BeautifulSoup(requests.get(base_url).text)
 189         for article in soup.findAll('article'):
 190             meta = dict(article.attrs)
 191             video = Video()
 192             video['title'] = meta['data-title']
 193             video['description'] = meta['data-description']
 194             video['url'] = dict(article.find('a').attrs)['href']
 195             video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
 196             video['num'] = video_num
 197             video['total'] = page_tot * videos_per_page
 198             video_num += 1
 199             yield video
 200         page_num += 1
 201
 202 def remux(video, xml=None):
 203     basename = video['filename'].split('.ts')[0]
 204     if 'genre' in video:
 205         if not os.path.exists(video['genre']):
 206             os.mkdir(video['genre'])
 207         video['path'] = os.path.join(video['genre'],basename+'.mkv')
 208     else:
 209         video['path'] = basename+'.mkv'
 210     command = ["mkvmerge","-o",video['path'], '--title',video['title']]
 211
 212     if xml:
 213         with open(basename+'.xml','w') as f:
 214             f.write(xml)
 215             command.extend(['--global-tags',basename+'.xml'])
 216     if 'thumb' in video:
 217         with open('thumbnail.jpg','wb') as f: #FIXME use title instead for many downloaders
 218             f.write(video['thumb'].read())
 219             command.extend(['--attachment-description', "Thumbnail",
 220                  '--attachment-mime-type', 'image/jpeg',
 221                  '--attach-file', 'thumbnail.jpg'])
 222     command.append(video['filename'])
 223     print(Popen(command, stdout=PIPE).communicate()[0])
 224     for fname in (video['filename'], basename+'.xml','thumbnail.jpg'):
 225         try:
 226             os.unlink(fname)
 227         except:
 228             pass
 229     if 'timestamp' in video:
 230         os.utime(video['path'], times=(video['timestamp'].timestamp(),video['timestamp'].timestamp()))
 231
 232
 233 def mkv_metadata(video):
 234     root = BeautifulSoup(features='xml')
 235     root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
 236     tags = root.new_tag("Tags")
 237     tag = root.new_tag("Tag")
 238     tags.append(tag)
 239     root.append(tags)
 240     keep = ('title','description', 'url','genre')
 241     targets = root.new_tag("Targets")
 242     ttv = root.new_tag("TargetTypeValue")
 243     ttv.string = str(50)
 244     targets.append(ttv)
 245     tag.append(targets)
 246     for key in video:
 247         if not key in keep:
 248             continue
 249         simple = root.new_tag('Simple')
 250         name = root.new_tag('Name')
 251         name.string=key.upper()
 252         simple.append(name)
 253         sstring = root.new_tag('String')
 254         sstring.string=video[key]
 255         simple.append(sstring)
 256         tag.append(simple)
 257     return str(root)
 258
 259 if __name__ == "__main__":
 260     parser = argparse.ArgumentParser()
 261     group = parser.add_mutually_exclusive_group(required=True)
 262     group.add_argument("-r", "--rss", help="Download all files in rss")
 263     group.add_argument("-u", "--url", help="Download video in url")
 264     group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
 265     parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
 266     parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
 267
 268     args = parser.parse_args()
 269     if args.rss:
 270         d = feedparser.parse(args.rss)
 271         for e in d.entries:
 272             print(("Downloading: %s"%e.title))
 273             if args.no_act:
 274                 continue
 275             video = scrape_player_page({'title':e.title,'url':e.link})
 276             if args.no_remux:
 277                 continue
 278             self.remux(video)
 279         #print(e.description)
 280     if args.mirror:
 281         if not os.path.exists('.seen'):
 282             os.mkdir('.seen')
 283         for video in parse_videolist():
 284             video['title'] = video['title'].replace('/','_')
 285             print(video['title']+'.mkv')
 286             print("{} of {}".format(video['num'], video['total']))
 287
 288             if os.path.exists(os.path.join('.seen',video['title'])):
 289                 print("Skipping")
 290                 continue
 291             print("Downloading...")
 292             if args.no_act:
 293                 continue
 294             open(os.path.join('.seen',video['title']),'w').close() #touch
 295             video = scrape_player_page(video)
 296             if args.no_remux:
 297                 continue
 298             xml = mkv_metadata(video)
 299             remux(video, xml)
 300
 301     else:
 302         if not args.no_act:
 303             video = scrape_player_page({'url':args.url})
 304         if not args.no_remux:
 305             remux({'title':e.title})
 306         print(("Downloaded {}".format(args.url)))