svtplaydump.py

   1 #!/usr/bin/env python3
   2 # -*- coding: utf-8 -*-
   3 #
   4 #   (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
   5 #
   6 #   This program is free software: you can redistribute it and/or modify
   7 #   it under the terms of the GNU General Public License as published by
   8 #   the Free Software Foundation, either version 3 of the License, or
   9 #   (at your option) any later version.
  10 #
  11 #   This program is distributed in the hope that it will be useful,
  12 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 #   GNU General Public License for more details.
  15 #
  16 #   You should have received a copy of the GNU General Public License
  17 #   along with this program.  If not, see <http://www.gnu.org/licenses/>
  18 #
  19 # Changelog:
  20 # 0.4 added mirror mode.
  21 # 0.3 added apple streaming playlist parsing and decryption
  22 # 0.2 added python 2.4 urlparse compatibility
  23 # 0.1 initial release
  24
  25 from bs4 import BeautifulSoup, Doctype
  26 from subprocess import *
  27 import re
  28 from Crypto.Cipher import AES
  29 import struct
  30 import argparse
  31 import requests
  32 import sys, os
  33 import socket
  34 import feedparser
  35 from datetime import datetime, timezone
  36 class Video(dict):
  37     def __init__(self, *args, **kwargs):
  38         self.update(dict(*args, **kwargs))  # use the free update to set keys
  39
  40     def __setattr__(self, name, value):
  41         return self.__setitem__(name,value)
  42
  43     def __getattr__(self, name):
  44         return self.__getitem__(name)
  45
  46     def is_downloaded(self):
  47         raise("NotImplemented")
  48
  49 def scrape_player_page(video):
  50     """
  51     Try to scrape the site for video and download.
  52     """
  53     if not video['url'].startswith('http'):
  54         video['url'] = "http://www.svtplay.se" + video['url']
  55     soup = BeautifulSoup(requests.get(video['url']).text)
  56     video_player = soup.body('a',{'data-json-href':True})[0]
  57     if 'oppetarkiv.se' in video['url']:
  58         flashvars = requests.get("http://www.oppetarkiv.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
  59     else:
  60         if video_player.attrs['data-json-href'].startswith("/wd"):
  61             flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
  62         else:
  63             flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
  64     video['duration'] = video_player.attrs.get('data-length',0)
  65     if not video['title']:
  66         video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('|','_').replace('/','_')
  67     if not 'genre' in video:
  68         if soup.find(text='Kategori:'):
  69             video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
  70         else:
  71             video['genre'] = 'Ingen Genre'
  72     if 'dynamicStreams' in flashvars:
  73         video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
  74         filename = video['title']+".mp4"
  75         print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
  76     if 'pathflv' in flashvars:
  77         rtmp = flashvars['pathflv'][0]
  78         filename = video['title']+".flv"
  79         print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
  80     if not 'timestamp' in video:
  81         if soup.find_all(datetime=True):
  82             xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
  83             video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) #naive in utc
  84             video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) #convert to local time
  85     if 'video' in flashvars:
  86         for reference in flashvars['video']['videoReferences']:
  87             if 'm3u8' in reference['url']:
  88                 video['url']=reference['url']
  89                 video['filename'] = video['title']+'.ts'
  90                 if 'statistics' in flashvars:
  91                     video['category'] = flashvars['statistics']['category']
  92         if not download_from_playlist(video):
  93             return False
  94     if not 'url' in video:
  95         print("Could not find any streams")
  96         return False
  97     return video
  98
  99 def download_from_playlist(video):
 100     playlist = parse_playlist(requests.get(video['url']).text)
 101     if not playlist:
 102         return
 103     videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
 104     if not videourl.startswith('http'): #if relative path
 105         videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
 106     segments, metadata = parse_segment_playlist(videourl)
 107     if "EXT-X-KEY" in metadata:
 108         key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
 109         decrypt=True
 110     else:
 111         decrypt=False
 112     with open("%s"%video['filename'],"wb") as ofile:
 113         segment=0
 114         size = 0
 115         for url in segments:
 116             try:
 117                 ufile = requests.get(url, stream=True).raw
 118             except:
 119                 print("Error reading, skipping file") #FIXME mark file as failed
 120                 print(sys.exc_info()[1])
 121                 return False
 122             print("\r{0:.2f} MB".format(size/1024/1024),end="")
 123             sys.stdout.flush()
 124             if decrypt:
 125                 iv=struct.pack("IIII",segment,0,0,0)
 126                 decryptor = AES.new(key, AES.MODE_CBC, iv)
 127             while(True):
 128                 try:
 129                     buf = ufile.read(4096)
 130                 except:
 131                     print("Error reading, skipping file") #FIXME mark file as failed
 132                     print(sys.exc_info()[1])
 133                     return False
 134                 if not buf:
 135                     break
 136                 if decrypt:
 137                     buf = decryptor.decrypt(buf)
 138                 ofile.write(buf)
 139                 size += len(buf)
 140             segment += 1
 141
 142     if 'thumb-url' in video:
 143         video['thumb'] = requests.get(video['thumb-url'],stream=True).raw
 144     return True
 145
 146 def parse_playlist(playlist):
 147     if not playlist.startswith("#EXTM3U"):
 148         print(playlist)
 149         return False
 150     playlist = playlist.splitlines()
 151     while not 'EXT-X-STREAM-INF' in playlist[0]:
 152         playlist = playlist[1:]
 153     items=[]
 154     for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
 155         md = Video()
 156         if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
 157             continue
 158         for item in metadata_string.split(':')[1].split(','):
 159             if '=' in item:
 160                 md.update([item.split('='),])
 161         md['url']=url
 162         items.append(md)
 163     return items
 164
 165 def parse_segment_playlist(playlisturl):
 166     playlist = requests.get(playlisturl).text
 167     assert playlist.startswith("#EXTM3U")
 168     PATTERN = re.compile(r'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
 169     segments = []
 170     next_is_url=False
 171     metadata = {}
 172     for row in playlist.splitlines():
 173         if next_is_url:
 174             if not row.startswith('http'): #if relative path
 175                 row = "{}/{}".format(os.path.dirname(playlisturl), row)
 176             segments.append(row)
 177             next_is_url=False
 178             continue
 179         if 'EXTINF' in row:
 180             next_is_url=True
 181         if "EXT-X-KEY" in row:
 182              row = row.split(':',1)[1] #skip first part
 183              parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
 184              metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
 185     return(segments, metadata)
 186
 187 def parse_videolist():
 188     page_num = 1
 189     soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
 190     page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
 191     videos_per_page = 8
 192     video_num = 0
 193     while(page_num <= page_tot):
 194         base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
 195         soup = BeautifulSoup(requests.get(base_url).text)
 196         for article in soup.findAll('article'):
 197             meta = dict(article.attrs)
 198             video = Video()
 199             video['title'] = meta['data-title']
 200             video['description'] = meta['data-description']
 201             video['url'] = dict(article.find('a').attrs)['href']
 202             video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
 203             video['num'] = video_num
 204             video['total'] = page_tot * videos_per_page
 205             video_num += 1
 206             yield video
 207         page_num += 1
 208
 209 def remux(video, xml=None):
 210     basename = video['filename'].split('.ts')[0]
 211     if 'genre' in video:
 212         if not os.path.exists(video['genre']):
 213             os.mkdir(video['genre'])
 214         video['path'] = os.path.join(video['genre'],basename+'.mkv')
 215     else:
 216         video['path'] = basename+'.mkv'
 217     command = ["mkvmerge","-o",video['path'], '--title',video['title']]
 218
 219     if xml:
 220         with open(basename+'.xml','w') as f:
 221             f.write(xml)
 222             command.extend(['--global-tags',basename+'.xml'])
 223     if 'thumb' in video:
 224         with open('thumbnail.jpg','wb') as f: #FIXME use title instead for many downloaders
 225             f.write(video['thumb'].read())
 226             command.extend(['--attachment-description', "Thumbnail",
 227                  '--attachment-mime-type', 'image/jpeg',
 228                  '--attach-file', 'thumbnail.jpg'])
 229     command.append(video['filename'])
 230     print(Popen(command, stdout=PIPE).communicate()[0])
 231     for fname in (video['filename'], basename+'.xml','thumbnail.jpg'):
 232         try:
 233             os.unlink(fname)
 234         except:
 235             pass
 236     if 'timestamp' in video:
 237         try:
 238             os.utime(video['path'], times=(video['timestamp'].timestamp(),video['timestamp'].timestamp()))
 239         except FileNotFoundError as e:
 240             print(e)
 241
 242
 243 def mkv_metadata(video):
 244     root = BeautifulSoup(features='xml')
 245     root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
 246     tags = root.new_tag("Tags")
 247     tag = root.new_tag("Tag")
 248     tags.append(tag)
 249     root.append(tags)
 250     keep = ('title','description', 'url','genre')
 251     targets = root.new_tag("Targets")
 252     ttv = root.new_tag("TargetTypeValue")
 253     ttv.string = str(50)
 254     targets.append(ttv)
 255     tag.append(targets)
 256     for key in video:
 257         if not key in keep:
 258             continue
 259         simple = root.new_tag('Simple')
 260         name = root.new_tag('Name')
 261         name.string=key.upper()
 262         simple.append(name)
 263         sstring = root.new_tag('String')
 264         sstring.string=video[key]
 265         simple.append(sstring)
 266         tag.append(simple)
 267     return str(root)
 268
 269 if __name__ == "__main__":
 270     parser = argparse.ArgumentParser()
 271     group = parser.add_mutually_exclusive_group(required=True)
 272     group.add_argument("-r", "--rss", help="Download all files in rss")
 273     group.add_argument("-u", "--url", help="Download video in url")
 274     group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
 275     parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
 276     parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
 277
 278     args = parser.parse_args()
 279     if args.rss:
 280         d = feedparser.parse(args.rss)
 281         for e in d.entries:
 282             print(("Downloading: %s"%e.title))
 283             if args.no_act:
 284                 continue
 285             video = scrape_player_page({'title':e.title,'url':e.link})
 286             if args.no_remux:
 287                 continue
 288             self.remux(video)
 289         #print(e.description)
 290     if args.mirror:
 291         if not os.path.exists('.seen'):
 292             os.mkdir('.seen')
 293         for video in parse_videolist():
 294             video['title'] = video['title'].replace('/','_')
 295             print(video['title']+'.mkv')
 296             print("{} of {}".format(video['num'], video['total']))
 297
 298             if os.path.exists(os.path.join('.seen',video['title'])):
 299                 print("Skipping")
 300                 continue
 301             print("Downloading...")
 302             if args.no_act:
 303                 continue
 304             open(os.path.join('.seen',video['title']),'w').close() #touch
 305             ret = scrape_player_page(video)
 306             if not ret:
 307                 if not os.path.exists('.failed'):
 308                     os.mkdir('.failed')
 309                 open(os.path.join('.failed',video['title']),'w').close() #touch
 310                 continue
 311             video = ret
 312             if args.no_remux:
 313                 continue
 314             xml = mkv_metadata(video)
 315             remux(video, xml)
 316
 317     else:
 318         if not args.no_act:
 319             video = scrape_player_page({'url':args.url})
 320         if not args.no_remux:
 321             remux({'title':e.title})
 322         print(("Downloaded {}".format(args.url)))