svtplaydump.py

   1 #!/usr/bin/env python3.4
   2 # -*- coding: utf-8 -*-
   3 #
   4 #   (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
   5 #
   6 #   This program is free software: you can redistribute it and/or modify
   7 #   it under the terms of the GNU General Public License as published by
   8 #   the Free Software Foundation, either version 3 of the License, or
   9 #   (at your option) any later version.
  10 #
  11 #   This program is distributed in the hope that it will be useful,
  12 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 #   GNU General Public License for more details.
  15 #
  16 #   You should have received a copy of the GNU General Public License
  17 #   along with this program.  If not, see <http://www.gnu.org/licenses/>
  18 #
  19 # Changelog:
  20 # 0.4 added mirror mode.
  21 # 0.3 added apple streaming playlist parsing and decryption
  22 # 0.2 added python 2.4 urlparse compatibility
  23 # 0.1 initial release
  24
  25 from bs4 import BeautifulSoup, Doctype
  26 from subprocess import *
  27 import re
  28 from Crypto.Cipher import AES
  29 import struct
  30 import argparse
  31 import requests
  32 import sys, os
  33 import socket
  34 import feedparser
  35 from datetime import datetime, timezone
  36 from pathlib import Path
  37
  38 class Video(dict):
  39     def __init__(self, *args, **kwargs):
  40         self.update(dict(*args, **kwargs))  # use the free update to set keys
  41
  42     def __setattr__(self, name, value):
  43         return self.__setitem__(name,value)
  44
  45     def __getattr__(self, name):
  46         return self.__getitem__(name)
  47
  48     def is_downloaded(self):
  49         raise("NotImplemented")
  50
  51 def scrape_player_page(video):
  52     """
  53     Try to scrape the site for video and download.
  54     """
  55     if not video['url'].startswith('http'):
  56         video['url'] = "http://www.svtplay.se" + video['url']
  57     soup = BeautifulSoup(requests.get(video['url']).text)
  58     video_player = soup.body('a',{'data-json-href':True})[0]
  59     if 'oppetarkiv.se' in video['url']:
  60         flashvars = requests.get("http://www.oppetarkiv.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
  61     else:
  62         if video_player.attrs['data-json-href'].startswith("/wd"):
  63             flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
  64         else:
  65             flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
  66     video['duration'] = video_player.attrs.get('data-length',0)
  67     if not 'title' in video:
  68         video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('|','_').replace('/','_')
  69     if not 'genre' in video:
  70         if soup.find(text='Kategori:'):
  71             video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
  72         else:
  73             video['genre'] = 'Ingen Genre'
  74     if 'dynamicStreams' in flashvars:
  75         video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
  76         filename = Path(video['title']).with_suffix(".mp4")
  77         print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
  78     if 'pathflv' in flashvars:
  79         rtmp = flashvars['pathflv'][0]
  80         filename = Path(video['title']).with_suffix(".flv")
  81         print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
  82     if not 'timestamp' in video:
  83         if soup.find_all(datetime=True):
  84             xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
  85             if xmldate_str:
  86                 video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) #naive in utc
  87                 video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) #convert to local time
  88     if 'video' in flashvars:
  89         for reference in flashvars['video']['videoReferences']:
  90             if 'm3u8' in reference['url']:
  91                 video['url']=reference['url']
  92                 video['filename'] = Path(video['title']).with_suffix('.ts')
  93                 if 'statistics' in flashvars:
  94                     video['category'] = flashvars['statistics']['category']
  95         if not download_from_playlist(video):
  96             return False
  97     if not 'url' in video:
  98         print("Could not find any streams")
  99         return False
 100     return video
 101
 102 def download_from_playlist(video):
 103     params = requests.utils.urlparse(video['url']).query
 104     print(params)
 105     if 'cc1=' in params:  #'cc1=name=Svenska~default=yes~forced=no~uri=http://media.svt.se/download/mcc/wp3/undertexter-wsrt/1134047/1134047-025A/C(sv)/index.m3u8~lang=sv'
 106         video['subs'] = [dict([k.split('=') for k in params.split('cc1=')[1].split('~')])] #make a dict from the paramstring
 107     try:
 108         req = requests.get(video['url']).text
 109     except:
 110         print("Error reading, skipping file")
 111         print(sys.exc_info()[1])
 112         return False
 113     if 'subs' in video:
 114         try:
 115             segments = [item for item in requests.get(video['subs'][0]['uri']).text.split('\n') if 'vtt' in item]
 116         except:
 117             print("Error reading, skipping subtitle")
 118             print(sys.exc_info()[1])
 119             segments = [] #ugly FIXME
 120         video['subs'][0]['download'] = []
 121         for segment in segments:
 122             if not segment.startswith('http'):
 123                 segment = "{}/{}".format(os.path.dirname(video['subs'][0]['uri']), segment)
 124             try:
 125                 video['subs'][0]['download'].append(requests.get(segment).text)
 126             except:
 127                 print("Error reading, skipping subtitle")
 128                 print(sys.exc_info()[1])
 129                 break
 130     playlist = parse_playlist(req)
 131     if not playlist:
 132         return
 133     videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
 134     if not videourl.startswith('http'): #if relative path
 135         videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
 136     segments, metadata = parse_segment_playlist(videourl)
 137     if "EXT-X-KEY" in metadata:
 138         try:
 139             key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
 140         except:
 141             print("Error reading, skipping file")
 142             print(sys.exc_info()[1])
 143             return False
 144         decrypt=True
 145     else:
 146         decrypt=False
 147     with video['filename'].open("wb") as ofile:
 148         segment=0
 149         size = 0
 150         for url in segments:
 151             try:
 152                 ufile = requests.get(url, stream=True).raw
 153             except:
 154                 print("Error reading, skipping file")
 155                 print(sys.exc_info()[1])
 156                 return False
 157             print("\r{0:.2f} MB".format(size/1024/1024), end="")
 158             sys.stdout.flush()
 159             if decrypt:
 160                 iv=struct.pack("IIII",segment,0,0,0)
 161                 try:
 162                     decryptor = AES.new(key, AES.MODE_CBC, iv) #ValueError: AES key must be either 16, 24, or 32 bytes long
 163                 except(ValueError) as e:
 164                     print("Error using decryption key. Skipping")
 165                     print(e)
 166                     return False
 167             while(True):
 168                 try:
 169                     buf = ufile.read(4096)
 170                 except:
 171                     print("Error reading, skipping file") #FIXME mark file as failed
 172                     print(sys.exc_info()[1])
 173                     return False
 174                 if not buf:
 175                     break
 176                 if decrypt:
 177                     buf = decryptor.decrypt(buf)
 178                 ofile.write(buf)
 179                 size += len(buf)
 180             segment += 1
 181
 182     if 'thumb-url' in video:
 183         try:
 184             video['thumb'] = requests.get(video['thumb-url'],stream=True).raw
 185         except:
 186             print("Error reading thumbnail") #FIXME mark file as failed
 187             print(sys.exc_info()[1])
 188
 189     return True
 190
 191 def parse_playlist(playlist):
 192     if not playlist.startswith("#EXTM3U"):
 193         print(playlist)
 194         return False
 195     playlist = playlist.splitlines()
 196     while not 'EXT-X-STREAM-INF' in playlist[0]:
 197         playlist = playlist[1:]
 198     items=[]
 199     for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
 200         md = Video()
 201         if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
 202             continue
 203         for item in metadata_string.split(':')[1].split(','):
 204             if '=' in item:
 205                 md.update([item.split('='),])
 206         md['url']=url
 207         items.append(md)
 208     return items
 209
 210 def parse_segment_playlist(playlisturl):
 211     playlist = requests.get(playlisturl).text
 212     assert playlist.startswith("#EXTM3U")
 213     PATTERN = re.compile(r'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
 214     segments = []
 215     next_is_url=False
 216     metadata = {}
 217     for row in playlist.splitlines():
 218         if next_is_url:
 219             if not row.startswith('http'): #if relative path
 220                 row = "{}/{}".format(os.path.dirname(playlisturl), row)
 221             segments.append(row)
 222             next_is_url=False
 223             continue
 224         if 'EXTINF' in row:
 225             next_is_url=True
 226         if "EXT-X-KEY" in row:
 227              row = row.split(':',1)[1] #skip first part
 228              parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
 229              metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
 230     return(segments, metadata)
 231
 232 def parse_videolist():
 233     page_num = 1
 234     soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
 235     page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
 236     videos_per_page = 8
 237     video_num = 0
 238     while(page_num <= page_tot):
 239         base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
 240         soup = BeautifulSoup(requests.get(base_url).text)
 241         for article in soup.findAll('article'):
 242             meta = dict(article.attrs)
 243             video = Video()
 244             video['title'] = meta['data-title']
 245             video['description'] = meta['data-description']
 246             video['url'] = dict(article.find('a').attrs)['href']
 247             video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
 248             video['num'] = video_num
 249             video['total'] = page_tot * videos_per_page
 250             video_num += 1
 251             yield video
 252         page_num += 1
 253
 254 def remux(video, xml=None):
 255     if 'genre' in video:
 256         if not os.path.exists(video['genre']):
 257             os.mkdir(video['genre'])
 258         video['path'] = Path(video['genre'] / video['filename']).with_suffix('.mkv')
 259     else:
 260         video['path'] = video['filename'].with_suffix('.mkv')
 261     command = ["mkvmerge","-o",str(video['path']), '--title',video['title']]
 262
 263     if xml:
 264         with video['filename'].with_suffix('.xml').open('w') as f:
 265             f.write(xml)
 266             command.extend(['--global-tags',str(video['filename'].with_suffix('.xml'))])
 267     if 'thumb' in video:
 268         with open('thumbnail.jpg','wb') as f: #FIXME use title instead for many downloaders
 269             f.write(video['thumb'].read())
 270             command.extend(['--attachment-description', "Thumbnail",
 271                  '--attachment-mime-type', 'image/jpeg',
 272                  '--attach-file', 'thumbnail.jpg'])
 273     # if 'subs' in video:
 274     #     for sub in video['subs']:
 275     #         if 'download' in sub:
 276     #             with open("{}.vtt".format(sub['lang']),'wb') as f:
 277     #                 f.write(bytes("".join(sub['download']),'utf-8')) #FIXME
 278     #                 command.extend(['--language 0:{} {}.vtt'.format(sub['lang'],sub['lang'])])
 279
 280     command.append(str(video['filename']))
 281     print(Popen(command, stdout=PIPE).communicate()[0])
 282     for fname in (video['filename'], video['filename'].with_suffix('.xml'),Path('thumbnail.jpg')):
 283         try:
 284             fname.unlink()
 285         except:
 286             pass
 287     if 'timestamp' in video:
 288         try:
 289             os.utime(str(video['path']), times=(video['timestamp'].timestamp(),video['timestamp'].timestamp()))
 290         except FileNotFoundError as e:
 291             print(e)
 292
 293
 294 def mkv_metadata(video):
 295     root = BeautifulSoup(features='xml')
 296     root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
 297     tags = root.new_tag("Tags")
 298     tag = root.new_tag("Tag")
 299     tags.append(tag)
 300     root.append(tags)
 301     keep = ('title','description', 'url','genre')
 302     targets = root.new_tag("Targets")
 303     ttv = root.new_tag("TargetTypeValue")
 304     ttv.string = str(50)
 305     targets.append(ttv)
 306     tag.append(targets)
 307     for key in video:
 308         if not key in keep:
 309             continue
 310         simple = root.new_tag('Simple')
 311         name = root.new_tag('Name')
 312         name.string=key.upper()
 313         simple.append(name)
 314         sstring = root.new_tag('String')
 315         sstring.string=video[key]
 316         simple.append(sstring)
 317         tag.append(simple)
 318     return str(root)
 319
 320 if __name__ == "__main__":
 321     parser = argparse.ArgumentParser()
 322     group = parser.add_mutually_exclusive_group(required=True)
 323     group.add_argument("-r", "--rss", help="Download all files in rss")
 324     group.add_argument("-u", "--url", help="Download video in url")
 325     group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
 326     parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
 327     parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
 328
 329     args = parser.parse_args()
 330     if args.rss:
 331         d = feedparser.parse(args.rss)
 332         for e in d.entries:
 333             print(("Downloading: %s"%e.title))
 334             if args.no_act:
 335                 continue
 336             video = scrape_player_page({'title':e.title,'url':e.link})
 337             if args.no_remux:
 338                 continue
 339             self.remux(video)
 340         #print(e.description)
 341     if args.mirror:
 342         if not os.path.exists('.seen'):
 343             os.mkdir('.seen')
 344         for video in parse_videolist():
 345             video['title'] = video['title'].replace('/','_')
 346             print(video['title']+'.mkv')
 347             print("{} of {}".format(video['num'], video['total']))
 348
 349             if os.path.exists(os.path.join('.seen',video['title'])):
 350                 print("Skipping")
 351                 continue
 352             print("Downloading...")
 353             if args.no_act:
 354                 continue
 355             open(os.path.join('.seen',video['title']),'w').close() #touch
 356             ret = scrape_player_page(video)
 357             if not ret:
 358                 if not os.path.exists('.failed'):
 359                     os.mkdir('.failed')
 360                 open(os.path.join('.failed',video['title']),'w').close() #touch
 361                 continue
 362             video = ret
 363             if args.no_remux:
 364                 continue
 365             xml = mkv_metadata(video)
 366             remux(video, xml)
 367
 368     else:
 369         if not args.no_act:
 370             video = scrape_player_page({'url':args.url})
 371         if not args.no_remux:
 372             remux(video)
 373         print(("Downloaded {}".format(args.url)))