svtplaydump.py

   1 #!/usr/bin/env python3
   2 # -*- coding: utf-8 -*-
   3 #
   4 #   (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
   5 #
   6 #   This program is free software: you can redistribute it and/or modify
   7 #   it under the terms of the GNU General Public License as published by
   8 #   the Free Software Foundation, either version 3 of the License, or
   9 #   (at your option) any later version.
  10 #
  11 #   This program is distributed in the hope that it will be useful,
  12 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 #   GNU General Public License for more details.
  15 #
  16 #   You should have received a copy of the GNU General Public License
  17 #   along with this program.  If not, see <http://www.gnu.org/licenses/>
  18 #
  19 # Changelog:
  20 # 0.4 added mirror mode.
  21 # 0.3 added apple streaming playlist parsing and decryption
  22 # 0.2 added python 2.4 urlparse compatibility
  23 # 0.1 initial release
  24
  25 from bs4 import BeautifulSoup, Doctype
  26 from subprocess import *
  27 import re
  28 from Crypto.Cipher import AES
  29 import struct
  30 import argparse
  31 import requests
  32 import sys, os
  33 import socket
  34 import feedparser
  35 from datetime import datetime, timezone
  36 class Video(dict):
  37     def __init__(self, *args, **kwargs):
  38         self.update(dict(*args, **kwargs))  # use the free update to set keys
  39
  40     def __setattr__(self, name, value):
  41         return self.__setitem__(name,value)
  42
  43     def __getattr__(self, name):
  44         return self.__getitem__(name)
  45
  46     def is_downloaded(self):
  47         raise("NotImplemented")
  48
  49 def scrape_player_page(video):
  50     """
  51     Try to scrape the site for video and download.
  52     """
  53     if not video['url'].startswith('http'):
  54         video['url'] = "http://www.svtplay.se" + video['url']
  55     soup = BeautifulSoup(requests.get(video['url']).text)
  56     video_player = soup.body('a',{'data-json-href':True})[0]
  57     if 'oppetarkiv.se' in video['url']:
  58         flashvars = requests.get("http://www.oppetarkiv.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
  59     else:
  60         if video_player.attrs['data-json-href'].startswith("/wd"):
  61             flashvars = requests.get("http://www.svt.se/%s"%video_player.attrs['data-json-href']).json()
  62         else:
  63             flashvars = requests.get("http://www.svtplay.se/%s"%video_player.attrs['data-json-href']+"?output=json").json()
  64     video['duration'] = video_player.attrs.get('data-length',0)
  65     if not 'title' in video:
  66         video['title'] = soup.find('meta',{'property':'og:title'}).attrs['content'].replace('|','_').replace('/','_')
  67     if not 'genre' in video:
  68         if soup.find(text='Kategori:'):
  69             video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
  70         else:
  71             video['genre'] = 'Ingen Genre'
  72     if 'dynamicStreams' in flashvars:
  73         video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
  74         filename = video['title']+".mp4"
  75         print(Popen(["rtmpdump","-o"+filename,"-r", url], stdout=PIPE).communicate()[0])
  76     if 'pathflv' in flashvars:
  77         rtmp = flashvars['pathflv'][0]
  78         filename = video['title']+".flv"
  79         print(Popen(["mplayer","-dumpstream","-dumpfile",filename, rtmp], stdout=PIPE).communicate()[0])
  80     if not 'timestamp' in video:
  81         if soup.find_all(datetime=True):
  82             xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
  83             video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6]) #naive in utc
  84             video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None) #convert to local time
  85     if 'video' in flashvars:
  86         for reference in flashvars['video']['videoReferences']:
  87             if 'm3u8' in reference['url']:
  88                 video['url']=reference['url']
  89                 video['filename'] = video['title']+'.ts'
  90                 if 'statistics' in flashvars:
  91                     video['category'] = flashvars['statistics']['category']
  92         if not download_from_playlist(video):
  93             return False
  94     if not 'url' in video:
  95         print("Could not find any streams")
  96         return False
  97     return video
  98
  99 def download_from_playlist(video):
 100     params = requests.utils.urlparse(video['url']).query
 101     print(params)
 102     if 'cc1=' in params:  #'cc1=name=Svenska~default=yes~forced=no~uri=http://media.svt.se/download/mcc/wp3/undertexter-wsrt/1134047/1134047-025A/C(sv)/index.m3u8~lang=sv'
 103         video['subs'] = [dict([k.split('=') for k in params.split('cc1=')[1].split('~')])] #make a dict from the paramstring
 104     try:
 105         req = requests.get(video['url']).text
 106     except:
 107         print("Error reading, skipping file")
 108         print(sys.exc_info()[1])
 109         return False
 110     if 'subs' in video:
 111         try:
 112             segments = [item for item in requests.get(video['subs'][0]['uri']).text.split('\n') if 'vtt' in item]
 113         except:
 114             print("Error reading, skipping subtitle")
 115             print(sys.exc_info()[1])
 116             segments = [] #ugly FIXME
 117         video['subs'][0]['download'] = []
 118         for segment in segments:
 119             if not segment.startswith('http'):
 120                 segment = "{}/{}".format(os.path.dirname(video['subs'][0]['uri']), segment)
 121             try:
 122                 video['subs'][0]['download'].append(requests.get(segment).text)
 123             except:
 124                 print("Error reading, skipping subtitle")
 125                 print(sys.exc_info()[1])
 126                 break
 127     playlist = parse_playlist(req)
 128     if not playlist:
 129         return
 130     videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
 131     if not videourl.startswith('http'): #if relative path
 132         videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
 133     segments, metadata = parse_segment_playlist(videourl)
 134     if "EXT-X-KEY" in metadata:
 135         try:
 136             key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
 137         except:
 138             print("Error reading, skipping file")
 139             print(sys.exc_info()[1])
 140             return False
 141         decrypt=True
 142     else:
 143         decrypt=False
 144     with open("%s"%video['filename'],"wb") as ofile:
 145         segment=0
 146         size = 0
 147         for url in segments:
 148             try:
 149                 ufile = requests.get(url, stream=True).raw
 150             except:
 151                 print("Error reading, skipping file")
 152                 print(sys.exc_info()[1])
 153                 return False
 154             print("\r{0:.2f} MB".format(size/1024/1024),end="")
 155             sys.stdout.flush()
 156             if decrypt:
 157                 iv=struct.pack("IIII",segment,0,0,0)
 158                 try:
 159                     decryptor = AES.new(key, AES.MODE_CBC, iv) #ValueError: AES key must be either 16, 24, or 32 bytes long
 160                 except(ValueError) as e:
 161                     print("Error using decryption key. Skipping")
 162                     print(e)
 163                     return False
 164             while(True):
 165                 try:
 166                     buf = ufile.read(4096)
 167                 except:
 168                     print("Error reading, skipping file") #FIXME mark file as failed
 169                     print(sys.exc_info()[1])
 170                     return False
 171                 if not buf:
 172                     break
 173                 if decrypt:
 174                     buf = decryptor.decrypt(buf)
 175                 ofile.write(buf)
 176                 size += len(buf)
 177             segment += 1
 178
 179     if 'thumb-url' in video:
 180         try:
 181             video['thumb'] = requests.get(video['thumb-url'],stream=True).raw
 182         except:
 183             print("Error reading thumbnail") #FIXME mark file as failed
 184             print(sys.exc_info()[1])
 185
 186     return True
 187
 188 def parse_playlist(playlist):
 189     if not playlist.startswith("#EXTM3U"):
 190         print(playlist)
 191         return False
 192     playlist = playlist.splitlines()
 193     while not 'EXT-X-STREAM-INF' in playlist[0]:
 194         playlist = playlist[1:]
 195     items=[]
 196     for (metadata_string,url) in zip(playlist[0::2], playlist[1::2]):
 197         md = Video()
 198         if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
 199             continue
 200         for item in metadata_string.split(':')[1].split(','):
 201             if '=' in item:
 202                 md.update([item.split('='),])
 203         md['url']=url
 204         items.append(md)
 205     return items
 206
 207 def parse_segment_playlist(playlisturl):
 208     playlist = requests.get(playlisturl).text
 209     assert playlist.startswith("#EXTM3U")
 210     PATTERN = re.compile(r'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
 211     segments = []
 212     next_is_url=False
 213     metadata = {}
 214     for row in playlist.splitlines():
 215         if next_is_url:
 216             if not row.startswith('http'): #if relative path
 217                 row = "{}/{}".format(os.path.dirname(playlisturl), row)
 218             segments.append(row)
 219             next_is_url=False
 220             continue
 221         if 'EXTINF' in row:
 222             next_is_url=True
 223         if "EXT-X-KEY" in row:
 224              row = row.split(':',1)[1] #skip first part
 225              parts = PATTERN.split(row)[1:-1] #do magic re split and keep quotes
 226              metadata["EXT-X-KEY"] = dict([part.split('=',1) for part in parts if '=' in part]) #throw away the commas and make dict of the pairs
 227     return(segments, metadata)
 228
 229 def parse_videolist():
 230     page_num = 1
 231     soup = BeautifulSoup(requests.get("http://www.svtplay.se/ajax/videospager").text)#this call does not work for getting the pages, we use it for the page totals only
 232     page_tot = int(soup.find('a',{'data-currentpage':True}).attrs['data-lastpage'])
 233     videos_per_page = 8
 234     video_num = 0
 235     while(page_num <= page_tot):
 236         base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
 237         soup = BeautifulSoup(requests.get(base_url).text)
 238         for article in soup.findAll('article'):
 239             meta = dict(article.attrs)
 240             video = Video()
 241             video['title'] = meta['data-title']
 242             video['description'] = meta['data-description']
 243             video['url'] = dict(article.find('a').attrs)['href']
 244             video['thumb-url'] = dict(article.find('img',{}).attrs)['src']
 245             video['num'] = video_num
 246             video['total'] = page_tot * videos_per_page
 247             video_num += 1
 248             yield video
 249         page_num += 1
 250
 251 def remux(video, xml=None):
 252     basename = video['filename'].split('.ts')[0]
 253     if 'genre' in video:
 254         if not os.path.exists(video['genre']):
 255             os.mkdir(video['genre'])
 256         video['path'] = os.path.join(video['genre'],basename+'.mkv')
 257     else:
 258         video['path'] = basename+'.mkv'
 259     command = ["mkvmerge","-o",video['path'], '--title',video['title']]
 260
 261     if xml:
 262         with open(basename+'.xml','w') as f:
 263             f.write(xml)
 264             command.extend(['--global-tags',basename+'.xml'])
 265     if 'thumb' in video:
 266         with open('thumbnail.jpg','wb') as f: #FIXME use title instead for many downloaders
 267             f.write(video['thumb'].read())
 268             command.extend(['--attachment-description', "Thumbnail",
 269                  '--attachment-mime-type', 'image/jpeg',
 270                  '--attach-file', 'thumbnail.jpg'])
 271     # if 'subs' in video:
 272     #     for sub in video['subs']:
 273     #         if 'download' in sub:
 274     #             with open("{}.vtt".format(sub['lang']),'wb') as f:
 275     #                 f.write(bytes("".join(sub['download']),'utf-8')) #FIXME
 276     #                 command.extend(['--language 0:{} {}.vtt'.format(sub['lang'],sub['lang'])])
 277
 278
 279     command.append(video['filename'])
 280     print(Popen(command, stdout=PIPE).communicate()[0])
 281     for fname in (video['filename'], basename+'.xml','thumbnail.jpg'):
 282         try:
 283             os.unlink(fname)
 284         except:
 285             pass
 286     if 'timestamp' in video:
 287         try:
 288             os.utime(video['path'], times=(video['timestamp'].timestamp(),video['timestamp'].timestamp()))
 289         except FileNotFoundError as e:
 290             print(e)
 291
 292
 293 def mkv_metadata(video):
 294     root = BeautifulSoup(features='xml')
 295     root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
 296     tags = root.new_tag("Tags")
 297     tag = root.new_tag("Tag")
 298     tags.append(tag)
 299     root.append(tags)
 300     keep = ('title','description', 'url','genre')
 301     targets = root.new_tag("Targets")
 302     ttv = root.new_tag("TargetTypeValue")
 303     ttv.string = str(50)
 304     targets.append(ttv)
 305     tag.append(targets)
 306     for key in video:
 307         if not key in keep:
 308             continue
 309         simple = root.new_tag('Simple')
 310         name = root.new_tag('Name')
 311         name.string=key.upper()
 312         simple.append(name)
 313         sstring = root.new_tag('String')
 314         sstring.string=video[key]
 315         simple.append(sstring)
 316         tag.append(simple)
 317     return str(root)
 318
 319 if __name__ == "__main__":
 320     parser = argparse.ArgumentParser()
 321     group = parser.add_mutually_exclusive_group(required=True)
 322     group.add_argument("-r", "--rss", help="Download all files in rss")
 323     group.add_argument("-u", "--url", help="Download video in url")
 324     group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
 325     parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action="store_true")
 326     parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
 327
 328     args = parser.parse_args()
 329     if args.rss:
 330         d = feedparser.parse(args.rss)
 331         for e in d.entries:
 332             print(("Downloading: %s"%e.title))
 333             if args.no_act:
 334                 continue
 335             video = scrape_player_page({'title':e.title,'url':e.link})
 336             if args.no_remux:
 337                 continue
 338             self.remux(video)
 339         #print(e.description)
 340     if args.mirror:
 341         if not os.path.exists('.seen'):
 342             os.mkdir('.seen')
 343         for video in parse_videolist():
 344             video['title'] = video['title'].replace('/','_')
 345             print(video['title']+'.mkv')
 346             print("{} of {}".format(video['num'], video['total']))
 347
 348             if os.path.exists(os.path.join('.seen',video['title'])):
 349                 print("Skipping")
 350                 continue
 351             print("Downloading...")
 352             if args.no_act:
 353                 continue
 354             open(os.path.join('.seen',video['title']),'w').close() #touch
 355             ret = scrape_player_page(video)
 356             if not ret:
 357                 if not os.path.exists('.failed'):
 358                     os.mkdir('.failed')
 359                 open(os.path.join('.failed',video['title']),'w').close() #touch
 360                 continue
 361             video = ret
 362             if args.no_remux:
 363                 continue
 364             xml = mkv_metadata(video)
 365             remux(video, xml)
 366
 367     else:
 368         if not args.no_act:
 369             video = scrape_player_page({'url':args.url})
 370         if not args.no_remux:
 371             remux(video)
 372         print(("Downloaded {}".format(args.url)))