svtplaydump.py

   1 #!/usr/bin/env python3.4
   2 # -*- coding: utf-8 -*-
   3 #
   4 #   (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
   5 #
   6 #   This program is free software: you can redistribute it and/or modify
   7 #   it under the terms of the GNU General Public License as published by
   8 #   the Free Software Foundation, either version 3 of the License, or
   9 #   (at your option) any later version.
  10 #
  11 #   This program is distributed in the hope that it will be useful,
  12 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 #   GNU General Public License for more details.
  15 #
  16 #   You should have received a copy of the GNU General Public License
  17 #   along with this program.  If not, see <http://www.gnu.org/licenses/>
  18 #
  19 # Changelog:
  20 # 0.4 added mirror mode.
  21 # 0.3 added apple streaming playlist parsing and decryption
  22 # 0.2 added python 2.4 urlparse compatibility
  23 # 0.1 initial release
  24
  25 from bs4 import BeautifulSoup, Doctype
  26 from subprocess import *
  27 import re
  28 from Crypto.Cipher import AES
  29 import struct
  30 import argparse
  31 import requests
  32 import sys, os
  33 import feedparser
  34 from datetime import datetime, timezone
  35 from pathlib import Path
  36
  37
  38 class Video(dict):
  39     def __init__(self, *args, **kwargs):
  40         self.update(dict(*args, **kwargs))  # use the free update to set keys
  41
  42     def __setattr__(self, name, value):
  43         return self.__setitem__(name, value)
  44
  45     def __getattr__(self, name):
  46         return self.__getitem__(name)
  47
  48     def is_downloaded(self):
  49         raise ("NotImplemented")
  50
  51
  52 def scrape_player_page(video):
  53     """
  54     Try to scrape the site for video and download.
  55     """
  56     if not video['url'].startswith('http'):
  57         video['url'] = "http://www.svtplay.se" + video['url']
  58     soup = BeautifulSoup(requests.get(video['url']).text)
  59     video_player = soup.body('a', {'data-json-href': True})[0]
  60     if 'oppetarkiv.se' in video['url']:
  61         flashvars = requests.get(
  62                 "http://www.oppetarkiv.se/%s" % video_player.attrs['data-json-href'] + "?output=json").json()
  63     else:
  64         if video_player.attrs['data-json-href'].startswith("/wd"):
  65             flashvars = requests.get("http://www.svt.se/%s" % video_player.attrs['data-json-href']).json()
  66         else:
  67             flashvars = requests.get(
  68                     "http://www.svtplay.se/%s" % video_player.attrs['data-json-href'] + "?output=json").json()
  69     video['duration'] = video_player.attrs.get('data-length', 0)
  70     if not 'title' in video:
  71         video['title'] = soup.find('meta', {'property': 'og:title'}).attrs['content'].replace('|', '_').replace('/', '_')
  72     if 'genre' not in video:
  73         if soup.find(text='Kategori:'):
  74             video['genre'] = soup.find(text='Kategori:').parent.parent.a.text
  75         else:
  76             video['genre'] = 'Ingen Genre'
  77     if 'dynamicStreams' in flashvars:
  78         video['url'] = flashvars['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] + '.mp4'
  79         filename = Path(video['title']).with_suffix(".mp4")
  80         print(Popen(["rtmpdump", "-o" + filename, "-r", video['url']], stdout=PIPE).communicate()[0])
  81     if 'pathflv' in flashvars:
  82         rtmp = flashvars['pathflv'][0]
  83         filename = Path(video['title']).with_suffix(".flv")
  84         print(Popen(["mplayer", "-dumpstream", "-dumpfile", filename, rtmp], stdout=PIPE).communicate()[0])
  85     if not 'timestamp' in video and soup.find_all(datetime=True):
  86         xmldate_str = soup.find_all(datetime=True)[0].attrs['datetime']
  87         if xmldate_str:
  88             video['timestamp'] = datetime(*feedparser._parse_date_w3dtf(xmldate_str)[:6])  # naive in utc
  89             video['timestamp'] = video['timestamp'].replace(tzinfo=timezone.utc).astimezone(tz=None)  # convert to local time
  90     if 'video' in flashvars:
  91         for reference in flashvars['video']['videoReferences']:
  92             if 'm3u8' in reference['url']:
  93                 video['url'] = reference['url']
  94                 video['filename'] = Path(video['title']).with_suffix('.ts')
  95                 if 'statistics' in flashvars:
  96                     video['category'] = flashvars['statistics']['category']
  97         if not download_from_playlist(video):
  98             return False
  99     if 'url' not in video:
 100         print("Could not find any streams")
 101         return False
 102     return video
 103
 104
 105 def download_from_playlist(video):
 106     params = requests.utils.urlparse(video['url']).query
 107     print(params)
 108     if 'cc1=' in params:  # 'cc1=name=Svenska~default=yes~forced=no~uri=http://media.svt.se/download/mcc/wp3/undertexter-wsrt/1134047/1134047-025A/C(sv)/index.m3u8~lang=sv'
 109         video['subs'] = [
 110             dict([k.split('=') for k in params.split('cc1=')[1].split('~')])]  # make a dict from the paramstring
 111     try:
 112         req = requests.get(video['url']).text
 113     except:
 114         print("Error reading, skipping file")
 115         print(sys.exc_info()[1])
 116         return False
 117     if 'subs' in video:
 118         try:
 119             segments = [item for item in requests.get(video['subs'][0]['uri']).text.split('\n') if 'vtt' in item]
 120         except:
 121             print("Error reading, skipping subtitle")
 122             print(sys.exc_info()[1])
 123             segments = []  # ugly FIXME
 124         video['subs'][0]['download'] = []
 125         for segment in segments:
 126             if not segment.startswith('http'):
 127                 segment = "{}/{}".format(os.path.dirname(video['subs'][0]['uri']), segment)
 128             try:
 129                 video['subs'][0]['download'].append(requests.get(segment).text)
 130             except:
 131                 print("Error reading, skipping subtitle")
 132                 print(sys.exc_info()[1])
 133                 break
 134     playlist = parse_playlist(req)
 135     if not playlist:
 136         return
 137     videourl = sorted(playlist, key=lambda k: int(k['BANDWIDTH']))[-1]['url']
 138     if not videourl.startswith('http'):  # if relative path
 139         videourl = "{}/{}".format(os.path.dirname(video['url']), videourl)
 140     segments, metadata = parse_segment_playlist(videourl)
 141     if "EXT-X-KEY" in metadata:
 142         try:
 143             key = requests.get(metadata["EXT-X-KEY"]['URI'].strip('"')).text
 144         except:
 145             print("Error reading, skipping file")
 146             print(sys.exc_info()[1])
 147             return False
 148         decrypt = True
 149     else:
 150         decrypt = False
 151     with video['filename'].open("wb") as ofile:
 152         segment = 0
 153         size = 0
 154         for url in segments:
 155             try:
 156                 ufile = requests.get(url, stream=True).raw
 157             except:
 158                 print("Error reading, skipping file")
 159                 print(sys.exc_info()[1])
 160                 return False
 161             print("\r{0:.2f} MB".format(size / 1024 / 1024), end="")
 162             sys.stdout.flush()
 163             if decrypt:
 164                 iv = struct.pack("IIII", segment, 0, 0, 0)
 165                 try:
 166                     decryptor = AES.new(key, AES.MODE_CBC,
 167                                         iv)  # ValueError: AES key must be either 16, 24, or 32 bytes long
 168                 except ValueError as e:
 169                     print("Error using decryption key. Skipping")
 170                     print(e)
 171                     return False
 172             while True:
 173                 try:
 174                     buf = ufile.read(4096)
 175                 except:
 176                     print("Error reading, skipping file")
 177                     print(sys.exc_info()[1])
 178                     return False
 179                 if not buf:
 180                     break
 181                 if decrypt:
 182                     buf = decryptor.decrypt(buf)
 183                 ofile.write(buf)
 184                 size += len(buf)
 185             segment += 1
 186
 187     if 'thumb-url' in video:
 188         try:
 189             video['thumb'] = requests.get(video['thumb-url'], stream=True).raw
 190         except:
 191             print("Error reading thumbnail")  # FIXME mark file as failed
 192             print(sys.exc_info()[1])
 193
 194     return True
 195
 196
 197 def parse_playlist(playlist):
 198     if not playlist.startswith("#EXTM3U"):
 199         print(playlist)
 200         return False
 201     playlist = playlist.splitlines()
 202     while not 'EXT-X-STREAM-INF' in playlist[0]:
 203         playlist = playlist[1:]
 204     items = []
 205     for (metadata_string, url) in zip(playlist[0::2], playlist[1::2]):
 206         md = Video()
 207         if not 'EXT-X-STREAM-INF' in metadata_string.split(':')[0]:
 208             continue
 209         for item in metadata_string.split(':')[1].split(','):
 210             if '=' in item:
 211                 md.update([item.split('='), ])
 212         md['url'] = url
 213         items.append(md)
 214     return items
 215
 216
 217 def parse_segment_playlist(playlisturl):
 218     playlist = requests.get(playlisturl).text
 219     assert playlist.startswith("#EXTM3U")
 220     PATTERN = re.compile(r'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
 221     segments = []
 222     next_is_url = False
 223     metadata = {}
 224     for row in playlist.splitlines():
 225         if next_is_url:
 226             if not row.startswith('http'):  # if relative path
 227                 row = "{}/{}".format(os.path.dirname(playlisturl), row)
 228             segments.append(row)
 229             next_is_url = False
 230             continue
 231         if 'EXTINF' in row:
 232             next_is_url = True
 233         if "EXT-X-KEY" in row:
 234             row = row.split(':', 1)[1]  # skip first part
 235             parts = PATTERN.split(row)[1:-1]  # do magic re split and keep quotes
 236             metadata["EXT-X-KEY"] = dict([part.split('=', 1) for part in parts if
 237                                           '=' in part])  # throw away the commas and make dict of the pairs
 238     return segments, metadata
 239
 240
 241 def parse_videolist():
 242     page_num = 1
 243     soup = BeautifulSoup(requests.get(
 244             "http://www.svtplay.se/ajax/videospager").text)  # this call does not work for getting the pages, we use it for the page totals only
 245     page_tot = int(soup.find('a', {'data-currentpage': True}).attrs['data-lastpage'])
 246     videos_per_page = 8
 247     video_num = 0
 248     while page_num <= page_tot:
 249         base_url = "http://www.svtplay.se/ajax/videos?sida={}".format(page_num)
 250         soup = BeautifulSoup(requests.get(base_url).text)
 251         for article in soup.findAll('article'):
 252             meta = dict(article.attrs)
 253             video = Video()
 254             video['title'] = meta['data-title']
 255             video['description'] = meta['data-description']
 256             video['url'] = dict(article.find('a').attrs)['href']
 257             video['thumb-url'] = dict(article.find('img', {}).attrs)['src']
 258             video['num'] = video_num
 259             video['total'] = page_tot * videos_per_page
 260             video_num += 1
 261             yield video
 262         page_num += 1
 263
 264
 265 def remux(video, xml=None):
 266     if 'genre' in video:
 267         if not os.path.exists(video['genre']):
 268             os.mkdir(video['genre'])
 269         video['path'] = Path(video['genre'] / video['filename']).with_suffix('.mkv')
 270     else:
 271         video['path'] = video['filename'].with_suffix('.mkv')
 272     command = ["mkvmerge", "-o", str(video['path']), '--title', video['title']]
 273
 274     if xml:
 275         with video['filename'].with_suffix('.xml').open('w') as f:
 276             f.write(xml)
 277             command.extend(['--global-tags', str(video['filename'].with_suffix('.xml'))])
 278     if 'thumb' in video:
 279         with open('thumbnail.jpg', 'wb') as f:  # FIXME use title instead for many downloaders
 280             f.write(video['thumb'].read())
 281             command.extend(['--attachment-description', "Thumbnail",
 282                             '--attachment-mime-type', 'image/jpeg',
 283                             '--attach-file', 'thumbnail.jpg'])
 284     # if 'subs' in video:
 285     #     for sub in video['subs']:
 286     #         if 'download' in sub:
 287     #             with open("{}.vtt".format(sub['lang']),'wb') as f:
 288     #                 f.write(bytes("".join(sub['download']),'utf-8')) #FIXME
 289     #                 command.extend(['--language 0:{} {}.vtt'.format(sub['lang'],sub['lang'])])
 290
 291     command.append(str(video['filename']))
 292     print(Popen(command, stdout=PIPE).communicate()[0])
 293     for fname in (video['filename'], video['filename'].with_suffix('.xml'), Path('thumbnail.jpg')):
 294         try:
 295             fname.unlink()
 296         except:
 297             pass
 298     if 'timestamp' in video:
 299         try:
 300             os.utime(str(video['path']), times=(video['timestamp'].timestamp(), video['timestamp'].timestamp()))
 301         except FileNotFoundError as e:
 302             print(e)
 303
 304
 305 def mkv_metadata(video):
 306     root = BeautifulSoup(features='xml')
 307     root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
 308     tags = root.new_tag("Tags")
 309     tag = root.new_tag("Tag")
 310     tags.append(tag)
 311     root.append(tags)
 312     keep = ('title', 'description', 'url', 'genre')
 313     targets = root.new_tag("Targets")
 314     ttv = root.new_tag("TargetTypeValue")
 315     ttv.string = str(50)
 316     targets.append(ttv)
 317     tag.append(targets)
 318     for key in video:
 319         if not key in keep:
 320             continue
 321         simple = root.new_tag('Simple')
 322         name = root.new_tag('Name')
 323         name.string = key.upper()
 324         simple.append(name)
 325         sstring = root.new_tag('String')
 326         sstring.string = video[key]
 327         simple.append(sstring)
 328         tag.append(simple)
 329     return str(root)
 330
 331
 332 if __name__ == "__main__":
 333     parser = argparse.ArgumentParser()
 334     group = parser.add_mutually_exclusive_group(required=True)
 335     group.add_argument("-r", "--rss", help="Download all files in rss")
 336     group.add_argument("-u", "--url", help="Download video in url")
 337     group.add_argument("-m", "--mirror", help="Mirror all files", action="store_true")
 338     parser.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.",
 339                         action="store_true")
 340     parser.add_argument("--no_remux", help="Don't remux into mkv", action="store_true")
 341
 342     args = parser.parse_args()
 343     if args.rss:
 344         d = feedparser.parse(args.rss)
 345         for e in d.entries:
 346             print(("Downloading: %s" % e.title))
 347             if args.no_act:
 348                 continue
 349             video = scrape_player_page({'title': e.title, 'url': e.link})
 350             if args.no_remux:
 351                 continue
 352             remux(video)
 353             # print(e.description)
 354     if args.mirror:
 355         if not os.path.exists('.seen'):
 356             os.mkdir('.seen')
 357         for video in parse_videolist():
 358             video['title'] = video['title'].replace('/', '_')
 359             print(video['title'] + '.mkv')
 360             print("{} of {}".format(video['num'], video['total']))
 361
 362             if os.path.exists(os.path.join('.seen', video['title'])):
 363                 print("Skipping")
 364                 continue
 365             print("Downloading...")
 366             if args.no_act:
 367                 continue
 368             open(os.path.join('.seen', video['title']), 'w').close()  # touch
 369             ret = scrape_player_page(video)
 370             if not ret:
 371                 if not os.path.exists('.failed'):
 372                     os.mkdir('.failed')
 373                 open(os.path.join('.failed', video['title']), 'w').close()  # touch
 374                 continue
 375             video = ret
 376             if args.no_remux:
 377                 continue
 378             xml = mkv_metadata(video)
 379             remux(video, xml)
 380
 381     else:
 382         if not args.no_act:
 383             video = scrape_player_page({'url': args.url})
 384         if not args.no_remux:
 385             remux(video)
 386         print(("Downloaded {}".format(args.url)))