]>
git.frykholm.com Git - svtplaydump.git/blob - svtplaydump.py 
   2  # -*- coding: utf-8 -*-    4  #   (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>    6  #   This program is free software: you can redistribute it and/or modify    7  #   it under the terms of the GNU General Public License as published by    8  #   the Free Software Foundation, either version 3 of the License, or    9  #   (at your option) any later version.   11  #   This program is distributed in the hope that it will be useful,   12  #   but WITHOUT ANY WARRANTY; without even the implied warranty of   13  #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the   14  #   GNU General Public License for more details.   16  #   You should have received a copy of the GNU General Public License   17  #   along with this program.  If not, see <http://www.gnu.org/licenses/>   20  # 0.4 added mirror mode.   21  # 0.3 added apple streaming playlist parsing and decryption   22  # 0.2 added python 2.4 urlparse compatibility   25  from  bs4 
import  BeautifulSoup
,  Doctype
  26  from  subprocess 
import  *   28  from  Crypto
. Cipher 
import  AES
  35  from  datetime 
import  datetime
,  timezone
  37      def  __init__ ( self
, * args
, ** kwargs
):   38          self
. update ( dict (* args
, ** kwargs
))   # use the free update to set keys   40      def  __setattr__ ( self
,  name
,  value
):   41          return  self
.__ setitem
__ ( name
, value
)   43      def  __getattr__ ( self
,  name
):   44          return  self
.__ getitem
__ ( name
)   46      def  is_downloaded ( self
):   47          raise ( "NotImplemented" )   49  def  scrape_player_page ( video
):   51      Try to scrape the site for video and download.    53      if not  video
[ 'url' ]. startswith ( 'http' ):   54          video
[ 'url' ] =  "http://www.svtplay.se"  +  video
[ 'url' ]   55      soup 
=  BeautifulSoup ( requests
. get ( video
[ 'url' ]). text
)   56      video_player 
=  soup
. body ( 'a' ,{ 'data-json-href' : True })[ 0 ]   57      if  'oppetarkiv.se'  in  video
[ 'url' ]:   58          flashvars 
=  requests
. get ( "http://www.oppetarkiv.se/ %s " % video_player
. attrs
[ 'data-json-href' ]+ "?output=json" ). json ()   60          if  video_player
. attrs
[ 'data-json-href' ]. startswith ( "/wd" ):   61              flashvars 
=  requests
. get ( "http://www.svt.se/ %s " % video_player
. attrs
[ 'data-json-href' ]). json ()   63              flashvars 
=  requests
. get ( "http://www.svtplay.se/ %s " % video_player
. attrs
[ 'data-json-href' ]+ "?output=json" ). json ()   64      video
[ 'duration' ] =  video_player
. attrs
. get ( 'data-length' , 0 )   65      if not  'title'  in  video
:   66          video
[ 'title' ] =  soup
. find ( 'meta' ,{ 'property' : 'og:title' }). attrs
[ 'content' ]. replace ( '|' , '_' ). replace ( '/' , '_' )   67      if not  'genre'  in  video
:   68          if  soup
. find ( text
= 'Kategori:' ):   69              video
[ 'genre' ] =  soup
. find ( text
= 'Kategori:' ). parent
. parent
. a
. text
  71              video
[ 'genre' ] =  'Ingen Genre'    72      if  'dynamicStreams'  in  flashvars
:   73          video
[ 'url' ] =  flashvars
[ 'dynamicStreams' ][ 0 ]. split ( 'url:' )[ 1 ]. split ( '.mp4,' )[ 0 ] + '.mp4'   74          filename 
=  video
[ 'title' ]+ ".mp4"   75          print ( Popen ([ "rtmpdump" , "-o" + filename
, "-r" ,  url
],  stdout
= PIPE
). communicate ()[ 0 ])   76      if  'pathflv'  in  flashvars
:   77          rtmp 
=  flashvars
[ 'pathflv' ][ 0 ]   78          filename 
=  video
[ 'title' ]+ ".flv"   79          print ( Popen ([ "mplayer" , "-dumpstream" , "-dumpfile" , filename
,  rtmp
],  stdout
= PIPE
). communicate ()[ 0 ])   80      if not  'timestamp'  in  video
:   81          if  soup
. find_all ( datetime
= True ):   82              xmldate_str 
=  soup
. find_all ( datetime
= True )[ 0 ]. attrs
[ 'datetime' ]   83              video
[ 'timestamp' ] =  datetime (* feedparser
._ parse
_ date
_ w
3 dtf
( xmldate_str
)[: 6 ])  #naive in utc   84              video
[ 'timestamp' ] =  video
[ 'timestamp' ]. replace ( tzinfo
= timezone
. utc
). astimezone ( tz
= None )  #convert to local time   85      if  'video'  in  flashvars
:   86          for  reference 
in  flashvars
[ 'video' ][ 'videoReferences' ]:   87              if  'm3u8'  in  reference
[ 'url' ]:   88                  video
[ 'url' ]= reference
[ 'url' ]   89                  video
[ 'filename' ] =  video
[ 'title' ]+ '.ts'   90                  if  'statistics'  in  flashvars
:   91                      video
[ 'category' ] =  flashvars
[ 'statistics' ][ 'category' ]   92          if not  download_from_playlist ( video
):   94      if not  'url'  in  video
:   95          print ( "Could not find any streams" )   99  def  download_from_playlist ( video
):  100      params 
=  requests
. utils
. urlparse ( video
[ 'url' ]). query
 102      if  'cc1='  in  params
:   #'cc1=name=Svenska~default=yes~forced=no~uri=http://media.svt.se/download/mcc/wp3/undertexter-wsrt/1134047/1134047-025A/C(sv)/index.m3u8~lang=sv'  103          video
[ 'subs' ] = [ dict ([ k
. split ( '=' )  for  k 
in  params
. split ( 'cc1=' )[ 1 ]. split ( '~' )])]  #make a dict from the paramstring  105          req 
=  requests
. get ( video
[ 'url' ]). text
 107          print ( "Error reading, skipping file" )   108          print ( sys
. exc_info ()[ 1 ])  112              segments 
= [ item 
for  item 
in  requests
. get ( video
[ 'subs' ][ 0 ][ 'uri' ]). text
. split ( ' \n ' )  if  'vtt'  in  item
]  114              print ( "Error reading, skipping subtitle" )   115              print ( sys
. exc_info ()[ 1 ])  116              segments 
= []  #ugly FIXME  117          video
[ 'subs' ][ 0 ][ 'download' ] = []  118          for  segment 
in  segments
:  119              if not  segment
. startswith ( 'http' ):  120                  segment 
=  "{}/{}" . format ( os
. path
. dirname ( video
[ 'subs' ][ 0 ][ 'uri' ]),  segment
)  122                  video
[ 'subs' ][ 0 ][ 'download' ]. append ( requests
. get ( segment
). text
)  124                  print ( "Error reading, skipping subtitle" )   125                  print ( sys
. exc_info ()[ 1 ])  127      playlist 
=  parse_playlist ( req
)  130      videourl 
=  sorted ( playlist
,  key
= lambda  k
:  int ( k
[ 'BANDWIDTH' ]))[- 1 ][ 'url' ]  131      if not  videourl
. startswith ( 'http' ):  #if relative path  132          videourl 
=  "{}/{}" . format ( os
. path
. dirname ( video
[ 'url' ]),  videourl
)   133      segments
,  metadata 
=  parse_segment_playlist ( videourl
)  134      if  "EXT-X-KEY"  in  metadata
:  136              key 
=  requests
. get ( metadata
[ "EXT-X-KEY" ][ 'URI' ]. strip ( '"' )). text
 138              print ( "Error reading, skipping file" )   139              print ( sys
. exc_info ()[ 1 ])  144      with 
open ( " %s " % video
[ 'filename' ], "wb" )  as  ofile
:  149                  ufile 
=  requests
. get ( url
,  stream
= True ). raw
 151                  print ( "Error reading, skipping file" )   152                  print ( sys
. exc_info ()[ 1 ])  154              print ( " \r {0:.2f} MB" . format ( size
/ 1024 / 1024 ), end
= "" )  157                  iv
= struct
. pack ( "IIII" , segment
, 0 , 0 , 0 )  159                      decryptor 
=  AES
. new ( key
,  AES
. MODE_CBC
,  iv
)  #ValueError: AES key must be either 16, 24, or 32 bytes long  160                  except ( ValueError )  as  e
:  161                      print ( "Error using decryption key. Skipping" )  166                      buf 
=  ufile
. read ( 4096 )  168                      print ( "Error reading, skipping file" )  #FIXME mark file as failed  169                      print ( sys
. exc_info ()[ 1 ])  174                      buf 
=  decryptor
. decrypt ( buf
)  179      if  'thumb-url'  in  video
:  181              video
[ 'thumb' ] =  requests
. get ( video
[ 'thumb-url' ], stream
= True ). raw
 183              print ( "Error reading thumbnail" )  #FIXME mark file as failed  184              print ( sys
. exc_info ()[ 1 ])  188  def  parse_playlist ( playlist
):  189      if not  playlist
. startswith ( "#EXTM3U" ):  192      playlist 
=  playlist
. splitlines ()  193      while not  'EXT-X-STREAM-INF'  in  playlist
[ 0 ]:  194          playlist 
=  playlist
[ 1 :]  196      for  ( metadata_string
, url
)  in  zip ( playlist
[ 0 :: 2 ],  playlist
[ 1 :: 2 ]):  198          if not  'EXT-X-STREAM-INF'  in  metadata_string
. split ( ':' )[ 0 ]:  200          for  item 
in  metadata_string
. split ( ':' )[ 1 ]. split ( ',' ):  202                  md
. update ([ item
. split ( '=' ),])   207  def  parse_segment_playlist ( playlisturl
):  208      playlist 
=  requests
. get ( playlisturl
). text
 209      assert  playlist
. startswith ( "#EXTM3U" )  210      PATTERN 
=  re
. compile ( r
'''((?:[^,"']|"[^"]*"|'[^']*')+)''' )  214      for  row 
in  playlist
. splitlines ():  216              if not  row
. startswith ( 'http' ):  #if relative path  217                  row 
=  "{}/{}" . format ( os
. path
. dirname ( playlisturl
),  row
)   223          if  "EXT-X-KEY"  in  row
:  224               row 
=  row
. split ( ':' , 1 )[ 1 ]  #skip first part  225               parts 
=  PATTERN
. split ( row
)[ 1 :- 1 ]  #do magic re split and keep quotes  226               metadata
[ "EXT-X-KEY" ] =  dict ([ part
. split ( '=' , 1 )  for  part 
in  parts 
if  '='  in  part
])  #throw away the commas and make dict of the pairs  227      return ( segments
,  metadata
)  229  def  parse_videolist ():  231      soup 
=  BeautifulSoup ( requests
. get ( "http://www.svtplay.se/ajax/videospager" ). text
) #this call does not work for getting the pages, we use it for the page totals only  232      page_tot 
=  int ( soup
. find ( 'a' ,{ 'data-currentpage' : True }). attrs
[ 'data-lastpage' ])  235      while ( page_num 
<=  page_tot
):  236          base_url 
=  "http://www.svtplay.se/ajax/videos?sida={}" . format ( page_num
)  237          soup 
=  BeautifulSoup ( requests
. get ( base_url
). text
)  238          for  article 
in  soup
. findAll ( 'article' ):  239              meta 
=  dict ( article
. attrs
)  241              video
[ 'title' ] =  meta
[ 'data-title' ]  242              video
[ 'description' ] =  meta
[ 'data-description' ]  243              video
[ 'url' ] =  dict ( article
. find ( 'a' ). attrs
)[ 'href' ]  244              video
[ 'thumb-url' ] =  dict ( article
. find ( 'img' ,{}). attrs
)[ 'src' ]  245              video
[ 'num' ] =  video_num
 246              video
[ 'total' ] =  page_tot 
*  videos_per_page
 251  def  remux ( video
,  xml
= None ):  252      basename 
=  video
[ 'filename' ]. split ( '.ts' )[ 0 ]  254          if not  os
. path
. exists ( video
[ 'genre' ]):  255              os
. mkdir ( video
[ 'genre' ])  256          video
[ 'path' ] =  os
. path
. join ( video
[ 'genre' ], basename
+ '.mkv' )  258          video
[ 'path' ] =  basename
+ '.mkv'   259      command 
= [ "mkvmerge" , "-o" , video
[ 'path' ],  '--title' , video
[ 'title' ]]  262          with 
open ( basename
+ '.xml' , 'w' )  as  f
:  264              command
. extend ([ '--global-tags' , basename
+ '.xml' ])             266          with 
open ( 'thumbnail.jpg' , 'wb' )  as  f
:  #FIXME use title instead for many downloaders  267              f
. write ( video
[ 'thumb' ]. read ())  268              command
. extend ([ '--attachment-description' ,  "Thumbnail" ,  269                   '--attachment-mime-type' ,  'image/jpeg' ,  270                   '--attach-file' ,  'thumbnail.jpg' ])  271      # if 'subs' in video:  272      #     for sub in video['subs']:  273      #         if 'download' in sub:  274      #             with open("{}.vtt".format(sub['lang']),'wb') as f:  275      #                 f.write(bytes("".join(sub['download']),'utf-8')) #FIXME  276      #                 command.extend(['--language 0:{} {}.vtt'.format(sub['lang'],sub['lang'])])  279      command
. append ( video
[ 'filename' ])  280      print ( Popen ( command
,  stdout
= PIPE
). communicate ()[ 0 ])  281      for  fname 
in  ( video
[ 'filename' ],  basename
+ '.xml' , 'thumbnail.jpg' ):  286      if  'timestamp'  in  video
:  288              os
. utime ( video
[ 'path' ],  times
=( video
[ 'timestamp' ]. timestamp (), video
[ 'timestamp' ]. timestamp ()))  289          except  FileNotFoundError 
as  e
:  293  def  mkv_metadata ( video
):  294      root 
=  BeautifulSoup ( features
= 'xml' )  295      root
. append ( Doctype ( 'Tags SYSTEM "matroskatags.dtd"' ))  296      tags 
=  root
. new_tag ( "Tags" )  297      tag 
=  root
. new_tag ( "Tag" )  300      keep 
= ( 'title' , 'description' ,  'url' , 'genre' )  301      targets 
=  root
. new_tag ( "Targets" )  302      ttv 
=  root
. new_tag ( "TargetTypeValue" )  309          simple 
=  root
. new_tag ( 'Simple' )  310          name 
=  root
. new_tag ( 'Name' )  311          name
. string
= key
. upper ()  313          sstring 
=  root
. new_tag ( 'String' )  314          sstring
. string
= video
[ key
]  315          simple
. append ( sstring
)  319  if  __name__ 
==  "__main__" :  320      parser 
=  argparse
. ArgumentParser ()  321      group 
=  parser
. add_mutually_exclusive_group ( required
= True )  322      group
. add_argument ( "-r" ,  "--rss" ,  help = "Download all files in rss" )  323      group
. add_argument ( "-u" ,  "--url" ,  help = "Download video in url" )  324      group
. add_argument ( "-m" ,  "--mirror" ,  help = "Mirror all files" ,  action
= "store_true" )  325      parser
. add_argument ( "-n" ,  "--no_act" ,  help = "Just print what would be done, don't do any downloading." ,  action
= "store_true" )  326      parser
. add_argument ( "--no_remux" ,  help = "Don't remux into mkv" ,  action
= "store_true" )  328      args 
=  parser
. parse_args ()  330          d 
=  feedparser
. parse ( args
. rss
)  332              print (( "Downloading:  %s " %e. title
))  335              video 
=  scrape_player_page ({ 'title' : e
. title
, 'url' : e
. link
})  339          #print(e.description)  341          if not  os
. path
. exists ( '.seen' ):  343          for  video 
in  parse_videolist ():  344              video
[ 'title' ] =  video
[ 'title' ]. replace ( '/' , '_' )  345              print ( video
[ 'title' ]+ '.mkv' )  346              print ( "{} of {}" . format ( video
[ 'num' ],  video
[ 'total' ]))  348              if  os
. path
. exists ( os
. path
. join ( '.seen' , video
[ 'title' ])):  351              print ( "Downloading..." )  354              open ( os
. path
. join ( '.seen' , video
[ 'title' ]), 'w' ). close ()  #touch  355              ret 
=  scrape_player_page ( video
)  357                  if not  os
. path
. exists ( '.failed' ):  359                  open ( os
. path
. join ( '.failed' , video
[ 'title' ]), 'w' ). close ()  #touch  364              xml 
=  mkv_metadata ( video
)  369              video 
=  scrape_player_page ({ 'url' : args
. url
})  370          if not  args
. no_remux
:  372          print (( "Downloaded {}" . format ( args
. url
)))