]>
git.frykholm.com Git - svtplaydump.git/blob - svtplaydump.py
59a44330c31db5ecfaeabcc65745f7153c885391
2 # -*- coding: utf-8 -*-
4 # (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>
20 # 0.4 added mirror mode.
21 # 0.3 added apple streaming playlist parsing and decryption
22 # 0.2 added python 2.4 urlparse compatibility
25 from bs4
import BeautifulSoup
, Doctype
26 from subprocess
import *
28 from Crypto
. Cipher
import AES
35 from datetime
import datetime
, timezone
37 def __init__ ( self
, * args
, ** kwargs
):
38 self
. update ( dict (* args
, ** kwargs
)) # use the free update to set keys
40 def __setattr__ ( self
, name
, value
):
41 return self
.__ setitem
__ ( name
, value
)
43 def __getattr__ ( self
, name
):
44 return self
.__ getitem
__ ( name
)
46 def is_downloaded ( self
):
47 raise ( "NotImplemented" )
49 def scrape_player_page ( video
):
51 Try to scrape the site for video and download.
53 if not video
[ 'url' ]. startswith ( 'http' ):
54 video
[ 'url' ] = "http://www.svtplay.se" + video
[ 'url' ]
55 soup
= BeautifulSoup ( requests
. get ( video
[ 'url' ]). text
)
56 video_player
= soup
. body ( 'a' ,{ 'data-json-href' : True })[ 0 ]
57 if 'oppetarkiv.se' in video
[ 'url' ]:
58 flashvars
= requests
. get ( "http://www.oppetarkiv.se/ %s " % video_player
. attrs
[ 'data-json-href' ]+ "?output=json" ). json ()
60 if video_player
. attrs
[ 'data-json-href' ]. startswith ( "/wd" ):
61 flashvars
= requests
. get ( "http://www.svt.se/ %s " % video_player
. attrs
[ 'data-json-href' ]). json ()
63 flashvars
= requests
. get ( "http://www.svtplay.se/ %s " % video_player
. attrs
[ 'data-json-href' ]+ "?output=json" ). json ()
64 video
[ 'duration' ] = video_player
. attrs
. get ( 'data-length' , 0 )
65 if not 'title' in video
:
66 video
[ 'title' ] = soup
. find ( 'meta' ,{ 'property' : 'og:title' }). attrs
[ 'content' ]. replace ( '|' , '_' ). replace ( '/' , '_' )
67 if not 'genre' in video
:
68 if soup
. find ( text
= 'Kategori:' ):
69 video
[ 'genre' ] = soup
. find ( text
= 'Kategori:' ). parent
. parent
. a
. text
71 video
[ 'genre' ] = 'Ingen Genre'
72 if 'dynamicStreams' in flashvars
:
73 video
[ 'url' ] = flashvars
[ 'dynamicStreams' ][ 0 ]. split ( 'url:' )[ 1 ]. split ( '.mp4,' )[ 0 ] + '.mp4'
74 filename
= video
[ 'title' ]+ ".mp4"
75 print ( Popen ([ "rtmpdump" , "-o" + filename
, "-r" , url
], stdout
= PIPE
). communicate ()[ 0 ])
76 if 'pathflv' in flashvars
:
77 rtmp
= flashvars
[ 'pathflv' ][ 0 ]
78 filename
= video
[ 'title' ]+ ".flv"
79 print ( Popen ([ "mplayer" , "-dumpstream" , "-dumpfile" , filename
, rtmp
], stdout
= PIPE
). communicate ()[ 0 ])
80 if not 'timestamp' in video
:
81 if soup
. find_all ( datetime
= True ):
82 xmldate_str
= soup
. find_all ( datetime
= True )[ 0 ]. attrs
[ 'datetime' ]
83 video
[ 'timestamp' ] = datetime (* feedparser
._ parse
_ date
_ w
3 dtf
( xmldate_str
)[: 6 ]) #naive in utc
84 video
[ 'timestamp' ] = video
[ 'timestamp' ]. replace ( tzinfo
= timezone
. utc
). astimezone ( tz
= None ) #convert to local time
85 if 'video' in flashvars
:
86 for reference
in flashvars
[ 'video' ][ 'videoReferences' ]:
87 if 'm3u8' in reference
[ 'url' ]:
88 video
[ 'url' ]= reference
[ 'url' ]
89 video
[ 'filename' ] = video
[ 'title' ]+ '.ts'
90 if 'statistics' in flashvars
:
91 video
[ 'category' ] = flashvars
[ 'statistics' ][ 'category' ]
92 if not download_from_playlist ( video
):
94 if not 'url' in video
:
95 print ( "Could not find any streams" )
99 def download_from_playlist ( video
):
100 params
= requests
. utils
. urlparse ( video
[ 'url' ]). query
102 if 'cc1=' in params
: #'cc1=name=Svenska~default=yes~forced=no~uri=http://media.svt.se/download/mcc/wp3/undertexter-wsrt/1134047/1134047-025A/C(sv)/index.m3u8~lang=sv'
103 video
[ 'subs' ] = [ dict ([ k
. split ( '=' ) for k
in params
. split ( 'cc1=' )[ 1 ]. split ( '~' )])] #make a dict from the paramstring
105 req
= requests
. get ( video
[ 'url' ]). text
107 print ( "Error reading, skipping file" )
108 print ( sys
. exc_info ()[ 1 ])
112 segments
= [ item
for item
in requests
. get ( video
[ 'subs' ][ 0 ][ 'uri' ]). text
. split ( ' \n ' ) if 'vtt' in item
]
114 print ( "Error reading, skipping subtitle" )
115 print ( sys
. exc_info ()[ 1 ])
116 segments
= [] #ugly FIXME
117 video
[ 'subs' ][ 0 ][ 'download' ] = []
118 for segment
in segments
:
119 if not segment
. startswith ( 'http' ):
120 segment
= "{}/{}" . format ( os
. path
. dirname ( video
[ 'subs' ][ 0 ][ 'uri' ]), segment
)
122 video
[ 'subs' ][ 0 ][ 'download' ]. append ( requests
. get ( segment
). text
)
124 print ( "Error reading, skipping subtitle" )
125 print ( sys
. exc_info ()[ 1 ])
127 playlist
= parse_playlist ( req
)
130 videourl
= sorted ( playlist
, key
= lambda k
: int ( k
[ 'BANDWIDTH' ]))[- 1 ][ 'url' ]
131 if not videourl
. startswith ( 'http' ): #if relative path
132 videourl
= "{}/{}" . format ( os
. path
. dirname ( video
[ 'url' ]), videourl
)
133 segments
, metadata
= parse_segment_playlist ( videourl
)
134 if "EXT-X-KEY" in metadata
:
136 key
= requests
. get ( metadata
[ "EXT-X-KEY" ][ 'URI' ]. strip ( '"' )). text
138 print ( "Error reading, skipping file" )
139 print ( sys
. exc_info ()[ 1 ])
144 with
open ( " %s " % video
[ 'filename' ], "wb" ) as ofile
:
149 ufile
= requests
. get ( url
, stream
= True ). raw
151 print ( "Error reading, skipping file" )
152 print ( sys
. exc_info ()[ 1 ])
154 print ( " \r {0:.2f} MB" . format ( size
/ 1024 / 1024 ), end
= "" )
157 iv
= struct
. pack ( "IIII" , segment
, 0 , 0 , 0 )
159 decryptor
= AES
. new ( key
, AES
. MODE_CBC
, iv
) #ValueError: AES key must be either 16, 24, or 32 bytes long
160 except ( ValueError ) as e
:
161 print ( "Error using decryption key. Skipping" )
166 buf
= ufile
. read ( 4096 )
168 print ( "Error reading, skipping file" ) #FIXME mark file as failed
169 print ( sys
. exc_info ()[ 1 ])
174 buf
= decryptor
. decrypt ( buf
)
179 if 'thumb-url' in video
:
181 video
[ 'thumb' ] = requests
. get ( video
[ 'thumb-url' ], stream
= True ). raw
183 print ( "Error reading thumbnail" ) #FIXME mark file as failed
184 print ( sys
. exc_info ()[ 1 ])
188 def parse_playlist ( playlist
):
189 if not playlist
. startswith ( "#EXTM3U" ):
192 playlist
= playlist
. splitlines ()
193 while not 'EXT-X-STREAM-INF' in playlist
[ 0 ]:
194 playlist
= playlist
[ 1 :]
196 for ( metadata_string
, url
) in zip ( playlist
[ 0 :: 2 ], playlist
[ 1 :: 2 ]):
198 if not 'EXT-X-STREAM-INF' in metadata_string
. split ( ':' )[ 0 ]:
200 for item
in metadata_string
. split ( ':' )[ 1 ]. split ( ',' ):
202 md
. update ([ item
. split ( '=' ),])
207 def parse_segment_playlist ( playlisturl
):
208 playlist
= requests
. get ( playlisturl
). text
209 assert playlist
. startswith ( "#EXTM3U" )
210 PATTERN
= re
. compile ( r
'''((?:[^,"']|"[^"]*"|'[^']*')+)''' )
214 for row
in playlist
. splitlines ():
216 if not row
. startswith ( 'http' ): #if relative path
217 row
= "{}/{}" . format ( os
. path
. dirname ( playlisturl
), row
)
223 if "EXT-X-KEY" in row
:
224 row
= row
. split ( ':' , 1 )[ 1 ] #skip first part
225 parts
= PATTERN
. split ( row
)[ 1 :- 1 ] #do magic re split and keep quotes
226 metadata
[ "EXT-X-KEY" ] = dict ([ part
. split ( '=' , 1 ) for part
in parts
if '=' in part
]) #throw away the commas and make dict of the pairs
227 return ( segments
, metadata
)
229 def parse_videolist ():
231 soup
= BeautifulSoup ( requests
. get ( "http://www.svtplay.se/ajax/videospager" ). text
) #this call does not work for getting the pages, we use it for the page totals only
232 page_tot
= int ( soup
. find ( 'a' ,{ 'data-currentpage' : True }). attrs
[ 'data-lastpage' ])
235 while ( page_num
<= page_tot
):
236 base_url
= "http://www.svtplay.se/ajax/videos?sida={}" . format ( page_num
)
237 soup
= BeautifulSoup ( requests
. get ( base_url
). text
)
238 for article
in soup
. findAll ( 'article' ):
239 meta
= dict ( article
. attrs
)
241 video
[ 'title' ] = meta
[ 'data-title' ]
242 video
[ 'description' ] = meta
[ 'data-description' ]
243 video
[ 'url' ] = dict ( article
. find ( 'a' ). attrs
)[ 'href' ]
244 video
[ 'thumb-url' ] = dict ( article
. find ( 'img' ,{}). attrs
)[ 'src' ]
245 video
[ 'num' ] = video_num
246 video
[ 'total' ] = page_tot
* videos_per_page
251 def remux ( video
, xml
= None ):
252 basename
= video
[ 'filename' ]. split ( '.ts' )[ 0 ]
254 if not os
. path
. exists ( video
[ 'genre' ]):
255 os
. mkdir ( video
[ 'genre' ])
256 video
[ 'path' ] = os
. path
. join ( video
[ 'genre' ], basename
+ '.mkv' )
258 video
[ 'path' ] = basename
+ '.mkv'
259 command
= [ "mkvmerge" , "-o" , video
[ 'path' ], '--title' , video
[ 'title' ]]
262 with
open ( basename
+ '.xml' , 'w' ) as f
:
264 command
. extend ([ '--global-tags' , basename
+ '.xml' ])
266 with
open ( 'thumbnail.jpg' , 'wb' ) as f
: #FIXME use title instead for many downloaders
267 f
. write ( video
[ 'thumb' ]. read ())
268 command
. extend ([ '--attachment-description' , "Thumbnail" ,
269 '--attachment-mime-type' , 'image/jpeg' ,
270 '--attach-file' , 'thumbnail.jpg' ])
271 # if 'subs' in video:
272 # for sub in video['subs']:
273 # if 'download' in sub:
274 # with open("{}.vtt".format(sub['lang']),'wb') as f:
275 # f.write(bytes("".join(sub['download']),'utf-8')) #FIXME
276 # command.extend(['--language 0:{} {}.vtt'.format(sub['lang'],sub['lang'])])
279 command
. append ( video
[ 'filename' ])
280 print ( Popen ( command
, stdout
= PIPE
). communicate ()[ 0 ])
281 for fname
in ( video
[ 'filename' ], basename
+ '.xml' , 'thumbnail.jpg' ):
286 if 'timestamp' in video
:
288 os
. utime ( video
[ 'path' ], times
=( video
[ 'timestamp' ]. timestamp (), video
[ 'timestamp' ]. timestamp ()))
289 except FileNotFoundError
as e
:
293 def mkv_metadata ( video
):
294 root
= BeautifulSoup ( features
= 'xml' )
295 root
. append ( Doctype ( 'Tags SYSTEM "matroskatags.dtd"' ))
296 tags
= root
. new_tag ( "Tags" )
297 tag
= root
. new_tag ( "Tag" )
300 keep
= ( 'title' , 'description' , 'url' , 'genre' )
301 targets
= root
. new_tag ( "Targets" )
302 ttv
= root
. new_tag ( "TargetTypeValue" )
309 simple
= root
. new_tag ( 'Simple' )
310 name
= root
. new_tag ( 'Name' )
311 name
. string
= key
. upper ()
313 sstring
= root
. new_tag ( 'String' )
314 sstring
. string
= video
[ key
]
315 simple
. append ( sstring
)
319 if __name__
== "__main__" :
320 parser
= argparse
. ArgumentParser ()
321 group
= parser
. add_mutually_exclusive_group ( required
= True )
322 group
. add_argument ( "-r" , "--rss" , help = "Download all files in rss" )
323 group
. add_argument ( "-u" , "--url" , help = "Download video in url" )
324 group
. add_argument ( "-m" , "--mirror" , help = "Mirror all files" , action
= "store_true" )
325 parser
. add_argument ( "-n" , "--no_act" , help = "Just print what would be done, don't do any downloading." , action
= "store_true" )
326 parser
. add_argument ( "--no_remux" , help = "Don't remux into mkv" , action
= "store_true" )
328 args
= parser
. parse_args ()
330 d
= feedparser
. parse ( args
. rss
)
332 print (( "Downloading: %s " %e. title
))
335 video
= scrape_player_page ({ 'title' : e
. title
, 'url' : e
. link
})
339 #print(e.description)
341 if not os
. path
. exists ( '.seen' ):
343 for video
in parse_videolist ():
344 video
[ 'title' ] = video
[ 'title' ]. replace ( '/' , '_' )
345 print ( video
[ 'title' ]+ '.mkv' )
346 print ( "{} of {}" . format ( video
[ 'num' ], video
[ 'total' ]))
348 if os
. path
. exists ( os
. path
. join ( '.seen' , video
[ 'title' ])):
351 print ( "Downloading..." )
354 open ( os
. path
. join ( '.seen' , video
[ 'title' ]), 'w' ). close () #touch
355 ret
= scrape_player_page ( video
)
357 if not os
. path
. exists ( '.failed' ):
359 open ( os
. path
. join ( '.failed' , video
[ 'title' ]), 'w' ). close () #touch
364 xml
= mkv_metadata ( video
)
369 video
= scrape_player_page ({ 'url' : args
. url
})
370 if not args
. no_remux
:
372 print (( "Downloaded {}" . format ( args
. url
)))