]>
git.frykholm.com Git - svtplaydump.git/blob - svtplaydump.py
8b4a3c25231fa4a299e6493e0596cea37dba4fcd
1 #!/usr/bin/env python3.4
2 # -*- coding: utf-8 -*-
4 # (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>
20 # 0.4 added mirror mode.
21 # 0.3 added apple streaming playlist parsing and decryption
22 # 0.2 added python 2.4 urlparse compatibility
25 from bs4
import BeautifulSoup
, Doctype
26 from subprocess
import *
28 from Crypto
. Cipher
import AES
35 from datetime
import datetime
, timezone
36 from pathlib
import Path
39 def __init__ ( self
, * args
, ** kwargs
):
40 self
. update ( dict (* args
, ** kwargs
)) # use the free update to set keys
42 def __setattr__ ( self
, name
, value
):
43 return self
.__ setitem
__ ( name
, value
)
45 def __getattr__ ( self
, name
):
46 return self
.__ getitem
__ ( name
)
48 def is_downloaded ( self
):
49 raise ( "NotImplemented" )
51 def scrape_player_page ( video
):
53 Try to scrape the site for video and download.
55 if not video
[ 'url' ]. startswith ( 'http' ):
56 video
[ 'url' ] = "http://www.svtplay.se" + video
[ 'url' ]
57 soup
= BeautifulSoup ( requests
. get ( video
[ 'url' ]). text
)
58 video_player
= soup
. body ( 'a' ,{ 'data-json-href' : True })[ 0 ]
59 if 'oppetarkiv.se' in video
[ 'url' ]:
60 flashvars
= requests
. get ( "http://www.oppetarkiv.se/ %s " % video_player
. attrs
[ 'data-json-href' ]+ "?output=json" ). json ()
62 if video_player
. attrs
[ 'data-json-href' ]. startswith ( "/wd" ):
63 flashvars
= requests
. get ( "http://www.svt.se/ %s " % video_player
. attrs
[ 'data-json-href' ]). json ()
65 flashvars
= requests
. get ( "http://www.svtplay.se/ %s " % video_player
. attrs
[ 'data-json-href' ]+ "?output=json" ). json ()
66 video
[ 'duration' ] = video_player
. attrs
. get ( 'data-length' , 0 )
67 if not 'title' in video
:
68 video
[ 'title' ] = soup
. find ( 'meta' ,{ 'property' : 'og:title' }). attrs
[ 'content' ]. replace ( '|' , '_' ). replace ( '/' , '_' )
69 if not 'genre' in video
:
70 if soup
. find ( text
= 'Kategori:' ):
71 video
[ 'genre' ] = soup
. find ( text
= 'Kategori:' ). parent
. parent
. a
. text
73 video
[ 'genre' ] = 'Ingen Genre'
74 if 'dynamicStreams' in flashvars
:
75 video
[ 'url' ] = flashvars
[ 'dynamicStreams' ][ 0 ]. split ( 'url:' )[ 1 ]. split ( '.mp4,' )[ 0 ] + '.mp4'
76 filename
= Path ( video
[ 'title' ]). with_suffix ( ".mp4" )
77 print ( Popen ([ "rtmpdump" , "-o" + filename
, "-r" , url
], stdout
= PIPE
). communicate ()[ 0 ])
78 if 'pathflv' in flashvars
:
79 rtmp
= flashvars
[ 'pathflv' ][ 0 ]
80 filename
= Path ( video
[ 'title' ]). with_suffix ( ".flv" )
81 print ( Popen ([ "mplayer" , "-dumpstream" , "-dumpfile" , filename
, rtmp
], stdout
= PIPE
). communicate ()[ 0 ])
82 if not 'timestamp' in video
:
83 if soup
. find_all ( datetime
= True ):
84 xmldate_str
= soup
. find_all ( datetime
= True )[ 0 ]. attrs
[ 'datetime' ]
86 video
[ 'timestamp' ] = datetime (* feedparser
._ parse
_ date
_ w
3 dtf
( xmldate_str
)[: 6 ]) #naive in utc
87 video
[ 'timestamp' ] = video
[ 'timestamp' ]. replace ( tzinfo
= timezone
. utc
). astimezone ( tz
= None ) #convert to local time
88 if 'video' in flashvars
:
89 for reference
in flashvars
[ 'video' ][ 'videoReferences' ]:
90 if 'm3u8' in reference
[ 'url' ]:
91 video
[ 'url' ]= reference
[ 'url' ]
92 video
[ 'filename' ] = Path ( video
[ 'title' ]). with_suffix ( '.ts' )
93 if 'statistics' in flashvars
:
94 video
[ 'category' ] = flashvars
[ 'statistics' ][ 'category' ]
95 if not download_from_playlist ( video
):
97 if not 'url' in video
:
98 print ( "Could not find any streams" )
102 def download_from_playlist ( video
):
103 params
= requests
. utils
. urlparse ( video
[ 'url' ]). query
105 if 'cc1=' in params
: #'cc1=name=Svenska~default=yes~forced=no~uri=http://media.svt.se/download/mcc/wp3/undertexter-wsrt/1134047/1134047-025A/C(sv)/index.m3u8~lang=sv'
106 video
[ 'subs' ] = [ dict ([ k
. split ( '=' ) for k
in params
. split ( 'cc1=' )[ 1 ]. split ( '~' )])] #make a dict from the paramstring
108 req
= requests
. get ( video
[ 'url' ]). text
110 print ( "Error reading, skipping file" )
111 print ( sys
. exc_info ()[ 1 ])
115 segments
= [ item
for item
in requests
. get ( video
[ 'subs' ][ 0 ][ 'uri' ]). text
. split ( ' \n ' ) if 'vtt' in item
]
117 print ( "Error reading, skipping subtitle" )
118 print ( sys
. exc_info ()[ 1 ])
119 segments
= [] #ugly FIXME
120 video
[ 'subs' ][ 0 ][ 'download' ] = []
121 for segment
in segments
:
122 if not segment
. startswith ( 'http' ):
123 segment
= "{}/{}" . format ( os
. path
. dirname ( video
[ 'subs' ][ 0 ][ 'uri' ]), segment
)
125 video
[ 'subs' ][ 0 ][ 'download' ]. append ( requests
. get ( segment
). text
)
127 print ( "Error reading, skipping subtitle" )
128 print ( sys
. exc_info ()[ 1 ])
130 playlist
= parse_playlist ( req
)
133 videourl
= sorted ( playlist
, key
= lambda k
: int ( k
[ 'BANDWIDTH' ]))[- 1 ][ 'url' ]
134 if not videourl
. startswith ( 'http' ): #if relative path
135 videourl
= "{}/{}" . format ( os
. path
. dirname ( video
[ 'url' ]), videourl
)
136 segments
, metadata
= parse_segment_playlist ( videourl
)
137 if "EXT-X-KEY" in metadata
:
139 key
= requests
. get ( metadata
[ "EXT-X-KEY" ][ 'URI' ]. strip ( '"' )). text
141 print ( "Error reading, skipping file" )
142 print ( sys
. exc_info ()[ 1 ])
147 with video
[ 'filename' ]. open ( "wb" ) as ofile
:
152 ufile
= requests
. get ( url
, stream
= True ). raw
154 print ( "Error reading, skipping file" )
155 print ( sys
. exc_info ()[ 1 ])
157 print ( " \r {0:.2f} MB" . format ( size
/ 1024 / 1024 ), end
= "" )
160 iv
= struct
. pack ( "IIII" , segment
, 0 , 0 , 0 )
162 decryptor
= AES
. new ( key
, AES
. MODE_CBC
, iv
) #ValueError: AES key must be either 16, 24, or 32 bytes long
163 except ( ValueError ) as e
:
164 print ( "Error using decryption key. Skipping" )
169 buf
= ufile
. read ( 4096 )
171 print ( "Error reading, skipping file" ) #FIXME mark file as failed
172 print ( sys
. exc_info ()[ 1 ])
177 buf
= decryptor
. decrypt ( buf
)
182 if 'thumb-url' in video
:
184 video
[ 'thumb' ] = requests
. get ( video
[ 'thumb-url' ], stream
= True ). raw
186 print ( "Error reading thumbnail" ) #FIXME mark file as failed
187 print ( sys
. exc_info ()[ 1 ])
191 def parse_playlist ( playlist
):
192 if not playlist
. startswith ( "#EXTM3U" ):
195 playlist
= playlist
. splitlines ()
196 while not 'EXT-X-STREAM-INF' in playlist
[ 0 ]:
197 playlist
= playlist
[ 1 :]
199 for ( metadata_string
, url
) in zip ( playlist
[ 0 :: 2 ], playlist
[ 1 :: 2 ]):
201 if not 'EXT-X-STREAM-INF' in metadata_string
. split ( ':' )[ 0 ]:
203 for item
in metadata_string
. split ( ':' )[ 1 ]. split ( ',' ):
205 md
. update ([ item
. split ( '=' ),])
210 def parse_segment_playlist ( playlisturl
):
211 playlist
= requests
. get ( playlisturl
). text
212 assert playlist
. startswith ( "#EXTM3U" )
213 PATTERN
= re
. compile ( r
'''((?:[^,"']|"[^"]*"|'[^']*')+)''' )
217 for row
in playlist
. splitlines ():
219 if not row
. startswith ( 'http' ): #if relative path
220 row
= "{}/{}" . format ( os
. path
. dirname ( playlisturl
), row
)
226 if "EXT-X-KEY" in row
:
227 row
= row
. split ( ':' , 1 )[ 1 ] #skip first part
228 parts
= PATTERN
. split ( row
)[ 1 :- 1 ] #do magic re split and keep quotes
229 metadata
[ "EXT-X-KEY" ] = dict ([ part
. split ( '=' , 1 ) for part
in parts
if '=' in part
]) #throw away the commas and make dict of the pairs
230 return ( segments
, metadata
)
232 def parse_videolist ():
234 soup
= BeautifulSoup ( requests
. get ( "http://www.svtplay.se/ajax/videospager" ). text
) #this call does not work for getting the pages, we use it for the page totals only
235 page_tot
= int ( soup
. find ( 'a' ,{ 'data-currentpage' : True }). attrs
[ 'data-lastpage' ])
238 while ( page_num
<= page_tot
):
239 base_url
= "http://www.svtplay.se/ajax/videos?sida={}" . format ( page_num
)
240 soup
= BeautifulSoup ( requests
. get ( base_url
). text
)
241 for article
in soup
. findAll ( 'article' ):
242 meta
= dict ( article
. attrs
)
244 video
[ 'title' ] = meta
[ 'data-title' ]
245 video
[ 'description' ] = meta
[ 'data-description' ]
246 video
[ 'url' ] = dict ( article
. find ( 'a' ). attrs
)[ 'href' ]
247 video
[ 'thumb-url' ] = dict ( article
. find ( 'img' ,{}). attrs
)[ 'src' ]
248 video
[ 'num' ] = video_num
249 video
[ 'total' ] = page_tot
* videos_per_page
254 def remux ( video
, xml
= None ):
256 if not os
. path
. exists ( video
[ 'genre' ]):
257 os
. mkdir ( video
[ 'genre' ])
258 video
[ 'path' ] = Path ( video
[ 'genre' ] / video
[ 'filename' ]). with_suffix ( '.mkv' )
260 video
[ 'path' ] = video
[ 'filename' ]. with_suffix ( '.mkv' )
261 command
= [ "mkvmerge" , "-o" , str ( video
[ 'path' ]), '--title' , video
[ 'title' ]]
264 with video
[ 'filename' ]. with_suffix ( '.xml' ). open ( 'w' ) as f
:
266 command
. extend ([ '--global-tags' , str ( video
[ 'filename' ]. with_suffix ( '.xml' ))])
268 with
open ( 'thumbnail.jpg' , 'wb' ) as f
: #FIXME use title instead for many downloaders
269 f
. write ( video
[ 'thumb' ]. read ())
270 command
. extend ([ '--attachment-description' , "Thumbnail" ,
271 '--attachment-mime-type' , 'image/jpeg' ,
272 '--attach-file' , 'thumbnail.jpg' ])
273 # if 'subs' in video:
274 # for sub in video['subs']:
275 # if 'download' in sub:
276 # with open("{}.vtt".format(sub['lang']),'wb') as f:
277 # f.write(bytes("".join(sub['download']),'utf-8')) #FIXME
278 # command.extend(['--language 0:{} {}.vtt'.format(sub['lang'],sub['lang'])])
280 command
. append ( str ( video
[ 'filename' ]))
281 print ( Popen ( command
, stdout
= PIPE
). communicate ()[ 0 ])
282 for fname
in ( video
[ 'filename' ], video
[ 'filename' ]. with_suffix ( '.xml' ), Path ( 'thumbnail.jpg' )):
287 if 'timestamp' in video
:
289 os
. utime ( str ( video
[ 'path' ]), times
=( video
[ 'timestamp' ]. timestamp (), video
[ 'timestamp' ]. timestamp ()))
290 except FileNotFoundError
as e
:
294 def mkv_metadata ( video
):
295 root
= BeautifulSoup ( features
= 'xml' )
296 root
. append ( Doctype ( 'Tags SYSTEM "matroskatags.dtd"' ))
297 tags
= root
. new_tag ( "Tags" )
298 tag
= root
. new_tag ( "Tag" )
301 keep
= ( 'title' , 'description' , 'url' , 'genre' )
302 targets
= root
. new_tag ( "Targets" )
303 ttv
= root
. new_tag ( "TargetTypeValue" )
310 simple
= root
. new_tag ( 'Simple' )
311 name
= root
. new_tag ( 'Name' )
312 name
. string
= key
. upper ()
314 sstring
= root
. new_tag ( 'String' )
315 sstring
. string
= video
[ key
]
316 simple
. append ( sstring
)
320 if __name__
== "__main__" :
321 parser
= argparse
. ArgumentParser ()
322 group
= parser
. add_mutually_exclusive_group ( required
= True )
323 group
. add_argument ( "-r" , "--rss" , help = "Download all files in rss" )
324 group
. add_argument ( "-u" , "--url" , help = "Download video in url" )
325 group
. add_argument ( "-m" , "--mirror" , help = "Mirror all files" , action
= "store_true" )
326 parser
. add_argument ( "-n" , "--no_act" , help = "Just print what would be done, don't do any downloading." , action
= "store_true" )
327 parser
. add_argument ( "--no_remux" , help = "Don't remux into mkv" , action
= "store_true" )
329 args
= parser
. parse_args ()
331 d
= feedparser
. parse ( args
. rss
)
333 print (( "Downloading: %s " %e. title
))
336 video
= scrape_player_page ({ 'title' : e
. title
, 'url' : e
. link
})
340 #print(e.description)
342 if not os
. path
. exists ( '.seen' ):
344 for video
in parse_videolist ():
345 video
[ 'title' ] = video
[ 'title' ]. replace ( '/' , '_' )
346 print ( video
[ 'title' ]+ '.mkv' )
347 print ( "{} of {}" . format ( video
[ 'num' ], video
[ 'total' ]))
349 if os
. path
. exists ( os
. path
. join ( '.seen' , video
[ 'title' ])):
352 print ( "Downloading..." )
355 open ( os
. path
. join ( '.seen' , video
[ 'title' ]), 'w' ). close () #touch
356 ret
= scrape_player_page ( video
)
358 if not os
. path
. exists ( '.failed' ):
360 open ( os
. path
. join ( '.failed' , video
[ 'title' ]), 'w' ). close () #touch
365 xml
= mkv_metadata ( video
)
370 video
= scrape_player_page ({ 'url' : args
. url
})
371 if not args
. no_remux
:
373 print (( "Downloaded {}" . format ( args
. url
)))