2 # -*- coding: utf-8 -*-
4 # (C) Copyright 2010 Mikael Frykholm <mikael@frykholm.com>
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>
20 # 0.4 added mirror mode.
21 # 0.3 added apple streaming playlist parsing and decryption
22 # 0.2 added python 2.4 urlparse compatibility
25 from bs4
import BeautifulSoup
, Doctype
26 from subprocess
import *
28 from Crypto
.Cipher
import AES
35 from datetime
import datetime
, timezone
37 def __init__(self
, *args
, **kwargs
):
38 self
.update(dict(*args
, **kwargs
)) # use the free update to set keys
40 def __setattr__(self
, name
, value
):
41 return self
.__setitem
__(name
,value
)
43 def __getattr__(self
, name
):
44 return self
.__getitem
__(name
)
46 def is_downloaded(self
):
47 raise("NotImplemented")
49 def scrape_player_page(video
):
51 Try to scrape the site for video and download.
53 if not video
['url'].startswith('http'):
54 video
['url'] = "http://www.svtplay.se" + video
['url']
55 soup
= BeautifulSoup(requests
.get(video
['url']).text
)
56 video_player
= soup
.body('a',{'data-json-href':True}
)[0]
57 if 'oppetarkiv.se' in video
['url']:
58 flashvars
= requests
.get("http://www.oppetarkiv.se/%s"%video_player
.attrs
['data-json-href']+"?output=json").json()
60 if video_player
.attrs
['data-json-href'].startswith("/wd"):
61 flashvars
= requests
.get("http://www.svt.se/%s"%video_player
.attrs
['data-json-href']).json()
63 flashvars
= requests
.get("http://www.svtplay.se/%s"%video_player
.attrs
['data-json-href']+"?output=json").json()
64 video
['duration'] = video_player
.attrs
.get('data-length',0)
65 if not video
['title']:
66 video
['title'] = soup
.find('meta',{'property':'og:title'}
).attrs
['content'].replace('|','_').replace('/','_')
67 if not 'genre' in video
:
68 if soup
.find(text
='Kategori:'):
69 video
['genre'] = soup
.find(text
='Kategori:').parent
.parent
.a
.text
71 video
['genre'] = 'Ingen Genre'
72 if 'dynamicStreams' in flashvars
:
73 video
['url'] = flashvars
['dynamicStreams'][0].split('url:')[1].split('.mp4,')[0] +'.mp4'
74 filename
= video
['title']+".mp4"
75 print(Popen(["rtmpdump","-o"+filename
,"-r", url
], stdout
=PIPE
).communicate()[0])
76 if 'pathflv' in flashvars
:
77 rtmp
= flashvars
['pathflv'][0]
78 filename
= video
['title']+".flv"
79 print(Popen(["mplayer","-dumpstream","-dumpfile",filename
, rtmp
], stdout
=PIPE
).communicate()[0])
80 if not 'timestamp' in video
:
81 if soup
.find_all(datetime
=True):
82 xmldate_str
= soup
.find_all(datetime
=True)[0].attrs
['datetime']
83 video
['timestamp'] = datetime(*feedparser
._parse
_date
_w
3dtf
(xmldate_str
)[:6]) #naive in utc
84 video
['timestamp'] = video
['timestamp'].replace(tzinfo
=timezone
.utc
).astimezone(tz
=None) #convert to local time
85 if 'video' in flashvars
:
86 for reference
in flashvars
['video']['videoReferences']:
87 if 'm3u8' in reference
['url']:
88 video
['url']=reference
['url']
89 video
['filename'] = video
['title']+'.ts'
90 if 'statistics' in flashvars
:
91 video
['category'] = flashvars
['statistics']['category']
92 if not download_from_playlist(video
):
94 if not 'url' in video
:
95 print("Could not find any streams")
99 def download_from_playlist(video
):
100 playlist
= parse_playlist(requests
.get(video
['url']).text
)
103 videourl
= sorted(playlist
, key
=lambda k
: int(k
['BANDWIDTH']))[-1]['url']
104 if not videourl
.startswith('http'): #if relative path
105 videourl
= "{}/{}".format(os
.path
.dirname(video
['url']), videourl
)
106 segments
, metadata
= parse_segment_playlist(videourl
)
107 if "EXT-X-KEY" in metadata
:
108 key
= requests
.get(metadata
["EXT-X-KEY"]['URI'].strip('"')).text
112 with open("%s"%video
['filename'],"wb") as ofile
:
117 ufile
= requests
.get(url
, stream
=True).raw
119 print("Error reading, skipping file") #FIXME mark file as failed
120 print(sys
.exc_info()[1])
122 print("\r{0:.2f} MB".format(size
/1024/1024),end
="")
125 iv
=struct
.pack("IIII",segment
,0,0,0)
126 decryptor
= AES
.new(key
, AES
.MODE_CBC
, iv
)
129 buf
= ufile
.read(4096)
131 print("Error reading, skipping file") #FIXME mark file as failed
132 print(sys
.exc_info()[1])
137 buf
= decryptor
.decrypt(buf
)
142 if 'thumb-url' in video
:
143 video
['thumb'] = requests
.get(video
['thumb-url'],stream
=True).raw
146 def parse_playlist(playlist
):
147 if not playlist
.startswith("#EXTM3U"):
150 playlist
= playlist
.splitlines()
151 while not 'EXT-X-STREAM-INF' in playlist
[0]:
152 playlist
= playlist
[1:]
154 for (metadata_string
,url
) in zip(playlist
[0::2], playlist
[1::2]):
156 if not 'EXT-X-STREAM-INF' in metadata_string
.split(':')[0]:
158 for item
in metadata_string
.split(':')[1].split(','):
160 md
.update([item
.split('='),])
165 def parse_segment_playlist(playlisturl
):
166 playlist
= requests
.get(playlisturl
).text
167 assert playlist
.startswith("#EXTM3U")
168 PATTERN
= re
.compile(r
'''((?:[^,"']|"[^"]*"|'[^']*')+)''')
172 for row
in playlist
.splitlines():
174 if not row
.startswith('http'): #if relative path
175 row
= "{}/{}".format(os
.path
.dirname(playlisturl
), row
)
181 if "EXT-X-KEY" in row
:
182 row
= row
.split(':',1)[1] #skip first part
183 parts
= PATTERN
.split(row
)[1:-1] #do magic re split and keep quotes
184 metadata
["EXT-X-KEY"] = dict([part
.split('=',1) for part
in parts
if '=' in part
]) #throw away the commas and make dict of the pairs
185 return(segments
, metadata
)
187 def parse_videolist():
189 soup
= BeautifulSoup(requests
.get("http://www.svtplay.se/ajax/videospager").text
)#this call does not work for getting the pages, we use it for the page totals only
190 page_tot
= int(soup
.find('a',{'data-currentpage':True}
).attrs
['data-lastpage'])
193 while(page_num
<= page_tot
):
194 base_url
= "http://www.svtplay.se/ajax/videos?sida={}".format(page_num
)
195 soup
= BeautifulSoup(requests
.get(base_url
).text
)
196 for article
in soup
.findAll('article'):
197 meta
= dict(article
.attrs
)
199 video
['title'] = meta
['data-title']
200 video
['description'] = meta
['data-description']
201 video
['url'] = dict(article
.find('a').attrs
)['href']
202 video
['thumb-url'] = dict(article
.find('img',{}).attrs
)['src']
203 video
['num'] = video_num
204 video
['total'] = page_tot
* videos_per_page
209 def remux(video
, xml
=None):
210 basename
= video
['filename'].split('.ts')[0]
212 if not os
.path
.exists(video
['genre']):
213 os
.mkdir(video
['genre'])
214 video
['path'] = os
.path
.join(video
['genre'],basename
+'.mkv')
216 video
['path'] = basename
+'.mkv'
217 command
= ["mkvmerge","-o",video
['path'], '--title',video
['title']]
220 with open(basename
+'.xml','w') as f
:
222 command
.extend(['--global-tags',basename
+'.xml'])
224 with open('thumbnail.jpg','wb') as f
: #FIXME use title instead for many downloaders
225 f
.write(video
['thumb'].read())
226 command
.extend(['--attachment-description', "Thumbnail",
227 '--attachment-mime-type', 'image/jpeg',
228 '--attach-file', 'thumbnail.jpg'])
229 command
.append(video
['filename'])
230 print(Popen(command
, stdout
=PIPE
).communicate()[0])
231 for fname
in (video
['filename'], basename
+'.xml','thumbnail.jpg'):
236 if 'timestamp' in video
:
238 os
.utime(video
['path'], times
=(video
['timestamp'].timestamp(),video
['timestamp'].timestamp()))
239 except FileNotFoundError
as e
:
243 def mkv_metadata(video
):
244 root
= BeautifulSoup(features
='xml')
245 root
.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
246 tags
= root
.new_tag("Tags")
247 tag
= root
.new_tag("Tag")
250 keep
= ('title','description', 'url','genre')
251 targets
= root
.new_tag("Targets")
252 ttv
= root
.new_tag("TargetTypeValue")
259 simple
= root
.new_tag('Simple')
260 name
= root
.new_tag('Name')
261 name
.string
=key
.upper()
263 sstring
= root
.new_tag('String')
264 sstring
.string
=video
[key
]
265 simple
.append(sstring
)
269 if __name__
== "__main__":
270 parser
= argparse
.ArgumentParser()
271 group
= parser
.add_mutually_exclusive_group(required
=True)
272 group
.add_argument("-r", "--rss", help="Download all files in rss")
273 group
.add_argument("-u", "--url", help="Download video in url")
274 group
.add_argument("-m", "--mirror", help="Mirror all files", action
="store_true")
275 parser
.add_argument("-n", "--no_act", help="Just print what would be done, don't do any downloading.", action
="store_true")
276 parser
.add_argument("--no_remux", help="Don't remux into mkv", action
="store_true")
278 args
= parser
.parse_args()
280 d
= feedparser
.parse(args
.rss
)
282 print(("Downloading: %s"%e.title
))
285 video
= scrape_player_page({'title':e.title,'url':e.link}
)
289 #print(e.description)
291 if not os
.path
.exists('.seen'):
293 for video
in parse_videolist():
294 video
['title'] = video
['title'].replace('/','_')
295 print(video
['title']+'.mkv')
296 print("{} of {}".format(video
['num'], video
['total']))
298 if os
.path
.exists(os
.path
.join('.seen',video
['title'])):
301 print("Downloading...")
304 open(os
.path
.join('.seen',video
['title']),'w').close() #touch
305 ret
= scrape_player_page(video
)
307 if not os
.path
.exists('.failed'):
309 open(os
.path
.join('.failed',video
['title']),'w').close() #touch
314 xml
= mkv_metadata(video
)
319 video
= scrape_player_page({'url':args.url}
)
320 if not args
.no_remux
:
321 remux({'title':e.title}
)
322 print(("Downloaded {}".format(args
.url
)))