2013-09-26 13:53:57 +02:00
# encoding: utf-8
2014-01-07 05:23:20 +01:00
from __future__ import unicode_literals
2013-09-26 13:53:57 +02:00
2013-07-05 21:31:50 +02:00
import re
import json
2013-07-10 17:49:11 +02:00
import xml . etree . ElementTree
2013-07-05 21:31:50 +02:00
from . common import InfoExtractor
2013-07-10 17:49:11 +02:00
from . . utils import (
compat_urllib_parse ,
2013-07-11 16:31:29 +02:00
find_xpath_attr ,
2014-01-21 02:09:49 +01:00
fix_xml_ampersands ,
2013-07-12 14:53:28 +02:00
compat_urlparse ,
2013-11-06 16:40:24 +01:00
compat_str ,
2013-11-07 21:06:48 +01:00
compat_urllib_request ,
2014-01-21 02:09:49 +01:00
compat_parse_qs ,
2013-09-26 13:53:57 +02:00
ExtractorError ,
2014-01-07 05:34:14 +01:00
unsmuggle_url ,
2013-07-10 17:49:11 +02:00
)
2013-07-05 21:31:50 +02:00
2013-11-07 21:06:48 +01:00
2013-07-05 21:31:50 +02:00
class BrightcoveIE ( InfoExtractor ) :
2013-07-11 00:04:33 +02:00
_VALID_URL = r ' https?://.*brightcove \ .com/(services|viewer).* \ ?(?P<query>.*) '
2013-07-10 17:49:11 +02:00
_FEDERATED_URL_TEMPLATE = ' http://c.brightcove.com/services/viewer/htmlFederated? %s '
2013-09-26 13:53:57 +02:00
_TESTS = [
{
2013-09-26 18:59:56 +02:00
# From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/
2014-01-07 05:23:20 +01:00
' url ' : ' http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience& % 40videoPlayer=2371591881001 ' ,
' file ' : ' 2371591881001.mp4 ' ,
' md5 ' : ' 5423e113865d26e40624dce2e4b45d95 ' ,
' note ' : ' Test Brightcove downloads and detection in GenericIE ' ,
' info_dict ' : {
' title ' : ' Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res” ' ,
' uploader ' : ' 8TV ' ,
' description ' : ' md5:a950cc4285c43e44d763d036710cd9cd ' ,
2013-09-26 13:53:57 +02:00
}
} ,
{
2013-09-26 18:59:56 +02:00
# From http://medianetwork.oracle.com/video/player/1785452137001
2014-01-07 05:23:20 +01:00
' url ' : ' http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer& % 40videoPlayer=1785452137001 ' ,
' file ' : ' 1785452137001.flv ' ,
' info_dict ' : {
' title ' : ' JVMLS 2012: Arrays 2.0 - Opportunities and Challenges ' ,
' description ' : ' John Rose speaks at the JVM Language Summit, August 1, 2012. ' ,
' uploader ' : ' Oracle ' ,
2013-09-26 13:53:57 +02:00
} ,
} ,
2013-11-06 17:25:38 +01:00
{
# From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/
2014-01-07 05:23:20 +01:00
' url ' : ' http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ %7E %7E % 2CAAABBzUwv1E %7E % 2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001 ' ,
' info_dict ' : {
' id ' : ' 2750934548001 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' This Bracelet Acts as a Personal Thermostat ' ,
' description ' : ' md5:547b78c64f4112766ccf4e151c20b6a0 ' ,
' uploader ' : ' Mashable ' ,
2013-11-06 17:25:38 +01:00
} ,
} ,
2013-12-09 20:01:43 +01:00
{
# test that the default referer works
# from http://national.ballet.ca/interact/video/Lost_in_Motion_II/
2014-01-07 05:23:20 +01:00
' url ' : ' http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001 ' ,
' info_dict ' : {
' id ' : ' 2878862109001 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Lost in Motion II ' ,
' description ' : ' md5:363109c02998fee92ec02211bd8000df ' ,
' uploader ' : ' National Ballet of Canada ' ,
2013-12-09 20:01:43 +01:00
} ,
} ,
2014-01-28 00:53:46 +01:00
{
# https://github.com/rg3/youtube-dl/issues/2253
' url ' : ' http://v.thestar.com/services/player/bcpid2071349530001?bckey=AQ~~,AAAAuO4KaJE~,gatFNwSKdGDmDpIYqNJ-fTHn_c4z_LH_&bctid=3101154703001 ' ,
' file ' : ' 3101154703001.mp4 ' ,
' md5 ' : ' 0ba9446db037002366bab3b3eb30c88c ' ,
' info_dict ' : {
' title ' : ' Still no power ' ,
' uploader ' : ' thestar.com ' ,
' description ' : ' Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs. ' ,
}
}
2013-09-26 13:53:57 +02:00
]
2013-07-10 17:49:11 +02:00
@classmethod
def _build_brighcove_url ( cls , object_str ) :
"""
Build a Brightcove url from a xml string containing
< object class = " BrightcoveExperience " > { params } < / object >
"""
2013-10-04 11:53:49 +02:00
# Fix up some stupid HTML, see https://github.com/rg3/youtube-dl/issues/1553
object_str = re . sub ( r ' (<param name= " [^ " ]+ " value= " [^ " ]+ " )> ' ,
lambda m : m . group ( 1 ) + ' /> ' , object_str )
2013-10-17 00:46:11 +02:00
# Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608
2014-01-07 05:23:20 +01:00
object_str = object_str . replace ( ' <-- ' , ' <!-- ' )
2014-01-21 02:09:49 +01:00
object_str = fix_xml_ampersands ( object_str )
2013-10-04 11:53:49 +02:00
2013-07-10 17:49:11 +02:00
object_doc = xml . etree . ElementTree . fromstring ( object_str )
2014-01-21 02:09:49 +01:00
fv_el = find_xpath_attr ( object_doc , ' ./param ' , ' name ' , ' flashVars ' )
2014-01-21 22:04:46 +01:00
if fv_el is not None :
flashvars = dict (
( k , v [ 0 ] )
for k , v in compat_parse_qs ( fv_el . attrib [ ' value ' ] ) . items ( ) )
else :
flashvars = { }
2014-01-21 02:09:49 +01:00
2013-11-23 23:26:06 +01:00
def find_param ( name ) :
2014-01-21 02:09:49 +01:00
if name in flashvars :
return flashvars [ name ]
2013-11-24 11:02:34 +01:00
node = find_xpath_attr ( object_doc , ' ./param ' , ' name ' , name )
if node is not None :
return node . attrib [ ' value ' ]
return None
2014-01-21 02:09:49 +01:00
params = { }
playerID = find_param ( ' playerID ' )
if playerID is None :
raise ExtractorError ( ' Cannot find player ID ' )
params [ ' playerID ' ] = playerID
2013-11-23 23:26:06 +01:00
playerKey = find_param ( ' playerKey ' )
2013-07-10 17:49:11 +02:00
# Not all pages define this value
if playerKey is not None :
2013-11-24 11:02:34 +01:00
params [ ' playerKey ' ] = playerKey
2013-11-23 23:26:06 +01:00
# The three fields hold the id of the video
videoPlayer = find_param ( ' @videoPlayer ' ) or find_param ( ' videoId ' ) or find_param ( ' videoID ' )
2013-07-11 00:04:33 +02:00
if videoPlayer is not None :
2013-11-24 11:02:34 +01:00
params [ ' @videoPlayer ' ] = videoPlayer
2013-11-23 23:26:06 +01:00
linkBase = find_param ( ' linkBaseURL ' )
2013-11-07 21:06:48 +01:00
if linkBase is not None :
2013-11-24 11:02:34 +01:00
params [ ' linkBaseURL ' ] = linkBase
2013-07-10 17:49:11 +02:00
data = compat_urllib_parse . urlencode ( params )
return cls . _FEDERATED_URL_TEMPLATE % data
2013-07-05 21:31:50 +02:00
2013-11-06 16:40:24 +01:00
@classmethod
def _extract_brightcove_url ( cls , webpage ) :
""" Try to extract the brightcove url from the wepbage, returns None
if it can ' t be found
"""
2014-01-28 00:53:46 +01:00
url_m = re . search ( r ' <meta \ s+property= " og:video " \ s+content= " (http://c.brightcove.com/[^ " ]+) " ' , webpage )
if url_m :
return url_m . group ( 1 )
2013-11-06 16:40:24 +01:00
m_brightcove = re . search (
2014-01-21 02:09:49 +01:00
r ''' (?sx)<object
( ? :
2014-01-21 22:04:46 +01:00
[ ^ > ] + ? class = ( [ \' " ])[^>]*?BrightcoveExperience.*? \1 |
2014-01-21 02:09:49 +01:00
[ ^ > ] * ? > \s * < param \s + name = " movie " \s + value = " https?://[^/]*brightcove \ .com/
) . + ? < / object > ''' ,
webpage )
2013-11-06 16:40:24 +01:00
if m_brightcove is not None :
return cls . _build_brighcove_url ( m_brightcove . group ( ) )
else :
return None
2013-07-05 21:31:50 +02:00
def _real_extract ( self , url ) :
2014-01-07 05:34:14 +01:00
url , smuggled_data = unsmuggle_url ( url , { } )
2013-11-06 22:03:00 +01:00
# Change the 'videoId' and others field to '@videoPlayer'
url = re . sub ( r ' (?<=[?&])(videoI(d|D)|bctid) ' , ' % 40videoPlayer ' , url )
# Change bckey (used by bcove.me urls) to playerKey
url = re . sub ( r ' (?<=[?&])bckey ' , ' playerKey ' , url )
2013-07-05 21:31:50 +02:00
mobj = re . match ( self . _VALID_URL , url )
2013-07-12 14:53:28 +02:00
query_str = mobj . group ( ' query ' )
query = compat_urlparse . parse_qs ( query_str )
2013-07-05 21:31:50 +02:00
2013-07-12 14:53:28 +02:00
videoPlayer = query . get ( ' @videoPlayer ' )
if videoPlayer :
2014-01-07 05:34:14 +01:00
# We set the original url as the default 'Referer' header
referer = smuggled_data . get ( ' Referer ' , url )
return self . _get_video_info (
videoPlayer [ 0 ] , query_str , query , referer = referer )
2013-07-11 00:04:33 +02:00
else :
2013-07-12 14:53:28 +02:00
player_key = query [ ' playerKey ' ]
return self . _get_playlist_info ( player_key [ 0 ] )
2013-07-11 00:04:33 +02:00
2013-12-09 20:01:43 +01:00
def _get_video_info ( self , video_id , query_str , query , referer = None ) :
2013-11-07 21:06:48 +01:00
request_url = self . _FEDERATED_URL_TEMPLATE % query_str
req = compat_urllib_request . Request ( request_url )
linkBase = query . get ( ' linkBaseURL ' )
if linkBase is not None :
2013-12-09 20:01:43 +01:00
referer = linkBase [ 0 ]
if referer is not None :
req . add_header ( ' Referer ' , referer )
2013-11-07 21:06:48 +01:00
webpage = self . _download_webpage ( req , video_id )
2013-07-05 21:31:50 +02:00
self . report_extraction ( video_id )
info = self . _search_regex ( r ' var experienceJSON = ( { .*?}); ' , webpage , ' json ' )
info = json . loads ( info ) [ ' data ' ]
video_info = info [ ' programmedContent ' ] [ ' videoPlayer ' ] [ ' mediaDTO ' ]
2014-01-21 02:09:49 +01:00
video_info [ ' _youtubedl_adServerURL ' ] = info . get ( ' adServerURL ' )
2013-07-11 00:04:33 +02:00
return self . _extract_video_info ( video_info )
def _get_playlist_info ( self , player_key ) :
2014-01-28 00:53:46 +01:00
info_url = ' http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey= %s ' % player_key
playlist_info = self . _download_webpage (
info_url , player_key , ' Downloading playlist information ' )
2013-07-11 00:04:33 +02:00
2013-10-17 01:02:17 +02:00
json_data = json . loads ( playlist_info )
if ' videoList ' not in json_data :
2014-01-07 05:23:20 +01:00
raise ExtractorError ( ' Empty playlist ' )
2013-10-17 01:02:17 +02:00
playlist_info = json_data [ ' videoList ' ]
2013-07-11 00:04:33 +02:00
videos = [ self . _extract_video_info ( video_info ) for video_info in playlist_info [ ' mediaCollectionDTO ' ] [ ' videoDTOs ' ] ]
return self . playlist_result ( videos , playlist_id = playlist_info [ ' id ' ] ,
playlist_title = playlist_info [ ' mediaCollectionDTO ' ] [ ' displayName ' ] )
def _extract_video_info ( self , video_info ) :
2013-09-26 13:53:57 +02:00
info = {
2013-11-06 16:40:24 +01:00
' id ' : compat_str ( video_info [ ' id ' ] ) ,
2014-01-23 00:12:47 +01:00
' title ' : video_info [ ' displayName ' ] . strip ( ) ,
2013-09-26 13:53:57 +02:00
' description ' : video_info . get ( ' shortDescription ' ) ,
' thumbnail ' : video_info . get ( ' videoStillURL ' ) or video_info . get ( ' thumbnailURL ' ) ,
' uploader ' : video_info . get ( ' publisherName ' ) ,
}
2013-07-11 00:04:33 +02:00
2013-09-26 13:53:57 +02:00
renditions = video_info . get ( ' renditions ' )
if renditions :
renditions = sorted ( renditions , key = lambda r : r [ ' size ' ] )
2013-11-06 19:05:41 +01:00
info [ ' formats ' ] = [ {
' url ' : rend [ ' defaultURL ' ] ,
' height ' : rend . get ( ' frameHeight ' ) ,
' width ' : rend . get ( ' frameWidth ' ) ,
} for rend in renditions ]
2013-09-26 13:53:57 +02:00
elif video_info . get ( ' FLVFullLengthURL ' ) is not None :
info . update ( {
' url ' : video_info [ ' FLVFullLengthURL ' ] ,
} )
2014-01-21 02:09:49 +01:00
if self . _downloader . params . get ( ' include_ads ' , False ) :
adServerURL = video_info . get ( ' _youtubedl_adServerURL ' )
if adServerURL :
ad_info = {
' _type ' : ' url ' ,
' url ' : adServerURL ,
}
if ' url ' in info :
return {
' _type ' : ' playlist ' ,
' title ' : info [ ' title ' ] ,
' entries ' : [ ad_info , info ] ,
}
else :
return ad_info
2014-01-21 21:53:10 +01:00
if ' url ' not in info and not info . get ( ' formats ' ) :
2014-01-07 05:23:20 +01:00
raise ExtractorError ( ' Unable to extract video url for %s ' % info [ ' id ' ] )
2013-09-26 13:53:57 +02:00
return info