[extractor/common] Add support for video of WebPage context in _json_ld (closes #12778)

This commit is contained in:
Sergey M․ 2017-04-18 22:21:38 +07:00
parent 06d0ad9a4e
commit bae1404893
No known key found for this signature in database
GPG Key ID: 2C393E0F18A9236D

View File

@ -976,6 +976,22 @@ class InfoExtractor(object):
return info return info
if isinstance(json_ld, dict): if isinstance(json_ld, dict):
json_ld = [json_ld] json_ld = [json_ld]
def extract_video_object(e):
assert e['@type'] == 'VideoObject'
info.update({
'url': e.get('contentUrl'),
'title': unescapeHTML(e.get('name')),
'description': unescapeHTML(e.get('description')),
'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
'duration': parse_duration(e.get('duration')),
'timestamp': unified_timestamp(e.get('uploadDate')),
'filesize': float_or_none(e.get('contentSize')),
'tbr': int_or_none(e.get('bitrate')),
'width': int_or_none(e.get('width')),
'height': int_or_none(e.get('height')),
})
for e in json_ld: for e in json_ld:
if e.get('@context') == 'http://schema.org': if e.get('@context') == 'http://schema.org':
item_type = e.get('@type') item_type = e.get('@type')
@ -1000,18 +1016,11 @@ class InfoExtractor(object):
'description': unescapeHTML(e.get('articleBody')), 'description': unescapeHTML(e.get('articleBody')),
}) })
elif item_type == 'VideoObject': elif item_type == 'VideoObject':
info.update({ extract_video_object(e)
'url': e.get('contentUrl'), elif item_type == 'WebPage':
'title': unescapeHTML(e.get('name')), video = e.get('video')
'description': unescapeHTML(e.get('description')), if isinstance(video, dict) and video.get('@type') == 'VideoObject':
'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'), extract_video_object(video)
'duration': parse_duration(e.get('duration')),
'timestamp': unified_timestamp(e.get('uploadDate')),
'filesize': float_or_none(e.get('contentSize')),
'tbr': int_or_none(e.get('bitrate')),
'width': int_or_none(e.get('width')),
'height': int_or_none(e.get('height')),
})
break break
return dict((k, v) for k, v in info.items() if v is not None) return dict((k, v) for k, v in info.items() if v is not None)