ytdl/youtube_dl/extractor/cspan.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    int_or_none,
    unescapeHTML,
    find_xpath_attr,
    smuggle_url,
    determine_ext,
    ExtractorError,
)
from .senateisvp import SenateISVPIE
from .ustream import UstreamIE


class CSpanIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?c-span\.org/video/\?(?P<id>[0-9a-f]+)'
    IE_DESC = 'C-SPAN'
    _TESTS = [{
        'url': 'http://www.c-span.org/video/?313572-1/HolderonV',
        'md5': '94b29a4f131ff03d23471dd6f60b6a1d',
        'info_dict': {
            'id': '315139',
            'title': 'Attorney General Eric Holder on Voting Rights Act Decision',
        },
        'playlist_mincount': 2,
        'skip': 'Regularly fails on travis, for unknown reasons',
    }, {
        'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models',
        # md5 is unstable
        'info_dict': {
            'id': 'c4486943',
            'ext': 'mp4',
            'title': 'CSPAN - International Health Care Models',
            'description': 'md5:7a985a2d595dba00af3d9c9f0783c967',
        }
    }, {
        'url': 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall',
        'info_dict': {
            'id': '342759',
            'title': 'General Motors Ignition Switch Recall',
        },
        'playlist_mincount': 6,
    }, {
        # Video from senate.gov
        'url': 'http://www.c-span.org/video/?104517-1/immigration-reforms-needed-protect-skilled-american-workers',
        'info_dict': {
            'id': 'judiciary031715',
            'ext': 'mp4',
            'title': 'Immigration Reforms Needed to Protect Skilled American Workers',
        },
        'params': {
            'skip_download': True,  # m3u8 downloads
        }
    }, {
        # Ustream embedded video
        'url': 'https://www.c-span.org/video/?114917-1/armed-services',
        'info_dict': {
            'id': '58428542',
            'ext': 'flv',
            'title': 'USHR07 Armed Services Committee',
            'description': 'hsas00-2118-20150204-1000et-07\n\n\nUSHR07 Armed Services Committee',
            'timestamp': 1423060374,
            'upload_date': '20150204',
            'uploader': 'HouseCommittee',
            'uploader_id': '12987475',
        },
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)
        video_type = None
        webpage = self._download_webpage(url, video_id)

        ustream_url = UstreamIE._extract_url(webpage)
        if ustream_url:
            return self.url_result(ustream_url, UstreamIE.ie_key())

        # We first look for clipid, because clipprog always appears before
        patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')]
        results = list(filter(None, (re.search(p, webpage) for p in patterns)))
        if results:
            matches = results[0]
            video_type, video_id = matches.groups()
            video_type = 'clip' if video_type == 'id' else 'program'
        else:
            m = re.search(r'data-(?P<type>clip|prog)id=["\'](?P<id>\d+)', webpage)
            if m:
                video_id = m.group('id')
                video_type = 'program' if m.group('type') == 'prog' else 'clip'
            else:
                senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
                if senate_isvp_url:
                    title = self._og_search_title(webpage)
                    surl = smuggle_url(senate_isvp_url, {'force_title': title})
                    return self.url_result(surl, 'SenateISVP', video_id, title)
        if video_type is None or video_id is None:
            raise ExtractorError('unable to find video id and type')

        def get_text_attr(d, attr):
            return d.get(attr, {}).get('#text')

        data = self._download_json(
            'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id),
            video_id)['video']
        if data['@status'] != 'Success':
            raise ExtractorError('%s said: %s' % (self.IE_NAME, get_text_attr(data, 'error')), expected=True)

        doc = self._download_xml(
            'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id),
            video_id)

        description = self._html_search_meta('description', webpage)

        title = find_xpath_attr(doc, './/string', 'name', 'title').text
        thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text

        files = data['files']
        capfile = get_text_attr(data, 'capfile')

        entries = []
        for partnum, f in enumerate(files):
            formats = []
            for quality in f['qualities']:
                formats.append({
                    'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')),
                    'url': unescapeHTML(get_text_attr(quality, 'file')),
                    'height': int_or_none(get_text_attr(quality, 'height')),
                    'tbr': int_or_none(get_text_attr(quality, 'bitrate')),
                })
            if not formats:
                path = unescapeHTML(get_text_attr(f, 'path'))
                if not path:
                    continue
                formats = self._extract_m3u8_formats(
                    path, video_id, 'mp4', entry_protocol='m3u8_native',
                    m3u8_id='hls') if determine_ext(path) == 'm3u8' else [{'url': path, }]
            self._sort_formats(formats)
            entries.append({
                'id': '%s_%d' % (video_id, partnum + 1),
                'title': (
                    title if len(files) == 1 else
                    '%s part %d' % (title, partnum + 1)),
                'formats': formats,
                'description': description,
                'thumbnail': thumbnail,
                'duration': int_or_none(get_text_attr(f, 'length')),
                'subtitles': {
                    'en': [{
                        'url': capfile,
                        'ext': determine_ext(capfile, 'dfxp')
                    }],
                } if capfile else None,
            })

        if len(entries) == 1:
            entry = dict(entries[0])
            entry['id'] = 'c' + video_id if video_type == 'clip' else video_id
            return entry
        else:
            return {
                '_type': 'playlist',
                'entries': entries,
                'title': title,
                'id': 'c' + video_id if video_type == 'clip' else video_id,
            }
[cspan] Use HTTP download (Fixes #2098) 2014-01-05 04:30:00 +01:00			`from __future__ import unicode_literals`

Add CSpanIE (closes #312) 2013-06-26 17:55:54 +02:00			`import re`

			`from .common import InfoExtractor`
			`from ..utils import (`
[cspan] Support multiple segments (Fixes #2674) 2014-04-03 05:56:28 +02:00			`int_or_none,`
[cspan] Use HTTP download (Fixes #2098) 2014-01-05 04:30:00 +01:00			`unescapeHTML,`
[cspan] Fix extraction (fixes #2291) The webpage urls have changed. The title and thumbnail are now extracted from an xml. 2014-02-02 18:24:20 +01:00			`find_xpath_attr,`
[CSpan] Add detection for Senate ISVP. Closes #5302 2015-04-20 21:18:38 +02:00			`smuggle_url,`
[cspan] Extract subtitles 2015-04-24 17:46:51 +02:00			`determine_ext,`
[cspan] handle error massages and extract qualities 2015-10-17 22:30:38 +02:00			`ExtractorError,`
Add CSpanIE (closes #312) 2013-06-26 17:55:54 +02:00			`)`
[CSpan] Add detection for Senate ISVP. Closes #5302 2015-04-20 21:18:38 +02:00			`from .senateisvp import SenateISVPIE`
[cspan] Support Ustream embedded videos Closes #11547 2017-01-20 15:11:43 +01:00			`from .ustream import UstreamIE`
Add CSpanIE (closes #312) 2013-06-26 17:55:54 +02:00
[cspan] Use HTTP download (Fixes #2098) 2014-01-05 04:30:00 +01:00
Add CSpanIE (closes #312) 2013-06-26 17:55:54 +02:00			`class CSpanIE(InfoExtractor):`
Add support for https for all extractors as preventive and future-proof measure 2016-03-21 16:36:32 +01:00			`_VALID_URL = r'https?://(?:www\.)?c-span\.org/video/\?(?P<id>[0-9a-f]+)'`
[cspan] Use HTTP download (Fixes #2098) 2014-01-05 04:30:00 +01:00			`IE_DESC = 'C-SPAN'`
[cspan] Add support for newer videos (Fixes #2577) 2014-03-21 02:10:24 +01:00			`_TESTS = [{`
[cspan] Fix extraction (fixes #2291) The webpage urls have changed. The title and thumbnail are now extracted from an xml. 2014-02-02 18:24:20 +01:00			`'url': 'http://www.c-span.org/video/?313572-1/HolderonV',`
[cspan] handle error massages and extract qualities 2015-10-17 22:30:38 +02:00			`'md5': '94b29a4f131ff03d23471dd6f60b6a1d',`
[cspan] Use HTTP download (Fixes #2098) 2014-01-05 04:30:00 +01:00			`'info_dict': {`
[cspan] Fix extraction (fixes #2291) The webpage urls have changed. The title and thumbnail are now extracted from an xml. 2014-02-02 18:24:20 +01:00			`'id': '315139',`
[cspan] Use HTTP download (Fixes #2098) 2014-01-05 04:30:00 +01:00			`'title': 'Attorney General Eric Holder on Voting Rights Act Decision',`
Move tests to the IE definitions 2013-06-27 20:46:46 +02:00			`},`
[cspan] Fix _TESTS 2017-01-20 15:25:20 +01:00			`'playlist_mincount': 2,`
[cspan] Disable test It works fine from all my machines, no matter where, but from travis, we get lots of 403s. Maybe another project is scraping CSPAN from travis and they're blocking the travis machines? 2014-01-22 15:10:00 +01:00			`'skip': 'Regularly fails on travis, for unknown reasons',`
[cspan] Add support for newer videos (Fixes #2577) 2014-03-21 02:10:24 +01:00			`}, {`
			`'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models',`
[cspan] Fix _TESTS 2017-01-20 15:25:20 +01:00			`# md5 is unstable`
[cspan] Add support for newer videos (Fixes #2577) 2014-03-21 02:10:24 +01:00			`'info_dict': {`
[cspan] correct the clip info extraction 2015-10-03 20:28:48 +02:00			`'id': 'c4486943',`
[cspan] Add support for newer videos (Fixes #2577) 2014-03-21 02:10:24 +01:00			`'ext': 'mp4',`
[cspan] correct the clip info extraction 2015-10-03 20:28:48 +02:00			`'title': 'CSPAN - International Health Care Models',`
[cspan] Add support for newer videos (Fixes #2577) 2014-03-21 02:10:24 +01:00			`'description': 'md5:7a985a2d595dba00af3d9c9f0783c967',`
			`}`
Move playlist tests to extractors. From now on, test_download will run these tests. That means we benefit not only from the networking setup in there, but also from the other tests (for example test_all_urls to find problems with _VALID_URLs). 2014-08-28 00:58:24 +02:00			`}, {`
			`'url': 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall',`
			`'info_dict': {`
			`'id': '342759',`
			`'title': 'General Motors Ignition Switch Recall',`
			`},`
[cspan] Fix _TESTS 2017-01-20 15:25:20 +01:00			`'playlist_mincount': 6,`
[CSpan] Add detection for Senate ISVP. Closes #5302 2015-04-20 21:18:38 +02:00			`}, {`
			`# Video from senate.gov`
			`'url': 'http://www.c-span.org/video/?104517-1/immigration-reforms-needed-protect-skilled-american-workers',`
			`'info_dict': {`
			`'id': 'judiciary031715',`
Fix unit tests for m3u8 and RTSP extractors that require ffmpeg or mplayer 2016-07-07 23:39:39 +02:00			`'ext': 'mp4',`
[CSpan] Add detection for Senate ISVP. Closes #5302 2015-04-20 21:18:38 +02:00			`'title': 'Immigration Reforms Needed to Protect Skilled American Workers',`
Fix unit tests for m3u8 and RTSP extractors that require ffmpeg or mplayer 2016-07-07 23:39:39 +02:00			`},`
			`'params': {`
			`'skip_download': True, # m3u8 downloads`
[CSpan] Add detection for Senate ISVP. Closes #5302 2015-04-20 21:18:38 +02:00			`}`
[cspan] Support Ustream embedded videos Closes #11547 2017-01-20 15:11:43 +01:00			`}, {`
			`# Ustream embedded video`
			`'url': 'https://www.c-span.org/video/?114917-1/armed-services',`
			`'info_dict': {`
			`'id': '58428542',`
			`'ext': 'flv',`
			`'title': 'USHR07 Armed Services Committee',`
			`'description': 'hsas00-2118-20150204-1000et-07\n\n\nUSHR07 Armed Services Committee',`
			`'timestamp': 1423060374,`
			`'upload_date': '20150204',`
			`'uploader': 'HouseCommittee',`
			`'uploader_id': '12987475',`
			`},`
[cspan] Add support for newer videos (Fixes #2577) 2014-03-21 02:10:24 +01:00			`}]`
Add CSpanIE (closes #312) 2013-06-26 17:55:54 +02:00
			`def _real_extract(self, url):`
[cspan] correct the clip info extraction 2015-10-03 20:28:48 +02:00			`video_id = self._match_id(url)`
[cspan] Initialize 'video_type' to avoid 'UnboundLocalError' exceptions (#8032) 2015-12-28 13:06:30 +01:00			`video_type = None`
[cspan] correct the clip info extraction 2015-10-03 20:28:48 +02:00			`webpage = self._download_webpage(url, video_id)`
[cspan] Support Ustream embedded videos Closes #11547 2017-01-20 15:11:43 +01:00
			`ustream_url = UstreamIE._extract_url(webpage)`
			`if ustream_url:`
			`return self.url_result(ustream_url, UstreamIE.ie_key())`

[cspan] Fix extraction (fixes #8032) 2015-12-28 13:48:10 +01:00			`# We first look for clipid, because clipprog always appears before`
			`patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')]`
			`results = list(filter(None, (re.search(p, webpage) for p in patterns)))`
			`if results:`
			`matches = results[0]`
[cspan] correct the clip info extraction 2015-10-03 20:28:48 +02:00			`video_type, video_id = matches.groups()`
[cspan] Fix extraction (fixes #8032) 2015-12-28 13:48:10 +01:00			`video_type = 'clip' if video_type == 'id' else 'program'`
[cspan] correct the clip info extraction 2015-10-03 20:28:48 +02:00			`else:`
[cspan] Fix clip/prog id extraction (#8317) 2016-01-26 15:42:20 +01:00			`m = re.search(r'data-(?P<type>clip\|prog)id=["\'](?P<id>\d+)', webpage)`
			`if m:`
			`video_id = m.group('id')`
			`video_type = 'program' if m.group('type') == 'prog' else 'clip'`
			`else:`
			`senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)`
			`if senate_isvp_url:`
			`title = self._og_search_title(webpage)`
			`surl = smuggle_url(senate_isvp_url, {'force_title': title})`
			`return self.url_result(surl, 'SenateISVP', video_id, title)`
[cspan] Initialize 'video_type' to avoid 'UnboundLocalError' exceptions (#8032) 2015-12-28 13:06:30 +01:00			`if video_type is None or video_id is None:`
			`raise ExtractorError('unable to find video id and type')`
[cspan] Use HTTP download (Fixes #2098) 2014-01-05 04:30:00 +01:00
[cspan] change into a function 2015-11-28 20:22:31 +01:00			`def get_text_attr(d, attr):`
			`return d.get(attr, {}).get('#text')`

[cspan] correct the clip info extraction 2015-10-03 20:28:48 +02:00			`data = self._download_json(`
[cspan] handle error massages and extract qualities 2015-10-17 22:30:38 +02:00			`'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id),`
			`video_id)['video']`
			`if data['@status'] != 'Success':`
[cspan] change into a function 2015-11-28 20:22:31 +01:00			`raise ExtractorError('%s said: %s' % (self.IE_NAME, get_text_attr(data, 'error')), expected=True)`
[cspan] Use HTTP download (Fixes #2098) 2014-01-05 04:30:00 +01:00
[cspan] Support multiple segments (Fixes #2674) 2014-04-03 05:56:28 +02:00			`doc = self._download_xml(`
[cspan] correct the clip info extraction 2015-10-03 20:28:48 +02:00			`'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id),`
[cspan] Fix extraction (fixes #2291) The webpage urls have changed. The title and thumbnail are now extracted from an xml. 2014-02-02 18:24:20 +01:00			`video_id)`

[cspan] correct the clip info extraction 2015-10-03 20:28:48 +02:00			`description = self._html_search_meta('description', webpage)`

[cspan] Support multiple segments (Fixes #2674) 2014-04-03 05:56:28 +02:00			`title = find_xpath_attr(doc, './/string', 'name', 'title').text`
			`thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text`

[cspan] handle error massages and extract qualities 2015-10-17 22:30:38 +02:00			`files = data['files']`
[cspan] change into a function 2015-11-28 20:22:31 +01:00			`capfile = get_text_attr(data, 'capfile')`
[cspan] Support multiple segments (Fixes #2674) 2014-04-03 05:56:28 +02:00
[cspan] handle error massages and extract qualities 2015-10-17 22:30:38 +02:00			`entries = []`
			`for partnum, f in enumerate(files):`
			`formats = []`
			`for quality in f['qualities']:`
			`formats.append({`
[cspan] change into a function 2015-11-28 20:22:31 +01:00			`'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')),`
			`'url': unescapeHTML(get_text_attr(quality, 'file')),`
			`'height': int_or_none(get_text_attr(quality, 'height')),`
			`'tbr': int_or_none(get_text_attr(quality, 'bitrate')),`
[cspan] handle error massages and extract qualities 2015-10-17 22:30:38 +02:00			`})`
[cspan] Extract from path when no qualities (Closes #8317) 2016-01-26 16:29:42 +01:00			`if not formats:`
[cspan] Unescape path (Closes #8365) 2016-01-29 19:26:33 +01:00			`path = unescapeHTML(get_text_attr(f, 'path'))`
[cspan] Extract from path when no qualities (Closes #8317) 2016-01-26 16:29:42 +01:00			`if not path:`
			`continue`
			`formats = self._extract_m3u8_formats(`
			`path, video_id, 'mp4', entry_protocol='m3u8_native',`
			`m3u8_id='hls') if determine_ext(path) == 'm3u8' else [{'url': path, }]`
[cspan] handle error massages and extract qualities 2015-10-17 22:30:38 +02:00			`self._sort_formats(formats)`
			`entries.append({`
			`'id': '%s_%d' % (video_id, partnum + 1),`
			`'title': (`
			`title if len(files) == 1 else`
			`'%s part %d' % (title, partnum + 1)),`
			`'formats': formats,`
			`'description': description,`
			`'thumbnail': thumbnail,`
[cspan] change into a function 2015-11-28 20:22:31 +01:00			`'duration': int_or_none(get_text_attr(f, 'length')),`
[cspan] handle error massages and extract qualities 2015-10-17 22:30:38 +02:00			`'subtitles': {`
			`'en': [{`
			`'url': capfile,`
			`'ext': determine_ext(capfile, 'dfxp')`
			`}],`
			`} if capfile else None,`
			`})`
[cspan] Fix extraction (fixes #2291) The webpage urls have changed. The title and thumbnail are now extracted from an xml. 2014-02-02 18:24:20 +01:00
[CSpan] Fix test cases CSpan_1 and CSpan_2 2015-04-20 21:30:54 +02:00			`if len(entries) == 1:`
			`entry = dict(entries[0])`
[cspan] correct the clip info extraction 2015-10-03 20:28:48 +02:00			`entry['id'] = 'c' + video_id if video_type == 'clip' else video_id`
[CSpan] Fix test cases CSpan_1 and CSpan_2 2015-04-20 21:30:54 +02:00			`return entry`
			`else:`
			`return {`
			`'_type': 'playlist',`
			`'entries': entries,`
			`'title': title,`
[cspan] correct the clip info extraction 2015-10-03 20:28:48 +02:00			`'id': 'c' + video_id if video_type == 'clip' else video_id,`
[CSpan] Fix test cases CSpan_1 and CSpan_2 2015-04-20 21:30:54 +02:00			`}`