ytdl/youtube_dl/extractor/mixcloud.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    unified_strdate,
    compat_urllib_parse,
    ExtractorError,
)


class MixcloudIE(InfoExtractor):
    _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/([^/]+)'
    IE_NAME = 'mixcloud'

    _TEST = {
        'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/',
        'info_dict': {
            'id': 'dholbach-cryptkeeper',
            'ext': 'mp3',
            'title': 'Cryptkeeper',
            'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
            'uploader': 'Daniel Holbach',
            'uploader_id': 'dholbach',
            'upload_date': '20111115',
        },
    }

    def check_urls(self, url_list):
        """Returns 1st active url from list"""
        for url in url_list:
            try:
                # We only want to know if the request succeed
                # don't download the whole file
                self._request_webpage(url, None, False)
                return url
            except ExtractorError:
                url = None

        return None

    def _get_url(self, template_url):
        return self.check_urls(template_url % i for i in range(30))

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        uploader = mobj.group(1)
        cloudcast_name = mobj.group(2)
        track_id = compat_urllib_parse.unquote('-'.join((uploader, cloudcast_name)))

        webpage = self._download_webpage(url, track_id)

        api_url = 'http://api.mixcloud.com/%s/%s/' % (uploader, cloudcast_name)
        info = self._download_json(
            api_url, track_id, 'Downloading cloudcast info')

        preview_url = self._search_regex(
            r'\s(?:data-preview-url|m-preview)="(.+?)"', webpage, 'preview url')
        song_url = preview_url.replace('/previews/', '/c/originals/')
        template_url = re.sub(r'(stream\d*)', 'stream%d', song_url)
        final_song_url = self._get_url(template_url)
        if final_song_url is None:
            self.to_screen('Trying with m4a extension')
            template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/')
            final_song_url = self._get_url(template_url)
        if final_song_url is None:
            raise ExtractorError(u'Unable to extract track url')

        return {
            'id': track_id,
            'title': info['name'],
            'url': final_song_url,
            'description': info.get('description'),
            'thumbnail': info['pictures'].get('extra_large'),
            'uploader': info['user']['name'],
            'uploader_id': info['user']['username'],
            'upload_date': unified_strdate(info['created_time']),
            'view_count': info['play_count'],
        }
[mixcloud] Use unicode_literals 2014-01-17 04:06:18 +01:00			`from __future__ import unicode_literals`

Move MixCloud into its own file 2013-06-23 21:59:15 +02:00			`import re`

			`from .common import InfoExtractor`
			`from ..utils import (`
[mixcloud] Rewrite extractor (fixes #278) 2013-09-14 14:26:42 +02:00			`unified_strdate,`
[mixcloud] Unquote the track id (#2462) 2014-02-27 18:58:09 +01:00			`compat_urllib_parse,`
Remove the calls to 'compat_urllib_request.urlopen' in a few extractors 2013-12-08 22:24:55 +01:00			`ExtractorError,`
Move MixCloud into its own file 2013-06-23 21:59:15 +02:00			`)`


			`class MixcloudIE(InfoExtractor):`
[mixcloud] Fix _VALID_RE (fixes #2462) Accept any character except `/` for uploader and the name, caused problems with non ASCII characters 2014-02-26 00:04:03 +01:00			`_VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/([^/]+)'`
[mixcloud] Use unicode_literals 2014-01-17 04:06:18 +01:00			`IE_NAME = 'mixcloud'`
Move MixCloud into its own file 2013-06-23 21:59:15 +02:00
[mixcloud] Rewrite extractor (fixes #278) 2013-09-14 14:26:42 +02:00			`_TEST = {`
[mixcloud] Use unicode_literals 2014-01-17 04:06:18 +01:00			`'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/',`
			`'info_dict': {`
[mixcloud] Unquote the track id (#2462) 2014-02-27 18:58:09 +01:00			`'id': 'dholbach-cryptkeeper',`
			`'ext': 'mp3',`
[mixcloud] Use unicode_literals 2014-01-17 04:06:18 +01:00			`'title': 'Cryptkeeper',`
			`'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',`
			`'uploader': 'Daniel Holbach',`
			`'uploader_id': 'dholbach',`
			`'upload_date': '20111115',`
[mixcloud] Rewrite extractor (fixes #278) 2013-09-14 14:26:42 +02:00			`},`
			`}`
Move MixCloud into its own file 2013-06-23 21:59:15 +02:00
			`def check_urls(self, url_list):`
			`"""Returns 1st active url from list"""`
			`for url in url_list:`
			`try:`
Remove the calls to 'compat_urllib_request.urlopen' in a few extractors 2013-12-08 22:24:55 +01:00			`# We only want to know if the request succeed`
			`# don't download the whole file`
			`self._request_webpage(url, None, False)`
Move MixCloud into its own file 2013-06-23 21:59:15 +02:00			`return url`
Remove the calls to 'compat_urllib_request.urlopen' in a few extractors 2013-12-08 22:24:55 +01:00			`except ExtractorError:`
Move MixCloud into its own file 2013-06-23 21:59:15 +02:00			`url = None`

			`return None`

[mixcloud] Try to get the m4a url if the mp3 url fails to download (fixes #1939) 2013-12-10 13:42:41 +01:00			`def _get_url(self, template_url):`
			`return self.check_urls(template_url % i for i in range(30))`

Move MixCloud into its own file 2013-06-23 21:59:15 +02:00			`def _real_extract(self, url):`
			`mobj = re.match(self._VALID_URL, url)`
[mixcloud] Rewrite extractor (fixes #278) 2013-09-14 14:26:42 +02:00			`uploader = mobj.group(1)`
			`cloudcast_name = mobj.group(2)`
[mixcloud] Unquote the track id (#2462) 2014-02-27 18:58:09 +01:00			`track_id = compat_urllib_parse.unquote('-'.join((uploader, cloudcast_name)))`
[mixcloud] Fix URL extraction 2014-01-17 04:05:15 +01:00
[mixcloud] Rewrite extractor (fixes #278) 2013-09-14 14:26:42 +02:00			`webpage = self._download_webpage(url, track_id)`

[mixcloud] Fix URL extraction 2014-01-17 04:05:15 +01:00			`api_url = 'http://api.mixcloud.com/%s/%s/' % (uploader, cloudcast_name)`
			`info = self._download_json(`
[mixcloud] Use unicode_literals 2014-01-17 04:06:18 +01:00			`api_url, track_id, 'Downloading cloudcast info')`
[mixcloud] Fix URL extraction 2014-01-17 04:05:15 +01:00
			`preview_url = self._search_regex(`
[mixcloud] Use unicode_literals 2014-01-17 04:06:18 +01:00			`r'\s(?:data-preview-url\|m-preview)="(.+?)"', webpage, 'preview url')`
[mixcloud] Fix track url transformation (fixes #2068) ‘/previews/‘ must be replaced with ‘/c/originals/‘ now. 2014-01-01 21:07:55 +01:00			`song_url = preview_url.replace('/previews/', '/c/originals/')`
[mixcloud] Rewrite extractor (fixes #278) 2013-09-14 14:26:42 +02:00			`template_url = re.sub(r'(stream\d*)', 'stream%d', song_url)`
[mixcloud] Try to get the m4a url if the mp3 url fails to download (fixes #1939) 2013-12-10 13:42:41 +01:00			`final_song_url = self._get_url(template_url)`
			`if final_song_url is None:`
			`self.to_screen('Trying with m4a extension')`
			`template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/')`
			`final_song_url = self._get_url(template_url)`
			`if final_song_url is None:`
			`raise ExtractorError(u'Unable to extract track url')`
[mixcloud] Rewrite extractor (fixes #278) 2013-09-14 14:26:42 +02:00
			`return {`
			`'id': track_id,`
			`'title': info['name'],`
			`'url': final_song_url,`
[mixcloud] The description field may be missing (fixes #1819) 2013-11-24 11:28:44 +01:00			`'description': info.get('description'),`
[mixcloud] Rewrite extractor (fixes #278) 2013-09-14 14:26:42 +02:00			`'thumbnail': info['pictures'].get('extra_large'),`
			`'uploader': info['user']['name'],`
			`'uploader_id': info['user']['username'],`
			`'upload_date': unified_strdate(info['created_time']),`
			`'view_count': info['play_count'],`
			`}`