svtplay-dl/lib/svtplay_dl/service/__init__.py

# ex:ts=4:sw=4:sts=4:et
# -*- tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*-
from __future__ import absolute_import
import re
from svtplay_dl.utils.urllib import urlparse
from svtplay_dl.utils import download_thumbnail, is_py2, HTTP

import logging

log = logging.getLogger('svtplay_dl')

class Service(object):
    supported_domains = []
    supported_domains_re = []

    def __init__(self, options, _url):
        self.options = options
        self._url = _url
        self._urldata = None
        self._error = False
        self.subtitle = None
        self.cookies = {}
        self.http = HTTP(options)

    @property
    def url(self):
        return self._url

    def get_urldata(self):
        if self._urldata is None:
            self._urldata = self.http.request("get", self.url).text
        return self._urldata

    @classmethod
    def handles(cls, url):
        urlp = urlparse(url)

        # Apply supported_domains_re regexp to the netloc. This
        # is meant for 'dynamic' domains, e.g. containing country
        # information etc.
        for domain_re in [re.compile(x) for x in cls.supported_domains_re]:
            if domain_re.match(urlp.netloc):
                return True

        if urlp.netloc in cls.supported_domains:
            return True

        # For every listed domain, try with www. subdomain as well.
        if urlp.netloc in ['www.'+x for x in cls.supported_domains]:
            return True

        return False

    def get_subtitle(self, options):
        pass

    def exclude(self, options):
        if options.exclude:
            for i in options.exclude:
                if is_py2:
                    i = i.decode("utf-8")
                if i in options.output:
                    return True
        return False

    # the options parameter is unused, but is part of the
    # interface, so we don't want to remove it. Thus, the
    # pylint ignore.
    def find_all_episodes(self, options): # pylint: disable-msg=unused-argument
        log.warning("--all-episodes not implemented for this service")
        return [self.url]

def opengraph_get(html, prop):
    """
    Extract specified OpenGraph property from html.

        >>> opengraph_get('<html><head><meta property="og:image" content="http://example.com/img.jpg"><meta ...', "image")
        'http://example.com/img.jpg'
        >>> opengraph_get('<html><head><meta content="http://example.com/img2.jpg" property="og:image"><meta ...', "image")
        'http://example.com/img2.jpg'
        >>> opengraph_get('<html><head><meta name="og:image" property="og:image" content="http://example.com/img3.jpg"><meta ...', "image")
        'http://example.com/img3.jpg'
    """
    match = re.search('<meta [^>]*property="og:' + prop + '" content="([^"]*)"', html)
    if match is None:
        match = re.search('<meta [^>]*content="([^"]*)" property="og:' + prop + '"', html)
        if match is None:
            return None
    return match.group(1)


class OpenGraphThumbMixin(object):
    """
    Mix this into the service class to grab thumbnail from OpenGraph properties.
    """
    def get_thumbnail(self, options):
        url = opengraph_get(self.get_urldata(), "image")
        if url is None:
            return
        download_thumbnail(options, url)


class Generic(Service):
    ''' Videos embed in sites '''
    def get(self, sites):
        data = self.http.request("get", self.url).text
        match = re.search(r"src=(\"|\')(http://www.svt.se/wd[^\'\"]+)(\"|\')", data)
        stream = None
        if match:
            url = match.group(2)
            for i in sites:
                if i.handles(url):
                    url = url.replace("&amp;", "&").replace("&#038;", "&")
                    return url, i(self.options, url)

        match = re.search(r"src=\"(http://player.vimeo.com/video/[0-9]+)\" ", data)
        if match:
            for i in sites:
                if i.handles(match.group(1)):
                    return match.group(1), i(self.options, url)
        match = re.search(r"tv4play.se/iframe/video/(\d+)?", data)
        if match:
            url = "http://www.tv4play.se/?video_id=%s" % match.group(1)
            for i in sites:
                if i.handles(url):
                    return url, i(self.options, url)
        match = re.search(r"embed.bambuser.com/broadcast/(\d+)", data)
        if match:
            url = "http://bambuser.com/v/%s" % match.group(1)
            for i in sites:
                if i.handles(url):
                    return url, i(self.options, url)
        match = re.search(r'src="(http://tv.aftonbladet[^"]*)"', data)
        if match:
            url = match.group(1)
            for i in sites:
                if i.handles(url):
                    return url, i(self.options, url)
        match = re.search(r'a href="(http://tv.aftonbladet[^"]*)" class="abVi', data)
        if match:
            url = match.group(1)
            for i in sites:
                if i.handles(url):
                    return url, i(self.options, url)

        match = re.search(r"iframe src='(http://www.svtplay[^']*)'", data)
        if match:
            url = match.group(1)
            for i in sites:
                if i.handles(url):
                    return url, i(self.options, url)

        match = re.search('src="(http://mm-resource-service.herokuapp.com[^"]*)"', data)
        if match:
            url = match.group(1)
            for i in sites:
                if i.handles(url):
                    return self.url, i(self.options, self.url)
        match = re.search('(lemonwhale|lwcdn.com)', data)
        if match:
            url = "http://lemonwhale.com"
            for i in sites:
                if i.handles(url):
                    return self.url, i(self.options, self.url)
        match = re.search('s.src="(https://csp-ssl.picsearch.com[^"]+|http://csp.picsearch.com/rest[^"]+)', data)
        if match:
            url = match.group(1)
            for i in sites:
                if i.handles(url):
                    return self.url, i(self.options, self.url)
        match = re.search('(picsearch_ajax_auth|screen9-ajax-auth)', data)
        if match:
            url = "http://csp.picsearch.com"
            for i in sites:
                if i.handles(url):
                    return self.url, i(self.options, self.url)
        match = re.search('iframe src="(//csp.screen9.com[^"]+)"', data)
        if match:
            url = "http:%s" % match.group(1)
            for i in sites:
                if i.handles(url):
                    return self.url, i(self.options, self.url)

        return self.url, stream

def service_handler(sites, options, url):
    handler = None

    for i in sites:
        if i.handles(url):
            handler = i(options, url)
            break

    return handler
Add editor modelines 2013-03-02 21:26:28 +01:00			`# ex:ts=4:sw=4:sts=4:et`
			`# -- tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil --`
Use absolute_import from __future__ everywhere 2013-03-01 23:39:42 +01:00			`from __future__ import absolute_import`
Fix so embedded videos works again 2013-03-23 16:11:36 +01:00			`import re`
service: missing urlparse for py3 2014-01-05 17:28:00 +01:00			`from svtplay_dl.utils.urllib import urlparse`
adding request support. still need some more work 2015-08-30 00:06:20 +02:00			`from svtplay_dl.utils import download_thumbnail, is_py2, HTTP`
service: sort service import list 2014-08-17 10:57:08 +02:00
Add --thumbnail Names thumbnails as $basename.tbn (Hi xbmc!) 2014-01-19 14:26:48 +01:00			`import logging`

			`log = logging.getLogger('svtplay_dl')`
Fix so embedded videos works again 2013-03-23 16:11:36 +01:00
			`class Service(object):`
Add default handle method in Service base class The default handle method will look for a supported_domains attribute (a list), containing the supported domains. The subclassed service class can of course override this if other means of determining support is needded. 2014-01-01 14:57:17 +01:00			`supported_domains = []`
Support supported_domains regexp for services Lets services with more complex domains (like domains with language/country codes) use a regular expressions that will match the supported domains for the handles() method. 2014-01-01 15:50:47 +01:00			`supported_domains_re = []`
Add default handle method in Service base class The default handle method will look for a supported_domains attribute (a list), containing the supported domains. The subclassed service class can of course override this if other means of determining support is needded. 2014-01-01 14:57:17 +01:00
Move options to when we init the service class 2015-12-26 11:46:14 +01:00			`def __init__(self, options, _url):`
			`self.options = options`
Add get_urldata() method to service self.get_urldata() is eqivalent to get_http_data(self.url), but also caches the data, so no additional requests are made if it is called multiple times (e.g when grabbing title or downloading thumbnail). Generic().get(url) still causes it to be fetched an extra time. 2014-02-18 16:48:53 +01:00			`self._url = _url`
			`self._urldata = None`
Rewrite http request handling. 2014-12-08 23:07:02 +01:00			`self._error = False`
Move options to when we init the service class 2015-12-26 11:46:14 +01:00			`self.subtitle = None`
			`self.cookies = {}`
Option to not verify ssl certificate 2015-12-26 12:14:14 +01:00			`self.http = HTTP(options)`
Add get_urldata() method to service self.get_urldata() is eqivalent to get_http_data(self.url), but also caches the data, so no additional requests are made if it is called multiple times (e.g when grabbing title or downloading thumbnail). Generic().get(url) still causes it to be fetched an extra time. 2014-02-18 16:48:53 +01:00
			`@property`
			`def url(self):`
			`return self._url`

			`def get_urldata(self):`
			`if self._urldata is None:`
urldata: should be text instead of binary 2015-08-31 23:18:18 +02:00			`self._urldata = self.http.request("get", self.url).text`
adding request support. still need some more work 2015-08-30 00:06:20 +02:00			`return self._urldata`
Move url to object attribute 2014-01-06 23:14:06 +01:00
Make Service.handles a classmethod This way it can be called without instantiating the class. 2014-01-06 22:47:54 +01:00			`@classmethod`
			`def handles(cls, url):`
Add default handle method in Service base class The default handle method will look for a supported_domains attribute (a list), containing the supported domains. The subclassed service class can of course override this if other means of determining support is needded. 2014-01-01 14:57:17 +01:00			`urlp = urlparse(url)`

Support supported_domains regexp for services Lets services with more complex domains (like domains with language/country codes) use a regular expressions that will match the supported domains for the handles() method. 2014-01-01 15:50:47 +01:00			`# Apply supported_domains_re regexp to the netloc. This`
			`# is meant for 'dynamic' domains, e.g. containing country`
			`# information etc.`
Make Service.handles a classmethod This way it can be called without instantiating the class. 2014-01-06 22:47:54 +01:00			`for domain_re in [re.compile(x) for x in cls.supported_domains_re]:`
Support supported_domains regexp for services Lets services with more complex domains (like domains with language/country codes) use a regular expressions that will match the supported domains for the handles() method. 2014-01-01 15:50:47 +01:00			`if domain_re.match(urlp.netloc):`
			`return True`

Make Service.handles a classmethod This way it can be called without instantiating the class. 2014-01-06 22:47:54 +01:00			`if urlp.netloc in cls.supported_domains:`
Add default handle method in Service base class The default handle method will look for a supported_domains attribute (a list), containing the supported domains. The subclassed service class can of course override this if other means of determining support is needded. 2014-01-01 14:57:17 +01:00			`return True`

			`# For every listed domain, try with www. subdomain as well.`
Make Service.handles a classmethod This way it can be called without instantiating the class. 2014-01-06 22:47:54 +01:00			`if urlp.netloc in ['www.'+x for x in cls.supported_domains]:`
Add default handle method in Service base class The default handle method will look for a supported_domains attribute (a list), containing the supported domains. The subclassed service class can of course override this if other means of determining support is needded. 2014-01-01 14:57:17 +01:00			`return True`

			`return False`
Use absolute_import from __future__ everywhere 2013-03-01 23:39:42 +01:00
Split subtitle getting to separate method 2014-01-11 23:02:47 +01:00			`def get_subtitle(self, options):`
			`pass`

Support for exclude filenames with WORD in them. this fixes #190 2014-12-22 17:41:40 +01:00			`def exclude(self, options):`
service.exclude: check options.exclude is not empty 2014-12-22 18:35:58 +01:00			`if options.exclude:`
			`for i in options.exclude:`
service.exclude: don’t crash when using åäö fixes #245 2015-07-13 19:26:51 +02:00			`if is_py2:`
			`i = i.decode("utf-8")`
service.exclude: check options.exclude is not empty 2014-12-22 18:35:58 +01:00			`if i in options.output:`
			`return True`
Support for exclude filenames with WORD in them. this fixes #190 2014-12-22 17:41:40 +01:00			`return False`

service: silence unused-argument warning from pylint The options parameter is unused, but is part of the interface, so we don't want to remove it. 2014-03-19 22:57:49 +01:00			`# the options parameter is unused, but is part of the`
			`# interface, so we don't want to remove it. Thus, the`
			`# pylint ignore.`
			`def find_all_episodes(self, options): # pylint: disable-msg=unused-argument`
Add --all-episodes option (for svt only currently) 2014-02-18 18:56:28 +01:00			`log.warning("--all-episodes not implemented for this service")`
			`return [self.url]`
Add --thumbnail Names thumbnails as $basename.tbn (Hi xbmc!) 2014-01-19 14:26:48 +01:00
Split out opengraph getter to separate function ...and add doctests 2014-02-18 16:17:02 +01:00			`def opengraph_get(html, prop):`
			`"""`
			`Extract specified OpenGraph property from html.`

			`>>> opengraph_get('<html><head><meta property="og:image" content="http://example.com/img.jpg"><meta ...', "image")`
			`'http://example.com/img.jpg'`
			`>>> opengraph_get('<html><head><meta content="http://example.com/img2.jpg" property="og:image"><meta ...', "image")`
			`'http://example.com/img2.jpg'`
			`>>> opengraph_get('<html><head><meta name="og:image" property="og:image" content="http://example.com/img3.jpg"><meta ...', "image")`
			`'http://example.com/img3.jpg'`
			`"""`
			`match = re.search('<meta [^>]property="og:' + prop + '" content="([^"])"', html)`
			`if match is None:`
			`match = re.search('<meta [^>]content="([^"])" property="og:' + prop + '"', html)`
			`if match is None:`
			`return None`
			`return match.group(1)`


Add --thumbnail Names thumbnails as $basename.tbn (Hi xbmc!) 2014-01-19 14:26:48 +01:00			`class OpenGraphThumbMixin(object):`
			`"""`
			`Mix this into the service class to grab thumbnail from OpenGraph properties.`
			`"""`
			`def get_thumbnail(self, options):`
more requests fixes for get_urldata. dont check the first byte 2015-08-31 22:04:59 +02:00			`url = opengraph_get(self.get_urldata(), "image")`
Split out opengraph getter to separate function ...and add doctests 2014-02-18 16:17:02 +01:00			`if url is None:`
			`return`
			`download_thumbnail(options, url)`
Add --thumbnail Names thumbnails as $basename.tbn (Hi xbmc!) 2014-01-19 14:26:48 +01:00

generic: request fixes 2015-08-30 12:04:16 +02:00			`class Generic(Service):`
Generic class a way to find embeded videos support for embeded svtplay videos. 2013-03-10 13:28:31 +01:00			`''' Videos embed in sites '''`
generic: handle it as a regular service 2015-09-06 16:03:57 +02:00			`def get(self, sites):`
			`data = self.http.request("get", self.url).text`
generic: better way to detect embedded svtplay 2014-12-22 10:39:51 +01:00			`match = re.search(r"src=(\"\|\')(http://www.svt.se/wd[^\'\"]+)(\"\|\')", data)`
Generic class a way to find embeded videos support for embeded svtplay videos. 2013-03-10 13:28:31 +01:00			`stream = None`
			`if match:`
generic: better way to detect embedded svtplay 2014-12-22 10:39:51 +01:00			`url = match.group(2)`
Generic class a way to find embeded videos support for embeded svtplay videos. 2013-03-10 13:28:31 +01:00			`for i in sites:`
Rename Service's handle method to handles handle can be ambiguous (i.e, not a verb). Signed-off-by: Olof Johansson <olof@ethup.se> 2014-01-01 15:03:15 +01:00			`if i.handles(url):`
pylint fixes 2015-01-05 21:52:34 +01:00			`url = url.replace("&", "&").replace("&", "&")`
generic: service need one more argument 2016-03-16 19:36:40 +01:00			`return url, i(self.options, url)`
Generic class a way to find embeded videos support for embeded svtplay videos. 2013-03-10 13:28:31 +01:00
Adding missing r prefixes to regex string 2013-05-05 12:57:42 +02:00			`match = re.search(r"src=\"(http://player.vimeo.com/video/[0-9]+)\" ", data)`
Generic class a way to find embeded videos support for embeded svtplay videos. 2013-03-10 13:28:31 +01:00			`if match:`
			`for i in sites:`
Rename Service's handle method to handles handle can be ambiguous (i.e, not a verb). Signed-off-by: Olof Johansson <olof@ethup.se> 2014-01-01 15:03:15 +01:00			`if i.handles(match.group(1)):`
generic: service need one more argument 2016-03-16 19:36:40 +01:00			`return match.group(1), i(self.options, url)`
generic: detect embeded tv4play streams 2014-02-04 20:20:36 +01:00			`match = re.search(r"tv4play.se/iframe/video/(\d+)?", data)`
Generic: embeded tv4play videos 2013-03-24 14:55:14 +01:00			`if match:`
			`url = "http://www.tv4play.se/?video_id=%s" % match.group(1)`
			`for i in sites:`
Rename Service's handle method to handles handle can be ambiguous (i.e, not a verb). Signed-off-by: Olof Johansson <olof@ethup.se> 2014-01-01 15:03:15 +01:00			`if i.handles(url):`
generic: service need one more argument 2016-03-16 19:36:40 +01:00			`return url, i(self.options, url)`
generic: support for embed bambuser streams 2014-02-05 20:42:34 +01:00			`match = re.search(r"embed.bambuser.com/broadcast/(\d+)", data)`
			`if match:`
			`url = "http://bambuser.com/v/%s" % match.group(1)`
			`for i in sites:`
			`if i.handles(url):`
generic: service need one more argument 2016-03-16 19:36:40 +01:00			`return url, i(self.options, url)`
generic: detect aftonbladet videos in iframes. 2014-12-30 21:19:14 +01:00			`match = re.search(r'src="(http://tv.aftonbladet[^"]*)"', data)`
generic: detect embeded svtplay streams 2014-02-08 22:47:27 +01:00			`if match:`
			`url = match.group(1)`
			`for i in sites:`
			`if i.handles(url):`
generic: service need one more argument 2016-03-16 19:36:40 +01:00			`return url, i(self.options, url)`
generic: detect embedded aftonbladet videos 2014-08-12 19:08:08 +02:00			`match = re.search(r'a href="(http://tv.aftonbladet[^"]*)" class="abVi', data)`
			`if match:`
			`url = match.group(1)`
			`for i in sites:`
			`if i.handles(url):`
generic: service need one more argument 2016-03-16 19:36:40 +01:00			`return url, i(self.options, url)`
generic: detect embedded aftonbladet videos 2014-08-12 19:08:08 +02:00
generic: detect embeded svtplay streams 2014-02-08 22:47:27 +01:00			`match = re.search(r"iframe src='(http://www.svtplay[^']*)'", data)`
aftonbladet: support for the new site 2014-02-05 23:15:19 +01:00			`if match:`
			`url = match.group(1)`
			`for i in sites:`
			`if i.handles(url):`
generic: service need one more argument 2016-03-16 19:36:40 +01:00			`return url, i(self.options, url)`
generic: support for embed bambuser streams 2014-02-05 20:42:34 +01:00
solidtango: support for embedded and videos on solidtango.com fixes: #343 2016-02-19 21:29:49 +01:00			`match = re.search('src="(http://mm-resource-service.herokuapp.com[^"]*)"', data)`
			`if match:`
			`url = match.group(1)`
			`for i in sites:`
			`if i.handles(url):`
			`return self.url, i(self.options, self.url)`
generic: download embedded lemonwhale videos fixes: #355 2016-03-16 20:56:20 +01:00			`match = re.search('(lemonwhale\|lwcdn.com)', data)`
			`if match:`
			`url = "http://lemonwhale.com"`
			`for i in sites:`
			`if i.handles(url):`
			`return self.url, i(self.options, self.url)`
generic: download embedded picsearch videos fixes: #356 2016-03-16 22:50:43 +01:00			`match = re.search('s.src="(https://csp-ssl.picsearch.com[^"]+\|http://csp.picsearch.com/rest[^"]+)', data)`
			`if match:`
			`url = match.group(1)`
			`for i in sites:`
			`if i.handles(url):`
			`return self.url, i(self.options, self.url)`
			`match = re.search('(picsearch_ajax_auth\|screen9-ajax-auth)', data)`
			`if match:`
			`url = "http://csp.picsearch.com"`
			`for i in sites:`
			`if i.handles(url):`
			`return self.url, i(self.options, self.url)`
picsearch: more ways to detect the service 2016-03-20 18:20:19 +01:00			`match = re.search('iframe src="(//csp.screen9.com[^"]+)"', data)`
			`if match:`
			`url = "http:%s" % match.group(1)`
			`for i in sites:`
			`if i.handles(url):`
			`return self.url, i(self.options, self.url)`
solidtango: support for embedded and videos on solidtango.com fixes: #343 2016-02-19 21:29:49 +01:00
generic: return sefl.url instead of url This fixes #278 2015-10-19 17:26:29 +02:00			`return self.url, stream`
Generic class a way to find embeded videos support for embeded svtplay videos. 2013-03-10 13:28:31 +01:00
Move options to when we init the service class 2015-12-26 11:46:14 +01:00			`def service_handler(sites, options, url):`
Move handler selection to svtplay.service 2013-02-28 21:44:28 +01:00			`handler = None`

			`for i in sites:`
Rename Service's handle method to handles handle can be ambiguous (i.e, not a verb). Signed-off-by: Olof Johansson <olof@ethup.se> 2014-01-01 15:03:15 +01:00			`if i.handles(url):`
Move options to when we init the service class 2015-12-26 11:46:14 +01:00			`handler = i(options, url)`
Move handler selection to svtplay.service 2013-02-28 21:44:28 +01:00			`break`

Add default handle method in Service base class The default handle method will look for a supported_domains attribute (a list), containing the supported domains. The subclassed service class can of course override this if other means of determining support is needded. 2014-01-01 14:57:17 +01:00			`return handler`