svtplay-dl/lib/svtplay_dl/subtitle/__init__.py

import xml.etree.ElementTree as ET
import json
import re
import os
from svtplay_dl.log import log
from svtplay_dl.utils import is_py2, is_py3, get_http_data

class subtitle(object):
    def __init__(self, options, subtype, url):
        self.url = url
        self.subtitle = None
        self.options = options
        self.subtype = subtype

    def download(self):
        error, subdata = get_http_data(self.url, cookiejar=self.options.cookies)
        if error:
            log.error("Can't download subtitle")
            return

        data = None
        if self.subtype == "tt":
            data = self.tt(subdata)
        if self.subtype == "json":
            data = self.json(subdata)
        if self.subtype == "sami":
            data = self.sami(subdata)
        if self.subtype == "smi":
            data = self.smi(subdata)
        if self.subtype == "wrst":
            data = self.wrst(subdata)

        save(self.options, data)

    def tt(self, subdata):
        i = 1
        data = ""
        tree = ET.ElementTree(ET.fromstring(subdata))
        xml = tree.find("{http://www.w3.org/2006/10/ttaf1}body").find("{http://www.w3.org/2006/10/ttaf1}div")
        plist = list(xml.findall("{http://www.w3.org/2006/10/ttaf1}p"))
        for node in plist:
            tag = norm(node.tag)
            if tag == "p" or tag == "span":
                begin = node.attrib["begin"]
                if not ("dur" in node.attrib):
                    duration = node.attrib["duration"]
                else:
                    duration = node.attrib["dur"]
                if not ("end" in node.attrib):
                    begin2 = begin.split(":")
                    duration2 = duration.split(":")
                    sec = float(begin2[2]) + float(duration2[2])
                    end = "%02d:%02d:%06.3f" % (int(begin[0]), int(begin[1]), sec)
                else:
                    end = node.attrib["end"]
                data += '%s\n%s --> %s\n' % (i, begin.replace(".", ","), end.replace(".", ","))
                data = tt_text(node, data)
                data += "\n"
                i += 1

        if is_py2:
            data = data.encode('utf8')
        return data

    def json(self, subdata):
        data = json.loads(subdata)
        number = 1
        subs = ""
        for i in data:
            subs += "%s\n%s --> %s\n" % (number, timestr(int(i["startMillis"])), timestr(int(i["endMillis"])))
            if is_py2:
                subs += "%s\n\n" % i["text"].encode("utf-8")
            else:
                subs += "%s\n\n" % i["text"]
            number += 1

        return subs

    def sami(self, subdata):
        tree = ET.XML(subdata)
        subt = tree.find("Font")
        subs = ""
        n = 0
        for i in subt.getiterator():
            if i.tag == "Subtitle":
                n = i.attrib["SpotNumber"]

                if i.attrib["SpotNumber"] == "1":
                    subs += "%s\n%s --> %s\n" % (i.attrib["SpotNumber"], timecolon(i.attrib["TimeIn"]), timecolon(i.attrib["TimeOut"]))
                else:
                    subs += "\n%s\n%s --> %s\n" % (i.attrib["SpotNumber"], timecolon(i.attrib["TimeIn"]), timecolon(i.attrib["TimeOut"]))
            else:
                if int(n) > 0:
                    subs += "%s\n" % i.text

        if is_py2:
            subs = subs.encode('utf8')
        return subs

    def smi(self, subdata):
        if is_py3:
            subdata = subdata.decode("latin1")
        recomp = re.compile(r'<SYNC Start=(\d+)>\s+<P Class=\w+>(.*)\s+<SYNC Start=(\d+)>\s+<P Class=\w+>', re.M|re.I|re.U)
        number = 1
        subs = ""
        TAG_RE = re.compile(r'<[^>]+>')
        bad_char = re.compile(r'\x96')
        for i in recomp.finditer(subdata):
            subs += "%s\n%s --> %s\n" % (number, timestr(i.group(1)), timestr(i.group(3)))
            text = "%s\n\n" % TAG_RE.sub('', i.group(2).replace("<br>", "\n"))
            if text[0] == "\x0a":
                text = text[1:]
            subs += text
            number += 1
        recomp = re.compile(r'\r')
        text = bad_char.sub('-', recomp.sub('', subs)).replace('&quot;', '"')
        return text

    def wrst(self, subdata):
        recomp = re.compile(r"(\d+)\r\n([\d:\.]+ --> [\d:\.]+)?([^\r\n]+)?\r\n([^\r\n]+)\r\n(([^\r\n]*)\r\n)?")
        srt = ""
        subtract = False
        for i in recomp.finditer(subdata):
            number = int(i.group(1))
            match = re.search(r'(\d+):(\d+):([\d\.]+) --> (\d+):(\d+):([\d\.]+)', i.group(2))
            hour1 = int(match.group(1))
            hour2 = int(match.group(4))
            if number == 1:
                if hour1 > 9:
                    subtract = True
            if subtract:
                hour1 -= 10
                hour2 -= 10
            time = "%s:%s:%s --> %s:%s:%s" % (hour1, match.group(2), match.group(3).replace(".", ","), hour2, match.group(5), match.group(6).replace(".", ","))
            sub = "%s\n%s\n%s\n" % (i.group(1), time, i.group(4))
            if len(i.group(6)) > 0:
                sub += "%s\n" % i.group(6)
            sub += "\n"
            sub = re.sub('<[^>]*>', '', sub)
            srt += sub

        return srt

def save(options, data):
    filename = re.search(r"(.*)\.[a-z0-9]{2,3}$", options.output)
    if filename:
        options.output = "%s.srt" % filename.group(1)
    else:
        options.output = "%s.srt" % options.output

    log.info("Subtitle: %s", options.output)
    if os.path.isfile(options.output) and not options.force:
        log.info("File already exists. use --force to overwrite")
        return
    fd = open(options.output, "w")
    fd.write(data)
    fd.close()

def timestr(msec):
    """
    Convert a millisecond value to a string of the following
    format:

        HH:MM:SS,SS

    with 10 millisecond precision. Note the , seperator in
    the seconds.
    """
    sec = float(msec) / 1000

    hours = int(sec / 3600)
    sec -= hours * 3600

    minutes = int(sec / 60)
    sec -= minutes * 60

    output = "%02d:%02d:%05.2f" % (hours, minutes, sec)
    return output.replace(".", ",")

def timecolon(data):
    match = re.search(r"(\d+:\d+:\d+):(\d+)", data)
    return "%s,%s" % (match.group(1), match.group(2))

def norm(name):
    if name[0] == "{":
        _, tag = name[1:].split("}")
        return tag
    else:
        return name

def tt_text(node, data):
    if node.text:
        data += "%s\n" % node.text.strip(' \t\n\r')
    for i in node:
        if i.text:
            data += "%s\n" % i.text.strip(' \t\n\r')
        if i.tail:
            text = i.tail.strip(' \t\n\r')
            if text:
                data += "%s\n" % text
    return data
Move subtitles into its own file 2014-04-21 19:52:09 +02:00			`import xml.etree.ElementTree as ET`
			`import json`
			`import re`
subtitle: show a warning if subtitle exists. 2014-08-23 13:33:38 +02:00			`import os`
Move subtitles into its own file 2014-04-21 19:52:09 +02:00			`from svtplay_dl.log import log`
Rewrite http request handling. 2014-12-08 23:07:02 +01:00			`from svtplay_dl.utils import is_py2, is_py3, get_http_data`
Move subtitles into its own file 2014-04-21 19:52:09 +02:00
subtitle: inherit from object 2014-07-28 15:53:23 +02:00			`class subtitle(object):`
subtitle: refactor so we can reuse the try-except-thing 2014-08-31 01:20:36 +02:00			`def __init__(self, options, subtype, url):`
Move subtitles into its own file 2014-04-21 19:52:09 +02:00			`self.url = url`
			`self.subtitle = None`
subtitle: refactor so we can reuse the try-except-thing 2014-08-31 01:20:36 +02:00			`self.options = options`
			`self.subtype = subtype`
Move subtitles into its own file 2014-04-21 19:52:09 +02:00
subtitle: refactor so we can reuse the try-except-thing 2014-08-31 01:20:36 +02:00			`def download(self):`
Rewrite http request handling. 2014-12-08 23:07:02 +01:00			`error, subdata = get_http_data(self.url, cookiejar=self.options.cookies)`
			`if error:`
			`log.error("Can't download subtitle")`
			`return`
subtitle: refactor so we can reuse the try-except-thing 2014-08-31 01:20:36 +02:00
			`data = None`
			`if self.subtype == "tt":`
			`data = self.tt(subdata)`
			`if self.subtype == "json":`
			`data = self.json(subdata)`
			`if self.subtype == "sami":`
			`data = self.sami(subdata)`
			`if self.subtype == "smi":`
			`data = self.smi(subdata)`
			`if self.subtype == "wrst":`
			`data = self.wrst(subdata)`

			`save(self.options, data)`

			`def tt(self, subdata):`
Move subtitles into its own file 2014-04-21 19:52:09 +02:00			`i = 1`
			`data = ""`
subtitle: catch http errors. 2014-08-31 00:56:51 +02:00			`tree = ET.ElementTree(ET.fromstring(subdata))`
subtitle_tt: rewrote the function. this fixes #111 2014-07-09 18:39:18 +02:00			`xml = tree.find("{http://www.w3.org/2006/10/ttaf1}body").find("{http://www.w3.org/2006/10/ttaf1}div")`
			`plist = list(xml.findall("{http://www.w3.org/2006/10/ttaf1}p"))`
			`for node in plist:`
Move subtitles into its own file 2014-04-21 19:52:09 +02:00			`tag = norm(node.tag)`
subtitle_tt: rewrote the function. this fixes #111 2014-07-09 18:39:18 +02:00			`if tag == "p" or tag == "span":`
Move subtitles into its own file 2014-04-21 19:52:09 +02:00			`begin = node.attrib["begin"]`
			`if not ("dur" in node.attrib):`
			`duration = node.attrib["duration"]`
			`else:`
			`duration = node.attrib["dur"]`
			`if not ("end" in node.attrib):`
			`begin2 = begin.split(":")`
			`duration2 = duration.split(":")`
			`sec = float(begin2[2]) + float(duration2[2])`
			`end = "%02d:%02d:%06.3f" % (int(begin[0]), int(begin[1]), sec)`
			`else:`
			`end = node.attrib["end"]`
pylint fixes 2014-12-26 02:04:29 +01:00			`data += '%s\n%s --> %s\n' % (i, begin.replace(".", ","), end.replace(".", ","))`
subtitle_tt: rewrote the function. this fixes #111 2014-07-09 18:39:18 +02:00			`data = tt_text(node, data)`
			`data += "\n"`
Move subtitles into its own file 2014-04-21 19:52:09 +02:00			`i += 1`
subtitle: we only need one save function. 2014-04-27 15:33:05 +02:00
Move subtitles into its own file 2014-04-21 19:52:09 +02:00			`if is_py2:`
			`data = data.encode('utf8')`
subtitle: refactor so we can reuse the try-except-thing 2014-08-31 01:20:36 +02:00			`return data`
Move subtitles into its own file 2014-04-21 19:52:09 +02:00
subtitle: refactor so we can reuse the try-except-thing 2014-08-31 01:20:36 +02:00			`def json(self, subdata):`
subtitle: catch http errors. 2014-08-31 00:56:51 +02:00			`data = json.loads(subdata)`
Move subtitles into its own file 2014-04-21 19:52:09 +02:00			`number = 1`
			`subs = ""`
			`for i in data:`
			`subs += "%s\n%s --> %s\n" % (number, timestr(int(i["startMillis"])), timestr(int(i["endMillis"])))`
subtitle_json: only convert to utf8 on py2 2014-11-25 19:02:50 +01:00			`if is_py2:`
			`subs += "%s\n\n" % i["text"].encode("utf-8")`
			`else:`
			`subs += "%s\n\n" % i["text"]`
Move subtitles into its own file 2014-04-21 19:52:09 +02:00			`number += 1`

subtitle: refactor so we can reuse the try-except-thing 2014-08-31 01:20:36 +02:00			`return subs`
Move subtitles into its own file 2014-04-21 19:52:09 +02:00
subtitle: refactor so we can reuse the try-except-thing 2014-08-31 01:20:36 +02:00			`def sami(self, subdata):`
subtitle: catch http errors. 2014-08-31 00:56:51 +02:00			`tree = ET.XML(subdata)`
Move subtitles into its own file 2014-04-21 19:52:09 +02:00			`subt = tree.find("Font")`
			`subs = ""`
			`n = 0`
			`for i in subt.getiterator():`
			`if i.tag == "Subtitle":`
			`n = i.attrib["SpotNumber"]`
sami: they started to use : instead of , 2014-12-15 22:19:58 +01:00
Move subtitles into its own file 2014-04-21 19:52:09 +02:00			`if i.attrib["SpotNumber"] == "1":`
sami: they started to use : instead of , 2014-12-15 22:19:58 +01:00			`subs += "%s\n%s --> %s\n" % (i.attrib["SpotNumber"], timecolon(i.attrib["TimeIn"]), timecolon(i.attrib["TimeOut"]))`
Move subtitles into its own file 2014-04-21 19:52:09 +02:00			`else:`
sami: they started to use : instead of , 2014-12-15 22:19:58 +01:00			`subs += "\n%s\n%s --> %s\n" % (i.attrib["SpotNumber"], timecolon(i.attrib["TimeIn"]), timecolon(i.attrib["TimeOut"]))`
Move subtitles into its own file 2014-04-21 19:52:09 +02:00			`else:`
			`if int(n) > 0:`
			`subs += "%s\n" % i.text`

			`if is_py2:`
			`subs = subs.encode('utf8')`
subtitle: refactor so we can reuse the try-except-thing 2014-08-31 01:20:36 +02:00			`return subs`
Move subtitles into its own file 2014-04-21 19:52:09 +02:00
subtitle: refactor so we can reuse the try-except-thing 2014-08-31 01:20:36 +02:00			`def smi(self, subdata):`
subtitle_smi: Empty subtitles. this fixes #180 2014-11-23 13:02:14 +01:00			`if is_py3:`
subtitle: refactor so we can reuse the try-except-thing 2014-08-31 01:20:36 +02:00			`subdata = subdata.decode("latin1")`
smi: broken regex is broken. 2014-12-15 22:06:24 +01:00			`recomp = re.compile(r'<SYNC Start=(\d+)>\s+<P Class=\w+>(.*)\s+<SYNC Start=(\d+)>\s+<P Class=\w+>', re.M\|re.I\|re.U)`
Move subtitles into its own file 2014-04-21 19:52:09 +02:00			`number = 1`
			`subs = ""`
subtitle_smi: Empty subtitles. this fixes #180 2014-11-23 13:02:14 +01:00			`TAG_RE = re.compile(r'<[^>]+>')`
			`bad_char = re.compile(r'\x96')`
subtitle: refactor so we can reuse the try-except-thing 2014-08-31 01:20:36 +02:00			`for i in recomp.finditer(subdata):`
smi: broken regex is broken. 2014-12-15 22:06:24 +01:00			`subs += "%s\n%s --> %s\n" % (number, timestr(i.group(1)), timestr(i.group(3)))`
			`text = "%s\n\n" % TAG_RE.sub('', i.group(2).replace("<br>", "\n"))`
subtitle_smi: Empty subtitles. this fixes #180 2014-11-23 13:02:14 +01:00			`if text[0] == "\x0a":`
			`text = text[1:]`
			`subs += text`
Move subtitles into its own file 2014-04-21 19:52:09 +02:00			`number += 1`
subtitle_smi: Empty subtitles. this fixes #180 2014-11-23 13:02:14 +01:00			`recomp = re.compile(r'\r')`
			`text = bad_char.sub('-', recomp.sub('', subs)).replace('"', '"')`
subtitle: refactor so we can reuse the try-except-thing 2014-08-31 01:20:36 +02:00			`return text`
Move subtitles into its own file 2014-04-21 19:52:09 +02:00
subtitle: refactor so we can reuse the try-except-thing 2014-08-31 01:20:36 +02:00			`def wrst(self, subdata):`
Move subtitles into its own file 2014-04-21 19:52:09 +02:00			`recomp = re.compile(r"(\d+)\r\n([\d:\.]+ --> [\d:\.]+)?([^\r\n]+)?\r\n([^\r\n]+)\r\n(([^\r\n]*)\r\n)?")`
			`srt = ""`
subtitle_wsrt: subtract 10 hours when it begins at >9 hours. This fixes #101 2014-06-07 18:50:51 +02:00			`subtract = False`
subtitle: catch http errors. 2014-08-31 00:56:51 +02:00			`for i in recomp.finditer(subdata):`
subtitle_wsrt: subtract 10 hours when it begins at >9 hours. This fixes #101 2014-06-07 18:50:51 +02:00			`number = int(i.group(1))`
			`match = re.search(r'(\d+):(\d+):([\d\.]+) --> (\d+):(\d+):([\d\.]+)', i.group(2))`
			`hour1 = int(match.group(1))`
			`hour2 = int(match.group(4))`
			`if number == 1:`
			`if hour1 > 9:`
subtitle_wrt: set the right variable to True. 2014-06-07 18:54:51 +02:00			`subtract = True`
subtitle_wsrt: subtract 10 hours when it begins at >9 hours. This fixes #101 2014-06-07 18:50:51 +02:00			`if subtract:`
			`hour1 -= 10`
			`hour2 -= 10`
subtitle_wsrt: it should be --> and not -> 2014-06-07 21:56:28 +02:00			`time = "%s:%s:%s --> %s:%s:%s" % (hour1, match.group(2), match.group(3).replace(".", ","), hour2, match.group(5), match.group(6).replace(".", ","))`
subtitle_wsrt: subtract 10 hours when it begins at >9 hours. This fixes #101 2014-06-07 18:50:51 +02:00			`sub = "%s\n%s\n%s\n" % (i.group(1), time, i.group(4))`
Move subtitles into its own file 2014-04-21 19:52:09 +02:00			`if len(i.group(6)) > 0:`
			`sub += "%s\n" % i.group(6)`
			`sub += "\n"`
			`sub = re.sub('<[^>]*>', '', sub)`
			`srt += sub`
subtitle: we only need one save function. 2014-04-27 15:33:05 +02:00
subtitle: refactor so we can reuse the try-except-thing 2014-08-31 01:20:36 +02:00			`return srt`
subtitle: we only need one save function. 2014-04-27 15:33:05 +02:00
			`def save(options, data):`
			`filename = re.search(r"(.*)\.[a-z0-9]{2,3}$", options.output)`
			`if filename:`
			`options.output = "%s.srt" % filename.group(1)`
			`else:`
			`options.output = "%s.srt" % options.output`

			`log.info("Subtitle: %s", options.output)`
subtitle: show a warning if subtitle exists. 2014-08-23 13:33:38 +02:00			`if os.path.isfile(options.output) and not options.force:`
			`log.info("File already exists. use --force to overwrite")`
			`return`
subtitle: we only need one save function. 2014-04-27 15:33:05 +02:00			`fd = open(options.output, "w")`
			`fd.write(data)`
			`fd.close()`
Move subtitles into its own file 2014-04-21 19:52:09 +02:00
			`def timestr(msec):`
			`"""`
			`Convert a millisecond value to a string of the following`
			`format:`

			`HH:MM:SS,SS`

			`with 10 millisecond precision. Note the , seperator in`
			`the seconds.`
			`"""`
			`sec = float(msec) / 1000`

			`hours = int(sec / 3600)`
			`sec -= hours * 3600`

			`minutes = int(sec / 60)`
			`sec -= minutes * 60`

			`output = "%02d:%02d:%05.2f" % (hours, minutes, sec)`
			`return output.replace(".", ",")`

sami: they started to use : instead of , 2014-12-15 22:19:58 +01:00			`def timecolon(data):`
missing r prefix in regex matches. 2014-12-22 10:20:37 +01:00			`match = re.search(r"(\d+:\d+:\d+):(\d+)", data)`
sami: they started to use : instead of , 2014-12-15 22:19:58 +01:00			`return "%s,%s" % (match.group(1), match.group(2))`

Move subtitles into its own file 2014-04-21 19:52:09 +02:00			`def norm(name):`
			`if name[0] == "{":`
			`_, tag = name[1:].split("}")`
			`return tag`
			`else:`
			`return name`
subtitle_tt: rewrote the function. this fixes #111 2014-07-09 18:39:18 +02:00
			`def tt_text(node, data):`
			`if node.text:`
			`data += "%s\n" % node.text.strip(' \t\n\r')`
			`for i in node:`
			`if i.text:`
			`data += "%s\n" % i.text.strip(' \t\n\r')`
			`if i.tail:`
			`text = i.tail.strip(' \t\n\r')`
			`if text:`
			`data += "%s\n" % text`
			`return data`