[utils] Support TTML without default namespace

In a strict sense such TTML is invalid, but Yahoo uses it.
This commit is contained in:
Yen Chi Hsuan 2015-05-19 00:45:01 +08:00
parent 2aa64b89b3
commit 1b0427e6c4
2 changed files with 21 additions and 3 deletions

View File

@ -621,6 +621,21 @@ Line
''' '''
self.assertEqual(dfxp2srt(dfxp_data), srt_data) self.assertEqual(dfxp2srt(dfxp_data), srt_data)
dfxp_data_no_default_namespace = '''<?xml version="1.0" encoding="UTF-8"?>
<tt xml:lang="en" xmlns:tts="http://www.w3.org/ns/ttml#parameter">
<body>
<div xml:lang="en">
<p begin="0" end="1">The first line</p>
</div>
</body>
</tt>'''
srt_data = '''1
00:00:00,000 --> 00:00:01,000
The first line
'''
self.assertEqual(dfxp2srt(dfxp_data_no_default_namespace), srt_data)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -1848,9 +1848,9 @@ def dfxp2srt(dfxp_data):
out = str_or_empty(node.text) out = str_or_empty(node.text)
for child in node: for child in node:
if child.tag == _x('ttml:br'): if child.tag in (_x('ttml:br'), 'br'):
out += '\n' + str_or_empty(child.tail) out += '\n' + str_or_empty(child.tail)
elif child.tag == _x('ttml:span'): elif child.tag in (_x('ttml:span'), 'span'):
out += str_or_empty(parse_node(child)) out += str_or_empty(parse_node(child))
else: else:
out += str_or_empty(xml.etree.ElementTree.tostring(child)) out += str_or_empty(xml.etree.ElementTree.tostring(child))
@ -1859,7 +1859,10 @@ def dfxp2srt(dfxp_data):
dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8')) dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
out = [] out = []
paras = dfxp.findall(_x('.//ttml:p')) paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
if not paras:
raise ValueError('Invalid dfxp/TTML subtitle')
for para, index in zip(paras, itertools.count(1)): for para, index in zip(paras, itertools.count(1)):
begin_time = parse_dfxp_time_expr(para.attrib['begin']) begin_time = parse_dfxp_time_expr(para.attrib['begin'])