mirror of
https://github.com/spaam/svtplay-dl.git
synced 2024-11-27 21:54:17 +01:00
55 lines
1.4 KiB
Python
55 lines
1.4 KiB
Python
from __future__ import absolute_import
|
|
import html.parser as HTMLParser
|
|
import re
|
|
import unicodedata
|
|
|
|
|
|
def ensure_unicode(s):
|
|
"""
|
|
Ensure string is a unicode string. If it isn't it assumed it is
|
|
utf-8 and decodes it to a unicode string.
|
|
"""
|
|
if isinstance(s, bytes):
|
|
s = s.decode('utf-8', 'replace')
|
|
return s
|
|
|
|
|
|
def decode_html_entities(s):
|
|
"""
|
|
Replaces html entities with the character they represent.
|
|
|
|
>>> print(decode_html_entities("<3 &"))
|
|
<3 &
|
|
"""
|
|
parser = HTMLParser.HTMLParser()
|
|
|
|
def unesc(m):
|
|
return parser.unescape(m.group())
|
|
return re.sub(r'(&[^;]+;)', unesc, ensure_unicode(s))
|
|
|
|
|
|
def filenamify(title):
|
|
"""
|
|
Convert a string to something suitable as a file name. E.g.
|
|
|
|
Matlagning del 1 av 10 - Räksmörgås | SVT Play
|
|
-> matlagning.del.1.av.10.-.raksmorgas.svt.play
|
|
"""
|
|
# ensure it is unicode
|
|
title = ensure_unicode(title)
|
|
|
|
# NFD decomposes chars into base char and diacritical mark, which
|
|
# means that we will get base char when we strip out non-ascii.
|
|
title = unicodedata.normalize('NFD', title)
|
|
|
|
# Convert to lowercase
|
|
# Drop any non ascii letters/digits
|
|
# Drop any leading/trailing whitespace that may have appeared
|
|
title = re.sub(r'[^a-z0-9 .-]', '', title.lower().strip())
|
|
|
|
# Replace whitespace with dot
|
|
title = re.sub(r'\s+', '.', title)
|
|
title = re.sub(r'\.-\.', '-', title)
|
|
|
|
return title
|