from __future__ import absolute_import
import html.parser as HTMLParser
import re
import unicodedata
def ensure_unicode(s):
"""
Ensure string is a unicode string. If it isn't it assumed it is
utf-8 and decodes it to a unicode string.
"""
if isinstance(s, bytes):
s = s.decode('utf-8', 'replace')
return s
def decode_html_entities(s):
"""
Replaces html entities with the character they represent.
>>> print(decode_html_entities("<3 &"))
<3 &
"""
parser = HTMLParser.HTMLParser()
def unesc(m):
return parser.unescape(m.group())
return re.sub(r'(&[^;]+;)', unesc, ensure_unicode(s))
def filenamify(title):
"""
Convert a string to something suitable as a file name. E.g.
Matlagning del 1 av 10 - Räksmörgås | SVT Play
-> matlagning.del.1.av.10.-.raksmorgas.svt.play
"""
# ensure it is unicode
title = ensure_unicode(title)
# NFD decomposes chars into base char and diacritical mark, which
# means that we will get base char when we strip out non-ascii.
title = unicodedata.normalize('NFD', title)
# Convert to lowercase
# Drop any non ascii letters/digits
# Drop any leading/trailing whitespace that may have appeared
title = re.sub(r'[^a-z0-9 .-]', '', title.lower().strip())
# Replace whitespace with dot
title = re.sub(r'\s+', '.', title)
title = re.sub(r'\.-\.', '-', title)
return title