Improve title to filename conversion.

Does unicode NFD decomposition on the string before stripping non ascii chars. This means that chars like 'ü' is decomposed into 'u' and '¨', which mean that when non-ascii chars are dropped we will still keep the 'u'. Fixes #60
2024-11-27 21:54:17 +01:00 · 2014-01-05 17:02:48 +01:00 · 2014-01-05 17:02:48 +01:00 · d6ab73679e
commit d6ab73679e
parent 712901cc8e
2 changed files with 27 additions and 10 deletions
--- a/lib/svtplay_dl/init.py
+++ b/lib/svtplay_dl/init.py
@ -8,7 +8,7 @@ import logging
 from optparse import OptionParser

 from svtplay_dl.log import log
-from svtplay_dl.utils import get_http_data, is_py3, is_py2, decode_html_entities
+from svtplay_dl.utils import get_http_data, decode_html_entities, filenamify
 from svtplay_dl.service import service_handler, Generic


@ -64,16 +64,11 @@ def get_media(url, options):
        match = re.search(r"(?i)<title[^>]*>\s*(.*?)\s*</title>", data, re.S)
        if match:
            title_tag = decode_html_entities(match.group(1))
-            if is_py3:
-                title = re.sub(r'[^\w\s-]', '', title_tag).strip().lower()
-                tmp = re.sub(r'[-\s]+', '-', title)
+            if not options.output:
+                options.output = filenamify(title_tag)
            else:
-                title = unicode(re.sub(r'[^\w\s-]', '', title_tag).strip().lower())
-                tmp = unicode(re.sub(r'[-\s]+', '-', title))
-            if options.output and os.path.isdir(options.output):
-                options.output += "/%s" % tmp
-            else:
-                options.output = tmp
+                # output is a directory
+                os.path.join(options.output, filenamify(title_tag))

    stream.get(options, url)

--- a/lib/svtplay_dl/utils/init.py
+++ b/lib/svtplay_dl/utils/init.py
@ -8,6 +8,7 @@ import re
 import xml.etree.ElementTree as ET
 import json
 import time
+import unicodedata
 try:
    import HTMLParser
 except ImportError:
@ -306,3 +307,24 @@ def decode_html_entities(s):
    def unesc(m):
        return parser.unescape(m.group())
    return re.sub(r'(&[^;]+;)', unesc, ensure_unicode(s))
+
+def filenamify(title):
+    """
+    Convert a string to something suitable as a file name.
+    """
+    # ensure it is unicode
+    title = ensure_unicode(title)
+
+    # NFD decomposes chars into base char and diacritical mark, which means that we will get base char when we strip out non-ascii.
+    title = unicodedata.normalize('NFD', title)
+
+    # Drop any non ascii letters/digits
+    title = re.sub(r'[^a-zA-Z0-9 -]', '', title)
+    # Drop any leading/trailing whitespace that may have appeared
+    title = title.strip()
+    # Lowercase
+    title = title.lower()
+    # Replace whitespace with dash
+    title = re.sub(r'[-\s]+', '-', title)
+
+    return title