1
0
mirror of https://github.com/spaam/svtplay-dl.git synced 2024-11-27 21:54:17 +01:00

Improve title to filename conversion.

Does unicode NFD decomposition on the string before stripping non ascii chars.

This means that chars like 'ü' is decomposed into 'u' and '¨', which mean that
when non-ascii chars are dropped we will still keep the 'u'.

Fixes #60
This commit is contained in:
Anders Waldenborg 2014-01-05 17:02:48 +01:00
parent 712901cc8e
commit d6ab73679e
2 changed files with 27 additions and 10 deletions

View File

@ -8,7 +8,7 @@ import logging
from optparse import OptionParser
from svtplay_dl.log import log
from svtplay_dl.utils import get_http_data, is_py3, is_py2, decode_html_entities
from svtplay_dl.utils import get_http_data, decode_html_entities, filenamify
from svtplay_dl.service import service_handler, Generic
@ -64,16 +64,11 @@ def get_media(url, options):
match = re.search(r"(?i)<title[^>]*>\s*(.*?)\s*</title>", data, re.S)
if match:
title_tag = decode_html_entities(match.group(1))
if is_py3:
title = re.sub(r'[^\w\s-]', '', title_tag).strip().lower()
tmp = re.sub(r'[-\s]+', '-', title)
if not options.output:
options.output = filenamify(title_tag)
else:
title = unicode(re.sub(r'[^\w\s-]', '', title_tag).strip().lower())
tmp = unicode(re.sub(r'[-\s]+', '-', title))
if options.output and os.path.isdir(options.output):
options.output += "/%s" % tmp
else:
options.output = tmp
# output is a directory
os.path.join(options.output, filenamify(title_tag))
stream.get(options, url)

View File

@ -8,6 +8,7 @@ import re
import xml.etree.ElementTree as ET
import json
import time
import unicodedata
try:
import HTMLParser
except ImportError:
@ -306,3 +307,24 @@ def decode_html_entities(s):
def unesc(m):
return parser.unescape(m.group())
return re.sub(r'(&[^;]+;)', unesc, ensure_unicode(s))
def filenamify(title):
"""
Convert a string to something suitable as a file name.
"""
# ensure it is unicode
title = ensure_unicode(title)
# NFD decomposes chars into base char and diacritical mark, which means that we will get base char when we strip out non-ascii.
title = unicodedata.normalize('NFD', title)
# Drop any non ascii letters/digits
title = re.sub(r'[^a-zA-Z0-9 -]', '', title)
# Drop any leading/trailing whitespace that may have appeared
title = title.strip()
# Lowercase
title = title.lower()
# Replace whitespace with dash
title = re.sub(r'[-\s]+', '-', title)
return title