mirror of
https://github.com/spaam/svtplay-dl.git
synced 2024-11-27 21:54:17 +01:00
Improve title to filename conversion.
Does unicode NFD decomposition on the string before stripping non ascii chars. This means that chars like 'ü' is decomposed into 'u' and '¨', which mean that when non-ascii chars are dropped we will still keep the 'u'. Fixes #60
This commit is contained in:
parent
712901cc8e
commit
d6ab73679e
@ -8,7 +8,7 @@ import logging
|
||||
from optparse import OptionParser
|
||||
|
||||
from svtplay_dl.log import log
|
||||
from svtplay_dl.utils import get_http_data, is_py3, is_py2, decode_html_entities
|
||||
from svtplay_dl.utils import get_http_data, decode_html_entities, filenamify
|
||||
from svtplay_dl.service import service_handler, Generic
|
||||
|
||||
|
||||
@ -64,16 +64,11 @@ def get_media(url, options):
|
||||
match = re.search(r"(?i)<title[^>]*>\s*(.*?)\s*</title>", data, re.S)
|
||||
if match:
|
||||
title_tag = decode_html_entities(match.group(1))
|
||||
if is_py3:
|
||||
title = re.sub(r'[^\w\s-]', '', title_tag).strip().lower()
|
||||
tmp = re.sub(r'[-\s]+', '-', title)
|
||||
if not options.output:
|
||||
options.output = filenamify(title_tag)
|
||||
else:
|
||||
title = unicode(re.sub(r'[^\w\s-]', '', title_tag).strip().lower())
|
||||
tmp = unicode(re.sub(r'[-\s]+', '-', title))
|
||||
if options.output and os.path.isdir(options.output):
|
||||
options.output += "/%s" % tmp
|
||||
else:
|
||||
options.output = tmp
|
||||
# output is a directory
|
||||
os.path.join(options.output, filenamify(title_tag))
|
||||
|
||||
stream.get(options, url)
|
||||
|
||||
|
@ -8,6 +8,7 @@ import re
|
||||
import xml.etree.ElementTree as ET
|
||||
import json
|
||||
import time
|
||||
import unicodedata
|
||||
try:
|
||||
import HTMLParser
|
||||
except ImportError:
|
||||
@ -306,3 +307,24 @@ def decode_html_entities(s):
|
||||
def unesc(m):
|
||||
return parser.unescape(m.group())
|
||||
return re.sub(r'(&[^;]+;)', unesc, ensure_unicode(s))
|
||||
|
||||
def filenamify(title):
|
||||
"""
|
||||
Convert a string to something suitable as a file name.
|
||||
"""
|
||||
# ensure it is unicode
|
||||
title = ensure_unicode(title)
|
||||
|
||||
# NFD decomposes chars into base char and diacritical mark, which means that we will get base char when we strip out non-ascii.
|
||||
title = unicodedata.normalize('NFD', title)
|
||||
|
||||
# Drop any non ascii letters/digits
|
||||
title = re.sub(r'[^a-zA-Z0-9 -]', '', title)
|
||||
# Drop any leading/trailing whitespace that may have appeared
|
||||
title = title.strip()
|
||||
# Lowercase
|
||||
title = title.lower()
|
||||
# Replace whitespace with dash
|
||||
title = re.sub(r'[-\s]+', '-', title)
|
||||
|
||||
return title
|
||||
|
Loading…
Reference in New Issue
Block a user