[outtmpl] Format type U
for unicode normalization
This commit is contained in:
parent
f440b14f87
commit
524e2e4fda
4 changed files with 27 additions and 16 deletions
|
@ -964,6 +964,7 @@ The field names themselves (the part inside the parenthesis) can also have some
|
||||||
1. **Alternatives**: Alternate fields can be specified seperated with a `,`. Eg: `%(release_date>%Y,upload_date>%Y|Unknown)s`
|
1. **Alternatives**: Alternate fields can be specified seperated with a `,`. Eg: `%(release_date>%Y,upload_date>%Y|Unknown)s`
|
||||||
1. **Default**: A literal default value can be specified for when the field is empty using a `|` seperator. This overrides `--output-na-template`. Eg: `%(uploader|Unknown)s`
|
1. **Default**: A literal default value can be specified for when the field is empty using a `|` seperator. This overrides `--output-na-template`. Eg: `%(uploader|Unknown)s`
|
||||||
1. **More Conversions**: In addition to the normal format types `diouxXeEfFgGcrs`, `B`, `j`, `l`, `q` can be used for converting to **B**ytes, **j**son, a comma seperated **l**ist and a string **q**uoted for the terminal respectively
|
1. **More Conversions**: In addition to the normal format types `diouxXeEfFgGcrs`, `B`, `j`, `l`, `q` can be used for converting to **B**ytes, **j**son, a comma seperated **l**ist and a string **q**uoted for the terminal respectively
|
||||||
|
1. **Unicode normalization**: The format type `U` can be used for NFC [unicode normalization](https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize). The alternate form flag (`#`) changes the normalization to NFD and the conversion flag `+` can be used for NFKC/NFKD compatibility equivalence normalization. Eg: `%(title)+.100U` is NFKC
|
||||||
|
|
||||||
To summarize, the general syntax for a field is:
|
To summarize, the general syntax for a field is:
|
||||||
```
|
```
|
||||||
|
|
|
@ -649,7 +649,7 @@ class TestYoutubeDL(unittest.TestCase):
|
||||||
'title2': '%PATH%',
|
'title2': '%PATH%',
|
||||||
'title3': 'foo/bar\\test',
|
'title3': 'foo/bar\\test',
|
||||||
'title4': 'foo "bar" test',
|
'title4': 'foo "bar" test',
|
||||||
'title5': 'áéí',
|
'title5': 'áéí 𝐀',
|
||||||
'timestamp': 1618488000,
|
'timestamp': 1618488000,
|
||||||
'duration': 100000,
|
'duration': 100000,
|
||||||
'playlist_index': 1,
|
'playlist_index': 1,
|
||||||
|
@ -769,6 +769,10 @@ class TestYoutubeDL(unittest.TestCase):
|
||||||
test('%(formats.:.id) 15l', ' id1, id2, id3')
|
test('%(formats.:.id) 15l', ' id1, id2, id3')
|
||||||
test('%(formats)j', (json.dumps(FORMATS), sanitize(json.dumps(FORMATS))))
|
test('%(formats)j', (json.dumps(FORMATS), sanitize(json.dumps(FORMATS))))
|
||||||
test('%(title5).3B', 'á')
|
test('%(title5).3B', 'á')
|
||||||
|
test('%(title5)U', 'áéí 𝐀')
|
||||||
|
test('%(title5)#U', 'a\u0301e\u0301i\u0301 𝐀')
|
||||||
|
test('%(title5)+U', 'áéí A')
|
||||||
|
test('%(title5)+#U', 'a\u0301e\u0301i\u0301 A')
|
||||||
if compat_os_name == 'nt':
|
if compat_os_name == 'nt':
|
||||||
test('%(title4)q', ('"foo \\"bar\\" test"', "'foo _'bar_' test'"))
|
test('%(title4)q', ('"foo \\"bar\\" test"', "'foo _'bar_' test'"))
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -25,6 +25,7 @@ import time
|
||||||
import tokenize
|
import tokenize
|
||||||
import traceback
|
import traceback
|
||||||
import random
|
import random
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
from string import ascii_letters
|
from string import ascii_letters
|
||||||
|
|
||||||
|
@ -908,7 +909,7 @@ class YoutubeDL(object):
|
||||||
def validate_outtmpl(cls, outtmpl):
|
def validate_outtmpl(cls, outtmpl):
|
||||||
''' @return None or Exception object '''
|
''' @return None or Exception object '''
|
||||||
outtmpl = re.sub(
|
outtmpl = re.sub(
|
||||||
STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqB]'),
|
STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqBU]'),
|
||||||
lambda mobj: f'{mobj.group(0)[:-1]}s',
|
lambda mobj: f'{mobj.group(0)[:-1]}s',
|
||||||
cls._outtmpl_expandpath(outtmpl))
|
cls._outtmpl_expandpath(outtmpl))
|
||||||
try:
|
try:
|
||||||
|
@ -940,7 +941,7 @@ class YoutubeDL(object):
|
||||||
}
|
}
|
||||||
|
|
||||||
TMPL_DICT = {}
|
TMPL_DICT = {}
|
||||||
EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqB]'))
|
EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqBU]'))
|
||||||
MATH_FUNCTIONS = {
|
MATH_FUNCTIONS = {
|
||||||
'+': float.__add__,
|
'+': float.__add__,
|
||||||
'-': float.__sub__,
|
'-': float.__sub__,
|
||||||
|
@ -1031,21 +1032,26 @@ class YoutubeDL(object):
|
||||||
value = default if value is None else value
|
value = default if value is None else value
|
||||||
|
|
||||||
str_fmt = f'{fmt[:-1]}s'
|
str_fmt = f'{fmt[:-1]}s'
|
||||||
if fmt[-1] == 'l':
|
if fmt[-1] == 'l': # list
|
||||||
value, fmt = ', '.join(variadic(value)), str_fmt
|
value, fmt = ', '.join(variadic(value)), str_fmt
|
||||||
elif fmt[-1] == 'j':
|
elif fmt[-1] == 'j': # json
|
||||||
value, fmt = json.dumps(value, default=_dumpjson_default), str_fmt
|
value, fmt = json.dumps(value, default=_dumpjson_default), str_fmt
|
||||||
elif fmt[-1] == 'q':
|
elif fmt[-1] == 'q': # quoted
|
||||||
value, fmt = compat_shlex_quote(str(value)), str_fmt
|
value, fmt = compat_shlex_quote(str(value)), str_fmt
|
||||||
elif fmt[-1] == 'B':
|
elif fmt[-1] == 'B': # bytes
|
||||||
value = f'%{str_fmt}'.encode('utf-8') % str(value).encode('utf-8')
|
value = f'%{str_fmt}'.encode('utf-8') % str(value).encode('utf-8')
|
||||||
value, fmt = value.decode('utf-8', 'ignore'), 's'
|
value, fmt = value.decode('utf-8', 'ignore'), 's'
|
||||||
|
elif fmt[-1] == 'U': # unicode normalized
|
||||||
|
opts = outer_mobj.group('conversion') or ''
|
||||||
|
value, fmt = unicodedata.normalize(
|
||||||
|
# "+" = compatibility equivalence, "#" = NFD
|
||||||
|
'NF%s%s' % ('K' if '+' in opts else '', 'D' if '#' in opts else 'C'),
|
||||||
|
value), str_fmt
|
||||||
elif fmt[-1] == 'c':
|
elif fmt[-1] == 'c':
|
||||||
value = str(value)
|
if value:
|
||||||
if value is None:
|
value = str(value)[0]
|
||||||
value, fmt = default, 's'
|
|
||||||
else:
|
else:
|
||||||
value = value[0]
|
fmt = str_fmt
|
||||||
elif fmt[-1] not in 'rs': # numeric
|
elif fmt[-1] not in 'rs': # numeric
|
||||||
value = float_or_none(value)
|
value = float_or_none(value)
|
||||||
if value is None:
|
if value is None:
|
||||||
|
|
|
@ -4474,12 +4474,12 @@ OUTTMPL_TYPES = {
|
||||||
STR_FORMAT_RE_TMPL = r'''(?x)
|
STR_FORMAT_RE_TMPL = r'''(?x)
|
||||||
(?<!%)(?P<prefix>(?:%%)*)
|
(?<!%)(?P<prefix>(?:%%)*)
|
||||||
%
|
%
|
||||||
(?P<has_key>\((?P<key>{0})\))? # mapping key
|
(?P<has_key>\((?P<key>{0})\))?
|
||||||
(?P<format>
|
(?P<format>
|
||||||
(?:[#0\-+ ]+)? # conversion flags (optional)
|
(?P<conversion>[#0\-+ ]+)?
|
||||||
(?:\d+)? # minimum field width (optional)
|
(?P<min_width>\d+)?
|
||||||
(?:\.\d+)? # precision (optional)
|
(?P<precision>\.\d+)?
|
||||||
[hlL]? # length modifier (optional)
|
(?P<len_mod>[hlL])? # unused in python
|
||||||
{1} # conversion type
|
{1} # conversion type
|
||||||
)
|
)
|
||||||
'''
|
'''
|
||||||
|
|
Loading…
Reference in a new issue