Add option --parse-metadata
* The fields extracted by this can be used in `--output` * Deprecated `--metadata-from-title` :ci skip dl
This commit is contained in:
parent
9882064024
commit
5bfa486205
8 changed files with 162 additions and 110 deletions
25
README.md
25
README.md
|
@ -610,16 +610,19 @@ Then simply type this
|
|||
--no-embed-thumbnail Do not embed thumbnail (default)
|
||||
--add-metadata Write metadata to the video file
|
||||
--no-add-metadata Do not write metadata (default)
|
||||
--metadata-from-title FORMAT Parse additional metadata like song title /
|
||||
artist from the video title. The format
|
||||
syntax is the same as --output. Regular
|
||||
expression with named capture groups may
|
||||
also be used. The parsed parameters replace
|
||||
existing values. Example: --metadata-from-
|
||||
title "%(artist)s - %(title)s" matches a
|
||||
--parse-metadata FIELD:FORMAT Parse additional metadata like title/artist
|
||||
from other fields. Give field name to
|
||||
extract data from, and format of the field
|
||||
seperated by a ":". The format syntax is
|
||||
the same as --output. Regular expression
|
||||
with named capture groups may also be used.
|
||||
The parsed parameters replace existing
|
||||
values. This option can be used multiple
|
||||
times. Example: --parse-metadata
|
||||
"title:%(artist)s - %(title)s" matches a
|
||||
title like "Coldplay - Paradise". Example
|
||||
(regex): --metadata-from-title
|
||||
"(?P<artist>.+?) - (?P<title>.+)"
|
||||
(regex): --parse-metadata
|
||||
"description:Artist - (?P<artist>.+?)"
|
||||
--xattrs Write metadata to the video file's xattrs
|
||||
(using dublin core and xdg standards)
|
||||
--fixup POLICY Automatically correct known faults of the
|
||||
|
@ -1098,7 +1101,7 @@ $ youtube-dlc -S '+res:480,codec,br'
|
|||
|
||||
Plugins are loaded from `<root-dir>/ytdlp_plugins/<type>/__init__.py`. Currently only `extractor` plugins are supported. Support for `downloader` and `postprocessor` plugins may be added in the future. See [ytdlp_plugins](ytdlp_plugins) for example.
|
||||
|
||||
**Note**: `<root-dir>` is the directory of the binary (`<root-dir>/youtube-dlc`), or the root directory of the module if you are running directly from source-code ((`<root dir>/youtube_dlc/__main__.py`)
|
||||
**Note**: `<root-dir>` is the directory of the binary (`<root-dir>/youtube-dlc`), or the root directory of the module if you are running directly from source-code (`<root dir>/youtube_dlc/__main__.py`)
|
||||
|
||||
# MORE
|
||||
For FAQ, Developer Instructions etc., see the [original README](https://github.com/ytdl-org/youtube-dl)
|
||||
For FAQ, Developer Instructions etc., see the [original README](https://github.com/ytdl-org/youtube-dl#faq)
|
||||
|
|
|
@ -8,10 +8,16 @@ import sys
|
|||
import unittest
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from youtube_dlc.postprocessor import MetadataFromTitlePP
|
||||
from youtube_dlc.postprocessor import MetadataFromFieldPP, MetadataFromTitlePP
|
||||
|
||||
|
||||
class TestMetadataFromField(unittest.TestCase):
|
||||
def test_format_to_regex(self):
|
||||
pp = MetadataFromFieldPP(None, ['title:%(title)s - %(artist)s'])
|
||||
self.assertEqual(pp._data[0]['regex'], r'(?P<title>[^\r\n]+)\ \-\ (?P<artist>[^\r\n]+)')
|
||||
|
||||
|
||||
class TestMetadataFromTitle(unittest.TestCase):
|
||||
def test_format_to_regex(self):
|
||||
pp = MetadataFromTitlePP(None, '%(title)s - %(artist)s')
|
||||
self.assertEqual(pp._titleregex, r'(?P<title>.+)\ \-\ (?P<artist>.+)')
|
||||
self.assertEqual(pp._titleregex, r'(?P<title>[^\r\n]+)\ \-\ (?P<artist>[^\r\n]+)')
|
||||
|
|
|
@ -375,8 +375,7 @@ class YoutubeDL(object):
|
|||
|
||||
params = None
|
||||
_ies = []
|
||||
_pps = []
|
||||
_pps_end = []
|
||||
_pps = {'beforedl': [], 'aftermove': [], 'normal': []}
|
||||
__prepare_filename_warned = False
|
||||
_download_retcode = None
|
||||
_num_downloads = None
|
||||
|
@ -390,8 +389,7 @@ class YoutubeDL(object):
|
|||
params = {}
|
||||
self._ies = []
|
||||
self._ies_instances = {}
|
||||
self._pps = []
|
||||
self._pps_end = []
|
||||
self._pps = {'beforedl': [], 'aftermove': [], 'normal': []}
|
||||
self.__prepare_filename_warned = False
|
||||
self._post_hooks = []
|
||||
self._progress_hooks = []
|
||||
|
@ -494,11 +492,13 @@ class YoutubeDL(object):
|
|||
pp_class = get_postprocessor(pp_def_raw['key'])
|
||||
pp_def = dict(pp_def_raw)
|
||||
del pp_def['key']
|
||||
after_move = pp_def.get('_after_move', False)
|
||||
if '_after_move' in pp_def:
|
||||
del pp_def['_after_move']
|
||||
if 'when' in pp_def:
|
||||
when = pp_def['when']
|
||||
del pp_def['when']
|
||||
else:
|
||||
when = 'normal'
|
||||
pp = pp_class(self, **compat_kwargs(pp_def))
|
||||
self.add_post_processor(pp, after_move=after_move)
|
||||
self.add_post_processor(pp, when=when)
|
||||
|
||||
for ph in self.params.get('post_hooks', []):
|
||||
self.add_post_hook(ph)
|
||||
|
@ -550,12 +550,9 @@ class YoutubeDL(object):
|
|||
for ie in gen_extractor_classes():
|
||||
self.add_info_extractor(ie)
|
||||
|
||||
def add_post_processor(self, pp, after_move=False):
|
||||
def add_post_processor(self, pp, when='normal'):
|
||||
"""Add a PostProcessor object to the end of the chain."""
|
||||
if after_move:
|
||||
self._pps_end.append(pp)
|
||||
else:
|
||||
self._pps.append(pp)
|
||||
self._pps[when].append(pp)
|
||||
pp.set_downloader(self)
|
||||
|
||||
def add_post_hook(self, ph):
|
||||
|
@ -1948,6 +1945,8 @@ class YoutubeDL(object):
|
|||
|
||||
self._num_downloads += 1
|
||||
|
||||
info_dict = self.pre_process(info_dict)
|
||||
|
||||
filename = self.prepare_filename(info_dict, warn=True)
|
||||
info_dict['_filename'] = full_filename = self.prepare_filepath(filename)
|
||||
temp_filename = self.prepare_filepath(filename, 'temp')
|
||||
|
@ -2400,41 +2399,45 @@ class YoutubeDL(object):
|
|||
(k, v) for k, v in info_dict.items()
|
||||
if k not in ['requested_formats', 'requested_subtitles'])
|
||||
|
||||
def run_pp(self, pp, infodict, files_to_move={}):
|
||||
files_to_delete = []
|
||||
try:
|
||||
files_to_delete, infodict = pp.run(infodict)
|
||||
except PostProcessingError as e:
|
||||
self.report_error(e.msg)
|
||||
if not files_to_delete:
|
||||
return files_to_move, infodict
|
||||
|
||||
if self.params.get('keepvideo', False):
|
||||
for f in files_to_delete:
|
||||
files_to_move.setdefault(f, '')
|
||||
else:
|
||||
for old_filename in set(files_to_delete):
|
||||
self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
|
||||
try:
|
||||
os.remove(encodeFilename(old_filename))
|
||||
except (IOError, OSError):
|
||||
self.report_warning('Unable to remove downloaded original file')
|
||||
if old_filename in files_to_move:
|
||||
del files_to_move[old_filename]
|
||||
return files_to_move, infodict
|
||||
|
||||
def pre_process(self, ie_info):
|
||||
info = dict(ie_info)
|
||||
for pp in self._pps['beforedl']:
|
||||
info = self.run_pp(pp, info)[1]
|
||||
return info
|
||||
|
||||
def post_process(self, filename, ie_info, files_to_move={}):
|
||||
"""Run all the postprocessors on the given file."""
|
||||
info = dict(ie_info)
|
||||
info['filepath'] = filename
|
||||
|
||||
def run_pp(pp):
|
||||
files_to_delete = []
|
||||
infodict = info
|
||||
try:
|
||||
files_to_delete, infodict = pp.run(infodict)
|
||||
except PostProcessingError as e:
|
||||
self.report_error(e.msg)
|
||||
if not files_to_delete:
|
||||
return infodict
|
||||
|
||||
if self.params.get('keepvideo', False):
|
||||
for f in files_to_delete:
|
||||
files_to_move.setdefault(f, '')
|
||||
else:
|
||||
for old_filename in set(files_to_delete):
|
||||
self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
|
||||
try:
|
||||
os.remove(encodeFilename(old_filename))
|
||||
except (IOError, OSError):
|
||||
self.report_warning('Unable to remove downloaded original file')
|
||||
if old_filename in files_to_move:
|
||||
del files_to_move[old_filename]
|
||||
return infodict
|
||||
|
||||
for pp in ie_info.get('__postprocessors', []) + self._pps:
|
||||
info = run_pp(pp)
|
||||
info = run_pp(MoveFilesAfterDownloadPP(self, files_to_move))
|
||||
files_to_move = {}
|
||||
for pp in self._pps_end:
|
||||
info = run_pp(pp)
|
||||
for pp in ie_info.get('__postprocessors', []) + self._pps['normal']:
|
||||
files_to_move, info = self.run_pp(pp, info, files_to_move)
|
||||
info = self.run_pp(MoveFilesAfterDownloadPP(self, files_to_move), info, files_to_move)[1]
|
||||
for pp in self._pps['aftermove']:
|
||||
files_to_move, info = self.run_pp(pp, info, {})
|
||||
|
||||
def _make_archive_id(self, info_dict):
|
||||
video_id = info_dict.get('id')
|
||||
|
|
|
@ -45,6 +45,7 @@ from .downloader import (
|
|||
from .extractor import gen_extractors, list_extractors
|
||||
from .extractor.common import InfoExtractor
|
||||
from .extractor.adobepass import MSO_INFO
|
||||
from .postprocessor.metadatafromfield import MetadataFromFieldPP
|
||||
from .YoutubeDL import YoutubeDL
|
||||
|
||||
|
||||
|
@ -249,16 +250,25 @@ def _real_main(argv=None):
|
|||
if re.match(InfoExtractor.FormatSort.regex, f) is None:
|
||||
parser.error('invalid format sort string "%s" specified' % f)
|
||||
|
||||
if opts.metafromfield is None:
|
||||
opts.metafromfield = []
|
||||
if opts.metafromtitle is not None:
|
||||
opts.metafromfield.append('title:%s' % opts.metafromtitle)
|
||||
for f in opts.metafromfield:
|
||||
if re.match(MetadataFromFieldPP.regex, f) is None:
|
||||
parser.error('invalid format string "%s" specified for --parse-metadata' % f)
|
||||
|
||||
any_getting = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json
|
||||
any_printing = opts.print_json
|
||||
download_archive_fn = expand_path(opts.download_archive) if opts.download_archive is not None else opts.download_archive
|
||||
|
||||
# PostProcessors
|
||||
postprocessors = []
|
||||
if opts.metafromtitle:
|
||||
if opts.metafromfield:
|
||||
postprocessors.append({
|
||||
'key': 'MetadataFromTitle',
|
||||
'titleformat': opts.metafromtitle
|
||||
'key': 'MetadataFromField',
|
||||
'formats': opts.metafromfield,
|
||||
'when': 'beforedl'
|
||||
})
|
||||
if opts.extractaudio:
|
||||
postprocessors.append({
|
||||
|
@ -324,7 +334,7 @@ def _real_main(argv=None):
|
|||
postprocessors.append({
|
||||
'key': 'ExecAfterDownload',
|
||||
'exec_cmd': opts.exec_cmd,
|
||||
'_after_move': True
|
||||
'when': 'aftermove'
|
||||
})
|
||||
|
||||
_args_compat_warning = 'WARNING: %s given without specifying name. The arguments will be given to all %s\n'
|
||||
|
|
|
@ -1078,14 +1078,20 @@ def parseOpts(overrideArguments=None):
|
|||
postproc.add_option(
|
||||
'--metadata-from-title',
|
||||
metavar='FORMAT', dest='metafromtitle',
|
||||
help=optparse.SUPPRESS_HELP)
|
||||
postproc.add_option(
|
||||
'--parse-metadata',
|
||||
metavar='FIELD:FORMAT', dest='metafromfield', action='append',
|
||||
help=(
|
||||
'Parse additional metadata like song title / artist from the video title. '
|
||||
'The format syntax is the same as --output. Regular expression with '
|
||||
'named capture groups may also be used. '
|
||||
'Parse additional metadata like title/artist from other fields. '
|
||||
'Give field name to extract data from, and format of the field seperated by a ":". '
|
||||
'The format syntax is the same as --output. '
|
||||
'Regular expression with named capture groups may also be used. '
|
||||
'The parsed parameters replace existing values. '
|
||||
'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like '
|
||||
'This option can be used multiple times. '
|
||||
'Example: --parse-metadata "title:%(artist)s - %(title)s" matches a title like '
|
||||
'"Coldplay - Paradise". '
|
||||
'Example (regex): --metadata-from-title "(?P<artist>.+?) - (?P<title>.+)"'))
|
||||
'Example (regex): --parse-metadata "description:Artist - (?P<artist>.+?)"'))
|
||||
postproc.add_option(
|
||||
'--xattrs',
|
||||
action='store_true', dest='xattrs', default=False,
|
||||
|
|
|
@ -16,7 +16,8 @@ from .ffmpeg import (
|
|||
)
|
||||
from .xattrpp import XAttrMetadataPP
|
||||
from .execafterdownload import ExecAfterDownloadPP
|
||||
from .metadatafromtitle import MetadataFromTitlePP
|
||||
from .metadatafromfield import MetadataFromFieldPP
|
||||
from .metadatafromfield import MetadataFromTitlePP
|
||||
from .movefilesafterdownload import MoveFilesAfterDownloadPP
|
||||
from .sponskrub import SponSkrubPP
|
||||
|
||||
|
@ -39,6 +40,7 @@ __all__ = [
|
|||
'FFmpegSubtitlesConvertorPP',
|
||||
'FFmpegVideoConvertorPP',
|
||||
'FFmpegVideoRemuxerPP',
|
||||
'MetadataFromFieldPP',
|
||||
'MetadataFromTitlePP',
|
||||
'MoveFilesAfterDownloadPP',
|
||||
'SponSkrubPP',
|
||||
|
|
66
youtube_dlc/postprocessor/metadatafromfield.py
Normal file
66
youtube_dlc/postprocessor/metadatafromfield.py
Normal file
|
@ -0,0 +1,66 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
from .common import PostProcessor
|
||||
from ..compat import compat_str
|
||||
|
||||
|
||||
class MetadataFromFieldPP(PostProcessor):
|
||||
regex = r'(?P<field>\w+):(?P<format>.+)$'
|
||||
|
||||
def __init__(self, downloader, formats):
|
||||
PostProcessor.__init__(self, downloader)
|
||||
assert isinstance(formats, (list, tuple))
|
||||
self._data = []
|
||||
for f in formats:
|
||||
assert isinstance(f, compat_str)
|
||||
match = re.match(self.regex, f)
|
||||
assert match is not None
|
||||
self._data.append({
|
||||
'field': match.group('field'),
|
||||
'format': match.group('format'),
|
||||
'regex': self.format_to_regex(match.group('format'))})
|
||||
|
||||
def format_to_regex(self, fmt):
|
||||
r"""
|
||||
Converts a string like
|
||||
'%(title)s - %(artist)s'
|
||||
to a regex like
|
||||
'(?P<title>.+)\ \-\ (?P<artist>.+)'
|
||||
"""
|
||||
if not re.search(r'%\(\w+\)s', fmt):
|
||||
return fmt
|
||||
lastpos = 0
|
||||
regex = ''
|
||||
# replace %(..)s with regex group and escape other string parts
|
||||
for match in re.finditer(r'%\((\w+)\)s', fmt):
|
||||
regex += re.escape(fmt[lastpos:match.start()])
|
||||
regex += r'(?P<' + match.group(1) + r'>[^\r\n]+)'
|
||||
lastpos = match.end()
|
||||
if lastpos < len(fmt):
|
||||
regex += re.escape(fmt[lastpos:])
|
||||
return regex
|
||||
|
||||
def run(self, info):
|
||||
for dictn in self._data:
|
||||
field, regex = dictn['field'], dictn['regex']
|
||||
if field not in info:
|
||||
self.report_warning('Video doesnot have a %s' % field)
|
||||
continue
|
||||
self.write_debug('Searching for r"%s" in %s' % (regex, field))
|
||||
match = re.search(regex, info[field])
|
||||
if match is None:
|
||||
self.report_warning('Could not interpret video %s as "%s"' % (field, dictn['format']))
|
||||
continue
|
||||
for attribute, value in match.groupdict().items():
|
||||
info[attribute] = value
|
||||
self.to_screen('parsed %s from %s: %s' % (attribute, field, value if value is not None else 'NA'))
|
||||
return [], info
|
||||
|
||||
|
||||
class MetadataFromTitlePP(MetadataFromFieldPP): # for backward compatibility
|
||||
def __init__(self, downloader, titleformat):
|
||||
super(MetadataFromTitlePP, self).__init__(downloader, ['title:%s' % titleformat])
|
||||
self._titleformat = titleformat
|
||||
self._titleregex = self._data[0]['regex']
|
|
@ -1,44 +0,0 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
from .common import PostProcessor
|
||||
|
||||
|
||||
class MetadataFromTitlePP(PostProcessor):
|
||||
def __init__(self, downloader, titleformat):
|
||||
super(MetadataFromTitlePP, self).__init__(downloader)
|
||||
self._titleformat = titleformat
|
||||
self._titleregex = (self.format_to_regex(titleformat)
|
||||
if re.search(r'%\(\w+\)s', titleformat)
|
||||
else titleformat)
|
||||
|
||||
def format_to_regex(self, fmt):
|
||||
r"""
|
||||
Converts a string like
|
||||
'%(title)s - %(artist)s'
|
||||
to a regex like
|
||||
'(?P<title>.+)\ \-\ (?P<artist>.+)'
|
||||
"""
|
||||
lastpos = 0
|
||||
regex = ''
|
||||
# replace %(..)s with regex group and escape other string parts
|
||||
for match in re.finditer(r'%\((\w+)\)s', fmt):
|
||||
regex += re.escape(fmt[lastpos:match.start()])
|
||||
regex += r'(?P<' + match.group(1) + '>.+)'
|
||||
lastpos = match.end()
|
||||
if lastpos < len(fmt):
|
||||
regex += re.escape(fmt[lastpos:])
|
||||
return regex
|
||||
|
||||
def run(self, info):
|
||||
title = info['title']
|
||||
match = re.match(self._titleregex, title)
|
||||
if match is None:
|
||||
self.to_screen('Could not interpret title of video as "%s"' % self._titleformat)
|
||||
return [], info
|
||||
for attribute, value in match.groupdict().items():
|
||||
info[attribute] = value
|
||||
self.to_screen('parsed %s: %s' % (attribute, value if value is not None else 'NA'))
|
||||
|
||||
return [], info
|
Loading…
Reference in a new issue