Add option `--replace-in-metadata`

3 years ago · e9f4ccd19e
parent a38bd1defa
commit e9f4ccd19e
8 changed files with 187 additions and 107 deletions
--- a/README.md
+++ b/README.md
@ -777,6 +777,10 @@ Then simply run `make`. You can also run `make yt-dlp` instead to compile only t
    --parse-metadata FROM:TO         Parse additional metadata like title/artist
                                     from other fields; see "MODIFYING METADATA"
                                     for details
    --replace-in-metadata FIELDS REGEX REPLACE
                                     Replace text in a metadata field using the
                                     given regex. This option can be used
                                     multiple times
    --xattrs                         Write metadata to the video file's xattrs
                                     (using dublin core and xdg standards)
    --fixup POLICY                   Automatically correct known faults of the
@ -1333,7 +1337,11 @@ $ yt-dlp -S '+res:480,codec,br'
 # MODIFYING METADATA
-The metadata obtained the the extractors can be modified by using `--parse-metadata FROM:TO`. The general syntax is to give the name of a field or a template (with similar syntax to [output template](#output-template)) to extract data from, and the format to interpret it as, separated by a colon `:`. Either a [python regular expression](https://docs.python.org/3/library/re.html#regular-expression-syntax) with named capture groups or a similar syntax to the [output template](#output-template) (only `%(field)s` formatting is supported) can be used for `TO`. The option can be used multiple times to parse and modify various fields.
+The metadata obtained the the extractors can be modified by using `--parse-metadata` and `--replace-in-metadata`
 `--replace-in-metadata FIELDS REGEX REPLACE` is used to replace text in any metatdata field using [python regular expression](https://docs.python.org/3/library/re.html#regular-expression-syntax). [Backreferences](https://docs.python.org/3/library/re.html?highlight=backreferences#re.sub) can be used in the replace string for advanced use.
 The general syntax of `--parse-metadata FROM:TO` is to give the name of a field or a template (with same syntax as [output template](#output-template)) to extract data from, and the format to interpret it as, separated by a colon `:`. Either a [python regular expression](https://docs.python.org/3/library/re.html#regular-expression-syntax) with named capture groups or a similar syntax to the [output template](#output-template) (only `%(field)s` formatting is supported) can be used for `TO`. The option can be used multiple times to parse and modify various fields.
 Note that any field created by this can be used in the [output template](#output-template) and will also affect the media file's metadata added when using `--add-metadata`.
@ -1380,6 +1388,9 @@ $ yt-dlp --parse-metadata '%(series)s S%(season_number)02dE%(episode_number)02d:
 # Set "comment" field in video metadata using description instead of webpage_url
 $ yt-dlp --parse-metadata 'description:(?s)(?P<meta_comment>.+)' --add-metadata
 # Replace all spaces and "_" in title and uploader with a `-`
 $ yt-dlp --replace-in-metadata 'title,uploader' '[ _]' '-'
 ```
 # EXTRACTOR ARGUMENTS
--- a/test/test_postprocessors.py
+++ b/test/test_postprocessors.py
@ -14,29 +14,28 @@ from yt_dlp.postprocessor import (
    ExecAfterDownloadPP,
    FFmpegThumbnailsConvertorPP,
    MetadataFromFieldPP,
-    MetadataFromTitlePP,
+    MetadataParserPP,
 )
 class TestMetadataFromField(unittest.TestCase):
    def test_format_to_regex(self):
        pp = MetadataFromFieldPP(None, ['title:%(title)s - %(artist)s'])
        self.assertEqual(pp._data[0]['regex'], r'(?P<title>.+)\ \-\ (?P<artist>.+)')
    def test_field_to_outtmpl(self):
        pp = MetadataFromFieldPP(None, ['title:%(title)s : %(artist)s'])
        self.assertEqual(pp._data[0]['tmpl'], '%(title)s')
    def test_in_out_seperation(self):
        pp = MetadataFromFieldPP(None, ['%(title)s \\: %(artist)s:%(title)s : %(artist)s'])
        self.assertEqual(pp._data[0]['in'], '%(title)s : %(artist)s')
        self.assertEqual(pp._data[0]['out'], '%(title)s : %(artist)s')
 class TestMetadataFromTitle(unittest.TestCase):
    def test_format_to_regex(self):
-        pp = MetadataFromTitlePP(None, '%(title)s - %(artist)s')
+        self.assertEqual(
-        self.assertEqual(pp._titleregex, r'(?P<title>.+)\ \-\ (?P<artist>.+)')
+            MetadataParserPP.format_to_regex('%(title)s - %(artist)s'),
            r'(?P<title>.+)\ \-\ (?P<artist>.+)')
        self.assertEqual(MetadataParserPP.format_to_regex(r'(?P<x>.+)'), r'(?P<x>.+)')
    def test_field_to_template(self):
        self.assertEqual(MetadataParserPP.field_to_template('title'), '%(title)s')
        self.assertEqual(MetadataParserPP.field_to_template('1'), '1')
        self.assertEqual(MetadataParserPP.field_to_template('foo bar'), 'foo bar')
        self.assertEqual(MetadataParserPP.field_to_template(' literal'), ' literal')
    def test_metadatafromfield(self):
        self.assertEqual(
            MetadataFromFieldPP.to_action('%(title)s \\: %(artist)s:%(title)s : %(artist)s'),
            (MetadataParserPP.Actions.INTERPRET, '%(title)s : %(artist)s', '%(title)s : %(artist)s'))
 class TestConvertThumbnail(unittest.TestCase):
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -1281,7 +1281,7 @@ class YoutubeDL(object):
            ie_result = self.process_video_result(ie_result, download=download)
            additional_urls = (ie_result or {}).get('additional_urls')
            if additional_urls:
-                # TODO: Improve MetadataFromFieldPP to allow setting a list
+                # TODO: Improve MetadataParserPP to allow setting a list
                if isinstance(additional_urls, compat_str):
                    additional_urls = [additional_urls]
                self.to_screen(
--- a/yt_dlp/init.py
+++ b/yt_dlp/init.py
@ -7,6 +7,7 @@ __license__ = 'Public Domain'
 import codecs
 import io
 import itertools
 import os
 import random
 import re
@ -18,6 +19,7 @@ from .options import (
 )
 from .compat import (
    compat_getpass,
    compat_shlex_quote,
    workaround_optparse_bug9161,
 )
 from .cookies import SUPPORTED_BROWSERS
@ -46,14 +48,15 @@ from .downloader import (
 from .extractor import gen_extractors, list_extractors
 from .extractor.common import InfoExtractor
 from .extractor.adobepass import MSO_INFO
-from .postprocessor.ffmpeg import (
+from .postprocessor import (
    FFmpegExtractAudioPP,
    FFmpegSubtitlesConvertorPP,
    FFmpegThumbnailsConvertorPP,
    FFmpegVideoConvertorPP,
    FFmpegVideoRemuxerPP,
    MetadataFromFieldPP,
    MetadataParserPP,
 )
 from .postprocessor.metadatafromfield import MetadataFromFieldPP
 from .YoutubeDL import YoutubeDL
@ -344,13 +347,29 @@ def _real_main(argv=None):
        if re.match(InfoExtractor.FormatSort.regex, f) is None:
            parser.error('invalid format sort string "%s" specified' % f)
-    if opts.metafromfield is None:
+    def metadataparser_actions(f):
-        opts.metafromfield = []
+        if isinstance(f, str):
            cmd = '--parse-metadata %s' % compat_shlex_quote(f)
            try:
                actions = [MetadataFromFieldPP.to_action(f)]
            except Exception as err:
                parser.error(f'{cmd} is invalid; {err}')
        else:
            cmd = '--replace-in-metadata %s' % ' '.join(map(compat_shlex_quote, f))
            actions = ((MetadataParserPP.Actions.REPLACE, x, *f[1:]) for x in f[0].split(','))
        for action in actions:
            try:
                MetadataParserPP.validate_action(*action)
            except Exception as err:
                parser.error(f'{cmd} is invalid; {err}')
            yield action
    if opts.parse_metadata is None:
        opts.parse_metadata = []
    if opts.metafromtitle is not None:
-        opts.metafromfield.append('title:%s' % opts.metafromtitle)
+        opts.parse_metadata.append('title:%s' % opts.metafromtitle)
-    for f in opts.metafromfield:
+    opts.parse_metadata = list(itertools.chain(*map(metadataparser_actions, opts.parse_metadata)))
        if re.match(MetadataFromFieldPP.regex, f) is None:
            parser.error('invalid format string "%s" specified for --parse-metadata' % f)
    any_getting = opts.forceprint or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json
    any_printing = opts.print_json
@ -402,10 +421,10 @@ def _real_main(argv=None):
    # PostProcessors
    postprocessors = []
-    if opts.metafromfield:
+    if opts.parse_metadata:
        postprocessors.append({
-            'key': 'MetadataFromField',
+            'key': 'MetadataParser',
-            'formats': opts.metafromfield,
+            'actions': opts.parse_metadata,
            # Run this immediately after extraction is complete
            'when': 'pre_process'
        })
--- a/yt_dlp/options.py
+++ b/yt_dlp/options.py
@ -1241,10 +1241,14 @@ def parseOpts(overrideArguments=None):
        help=optparse.SUPPRESS_HELP)
    postproc.add_option(
        '--parse-metadata',
-        metavar='FROM:TO', dest='metafromfield', action='append',
+        metavar='FROM:TO', dest='parse_metadata', action='append',
        help=(
            'Parse additional metadata like title/artist from other fields; '
            'see "MODIFYING METADATA" for details'))
    postproc.add_option(
        '--replace-in-metadata',
        dest='parse_metadata', metavar='FIELDS REGEX REPLACE', action='append', nargs=3,
        help='Replace text in a metadata field using the given regex. This option can be used multiple times')
    postproc.add_option(
        '--xattrs',
        action='store_true', dest='xattrs', default=False,
--- a/yt_dlp/postprocessor/init.py
+++ b/yt_dlp/postprocessor/init.py
@ -20,8 +20,11 @@ from .ffmpeg import (
 )
 from .xattrpp import XAttrMetadataPP
 from .execafterdownload import ExecAfterDownloadPP
-from .metadatafromfield import MetadataFromFieldPP
+from .metadataparser import (
-from .metadatafromfield import MetadataFromTitlePP
+    MetadataFromFieldPP,
    MetadataFromTitlePP,
    MetadataParserPP,
 )
 from .movefilesafterdownload import MoveFilesAfterDownloadPP
 from .sponskrub import SponSkrubPP
@ -48,6 +51,7 @@ __all__ = [
    'FFmpegThumbnailsConvertorPP',
    'FFmpegVideoConvertorPP',
    'FFmpegVideoRemuxerPP',
    'MetadataParserPP',
    'MetadataFromFieldPP',
    'MetadataFromTitlePP',
    'MoveFilesAfterDownloadPP',
--- a/yt_dlp/postprocessor/metadatafromfield.py
+++ b/yt_dlp/postprocessor/metadatafromfield.py
@ -1,74 +0,0 @@
 from __future__ import unicode_literals
 import re
 from .common import PostProcessor
 from ..compat import compat_str
 class MetadataFromFieldPP(PostProcessor):
    regex = r'(?P<in>.*?)(?<!\\):(?P<out>.+)$'
    def __init__(self, downloader, formats):
        PostProcessor.__init__(self, downloader)
        assert isinstance(formats, (list, tuple))
        self._data = []
        for f in formats:
            assert isinstance(f, compat_str)
            match = re.match(self.regex, f)
            assert match is not None
            inp = match.group('in').replace('\\:', ':')
            self._data.append({
                'in': inp,
                'out': match.group('out'),
                'tmpl': self.field_to_template(inp),
                'regex': self.format_to_regex(match.group('out')),
            })
    @staticmethod
    def field_to_template(tmpl):
        if re.match(r'[a-zA-Z_]+$', tmpl):
            return '%%(%s)s' % tmpl
        return tmpl
    @staticmethod
    def format_to_regex(fmt):
        r"""
        Converts a string like
           '%(title)s - %(artist)s'
        to a regex like
           '(?P<title>.+)\ \-\ (?P<artist>.+)'
        """
        if not re.search(r'%\(\w+\)s', fmt):
            return fmt
        lastpos = 0
        regex = ''
        # replace %(..)s with regex group and escape other string parts
        for match in re.finditer(r'%\((\w+)\)s', fmt):
            regex += re.escape(fmt[lastpos:match.start()])
            regex += r'(?P<%s>.+)' % match.group(1)
            lastpos = match.end()
        if lastpos < len(fmt):
            regex += re.escape(fmt[lastpos:])
        return regex
    def run(self, info):
        for dictn in self._data:
            tmpl, tmpl_dict = self._downloader.prepare_outtmpl(dictn['tmpl'], info)
            data_to_parse = self._downloader.escape_outtmpl(tmpl) % tmpl_dict
            self.write_debug('Searching for r"%s" in %s' % (dictn['regex'], dictn['tmpl']))
            match = re.search(dictn['regex'], data_to_parse)
            if match is None:
                self.report_warning('Could not interpret video %s as "%s"' % (dictn['in'], dictn['out']))
                continue
            for attribute, value in match.groupdict().items():
                info[attribute] = value
                self.to_screen('parsed %s from "%s": %s' % (attribute, dictn['tmpl'], value if value is not None else 'NA'))
        return [], info
 class MetadataFromTitlePP(MetadataFromFieldPP):  # for backward compatibility
    def __init__(self, downloader, titleformat):
        super(MetadataFromTitlePP, self).__init__(downloader, ['%%(title)s:%s' % titleformat])
        self._titleformat = titleformat
        self._titleregex = self._data[0]['regex']
--- a/yt_dlp/postprocessor/metadataparser.py
+++ b/yt_dlp/postprocessor/metadataparser.py
@ -0,0 +1,117 @@
 import re
 from enum import Enum
 from .common import PostProcessor
 class MetadataParserPP(PostProcessor):
    class Actions(Enum):
        INTERPRET = 'interpretter'
        REPLACE = 'replacer'
    def __init__(self, downloader, actions):
        PostProcessor.__init__(self, downloader)
        self._actions = []
        for f in actions:
            action = f[0]
            assert isinstance(action, self.Actions)
            self._actions.append(getattr(self, action._value_)(*f[1:]))
    @classmethod
    def validate_action(cls, action, *data):
        ''' Each action can be:
                (Actions.INTERPRET, from, to) OR
                (Actions.REPLACE, field, search, replace)
        '''
        if not isinstance(action, cls.Actions):
            raise ValueError(f'{action!r} is not a valid action')
        getattr(cls, action._value_)(cls, *data)
    @staticmethod
    def field_to_template(tmpl):
        if re.match(r'[a-zA-Z_]+$', tmpl):
            return f'%({tmpl})s'
        return tmpl
    @staticmethod
    def format_to_regex(fmt):
        r"""
        Converts a string like
           '%(title)s - %(artist)s'
        to a regex like
           '(?P<title>.+)\ \-\ (?P<artist>.+)'
        """
        if not re.search(r'%\(\w+\)s', fmt):
            return fmt
        lastpos = 0
        regex = ''
        # replace %(..)s with regex group and escape other string parts
        for match in re.finditer(r'%\((\w+)\)s', fmt):
            regex += re.escape(fmt[lastpos:match.start()])
            regex += rf'(?P<{match.group(1)}>.+)'
            lastpos = match.end()
        if lastpos < len(fmt):
            regex += re.escape(fmt[lastpos:])
        return regex
    def run(self, info):
        for f in self._actions:
            f(info)
        return [], info
    def interpretter(self, inp, out):
        def f(info):
            outtmpl, tmpl_dict = self._downloader.prepare_outtmpl(template, info)
            data_to_parse = self._downloader.escape_outtmpl(outtmpl) % tmpl_dict
            self.write_debug(f'Searching for r{out_re.pattern!r} in {template!r}')
            match = out_re.search(data_to_parse)
            if match is None:
                self.report_warning('Could not interpret {inp!r} as {out!r}')
                return
            for attribute, value in match.groupdict().items():
                info[attribute] = value
                self.to_screen('Parsed %s from %r: %r' % (attribute, template, value if value is not None else 'NA'))
        template = self.field_to_template(inp)
        out_re = re.compile(self.format_to_regex(out))
        return f
    def replacer(self, field, search, replace):
        def f(info):
            val = info.get(field)
            if val is None:
                self.report_warning(f'Video does not have a {field}')
                return
            elif not isinstance(val, str):
                self.report_warning(f'Cannot replace in field {field} since it is a {type(val).__name__}')
                return
            self.write_debug(f'Replacing all r{search!r} in {field} with {replace!r}')
            info[field], n = search_re.subn(replace, val)
            if n:
                self.to_screen(f'Changed {field} to: {info[field]}')
            else:
                self.to_screen(f'Did not find r{search!r} in {field}')
        search_re = re.compile(search)
        return f
 class MetadataFromFieldPP(MetadataParserPP):
    @classmethod
    def to_action(cls, f):
        match = re.match(r'(?P<in>.*?)(?<!\\):(?P<out>.+)$', f)
        if match is None:
            raise ValueError(f'it should be FROM:TO, not {f!r}')
        return (
            cls.Actions.INTERPRET,
            match.group('in').replace('\\:', ':'),
            match.group('out'))
    def __init__(self, downloader, formats):
        MetadataParserPP.__init__(self, downloader, [self.to_action(f) for f in formats])
 class MetadataFromTitlePP(MetadataParserPP):  # for backward compatibility
    def __init__(self, downloader, titleformat):
        MetadataParserPP.__init__(self, downloader, [(self.Actions.INTERPRET, 'title', titleformat)])