yt-dlp/devscripts/tomlparse.py

#!/usr/bin/env python3

"""
Simple parser for spec compliant toml files

A simple toml parser for files that comply with the spec.
Should only be used to parse `pyproject.toml` for `install_deps.py`.

IMPORTANT: INVALID FILES OR MULTILINE STRINGS ARE NOT SUPPORTED!
"""

from __future__ import annotations

import datetime as dt
import json
import re

WS = r'(?:[\ \t]*)'
STRING_RE = re.compile(r'"(?:\\.|[^\\"\n])*"|\'[^\'\n]*\'')
SINGLE_KEY_RE = re.compile(rf'{STRING_RE.pattern}|[A-Za-z0-9_-]+')
KEY_RE = re.compile(rf'{WS}(?:{SINGLE_KEY_RE.pattern}){WS}(?:\.{WS}(?:{SINGLE_KEY_RE.pattern}){WS})*')
EQUALS_RE = re.compile(rf'={WS}')
WS_RE = re.compile(WS)

_SUBTABLE = rf'(?P<subtable>^\[(?P<is_list>\[)?(?P<path>{KEY_RE.pattern})\]\]?)'
EXPRESSION_RE = re.compile(rf'^(?:{_SUBTABLE}|{KEY_RE.pattern}=)', re.MULTILINE)

LIST_WS_RE = re.compile(rf'{WS}((#[^\n]*)?\n{WS})*')
LEFTOVER_VALUE_RE = re.compile(r'[^,}\]\t\n#]+')


def parse_key(value: str):
    for match in SINGLE_KEY_RE.finditer(value):
        if match[0][0] == '"':
            yield json.loads(match[0])
        elif match[0][0] == '\'':
            yield match[0][1:-1]
        else:
            yield match[0]


def get_target(root: dict, paths: list[str], is_list=False):
    target = root

    for index, key in enumerate(paths, 1):
        use_list = is_list and index == len(paths)
        result = target.get(key)
        if result is None:
            result = [] if use_list else {}
            target[key] = result

        if isinstance(result, dict):
            target = result
        elif use_list:
            target = {}
            result.append(target)
        else:
            target = result[-1]

    assert isinstance(target, dict)
    return target


def parse_enclosed(data: str, index: int, end: str, ws_re: re.Pattern):
    index += 1

    if match := ws_re.match(data, index):
        index = match.end()

    while data[index] != end:
        index = yield True, index

        if match := ws_re.match(data, index):
            index = match.end()

        if data[index] == ',':
            index += 1

        if match := ws_re.match(data, index):
            index = match.end()

    assert data[index] == end
    yield False, index + 1


def parse_value(data: str, index: int):
    if data[index] == '[':
        result = []

        indices = parse_enclosed(data, index, ']', LIST_WS_RE)
        valid, index = next(indices)
        while valid:
            index, value = parse_value(data, index)
            result.append(value)
            valid, index = indices.send(index)

        return index, result

    if data[index] == '{':
        result = {}

        indices = parse_enclosed(data, index, '}', WS_RE)
        valid, index = next(indices)
        while valid:
            valid, index = indices.send(parse_kv_pair(data, index, result))

        return index, result

    if match := STRING_RE.match(data, index):
        return match.end(), json.loads(match[0]) if match[0][0] == '"' else match[0][1:-1]

    match = LEFTOVER_VALUE_RE.match(data, index)
    assert match
    value = match[0].strip()
    for func in [
        int,
        float,
        dt.time.fromisoformat,
        dt.date.fromisoformat,
        dt.datetime.fromisoformat,
        {'true': True, 'false': False}.get,
    ]:
        try:
            value = func(value)
            break
        except Exception:
            pass

    return match.end(), value


def parse_kv_pair(data: str, index: int, target: dict):
    match = KEY_RE.match(data, index)
    if not match:
        return None

    *keys, key = parse_key(match[0])

    match = EQUALS_RE.match(data, match.end())
    assert match
    index = match.end()

    index, value = parse_value(data, index)
    get_target(target, keys)[key] = value
    return index


def parse_toml(data: str):
    root = {}
    target = root

    index = 0
    while True:
        match = EXPRESSION_RE.search(data, index)
        if not match:
            break

        if match.group('subtable'):
            index = match.end()
            path, is_list = match.group('path', 'is_list')
            target = get_target(root, list(parse_key(path)), bool(is_list))
            continue

        index = parse_kv_pair(data, match.start(), target)
        assert index is not None

    return root


def main():
    import argparse
    from pathlib import Path

    parser = argparse.ArgumentParser()
    parser.add_argument('infile', type=Path, help='The TOML file to read as input')
    args = parser.parse_args()

    with args.infile.open('r', encoding='utf-8') as file:
        data = file.read()

    def default(obj):
        if isinstance(obj, (dt.date, dt.time, dt.datetime)):
            return obj.isoformat()

    print(json.dumps(parse_toml(data), default=default))


if __name__ == '__main__':
    main()
[devscripts] `tomlparse`: Add makeshift toml parser Authored by: Grub4K 9 months ago			`#!/usr/bin/env python3`

			`"""`
			`Simple parser for spec compliant toml files`

			`A simple toml parser for files that comply with the spec.`
			Should only be used to parse `pyproject.toml` for `install_deps.py`.

			`IMPORTANT: INVALID FILES OR MULTILINE STRINGS ARE NOT SUPPORTED!`
			`"""`

			`from __future__ import annotations`

[cleanup] Standardize `import datetime as dt` (#8978) 8 months ago			`import datetime as dt`
[devscripts] `tomlparse`: Add makeshift toml parser Authored by: Grub4K 9 months ago			`import json`
			`import re`

			`WS = r'(?:[\ \t]*)'`
			`STRING_RE = re.compile(r'"(?:\\.\|[^\\"\n])"\|\'[^\'\n]\'')`
			`SINGLE_KEY_RE = re.compile(rf'{STRING_RE.pattern}\|[A-Za-z0-9_-]+')`
			`KEY_RE = re.compile(rf'{WS}(?:{SINGLE_KEY_RE.pattern}){WS}(?:\.{WS}(?:{SINGLE_KEY_RE.pattern}){WS})*')`
			`EQUALS_RE = re.compile(rf'={WS}')`
			`WS_RE = re.compile(WS)`

			`_SUBTABLE = rf'(?P<subtable>^\[(?P<is_list>\[)?(?P<path>{KEY_RE.pattern})\]\]?)'`
			`EXPRESSION_RE = re.compile(rf'^(?:{_SUBTABLE}\|{KEY_RE.pattern}=)', re.MULTILINE)`

			`LIST_WS_RE = re.compile(rf'{WS}((#[^\n])?\n{WS})')`
			`LEFTOVER_VALUE_RE = re.compile(r'[^,}\]\t\n#]+')`


			`def parse_key(value: str):`
			`for match in SINGLE_KEY_RE.finditer(value):`
			`if match[0][0] == '"':`
			`yield json.loads(match[0])`
			`elif match[0][0] == '\'':`
			`yield match[0][1:-1]`
			`else:`
			`yield match[0]`


			`def get_target(root: dict, paths: list[str], is_list=False):`
			`target = root`

			`for index, key in enumerate(paths, 1):`
			`use_list = is_list and index == len(paths)`
			`result = target.get(key)`
			`if result is None:`
			`result = [] if use_list else {}`
			`target[key] = result`

			`if isinstance(result, dict):`
			`target = result`
			`elif use_list:`
			`target = {}`
			`result.append(target)`
			`else:`
			`target = result[-1]`

			`assert isinstance(target, dict)`
			`return target`


			`def parse_enclosed(data: str, index: int, end: str, ws_re: re.Pattern):`
			`index += 1`

			`if match := ws_re.match(data, index):`
			`index = match.end()`

			`while data[index] != end:`
			`index = yield True, index`

			`if match := ws_re.match(data, index):`
			`index = match.end()`

			`if data[index] == ',':`
			`index += 1`

			`if match := ws_re.match(data, index):`
			`index = match.end()`

			`assert data[index] == end`
			`yield False, index + 1`


			`def parse_value(data: str, index: int):`
			`if data[index] == '[':`
			`result = []`

			`indices = parse_enclosed(data, index, ']', LIST_WS_RE)`
			`valid, index = next(indices)`
			`while valid:`
			`index, value = parse_value(data, index)`
			`result.append(value)`
			`valid, index = indices.send(index)`

			`return index, result`

			`if data[index] == '{':`
			`result = {}`

			`indices = parse_enclosed(data, index, '}', WS_RE)`
			`valid, index = next(indices)`
			`while valid:`
			`valid, index = indices.send(parse_kv_pair(data, index, result))`

			`return index, result`

			`if match := STRING_RE.match(data, index):`
			`return match.end(), json.loads(match[0]) if match[0][0] == '"' else match[0][1:-1]`

			`match = LEFTOVER_VALUE_RE.match(data, index)`
			`assert match`
			`value = match[0].strip()`
			`for func in [`
			`int,`
			`float,`
[cleanup] Standardize `import datetime as dt` (#8978) 8 months ago			`dt.time.fromisoformat,`
			`dt.date.fromisoformat,`
			`dt.datetime.fromisoformat,`
[devscripts] `tomlparse`: Add makeshift toml parser Authored by: Grub4K 9 months ago			`{'true': True, 'false': False}.get,`
			`]:`
			`try:`
			`value = func(value)`
			`break`
			`except Exception:`
			`pass`

			`return match.end(), value`


			`def parse_kv_pair(data: str, index: int, target: dict):`
			`match = KEY_RE.match(data, index)`
			`if not match:`
			`return None`

			`*keys, key = parse_key(match[0])`

			`match = EQUALS_RE.match(data, match.end())`
			`assert match`
			`index = match.end()`

			`index, value = parse_value(data, index)`
			`get_target(target, keys)[key] = value`
			`return index`


			`def parse_toml(data: str):`
			`root = {}`
			`target = root`

			`index = 0`
			`while True:`
			`match = EXPRESSION_RE.search(data, index)`
			`if not match:`
			`break`

			`if match.group('subtable'):`
			`index = match.end()`
			`path, is_list = match.group('path', 'is_list')`
			`target = get_target(root, list(parse_key(path)), bool(is_list))`
			`continue`

			`index = parse_kv_pair(data, match.start(), target)`
			`assert index is not None`

			`return root`


			`def main():`
			`import argparse`
			`from pathlib import Path`

			`parser = argparse.ArgumentParser()`
			`parser.add_argument('infile', type=Path, help='The TOML file to read as input')`
			`args = parser.parse_args()`

			`with args.infile.open('r', encoding='utf-8') as file:`
			`data = file.read()`

			`def default(obj):`
[cleanup] Standardize `import datetime as dt` (#8978) 8 months ago			`if isinstance(obj, (dt.date, dt.time, dt.datetime)):`
[devscripts] `tomlparse`: Add makeshift toml parser Authored by: Grub4K 9 months ago			`return obj.isoformat()`

			`print(json.dumps(parse_toml(data), default=default))`


			`if __name__ == '__main__':`
			`main()`