From a71b812f53a5f678e4c9467858e721dcd4953a16 Mon Sep 17 00:00:00 2001 From: Simon Sawicki <37424085+Grub4K@users.noreply.github.com> Date: Wed, 12 Oct 2022 22:22:17 +0200 Subject: [PATCH] [utils] `js_to_json`: Improve escape handling (#5217) Authored by: Grub4K --- test/test_utils.py | 6 +++++ yt_dlp/utils.py | 61 ++++++++++++++++++++++++++-------------------- 2 files changed, 40 insertions(+), 27 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index df23f1f47..49ab3796b 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1100,6 +1100,12 @@ class TestUtil(unittest.TestCase): on = js_to_json('[1,//{},\n2]') self.assertEqual(json.loads(on), [1, 2]) + on = js_to_json(R'"\^\$\#"') + self.assertEqual(json.loads(on), R'^$#', msg='Unnecessary escapes should be stripped') + + on = js_to_json('\'"\\""\'') + self.assertEqual(json.loads(on), '"""', msg='Unnecessary quote escape should be escaped') + def test_js_to_json_malformed(self): self.assertEqual(js_to_json('42a1'), '42"a1"') self.assertEqual(js_to_json('42a-1'), '42"a"-1') diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 6cfbcdb8d..adb7c0e8c 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3275,6 +3275,8 @@ def strip_jsonp(code): def js_to_json(code, vars={}, *, strict=False): # vars is a dict of var, val pairs to substitute + STRING_QUOTES = '\'"' + STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES) COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n' SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*' INTEGER_TABLE = ( @@ -3282,6 +3284,15 @@ def js_to_json(code, vars={}, *, strict=False): (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8), ) + def process_escape(match): + JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu' + escape = match.group(1) or match.group(2) + + return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES + else R'\u00' if escape == 'x' + else '' if escape == '\n' + else escape) + def fix_kv(m): v = m.group(0) if v in ('true', 'false', 'null'): @@ -3289,28 +3300,25 @@ def js_to_json(code, vars={}, *, strict=False): elif v in ('undefined', 'void 0'): return 'null' elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',': - return "" - - if v[0] in ("'", '"'): - v = re.sub(r'(?s)\\.|"', lambda m: { - '"': '\\"', - "\\'": "'", - '\\\n': '', - '\\x': '\\u00', - }.get(m.group(0), m.group(0)), v[1:-1]) - else: - for regex, base in INTEGER_TABLE: - im = re.match(regex, v) - if im: - i = int(im.group(1), base) - return '"%d":' % i if v.endswith(':') else '%d' % i + return '' + + if v[0] in STRING_QUOTES: + escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v[1:-1]) + return f'"{escaped}"' + + for regex, base in INTEGER_TABLE: + im = re.match(regex, v) + if im: + i = int(im.group(1), base) + return f'"{i}":' if v.endswith(':') else str(i) + + if v in vars: + return json.dumps(vars[v]) - if v in vars: - return json.dumps(vars[v]) - if strict: - raise ValueError(f'Unknown value: {v}') + if not strict: + return f'"{v}"' - return '"%s"' % v + raise ValueError(f'Unknown value: {v}') def create_map(mobj): return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars)))) @@ -3320,15 +3328,14 @@ def js_to_json(code, vars={}, *, strict=False): code = re.sub(r'new Date\((".+")\)', r'\g<1>', code) code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code) - return re.sub(r'''(?sx) - "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| - '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| - {comment}|,(?={skip}[\]}}])| + return re.sub(rf'''(?sx) + {STRING_RE}| + {COMMENT_RE}|,(?={SKIP_RE}[\]}}])| void\s0|(?:(?