find - add encoding option to use in conjunction with contains (#82284)

* when doing a 'contains' search, determine the encoding of the files to be searched * set default encoding to None for backwards compatibility * changelog, error handling, tests added * add sanity ignore.txt for non-utf-8 test
4 months ago · aa40167f40
parent b01f1f207c
commit aa40167f40
5 changed files with 74 additions and 9 deletions
--- a/changelogs/fragments/find-encoding.yml
+++ b/changelogs/fragments/find-encoding.yml
@ -0,0 +1,2 @@
+minor_changes:
+  - find - add a encoding parameter to specify which encoding of the files to be searched.
--- a/lib/ansible/modules/find.py
+++ b/lib/ansible/modules/find.py
@ -149,6 +149,11 @@ options:
            - Default is unlimited depth.
        type: int
        version_added: "2.6"
+    encoding:
+        description:
+            - When doing a C(contains) search, determine the encoding of the files to be searched.
+        type: str
+        version_added: "2.17"
 extends_documentation_fragment: action_common_attributes
 attributes:
    check_mode:
@ -337,11 +342,12 @@ def sizefilter(st, size):
    return False


-def contentfilter(fsname, pattern, read_whole_file=False):
+def contentfilter(fsname, pattern, encoding, read_whole_file=False):
    """
    Filter files which contain the given expression
    :arg fsname: Filename to scan for lines matching a pattern
    :arg pattern: Pattern to look for inside of line
+    :arg encoding: Encoding of the file to be scanned
    :arg read_whole_file: If true, the whole file is read into memory before the regex is applied against it. Otherwise, the regex is applied line-by-line.
    :rtype: bool
    :returns: True if one of the lines in fsname matches the pattern. Otherwise False
@ -352,7 +358,7 @@ def contentfilter(fsname, pattern, read_whole_file=False):
    prog = re.compile(pattern)

    try:
-        with open(fsname) as f:
+        with open(fsname, encoding=encoding) as f:
            if read_whole_file:
                return bool(prog.search(f.read()))

@ -360,6 +366,13 @@ def contentfilter(fsname, pattern, read_whole_file=False):
                if prog.match(line):
                    return True

+    except LookupError as e:
+        raise e
+    except UnicodeDecodeError as e:
+        if encoding is None:
+            encoding = 'None (default determined by the Python built-in function "open")'
+        msg = f'Failed to read the file {fsname} due to an encoding error. current encoding: {encoding}'
+        raise Exception(msg) from e
    except Exception:
        pass

@ -457,6 +470,7 @@ def main():
            depth=dict(type='int'),
            mode=dict(type='raw'),
            exact_mode=dict(type='bool', default=True),
+            encoding=dict(type='str')
        ),
        supports_check_mode=True,
    )
@ -563,7 +577,7 @@ def main():
                        if (pfilter(fsobj, params['patterns'], params['excludes'], params['use_regex']) and
                                agefilter(st, now, age, params['age_stamp']) and
                                sizefilter(st, size) and
-                                contentfilter(fsname, params['contains'], params['read_whole_file']) and
+                                contentfilter(fsname, params['contains'], params['encoding'], params['read_whole_file']) and
                                mode_filter(st, params['mode'], params['exact_mode'], module)):

                            r.update(statinfo(st))
--- a/test/integration/targets/find/files/hello_world.gbk
+++ b/test/integration/targets/find/files/hello_world.gbk
@ -0,0 +1 @@
+ΔγΊΓΚΐ½η
--- a/test/integration/targets/find/tasks/main.yml
+++ b/test/integration/targets/find/tasks/main.yml
@ -124,6 +124,7 @@
  with_items:
      - a.txt
      - log.txt
+      - hello_world.gbk

 - name: Ensure '$' only matches the true end of the file with read_whole_file, not a line
  find:
@ -195,6 +196,51 @@
      that:
          - no_match_line_boundaries.matched == 0

+- name: read a gbk file by utf-8
+  find:
+    paths: "{{ remote_tmp_dir_test }}"
+    patterns: "*.gbk"
+    contains: "你好世界"
+    encoding: "utf-8"
+  register: fail_to_read_wrong_encoding_file
+
+- debug: var=fail_to_read_wrong_encoding_file
+
+- assert:
+      that:
+          - fail_to_read_wrong_encoding_file.msg == 'Not all paths examined, check warnings for details'
+          - >-
+              fail_to_read_wrong_encoding_file.skipped_paths[remote_tmp_dir_test] == 
+              ("Failed to read the file %s/hello_world.gbk due to an encoding error. current encoding: utf-8" % (remote_tmp_dir_test))
+
+- name: read a gbk file by gbk
+  find:
+    paths: "{{ remote_tmp_dir_test }}"
+    encoding: "gbk"
+    patterns: "*.gbk"
+    contains: "你好世界"
+  register: success_to_read_right_encoding_file
+
+- debug: var=success_to_read_right_encoding_file
+
+- assert:
+      that:
+          - success_to_read_right_encoding_file.matched == 1
+
+- name: read a gbk file by non-exists encoding
+  find:
+    paths: "{{ remote_tmp_dir_test }}"
+    encoding: "idontexist"
+    patterns: "*.gbk"
+    contains: "你好世界"
+  register: fail_to_search_file_by_non_exists_encoding
+
+- debug: var=fail_to_search_file_by_non_exists_encoding
+
+- assert:
+      that:
+          - fail_to_search_file_by_non_exists_encoding.skipped_paths[remote_tmp_dir_test] == "unknown encoding: idontexist"
+
 - block:
    - set_fact:
        mypath: /idontexist{{lookup('pipe', 'mktemp')}}
@ -221,8 +267,8 @@

    - assert:
        that:
-          - total_contents.matched == 18
-          - total_contents.examined == 18
+          - total_contents.matched == 19
+          - total_contents.examined == 19

    - name: Get files and directories with depth
      find:
@ -234,10 +280,10 @@

    - assert:
        that:
-          - contents_with_depth.matched == 8
+          - contents_with_depth.matched == 9
          # dir contents are considered until the depth exceeds the requested depth
          # there are 8 files/directories in the requested depth and 4 that exceed it by 1
-          - contents_with_depth.examined == 12
+          - contents_with_depth.examined == 13

    - name: Find files with depth
      find:
@ -248,10 +294,10 @@

    - assert:
        that:
-          - files_with_depth.matched == 4
+          - files_with_depth.matched == 5
          # dir contents are considered until the depth exceeds the requested depth
          # there are 8 files/directories in the requested depth and 4 that exceed it by 1
-          - files_with_depth.examined == 12
+          - files_with_depth.examined == 13

 - name: exclude with regex
  find:
--- a/test/sanity/ignore.txt
+++ b/test/sanity/ignore.txt
@ -197,3 +197,5 @@ README.md pymarkdown:line-length
 test/integration/targets/ansible-vault/invalid_format/README.md pymarkdown:no-bare-urls
 test/support/README.md pymarkdown:no-bare-urls
 test/units/cli/test_data/role_skeleton/README.md pymarkdown:line-length
+test/integration/targets/find/files/hello_world.gbk no-smart-quotes
+test/integration/targets/find/files/hello_world.gbk no-unwanted-characters