ansible-test - Improve container startup handling.

Also improve the ansible-test-container integration test: - Add coverage for the no-probe code path. - Add work-arounds for centos6 containers (to support backporting). - Avoid systemd debug when the container doesn't use cgroup.
1 year ago · 04fc98c794
parent 69c874f478
commit 04fc98c794
3 changed files with 76 additions and 9 deletions
--- a/changelogs/fragments/ansible-test-container-management.yml
+++ b/changelogs/fragments/ansible-test-container-management.yml
@ -56,6 +56,8 @@ bugfixes:
  - ansible-test - Detection for running in a Podman or Docker container has been fixed to detect more scenarios.
                   The new detection relies on ``/proc/self/mountinfo`` instead of ``/proc/self/cpuset``.
                   Detection now works with custom cgroups and private cgroup namespaces.
+  - ansible-test - Avoid using ``exec`` after container startup when possible.
+                   This improves container startup performance and avoids intermittent startup issues with some old containers.
 known_issues:
  - ansible-test - Using Docker on systems with SELinux may require setting SELinux to permissive mode.
                   Podman should work with SELinux in enforcing mode.
--- a/test/integration/targets/ansible-test-container/runme.py
+++ b/test/integration/targets/ansible-test-container/runme.py
@ -149,10 +149,29 @@ def get_test_scenarios() -> list[TestScenario]:
        image = settings['image']
        cgroup = settings.get('cgroup', 'v1-v2')

+        if container_name == 'centos6' and os_release.id == 'alpine':
+            # Alpine kernels do not emulate vsyscall by default, which causes the centos6 container to fail during init.
+            # See: https://unix.stackexchange.com/questions/478387/running-a-centos-docker-image-on-arch-linux-exits-with-code-139
+            # Other distributions enable settings which trap vsyscall by default.
+            # See: https://www.kernelconfig.io/config_legacy_vsyscall_xonly
+            # See: https://www.kernelconfig.io/config_legacy_vsyscall_emulate
+            continue
+
        for engine in available_engines:
            # TODO: figure out how to get tests passing using docker without disabling selinux
            disable_selinux = os_release.id == 'fedora' and engine == 'docker' and cgroup != 'none'
            expose_cgroup_v1 = cgroup == 'v1-only' and get_docker_info(engine).cgroup_version != 1
+            debug_systemd = cgroup != 'none'
+
+            # The sleep+pkill used to support the cgroup probe causes problems with the centos6 container.
+            # It results in sshd connections being refused or reset for many, but not all, container instances.
+            # The underlying cause of this issue is unknown.
+            probe_cgroups = container_name != 'centos6'
+
+            # The default RHEL 9 crypto policy prevents use of SHA-1.
+            # This results in SSH errors with centos6 containers: ssh_dispatch_run_fatal: Connection to 1.2.3.4 port 22: error in libcrypto
+            # See: https://access.redhat.com/solutions/6816771
+            enable_sha1 = os_release.id == 'rhel' and os_release.version_id.startswith('9.') and container_name == 'centos6'

            if cgroup != 'none' and get_docker_info(engine).cgroup_version == 1 and not have_cgroup_systemd():
                expose_cgroup_v1 = True  # the host uses cgroup v1 but there is no systemd cgroup and the container requires cgroup support
@ -182,6 +201,9 @@ def get_test_scenarios() -> list[TestScenario]:
                        image=image,
                        disable_selinux=disable_selinux,
                        expose_cgroup_v1=expose_cgroup_v1,
+                        enable_sha1=enable_sha1,
+                        debug_systemd=debug_systemd,
+                        probe_cgroups=probe_cgroups,
                    )
                )

@ -195,11 +217,21 @@ def run_test(scenario: TestScenario) -> TestResult:
    start = time.monotonic()

    integration = ['ansible-test', 'integration', 'split']
-    integration_options = ['--target', f'docker:{scenario.container_name}', '--color', '--truncate', '0', '-v', '--dev-probe-cgroups', str(LOG_PATH),
-                           '--dev-systemd-debug']
+    integration_options = ['--target', f'docker:{scenario.container_name}', '--color', '--truncate', '0', '-v']
+    target_only_options = []
+
+    if scenario.debug_systemd:
+        integration_options.append('--dev-systemd-debug')
+
+    if scenario.probe_cgroups:
+        target_only_options = ['--dev-probe-cgroups', str(LOG_PATH)]

    commands = [
-        [*integration, *integration_options],
+        # The cgroup probe is only performed for the first test of the target.
+        # There's no need to repeat the probe again for the same target.
+        # The controller will be tested separately as a target.
+        # This ensures that both the probe and no-probe code paths are functional.
+        [*integration, *integration_options, *target_only_options],
        # For the split test we'll use alpine3 as the controller. There are two reasons for this:
        # 1) It doesn't require the cgroup v1 hack, so we can test a target that doesn't need that.
        # 2) It doesn't require disabling selinux, so we can test a target that doesn't need that.
@ -260,12 +292,18 @@ def run_test(scenario: TestScenario) -> TestResult:
        if scenario.disable_selinux:
            run_command('setenforce', 'permissive')

+        if scenario.enable_sha1:
+            run_command('update-crypto-policies', '--set', 'DEFAULT:SHA1')
+
        for test_command in test_commands:
            retry_command(lambda: run_command(*test_command))
    except SubprocessError as ex:
        message = str(ex)
        display.error(f'{scenario} {message}')
    finally:
+        if scenario.enable_sha1:
+            run_command('update-crypto-policies', '--set', 'DEFAULT')
+
        if scenario.disable_selinux:
            run_command('setenforce', 'enforcing')

@ -519,6 +557,9 @@ class TestScenario:
    image: str
    disable_selinux: bool
    expose_cgroup_v1: bool
+    enable_sha1: bool
+    debug_systemd: bool
+    probe_cgroups: bool

    @property
    def tags(self) -> tuple[str, ...]:
@ -536,6 +577,9 @@ class TestScenario:
        if self.expose_cgroup_v1:
            tags.append('cgroup: v1')

+        if self.enable_sha1:
+            tags.append('sha1: enabled')
+
        return tuple(tags)

    @property
--- a/test/lib/ansible_test/_internal/host_profiles.py
+++ b/test/lib/ansible_test/_internal/host_profiles.py
@ -411,6 +411,7 @@ class DockerProfile(ControllerHostProfile[DockerConfig], SshTargetHostProfile[Do
        """Configuration details required to run the container init."""
        options: list[str]
        command: str
+        command_privileged: bool
        expected_mounts: tuple[CGroupMount, ...]

    @property
@ -452,12 +453,12 @@ class DockerProfile(ControllerHostProfile[DockerConfig], SshTargetHostProfile[Do
            publish_ports=not self.controller,  # connections to the controller over SSH are not required
            options=init_config.options,
            cleanup=CleanupMode.NO,
-            cmd=self.build_sleep_command() if init_config.command or init_probe else None,
+            cmd=self.build_init_command(init_config, init_probe),
        )

        if not container:
            if self.args.prime_containers:
-                if init_config.command or init_probe:
+                if init_config.command_privileged or init_probe:
                    docker_pull(self.args, UTILITY_IMAGE)

            return
@ -467,7 +468,7 @@ class DockerProfile(ControllerHostProfile[DockerConfig], SshTargetHostProfile[Do
        try:
            options = ['--pid', 'host', '--privileged']

-            if init_config.command:
+            if init_config.command and init_config.command_privileged:
                init_command = init_config.command

                if not init_probe:
@ -500,6 +501,7 @@ class DockerProfile(ControllerHostProfile[DockerConfig], SshTargetHostProfile[Do
        """Return init config for running under Podman."""
        options = self.get_common_run_options()
        command: t.Optional[str] = None
+        command_privileged = False
        expected_mounts: tuple[CGroupMount, ...]

        cgroup_version = get_docker_info(self.args).cgroup_version
@ -651,6 +653,7 @@ class DockerProfile(ControllerHostProfile[DockerConfig], SshTargetHostProfile[Do
        return self.InitConfig(
            options=options,
            command=command,
+            command_privileged=command_privileged,
            expected_mounts=expected_mounts,
        )

@ -658,6 +661,7 @@ class DockerProfile(ControllerHostProfile[DockerConfig], SshTargetHostProfile[Do
        """Return init config for running under Docker."""
        options = self.get_common_run_options()
        command: t.Optional[str] = None
+        command_privileged = False
        expected_mounts: tuple[CGroupMount, ...]

        cgroup_version = get_docker_info(self.args).cgroup_version
@ -724,7 +728,9 @@ class DockerProfile(ControllerHostProfile[DockerConfig], SshTargetHostProfile[Do
        elif self.config.cgroup in (CGroupVersion.V1_V2, CGroupVersion.V2_ONLY) and cgroup_version == 2:
            # Docker hosts providing cgroup v2 will give each container a read-only cgroup mount.
            # It must be remounted read-write before systemd starts.
+            # This must be done in a privileged container, otherwise a "permission denied" error can occur.
            command = 'mount -o remount,rw /sys/fs/cgroup/'
+            command_privileged = True

            options.extend((
                # A private cgroup namespace is used to avoid exposing the host cgroup to the container.
@ -768,12 +774,14 @@ class DockerProfile(ControllerHostProfile[DockerConfig], SshTargetHostProfile[Do
        return self.InitConfig(
            options=options,
            command=command,
+            command_privileged=command_privileged,
            expected_mounts=expected_mounts,
        )

-    def build_sleep_command(self) -> list[str]:
+    def build_init_command(self, init_config: InitConfig, sleep: bool) -> t.Optional[list[str]]:
        """
-        Build and return the command to put the container to sleep.
+        Build and return the command to start in the container.
+        Returns None if the default command for the container should be used.

        The sleep duration below was selected to:

@ -783,10 +791,23 @@ class DockerProfile(ControllerHostProfile[DockerConfig], SshTargetHostProfile[Do

        NOTE: The container must have a POSIX-compliant default shell "sh" with a non-builtin "sleep" command.
        """
+        command = ''
+
+        if init_config.command and not init_config.command_privileged:
+            command += f'{init_config.command} && '
+
+        if sleep or init_config.command_privileged:
+            command += 'sleep 60 ; '
+
+        if not command:
+            return None
+
        docker_pull(self.args, self.config.image)
        inspect = docker_image_inspect(self.args, self.config.image)

-        return ['sh', '-c', f'sleep 60; exec {shlex.join(inspect.cmd)}']
+        command += f'exec {shlex.join(inspect.cmd)}'
+
+        return ['sh', '-c', command]

    @property
    def wake_command(self) -> list[str]: