Add timeout support to ansible-test. (#53302)

* Add timeout support to ansible-test. * Fix ansible-test tar filename filter bug. * Update timeouts used on Shippable. * Kill subprocesses when parent process terminates. * Require explicit use of env --show option.
6 years ago · a8d829d9c3
parent 44b347aef5
commit a8d829d9c3
9 changed files with 219 additions and 21 deletions
--- a/.gitignore
+++ b/.gitignore
@ -105,3 +105,4 @@ test/units/.coverage.*
 /test/integration/cloud-config-azure.yml
 /SYMLINK_CACHE.json
 changelogs/.plugin-cache.yaml
 .ansible-test-timeout.json
--- a/test/runner/lib/cli.py
+++ b/test/runner/lib/cli.py
@ -52,6 +52,7 @@ from lib.config import (
 from lib.env import (
    EnvConfig,
    command_env,
    configure_timeout,
 )
 from lib.sanity import (
@ -95,6 +96,7 @@ def main():
        display.color = config.color
        display.info_stderr = (isinstance(config, SanityConfig) and config.lint) or (isinstance(config, IntegrationConfig) and config.list_targets)
        check_startup()
        configure_timeout(config)
        display.info('RLIMIT_NOFILE: %s' % (CURRENT_RLIMIT_NOFILE,), verbosity=2)
        display.info('MAXFD: %d' % MAXFD, verbosity=2)
@ -509,6 +511,11 @@ def parse_args():
                     action='store_true',
                     help='dump environment to disk')
    env.add_argument('--timeout',
                     type=int,
                     metavar='MINUTES',
                     help='timeout for future ansible-test commands (0 clears)')
    if argcomplete:
        argcomplete.autocomplete(parser, always_complete_options=False, validator=lambda i, k: True)
--- a/test/runner/lib/constants.py
+++ b/test/runner/lib/constants.py
@ -3,3 +3,6 @@
 # Setting a low soft RLIMIT_NOFILE value will improve the performance of subprocess.Popen on Python 2.x when close_fds=True.
 # This will affect all Python subprocesses. It will also affect the current Python process if set before subprocess is imported for the first time.
 SOFT_RLIMIT_NOFILE = 1024
 # File used to track the ansible-test test execution timeout.
 TIMEOUT_PATH = '.ansible-test-timeout.json'
--- a/test/runner/lib/env.py
+++ b/test/runner/lib/env.py
@ -4,13 +4,17 @@ from __future__ import absolute_import, print_function
 import datetime
 import json
 import functools
 import os
 import platform
 import re
 import signal
 import sys
 import time
 from lib.config import (
    CommonConfig,
    TestConfig,
 )
 from lib.util import (
@ -34,6 +38,18 @@ from lib.docker_util import (
    docker_version
 )
 from lib.thread import (
    WrappedThread,
 )
 from lib.constants import (
    TIMEOUT_PATH,
 )
 from lib.test import (
    TestTimeout,
 )
 class EnvConfig(CommonConfig):
    """Configuration for the tools command."""
@ -43,14 +59,26 @@ class EnvConfig(CommonConfig):
        """
        super(EnvConfig, self).__init__(args, 'env')
-        self.show = args.show or not args.dump
+        self.show = args.show
        self.dump = args.dump
        self.timeout = args.timeout
 def command_env(args):
    """
    :type args: EnvConfig
    """
    show_dump_env(args)
    set_timeout(args)
 def show_dump_env(args):
    """
    :type args: EnvConfig
    """
    if not args.show and not args.dump:
        return
    data = dict(
        ansible=dict(
            version=get_ansible_version(args),
@ -84,6 +112,105 @@ def command_env(args):
            results_fd.write(json.dumps(data, sort_keys=True))
 def set_timeout(args):
    """
    :type args: EnvConfig
    """
    if args.timeout is None:
        return
    if args.timeout:
        deadline = (datetime.datetime.utcnow() + datetime.timedelta(minutes=args.timeout)).strftime('%Y-%m-%dT%H:%M:%SZ')
        display.info('Setting a %d minute test timeout which will end at: %s' % (args.timeout, deadline), verbosity=1)
    else:
        deadline = None
        display.info('Clearing existing test timeout.', verbosity=1)
    if args.explain:
        return
    if deadline:
        data = dict(
            duration=args.timeout,
            deadline=deadline,
        )
        with open(TIMEOUT_PATH, 'w') as timeout_fd:
            json.dump(data, timeout_fd, indent=4, sort_keys=True)
    elif os.path.exists(TIMEOUT_PATH):
        os.remove(TIMEOUT_PATH)
 def get_timeout():
    """
    :rtype: dict[str, any] | None
    """
    if not os.path.exists(TIMEOUT_PATH):
        return None
    with open(TIMEOUT_PATH, 'r') as timeout_fd:
        data = json.load(timeout_fd)
    data['deadline'] = datetime.datetime.strptime(data['deadline'], '%Y-%m-%dT%H:%M:%SZ')
    return data
 def configure_timeout(args):
    """
    :type args: CommonConfig
    """
    if isinstance(args, TestConfig):
        configure_test_timeout(args)  # only tests are subject to the timeout
 def configure_test_timeout(args):
    """
    :type args: TestConfig
    """
    timeout = get_timeout()
    if not timeout:
        return
    timeout_start = datetime.datetime.utcnow()
    timeout_duration = timeout['duration']
    timeout_deadline = timeout['deadline']
    timeout_remaining = timeout_deadline - timeout_start
    test_timeout = TestTimeout(timeout_duration)
    if timeout_remaining <= datetime.timedelta():
        test_timeout.write(args)
        raise ApplicationError('The %d minute test timeout expired %s ago at %s.' % (
            timeout_duration, timeout_remaining * -1, timeout_deadline))
    display.info('The %d minute test timeout expires in %s at %s.' % (
        timeout_duration, timeout_remaining, timeout_deadline), verbosity=1)
    def timeout_handler(_dummy1, _dummy2):
        """Runs when SIGUSR1 is received."""
        test_timeout.write(args)
        raise ApplicationError('Tests aborted after exceeding the %d minute time limit.' % timeout_duration)
    def timeout_waiter(timeout_seconds):
        """
        :type timeout_seconds: int
        """
        time.sleep(timeout_seconds)
        os.kill(os.getpid(), signal.SIGUSR1)
    signal.signal(signal.SIGUSR1, timeout_handler)
    instance = WrappedThread(functools.partial(timeout_waiter, timeout_remaining.seconds))
    instance.daemon = True
    instance.start()
 def show_dict(data, verbose, root_verbosity=0, path=None):
    """
    :type data: dict[str, any]
--- a/test/runner/lib/pytar.py
+++ b/test/runner/lib/pytar.py
@ -11,6 +11,10 @@ from lib.util import (
    ABC,
 )
 from lib.constants import (
    TIMEOUT_PATH,
 )
 # improve performance by disabling uid/gid lookups
 tarfile.pwd = None
 tarfile.grp = None
@ -45,6 +49,7 @@ class DefaultTarFilter(TarFilter):
        self.ignore_files = (
            '.gitignore',
            '.gitdir',
            TIMEOUT_PATH,
        )
        self.ignore_extensions = (
@ -58,7 +63,7 @@ class DefaultTarFilter(TarFilter):
        :rtype: tarfile.TarInfo | None
        """
        filename = os.path.basename(item.path)
-        name, ext = os.path.splitext(filename)
+        ext = os.path.splitext(filename)[1]
        dirs = os.path.split(item.path)
        if not item.isdir():
@ -68,7 +73,7 @@ class DefaultTarFilter(TarFilter):
            if item.path.startswith('./docs/docsite/_build/'):
                return None
-        if name in self.ignore_files:
+        if filename in self.ignore_files:
            return None
        if ext in self.ignore_extensions:
--- a/test/runner/lib/test.py
+++ b/test/runner/lib/test.py
@ -57,15 +57,17 @@ def calculate_confidence(path, line, metadata):
 class TestResult(object):
    """Base class for test results."""
-    def __init__(self, command, test, python_version=None):
+    def __init__(self, command, test, python_version=None, enable_junit=False):
        """
        :type command: str
        :type test: str
        :type python_version: str
        :type enable_junit: bool
        """
        self.command = command
        self.test = test
        self.python_version = python_version
        self.enable_junit = enable_junit
        self.name = self.test or self.command
        if self.python_version:
@ -88,7 +90,7 @@ class TestResult(object):
        if args.lint:
            self.write_lint()
-        if args.junit:
+        if args.junit or self.enable_junit:
            if self.junit:
                self.write_junit(args)
            else:
@ -159,6 +161,38 @@ class TestResult(object):
            xml.write(report.encode('utf-8', 'strict'))
 class TestTimeout(TestResult):
    """Test timeout."""
    def __init__(self, timeout_duration):
        """
        :type timeout_duration: int
        """
        super(TestTimeout, self).__init__(command='timeout', test='', enable_junit=True)
        self.timeout_duration = timeout_duration
    def write_junit(self, args):
        """
        :type args: TestConfig
        """
        message = 'Tests were aborted after exceeding the %d minute time limit.' % self.timeout_duration
        output = '''One or more of the following situations may be responsible:
 - Code changes have resulted in tests that hang or run for an excessive amount of time.
 - Tests have been added which exceed the time limit when combined with existing tests.
 - Test infrastructure and/or external dependencies are operating slower than normal.'''
        if args.coverage:
            output += '\n- Additional overhead from collecting code coverage has resulted in tests exceeding the time limit.'
        output += '\n\nConsult the console log for additional details on where the timeout occurred.'
        test_case = self.junit.TestCase(classname=self.command, name=self.name)
        test_case.add_error_info(message, output)
        self.save_junit(args, test_case)
 class TestSuccess(TestResult):
    """Test success."""
    def write_junit(self, args):
--- a/test/runner/lib/util.py
+++ b/test/runner/lib/util.py
@ -380,7 +380,9 @@ def raw_command(cmd, capture=False, env=None, data=None, cwd=None, explain=False
        stderr = None
    start = time.time()
    process = None
    try:
        try:
            process = subprocess.Popen(cmd, env=env, stdin=stdin, stdout=stdout, stderr=stderr, cwd=cwd)
        except OSError as ex:
@ -397,6 +399,11 @@ def raw_command(cmd, capture=False, env=None, data=None, cwd=None, explain=False
        else:
            process.wait()
            stdout_text, stderr_text = None, None
    finally:
        if process and process.returncode is None:
            process.kill()
            display.info('')  # the process we're interrupting may have completed a partial line of output
            display.notice('Killed command to avoid an orphaned child process during handling of an unexpected exception.')
    status = process.returncode
    runtime = time.time() - start
--- a/test/utils/shippable/shippable.sh
+++ b/test/utils/shippable/shippable.sh
@ -116,6 +116,12 @@ function cleanup
 trap cleanup EXIT
-ansible-test env --dump --show --color -v
+if [[ "${COVERAGE:-}" ]]; then
    timeout=60
 else
    timeout=45
 fi
 ansible-test env --dump --show --timeout "${timeout}" --color -v
 "test/utils/shippable/${script}.sh" "${test}"
--- a/test/utils/shippable/units.sh
+++ b/test/utils/shippable/units.sh
@ -7,5 +7,13 @@ IFS='/:' read -ra args <<< "$1"
 version="${args[1]}"
 if [[ "${COVERAGE:-}" ]]; then
    timeout=90
 else
    timeout=10
 fi
 ansible-test env --timeout "${timeout}" --color -v
 # shellcheck disable=SC2086
 ansible-test units --color -v --docker default --python "${version}" ${COVERAGE:+"$COVERAGE"} ${CHANGED:+"$CHANGED"} \