From 2ae013667ef226635fe521be886efd1bf58cd46f Mon Sep 17 00:00:00 2001 From: Matt Martz Date: Wed, 22 Mar 2023 11:04:56 -0500 Subject: [PATCH] ansible-galaxy collection install retry improvements (#80180) * clog frag * Fix retries so that each explicit call to _call_galaxy is retried for the correct number of attempts. Fixes #80174 * Extend retry logic to common URL related connection errors. Fixes #80170 * Extend retries to downloading artifacts * Extend param docs for change * Rework the exception handling * Don't be overly broad, reduce to TimeoutError, and BadStatusLine for now * _download_file needs to raise AnsibleError.orig_exc * Remove unused import * Add IncompleteRead * Add socket.timeout for py39 * Add 502 to retry codes * Move http error code checking first * Use itertools.tee to replay the backoff_iterator instead of using a callable * Actually set a CLI default of 60s for timeout, to prevent implicit galaxy from using 10s as default from Request.open * Import typing * fix type hints * Use http.HTTPStatus instead of int HTTP error codes where feasible * Split exception handling Co-authored-by: Sviatoslav Sydorenko * Add missing import --------- Co-authored-by: Sviatoslav Sydorenko --- .../fragments/galaxy-improve-retries.yml | 3 ++ lib/ansible/cli/galaxy.py | 2 +- lib/ansible/galaxy/api.py | 36 ++++++++++++++----- .../collection/concrete_artifact_manager.py | 34 ++++++++++++++---- lib/ansible/module_utils/api.py | 15 +++++++- 5 files changed, 73 insertions(+), 17 deletions(-) create mode 100644 changelogs/fragments/galaxy-improve-retries.yml diff --git a/changelogs/fragments/galaxy-improve-retries.yml b/changelogs/fragments/galaxy-improve-retries.yml new file mode 100644 index 00000000000..ab0c7b216c7 --- /dev/null +++ b/changelogs/fragments/galaxy-improve-retries.yml @@ -0,0 +1,3 @@ +bugfixes: +- ansible-galaxy - Improve retries for collection installs, to properly retry, and extend retry logic to common URL related connection errors + (https://github.com/ansible/ansible/issues/80170 https://github.com/ansible/ansible/issues/80174) diff --git a/lib/ansible/cli/galaxy.py b/lib/ansible/cli/galaxy.py index d5501f893fc..96b071e0066 100755 --- a/lib/ansible/cli/galaxy.py +++ b/lib/ansible/cli/galaxy.py @@ -244,7 +244,7 @@ class GalaxyCLI(CLI): help='The Ansible Galaxy API key which can be found at ' 'https://galaxy.ansible.com/me/preferences.') common.add_argument('-c', '--ignore-certs', action='store_true', dest='ignore_certs', help='Ignore SSL certificate validation errors.', default=None) - common.add_argument('--timeout', dest='timeout', type=int, + common.add_argument('--timeout', dest='timeout', type=int, default=60, help="The time to wait for operations against the galaxy server, defaults to 60s.") opt_help.add_verbosity_options(common) diff --git a/lib/ansible/galaxy/api.py b/lib/ansible/galaxy/api.py index 8dea804953b..03d9104f5c9 100644 --- a/lib/ansible/galaxy/api.py +++ b/lib/ansible/galaxy/api.py @@ -11,12 +11,15 @@ import functools import hashlib import json import os +import socket import stat import tarfile import time import threading -from urllib.error import HTTPError +from http import HTTPStatus +from http.client import BadStatusLine, IncompleteRead +from urllib.error import HTTPError, URLError from urllib.parse import quote as urlquote, urlencode, urlparse, parse_qs, urljoin from ansible import constants as C @@ -34,10 +37,11 @@ from ansible.utils.path import makedirs_safe display = Display() _CACHE_LOCK = threading.Lock() COLLECTION_PAGE_SIZE = 100 -RETRY_HTTP_ERROR_CODES = [ # TODO: Allow user-configuration - 429, # Too Many Requests +RETRY_HTTP_ERROR_CODES = { # TODO: Allow user-configuration + HTTPStatus.TOO_MANY_REQUESTS, 520, # Galaxy rate limit error code (Cloudflare unknown error) -] + HTTPStatus.BAD_GATEWAY, # Common error from galaxy that may represent any number of transient backend issues +} def cache_lock(func): @@ -48,11 +52,24 @@ def cache_lock(func): return wrapped -def is_rate_limit_exception(exception): +def should_retry_error(exception): # Note: cloud.redhat.com masks rate limit errors with 403 (Forbidden) error codes. # Since 403 could reflect the actual problem (such as an expired token), we should # not retry by default. - return isinstance(exception, GalaxyError) and exception.http_code in RETRY_HTTP_ERROR_CODES + if isinstance(exception, GalaxyError) and exception.http_code in RETRY_HTTP_ERROR_CODES: + return True + + if isinstance(exception, AnsibleError) and (orig_exc := getattr(exception, 'orig_exc', None)): + # URLError is often a proxy for an underlying error, handle wrapped exceptions + if isinstance(orig_exc, URLError): + orig_exc = orig_exc.reason + + # Handle common URL related errors such as TimeoutError, and BadStatusLine + # Note: socket.timeout is only required for Py3.9 + if isinstance(orig_exc, (TimeoutError, BadStatusLine, IncompleteRead, socket.timeout)): + return True + + return False def g_connect(versions): @@ -327,7 +344,7 @@ class GalaxyAPI: @retry_with_delays_and_condition( backoff_iterator=generate_jittered_backoff(retries=6, delay_base=2, delay_threshold=40), - should_retry_error=is_rate_limit_exception + should_retry_error=should_retry_error ) def _call_galaxy(self, url, args=None, headers=None, method=None, auth_required=False, error_context_msg=None, cache=False, cache_key=None): @@ -385,7 +402,10 @@ class GalaxyAPI: except HTTPError as e: raise GalaxyError(e, error_context_msg) except Exception as e: - raise AnsibleError("Unknown error when attempting to call Galaxy at '%s': %s" % (url, to_native(e))) + raise AnsibleError( + "Unknown error when attempting to call Galaxy at '%s': %s" % (url, to_native(e)), + orig_exc=e + ) resp_data = to_text(resp.read(), errors='surrogate_or_strict') try: diff --git a/lib/ansible/galaxy/collection/concrete_artifact_manager.py b/lib/ansible/galaxy/collection/concrete_artifact_manager.py index aa3a190ce8f..4c60642770f 100644 --- a/lib/ansible/galaxy/collection/concrete_artifact_manager.py +++ b/lib/ansible/galaxy/collection/concrete_artifact_manager.py @@ -27,9 +27,12 @@ if t.TYPE_CHECKING: from ansible.errors import AnsibleError from ansible.galaxy import get_collections_galaxy_meta_info +from ansible.galaxy.api import should_retry_error from ansible.galaxy.dependency_resolution.dataclasses import _GALAXY_YAML from ansible.galaxy.user_agent import user_agent from ansible.module_utils._text import to_bytes, to_native, to_text +from ansible.module_utils.api import retry_with_delays_and_condition +from ansible.module_utils.api import generate_jittered_backoff from ansible.module_utils.common.process import get_bin_path from ansible.module_utils.common.yaml import yaml_load from ansible.module_utils.urls import open_url @@ -163,6 +166,16 @@ class ConcreteArtifactsManager: download_err=to_native(err), ), ) from err + except Exception as err: + raise AnsibleError( + 'Failed to download collection tar ' + "from '{coll_src!s}' due to the following unforeseen error: " + '{download_err!s}'. + format( + coll_src=to_native(collection.src), + download_err=to_native(err), + ), + ) from err else: display.vvv( "Collection '{coll!s}' obtained from " @@ -440,6 +453,10 @@ def _extract_collection_from_git(repo_url, coll_ver, b_path): # FIXME: use random subdirs while preserving the file names +@retry_with_delays_and_condition( + backoff_iterator=generate_jittered_backoff(retries=6, delay_base=2, delay_threshold=40), + should_retry_error=should_retry_error +) def _download_file(url, b_path, expected_hash, validate_certs, token=None, timeout=60): # type: (str, bytes, t.Optional[str], bool, GalaxyToken, int) -> bytes # ^ NOTE: used in download and verify_collections ^ @@ -458,13 +475,16 @@ def _download_file(url, b_path, expected_hash, validate_certs, token=None, timeo display.display("Downloading %s to %s" % (url, to_text(b_tarball_dir))) # NOTE: Galaxy redirects downloads to S3 which rejects the request # NOTE: if an Authorization header is attached so don't redirect it - resp = open_url( - to_native(url, errors='surrogate_or_strict'), - validate_certs=validate_certs, - headers=None if token is None else token.headers(), - unredirected_headers=['Authorization'], http_agent=user_agent(), - timeout=timeout - ) + try: + resp = open_url( + to_native(url, errors='surrogate_or_strict'), + validate_certs=validate_certs, + headers=None if token is None else token.headers(), + unredirected_headers=['Authorization'], http_agent=user_agent(), + timeout=timeout + ) + except Exception as err: + raise AnsibleError(to_native(err), orig_exc=err) with open(b_file_path, 'wb') as download_file: # type: t.BinaryIO actual_hash = _consume_file(resp, write_to=download_file) diff --git a/lib/ansible/module_utils/api.py b/lib/ansible/module_utils/api.py index e780ec6b50f..2de8a4efc14 100644 --- a/lib/ansible/module_utils/api.py +++ b/lib/ansible/module_utils/api.py @@ -26,11 +26,15 @@ The 'api' module provides the following common argument specs: from __future__ import (absolute_import, division, print_function) __metaclass__ = type +import copy import functools +import itertools import random import sys import time +import ansible.module_utils.compat.typing as t + def rate_limit_argument_spec(spec=None): """Creates an argument spec for working with rate limiting""" @@ -141,6 +145,15 @@ def retry_with_delays_and_condition(backoff_iterator, should_retry_error=None): :param backoff_iterator: An iterable of delays in seconds. :param should_retry_error: A callable that takes an exception of the decorated function and decides whether to retry or not (returns a bool). """ + def _emit_isolated_iterator_copies(original_iterator): # type: (t.Iterable[t.Any]) -> t.Generator + # Ref: https://stackoverflow.com/a/30232619/595220 + _copiable_iterator, _first_iterator_copy = itertools.tee(original_iterator) + yield _first_iterator_copy + while True: + yield copy.copy(_copiable_iterator) + backoff_iterator_generator = _emit_isolated_iterator_copies(backoff_iterator) + del backoff_iterator # prevent accidental use elsewhere + if should_retry_error is None: should_retry_error = retry_never @@ -152,7 +165,7 @@ def retry_with_delays_and_condition(backoff_iterator, should_retry_error=None): """ call_retryable_function = functools.partial(function, *args, **kwargs) - for delay in backoff_iterator: + for delay in next(backoff_iterator_generator): try: return call_retryable_function() except Exception as e: