ansible-galaxy collection install retry improvements (#80180)

* clog frag

* Fix retries so that each explicit call to _call_galaxy is retried for the correct number of attempts. Fixes #80174

* Extend retry logic to common URL related connection errors. Fixes #80170

* Extend retries to downloading artifacts

* Extend param docs for change

* Rework the exception handling

* Don't be overly broad, reduce to TimeoutError, and BadStatusLine for now

* _download_file needs to raise AnsibleError.orig_exc

* Remove unused import

* Add IncompleteRead

* Add socket.timeout for py39

* Add 502 to retry codes

* Move http error code checking first

* Use itertools.tee to replay the backoff_iterator instead of using a callable

* Actually set a CLI default of 60s for timeout, to prevent implicit galaxy from using 10s as default from Request.open

* Import typing

* fix type hints

* Use http.HTTPStatus instead of int HTTP error codes where feasible

* Split exception handling

Co-authored-by: Sviatoslav Sydorenko <wk.cvs.github@sydorenko.org.ua>

* Add missing import

---------

Co-authored-by: Sviatoslav Sydorenko <wk.cvs.github@sydorenko.org.ua>
pull/80278/head
Matt Martz 2 years ago committed by GitHub
parent cba3952434
commit 2ae013667e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,3 @@
bugfixes:
- ansible-galaxy - Improve retries for collection installs, to properly retry, and extend retry logic to common URL related connection errors
(https://github.com/ansible/ansible/issues/80170 https://github.com/ansible/ansible/issues/80174)

@ -244,7 +244,7 @@ class GalaxyCLI(CLI):
help='The Ansible Galaxy API key which can be found at '
'https://galaxy.ansible.com/me/preferences.')
common.add_argument('-c', '--ignore-certs', action='store_true', dest='ignore_certs', help='Ignore SSL certificate validation errors.', default=None)
common.add_argument('--timeout', dest='timeout', type=int,
common.add_argument('--timeout', dest='timeout', type=int, default=60,
help="The time to wait for operations against the galaxy server, defaults to 60s.")
opt_help.add_verbosity_options(common)

@ -11,12 +11,15 @@ import functools
import hashlib
import json
import os
import socket
import stat
import tarfile
import time
import threading
from urllib.error import HTTPError
from http import HTTPStatus
from http.client import BadStatusLine, IncompleteRead
from urllib.error import HTTPError, URLError
from urllib.parse import quote as urlquote, urlencode, urlparse, parse_qs, urljoin
from ansible import constants as C
@ -34,10 +37,11 @@ from ansible.utils.path import makedirs_safe
display = Display()
_CACHE_LOCK = threading.Lock()
COLLECTION_PAGE_SIZE = 100
RETRY_HTTP_ERROR_CODES = [ # TODO: Allow user-configuration
429, # Too Many Requests
RETRY_HTTP_ERROR_CODES = { # TODO: Allow user-configuration
HTTPStatus.TOO_MANY_REQUESTS,
520, # Galaxy rate limit error code (Cloudflare unknown error)
]
HTTPStatus.BAD_GATEWAY, # Common error from galaxy that may represent any number of transient backend issues
}
def cache_lock(func):
@ -48,11 +52,24 @@ def cache_lock(func):
return wrapped
def is_rate_limit_exception(exception):
def should_retry_error(exception):
# Note: cloud.redhat.com masks rate limit errors with 403 (Forbidden) error codes.
# Since 403 could reflect the actual problem (such as an expired token), we should
# not retry by default.
return isinstance(exception, GalaxyError) and exception.http_code in RETRY_HTTP_ERROR_CODES
if isinstance(exception, GalaxyError) and exception.http_code in RETRY_HTTP_ERROR_CODES:
return True
if isinstance(exception, AnsibleError) and (orig_exc := getattr(exception, 'orig_exc', None)):
# URLError is often a proxy for an underlying error, handle wrapped exceptions
if isinstance(orig_exc, URLError):
orig_exc = orig_exc.reason
# Handle common URL related errors such as TimeoutError, and BadStatusLine
# Note: socket.timeout is only required for Py3.9
if isinstance(orig_exc, (TimeoutError, BadStatusLine, IncompleteRead, socket.timeout)):
return True
return False
def g_connect(versions):
@ -327,7 +344,7 @@ class GalaxyAPI:
@retry_with_delays_and_condition(
backoff_iterator=generate_jittered_backoff(retries=6, delay_base=2, delay_threshold=40),
should_retry_error=is_rate_limit_exception
should_retry_error=should_retry_error
)
def _call_galaxy(self, url, args=None, headers=None, method=None, auth_required=False, error_context_msg=None,
cache=False, cache_key=None):
@ -385,7 +402,10 @@ class GalaxyAPI:
except HTTPError as e:
raise GalaxyError(e, error_context_msg)
except Exception as e:
raise AnsibleError("Unknown error when attempting to call Galaxy at '%s': %s" % (url, to_native(e)))
raise AnsibleError(
"Unknown error when attempting to call Galaxy at '%s': %s" % (url, to_native(e)),
orig_exc=e
)
resp_data = to_text(resp.read(), errors='surrogate_or_strict')
try:

@ -27,9 +27,12 @@ if t.TYPE_CHECKING:
from ansible.errors import AnsibleError
from ansible.galaxy import get_collections_galaxy_meta_info
from ansible.galaxy.api import should_retry_error
from ansible.galaxy.dependency_resolution.dataclasses import _GALAXY_YAML
from ansible.galaxy.user_agent import user_agent
from ansible.module_utils._text import to_bytes, to_native, to_text
from ansible.module_utils.api import retry_with_delays_and_condition
from ansible.module_utils.api import generate_jittered_backoff
from ansible.module_utils.common.process import get_bin_path
from ansible.module_utils.common.yaml import yaml_load
from ansible.module_utils.urls import open_url
@ -163,6 +166,16 @@ class ConcreteArtifactsManager:
download_err=to_native(err),
),
) from err
except Exception as err:
raise AnsibleError(
'Failed to download collection tar '
"from '{coll_src!s}' due to the following unforeseen error: "
'{download_err!s}'.
format(
coll_src=to_native(collection.src),
download_err=to_native(err),
),
) from err
else:
display.vvv(
"Collection '{coll!s}' obtained from "
@ -440,6 +453,10 @@ def _extract_collection_from_git(repo_url, coll_ver, b_path):
# FIXME: use random subdirs while preserving the file names
@retry_with_delays_and_condition(
backoff_iterator=generate_jittered_backoff(retries=6, delay_base=2, delay_threshold=40),
should_retry_error=should_retry_error
)
def _download_file(url, b_path, expected_hash, validate_certs, token=None, timeout=60):
# type: (str, bytes, t.Optional[str], bool, GalaxyToken, int) -> bytes
# ^ NOTE: used in download and verify_collections ^
@ -458,6 +475,7 @@ def _download_file(url, b_path, expected_hash, validate_certs, token=None, timeo
display.display("Downloading %s to %s" % (url, to_text(b_tarball_dir)))
# NOTE: Galaxy redirects downloads to S3 which rejects the request
# NOTE: if an Authorization header is attached so don't redirect it
try:
resp = open_url(
to_native(url, errors='surrogate_or_strict'),
validate_certs=validate_certs,
@ -465,6 +483,8 @@ def _download_file(url, b_path, expected_hash, validate_certs, token=None, timeo
unredirected_headers=['Authorization'], http_agent=user_agent(),
timeout=timeout
)
except Exception as err:
raise AnsibleError(to_native(err), orig_exc=err)
with open(b_file_path, 'wb') as download_file: # type: t.BinaryIO
actual_hash = _consume_file(resp, write_to=download_file)

@ -26,11 +26,15 @@ The 'api' module provides the following common argument specs:
from __future__ import (absolute_import, division, print_function)
__metaclass__ = type
import copy
import functools
import itertools
import random
import sys
import time
import ansible.module_utils.compat.typing as t
def rate_limit_argument_spec(spec=None):
"""Creates an argument spec for working with rate limiting"""
@ -141,6 +145,15 @@ def retry_with_delays_and_condition(backoff_iterator, should_retry_error=None):
:param backoff_iterator: An iterable of delays in seconds.
:param should_retry_error: A callable that takes an exception of the decorated function and decides whether to retry or not (returns a bool).
"""
def _emit_isolated_iterator_copies(original_iterator): # type: (t.Iterable[t.Any]) -> t.Generator
# Ref: https://stackoverflow.com/a/30232619/595220
_copiable_iterator, _first_iterator_copy = itertools.tee(original_iterator)
yield _first_iterator_copy
while True:
yield copy.copy(_copiable_iterator)
backoff_iterator_generator = _emit_isolated_iterator_copies(backoff_iterator)
del backoff_iterator # prevent accidental use elsewhere
if should_retry_error is None:
should_retry_error = retry_never
@ -152,7 +165,7 @@ def retry_with_delays_and_condition(backoff_iterator, should_retry_error=None):
"""
call_retryable_function = functools.partial(function, *args, **kwargs)
for delay in backoff_iterator:
for delay in next(backoff_iterator_generator):
try:
return call_retryable_function()
except Exception as e:

Loading…
Cancel
Save