You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ansible/test/runner/lib/core_ci.py

598 lines
19 KiB
Python

"""Access Ansible Core CI remote services."""
from __future__ import (absolute_import, division, print_function)
__metaclass__ = type
import json
import os
import traceback
import uuid
import errno
import time
import shutil
from lib.http import (
HttpClient,
HttpResponse,
HttpError,
)
from lib.util import (
ApplicationError,
make_dirs,
display,
is_shippable,
)
from lib.util_common import (
run_command,
)
from lib.config import (
EnvironmentConfig,
)
AWS_ENDPOINTS = {
'us-east-1': 'https://14blg63h2i.execute-api.us-east-1.amazonaws.com',
'us-east-2': 'https://g5xynwbk96.execute-api.us-east-2.amazonaws.com',
}
class AnsibleCoreCI:
"""Client for Ansible Core CI services."""
def __init__(self, args, platform, version, stage='prod', persist=True, load=True, name=None, provider=None):
"""
:type args: EnvironmentConfig
:type platform: str
:type version: str
:type stage: str
:type persist: bool
:type load: bool
:type name: str
"""
self.args = args
self.platform = platform
self.version = version
self.stage = stage
self.client = HttpClient(args)
self.connection = None
self.instance_id = None
self.endpoint = None
self.max_threshold = 1
self.name = name if name else '%s-%s' % (self.platform, self.version)
self.ci_key = os.path.expanduser('~/.ansible-core-ci.key')
self.resource = 'jobs'
# Assign each supported platform to one provider.
# This is used to determine the provider from the platform when no provider is specified.
providers = dict(
aws=(
'aws',
'windows',
'freebsd',
'vyos',
'junos',
'ios',
'tower',
'rhel',
'hetzner',
),
azure=(
'azure',
),
parallels=(
'osx',
),
vmware=(
'vmware'
),
)
if provider:
# override default provider selection (not all combinations are valid)
self.provider = provider
else:
for candidate in providers:
if platform in providers[candidate]:
# assign default provider based on platform
self.provider = candidate
break
for candidate in providers:
if '%s/%s' % (platform, version) in providers[candidate]:
# assign default provider based on platform and version
self.provider = candidate
break
self.path = os.path.expanduser('~/.ansible/test/instances/%s-%s-%s' % (self.name, self.provider, self.stage))
if self.provider in ('aws', 'azure'):
if self.provider != 'aws':
self.resource = self.provider
if args.remote_aws_region:
# permit command-line override of region selection
region = args.remote_aws_region
# use a dedicated CI key when overriding the region selection
self.ci_key += '.%s' % args.remote_aws_region
elif is_shippable():
# split Shippable jobs across multiple regions to maximize use of launch credits
if self.platform == 'windows':
region = 'us-east-2'
else:
region = 'us-east-1'
else:
# send all non-Shippable jobs to us-east-1 to reduce api key maintenance
region = 'us-east-1'
self.path = "%s-%s" % (self.path, region)
self.endpoints = (AWS_ENDPOINTS[region],)
self.ssh_key = SshKey(args)
if self.platform == 'windows':
self.port = 5986
else:
self.port = 22
elif self.provider == 'parallels':
self.endpoints = self._get_parallels_endpoints()
self.max_threshold = 6
self.ssh_key = SshKey(args)
self.port = None
elif self.provider == 'vmware':
self.ssh_key = SshKey(args)
self.endpoints = ['https://access.ws.testing.ansible.com']
self.max_threshold = 1
else:
raise ApplicationError('Unsupported platform: %s' % platform)
if persist and load and self._load():
try:
display.info('Checking existing %s/%s instance %s.' % (self.platform, self.version, self.instance_id),
verbosity=1)
self.connection = self.get(always_raise_on=[404])
display.info('Loaded existing %s/%s from: %s' % (self.platform, self.version, self._uri), verbosity=1)
except HttpError as ex:
if ex.status != 404:
raise
self._clear()
display.info('Cleared stale %s/%s instance %s.' % (self.platform, self.version, self.instance_id),
verbosity=1)
self.instance_id = None
self.endpoint = None
elif not persist:
self.instance_id = None
self.endpoint = None
self._clear()
if self.instance_id:
self.started = True
else:
self.started = False
self.instance_id = str(uuid.uuid4())
self.endpoint = None
display.sensitive.add(self.instance_id)
def _get_parallels_endpoints(self):
"""
:rtype: tuple[str]
"""
client = HttpClient(self.args, always=True)
display.info('Getting available endpoints...', verbosity=1)
sleep = 3
for _ in range(1, 10):
response = client.get('https://s3.amazonaws.com/ansible-ci-files/ansible-test/parallels-endpoints.txt')
if response.status_code == 200:
endpoints = tuple(response.response.splitlines())
display.info('Available endpoints (%d):\n%s' % (len(endpoints), '\n'.join(' - %s' % endpoint for endpoint in endpoints)), verbosity=1)
return endpoints
display.warning('HTTP %d error getting endpoints, trying again in %d seconds.' % (response.status_code, sleep))
time.sleep(sleep)
raise ApplicationError('Unable to get available endpoints.')
def start(self):
"""Start instance."""
if self.started:
display.info('Skipping started %s/%s instance %s.' % (self.platform, self.version, self.instance_id),
verbosity=1)
return None
if is_shippable():
return self.start_shippable()
return self.start_remote()
def start_remote(self):
"""Start instance for remote development/testing."""
with open(self.ci_key, 'r') as key_fd:
auth_key = key_fd.read().strip()
return self._start(dict(
remote=dict(
key=auth_key,
nonce=None,
),
))
def start_shippable(self):
"""Start instance on Shippable."""
return self._start(dict(
shippable=dict(
run_id=os.environ['SHIPPABLE_BUILD_ID'],
job_number=int(os.environ['SHIPPABLE_JOB_NUMBER']),
),
))
def stop(self):
"""Stop instance."""
if not self.started:
display.info('Skipping invalid %s/%s instance %s.' % (self.platform, self.version, self.instance_id),
verbosity=1)
return
response = self.client.delete(self._uri)
if response.status_code == 404:
self._clear()
display.info('Cleared invalid %s/%s instance %s.' % (self.platform, self.version, self.instance_id),
verbosity=1)
return
if response.status_code == 200:
self._clear()
display.info('Stopped running %s/%s instance %s.' % (self.platform, self.version, self.instance_id),
verbosity=1)
return
raise self._create_http_error(response)
def get(self, tries=3, sleep=15, always_raise_on=None):
"""
Get instance connection information.
:type tries: int
:type sleep: int
:type always_raise_on: list[int] | None
:rtype: InstanceConnection
"""
if not self.started:
display.info('Skipping invalid %s/%s instance %s.' % (self.platform, self.version, self.instance_id),
verbosity=1)
return None
if not always_raise_on:
always_raise_on = []
if self.connection and self.connection.running:
return self.connection
while True:
tries -= 1
response = self.client.get(self._uri)
if response.status_code == 200:
break
error = self._create_http_error(response)
if not tries or response.status_code in always_raise_on:
raise error
display.warning('%s. Trying again after %d seconds.' % (error, sleep))
time.sleep(sleep)
if self.args.explain:
self.connection = InstanceConnection(
running=True,
hostname='cloud.example.com',
port=self.port or 12345,
username='username',
password='password' if self.platform == 'windows' else None,
)
else:
response_json = response.json()
status = response_json['status']
con = response_json['connection']
self.connection = InstanceConnection(
running=status == 'running',
hostname=con['hostname'],
port=int(con.get('port', self.port)),
username=con['username'],
password=con.get('password'),
)
if self.connection.password:
display.sensitive.add(str(self.connection.password))
status = 'running' if self.connection.running else 'starting'
display.info('Status update: %s/%s on instance %s is %s.' %
(self.platform, self.version, self.instance_id, status),
verbosity=1)
return self.connection
def wait(self):
"""Wait for the instance to become ready."""
for _ in range(1, 90):
if self.get().running:
return
time.sleep(10)
raise ApplicationError('Timeout waiting for %s/%s instance %s.' %
(self.platform, self.version, self.instance_id))
@property
def _uri(self):
return '%s/%s/%s/%s' % (self.endpoint, self.stage, self.resource, self.instance_id)
def _start(self, auth):
"""Start instance."""
display.info('Initializing new %s/%s instance %s.' % (self.platform, self.version, self.instance_id), verbosity=1)
if self.platform == 'windows':
with open('examples/scripts/ConfigureRemotingForAnsible.ps1', 'rb') as winrm_config_fd:
winrm_config = winrm_config_fd.read().decode('utf-8')
else:
winrm_config = None
data = dict(
config=dict(
platform=self.platform,
version=self.version,
public_key=self.ssh_key.pub_contents if self.ssh_key else None,
query=False,
winrm_config=winrm_config,
)
)
data.update(dict(auth=auth))
headers = {
'Content-Type': 'application/json',
}
response = self._start_try_endpoints(data, headers)
self.started = True
self._save()
display.info('Started %s/%s from: %s' % (self.platform, self.version, self._uri), verbosity=1)
if self.args.explain:
return {}
return response.json()
def _start_try_endpoints(self, data, headers):
"""
:type data: dict[str, any]
:type headers: dict[str, str]
:rtype: HttpResponse
"""
threshold = 1
while threshold <= self.max_threshold:
for self.endpoint in self.endpoints:
try:
return self._start_at_threshold(data, headers, threshold)
except CoreHttpError as ex:
if ex.status == 503:
display.info('Service Unavailable: %s' % ex.remote_message, verbosity=1)
continue
display.error(ex.remote_message)
except HttpError as ex:
display.error(u'%s' % ex)
time.sleep(3)
threshold += 1
raise ApplicationError('Maximum threshold reached and all endpoints exhausted.')
def _start_at_threshold(self, data, headers, threshold):
"""
:type data: dict[str, any]
:type headers: dict[str, str]
:type threshold: int
:rtype: HttpResponse | None
"""
tries = 3
sleep = 15
data['threshold'] = threshold
display.info('Trying endpoint: %s (threshold %d)' % (self.endpoint, threshold), verbosity=1)
while True:
tries -= 1
response = self.client.put(self._uri, data=json.dumps(data), headers=headers)
if response.status_code == 200:
return response
error = self._create_http_error(response)
if response.status_code == 503:
raise error
if not tries:
raise error
display.warning('%s. Trying again after %d seconds.' % (error, sleep))
time.sleep(sleep)
def _clear(self):
"""Clear instance information."""
try:
self.connection = None
os.remove(self.path)
except OSError as ex:
if ex.errno != errno.ENOENT:
raise
def _load(self):
"""Load instance information."""
try:
with open(self.path, 'r') as instance_fd:
data = instance_fd.read()
except IOError as ex:
if ex.errno != errno.ENOENT:
raise
return False
if not data.startswith('{'):
return False # legacy format
config = json.loads(data)
return self.load(config)
def load(self, config):
"""
:type config: dict[str, str]
:rtype: bool
"""
self.instance_id = str(config['instance_id'])
self.endpoint = config['endpoint']
self.started = True
display.sensitive.add(self.instance_id)
return True
def _save(self):
"""Save instance information."""
if self.args.explain:
return
config = self.save()
make_dirs(os.path.dirname(self.path))
with open(self.path, 'w') as instance_fd:
instance_fd.write(json.dumps(config, indent=4, sort_keys=True))
def save(self):
"""
:rtype: dict[str, str]
"""
return dict(
platform_version='%s/%s' % (self.platform, self.version),
instance_id=self.instance_id,
endpoint=self.endpoint,
)
@staticmethod
def _create_http_error(response):
"""
:type response: HttpResponse
:rtype: ApplicationError
"""
response_json = response.json()
stack_trace = ''
if 'message' in response_json:
message = response_json['message']
elif 'errorMessage' in response_json:
message = response_json['errorMessage'].strip()
if 'stackTrace' in response_json:
traceback_lines = response_json['stackTrace']
# AWS Lambda on Python 2.7 returns a list of tuples
# AWS Lambda on Python 3.7 returns a list of strings
if traceback_lines and isinstance(traceback_lines[0], list):
traceback_lines = traceback.format_list(traceback_lines)
trace = '\n'.join([x.rstrip() for x in traceback_lines])
stack_trace = ('\nTraceback (from remote server):\n%s' % trace)
else:
message = str(response_json)
return CoreHttpError(response.status_code, message, stack_trace)
class CoreHttpError(HttpError):
"""HTTP response as an error."""
def __init__(self, status, remote_message, remote_stack_trace):
"""
:type status: int
:type remote_message: str
:type remote_stack_trace: str
"""
super(CoreHttpError, self).__init__(status, '%s%s' % (remote_message, remote_stack_trace))
self.remote_message = remote_message
self.remote_stack_trace = remote_stack_trace
class SshKey:
"""Container for SSH key used to connect to remote instances."""
KEY_NAME = 'id_rsa'
PUB_NAME = 'id_rsa.pub'
def __init__(self, args):
"""
:type args: EnvironmentConfig
"""
cache_dir = 'test/cache'
self.key = os.path.join(cache_dir, self.KEY_NAME)
self.pub = os.path.join(cache_dir, self.PUB_NAME)
if not os.path.isfile(self.key) or not os.path.isfile(self.pub):
base_dir = os.path.expanduser('~/.ansible/test/')
key = os.path.join(base_dir, self.KEY_NAME)
pub = os.path.join(base_dir, self.PUB_NAME)
if not args.explain:
make_dirs(base_dir)
if not os.path.isfile(key) or not os.path.isfile(pub):
run_command(args, ['ssh-keygen', '-m', 'PEM', '-q', '-t', 'rsa', '-N', '', '-f', key])
if not args.explain:
shutil.copy2(key, self.key)
shutil.copy2(pub, self.pub)
if args.explain:
self.pub_contents = None
else:
with open(self.pub, 'r') as pub_fd:
self.pub_contents = pub_fd.read().strip()
class InstanceConnection:
"""Container for remote instance status and connection details."""
def __init__(self, running, hostname, port, username, password):
"""
:type running: bool
:type hostname: str
:type port: int
:type username: str
:type password: str | None
"""
self.running = running
self.hostname = hostname
self.port = port
self.username = username
self.password = password
def __str__(self):
if self.password:
return '%s:%s [%s:%s]' % (self.hostname, self.port, self.username, self.password)
return '%s:%s [%s]' % (self.hostname, self.port, self.username)