ec2_asg: fix #28087 and #35993 (#36679)

Fixes #35993 - Changes to update_size in commit eb4cc31 made it so the group dict passed into update_size was not modified. As a result, the 'replace' call does not see an updated min_size like it previously did and doesn't pause to wait for any new instances to spin up. Instead, it moves straight into terminating old instances. Fix is to add batch_size to min_size when calling wait_for_new_inst. Fixes #28087 - Make replace_all_instances and replace_instances behave exactly the same by setting replace_instances = current list of instances when replace_all_instances used. Root cause of issue was that without lc_check terminate_batch will terminate all instances passed to it and after updating the asg size we were querying the asg again for the list of instances - so terminate batch saw the list including new ones just spun up. When creating new asg with replace_all_instances: yes and lc_check: false the instances that are initially created are then subsequently replaced. This change makes it so replace only occurs if the asg already existed. Add integration tests for #28087 and #35993.
8 years ago · a2b3120e85
parent ce416f247f
commit a2b3120e85
2 changed files with 132 additions and 2 deletions
--- a/lib/ansible/modules/cloud/amazon/ec2_asg.py
+++ b/lib/ansible/modules/cloud/amazon/ec2_asg.py
@ -1191,6 +1191,7 @@ def replace(connection):
    desired_capacity = module.params.get('desired_capacity')
    lc_check = module.params.get('lc_check')
    replace_instances = module.params.get('replace_instances')
    replace_all_instances = module.params.get('replace_all_instances')
    as_group = describe_autoscaling_groups(connection, group_name)[0]
    if desired_capacity is None:
@ -1199,6 +1200,10 @@ def replace(connection):
    wait_for_new_inst(connection, group_name, wait_timeout, as_group['MinSize'], 'viable_instances')
    props = get_properties(as_group)
    instances = props['instances']
    if replace_all_instances:
        # If replacing all instances, then set replace_instances to current set
        # This allows replace_instances and replace_all_instances to behave same
        replace_instances = instances
    if replace_instances:
        instances = replace_instances
    # check to see if instances are replaceable if checking launch configs
@ -1235,7 +1240,7 @@ def replace(connection):
    as_group = describe_autoscaling_groups(connection, group_name)[0]
    update_size(connection, as_group, max_size + batch_size, min_size + batch_size, desired_capacity + batch_size)
-    wait_for_new_inst(connection, group_name, wait_timeout, as_group['MinSize'], 'viable_instances')
+    wait_for_new_inst(connection, group_name, wait_timeout, as_group['MinSize'] + batch_size, 'viable_instances')
    wait_for_elb(connection, group_name)
    wait_for_target_group(connection, group_name)
    as_group = describe_autoscaling_groups(connection, group_name)[0]
@ -1414,6 +1419,12 @@ def wait_for_new_inst(connection, group_name, wait_timeout, desired_size, prop):
    return props
 def asg_exists(connection):
    group_name = module.params.get('name')
    as_group = describe_autoscaling_groups(connection, group_name)
    return bool(len(as_group))
 def main():
    argument_spec = ec2_argument_spec()
    argument_spec.update(
@ -1483,13 +1494,16 @@ def main():
                            endpoint=ec2_url,
                            **aws_connect_params)
    changed = create_changed = replace_changed = False
    exists = asg_exists(connection)
    if state == 'present':
        create_changed, asg_properties = create_autoscaling_group(connection)
    elif state == 'absent':
        changed = delete_autoscaling_group(connection)
        module.exit_json(changed=changed)
-    if replace_all_instances or replace_instances:
+
    # Only replace instances if asg existed at start of call
    if exists and (replace_all_instances or replace_instances):
        replace_changed, asg_properties = replace(connection)
    if create_changed or replace_changed:
        changed = True
--- a/test/integration/targets/ec2_asg/tasks/main.yml
+++ b/test/integration/targets/ec2_asg/tasks/main.yml
@ -385,6 +385,122 @@
        that:
          - "output.viable_instances == 3"
    # ============================================================
    # perform rolling replace with new launch configuration and lc_check:false
    # Note - this is done async so we can query asg_facts during
    # the execution. Issues #28087 and #35993 result in correct
    # end result, but spin up extraneous instances during execution.
    - name: "perform rolling update to new AMI with lc_check: false"
      ec2_asg:
        name: "{{ resource_prefix }}-asg"
        launch_config_name: "{{ resource_prefix }}-lc-2"
        health_check_type: EC2
        desired_capacity: 3
        min_size: 1
        max_size: 5
        health_check_period: 900
        load_balancers: []
        vpc_zone_identifier: "{{ testing_subnet.subnet.id }}"
        wait_for_instances: yes
        replace_all_instances: yes
        replace_batch_size: 3
        lc_check: false
        wait_timeout: 1800
        state: present
        <<: *aws_connection_info
      async: 1800
      poll: 0
      register: asg_job
    - name: get ec2_asg facts for 3 minutes
      ec2_asg_facts:
        name: "{{ resource_prefix }}-asg"
        <<: *aws_connection_info
      register: output
      loop_control:
          pause: 15
      with_sequence: count=12
    - set_fact:
        inst_id_json_query: 'results[*].results[*].instances[*].instance_id'
    # Since we started with 3 servers and replace all of them.
    # We should see 6 servers total.
    - assert:
        that:
          - "lookup('flattened',output|json_query(inst_id_json_query)).split(',')|unique|length == 6"
    - name: Ensure ec2_asg task completes
      async_status: jid="{{ asg_job.ansible_job_id }}"
      register: status
      until: status.finished
      retries: 200
      delay: 15
    # ============================================================
    - name: kill asg
      ec2_asg:
        name: "{{ resource_prefix }}-asg"
        state: absent
        <<: *aws_connection_info
      async: 300
    # Create new asg with replace_all_instances and lc_check:false
    # Note - this is done async so we can query asg_facts during
    # the execution. Issues #28087 results in correct
    # end result, but spin up extraneous instances during execution.
    - name: "new asg with lc_check: false"
      ec2_asg:
        name: "{{ resource_prefix }}-asg"
        launch_config_name: "{{ resource_prefix }}-lc"
        health_check_type: EC2
        desired_capacity: 3
        min_size: 1
        max_size: 5
        health_check_period: 900
        load_balancers: []
        vpc_zone_identifier: "{{ testing_subnet.subnet.id }}"
        wait_for_instances: yes
        replace_all_instances: yes
        replace_batch_size: 3
        lc_check: false
        wait_timeout: 1800
        state: present
        <<: *aws_connection_info
      async: 1800
      poll: 0
      register: asg_job
    # Collect ec2_asg_facts for 3 minutes
    - name: get ec2_asg facts
      ec2_asg_facts:
        name: "{{ resource_prefix }}-asg"
        <<: *aws_connection_info
      register: output
      loop_control:
          pause: 15
      with_sequence: count=12
    - set_fact:
        inst_id_json_query: 'results[*].results[*].instances[*].instance_id'
    # Get all instance_ids we saw and assert we saw number expected
    # Should only see 3 (don't replace instances we just created)
    - assert:
        that:
          - "lookup('flattened',output|json_query(inst_id_json_query)).split(',')|unique|length == 3"
    - name: Ensure ec2_asg task completes
      async_status: jid="{{ asg_job.ansible_job_id }}"
      register: status
      until: status.finished
      retries: 200
      delay: 15
 # ============================================================
  always: