""" Certbot Integration Test Tool - Launches EC2 instances with a given list of AMIs for different distros - Copies certbot repo and puts it on the instances - Runs certbot tests (bash scripts) on all of these - Logs execution and success/fail for debugging Notes: - Some AWS images, e.g. official CentOS and FreeBSD images require acceptance of user terms on the AWS marketplace website. This can't be automated. - AWS EC2 has a default limit of 20 t2/t1 instances, if more are needed, they need to be requested via online webform. Usage: - Requires AWS IAM secrets to be set up with aws cli - Requires an AWS associated keyfile .pem >aws configure --profile HappyHacker [interactive: enter secrets for IAM role] >aws ec2 create-key-pair --profile HappyHacker --key-name MyKeyPair \ --query 'KeyMaterial' --output text > MyKeyPair.pem then: >python multitester.py targets.yaml MyKeyPair.pem HappyHacker scripts/test_leauto_upgrades.sh see: https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html https://docs.aws.amazon.com/cli/latest/userguide/cli-ec2-keypairs.html """ import argparse import multiprocessing as mp from multiprocessing import Manager import os import socket import sys import time import traceback import urllib.error as urllib_error import urllib.request as urllib_request import boto3 from botocore.exceptions import ClientError import yaml from fabric import Config from fabric import Connection # Command line parser #------------------------------------------------------------------------------- parser = argparse.ArgumentParser(description='Builds EC2 cluster for testing.') parser.add_argument('config_file', help='yaml configuration file for AWS server cluster') parser.add_argument('key_file', help='key file (.pem) for AWS') parser.add_argument('aws_profile', help='profile for AWS (i.e. as in ~/.aws/certificates)') parser.add_argument('test_script', default='test_letsencrypt_auto_certonly_standalone.sh', help='path of bash script in to deploy and run') parser.add_argument('--repo', default='https://github.com/letsencrypt/letsencrypt.git', help='certbot git repo to use') parser.add_argument('--branch', default='~', help='certbot git branch to trial') parser.add_argument('--pull_request', default='~', help='letsencrypt/letsencrypt pull request to trial') parser.add_argument('--merge_master', action='store_true', help="if set merges PR into master branch of letsencrypt/letsencrypt") parser.add_argument('--saveinstances', action='store_true', help="don't kill EC2 instances after run, useful for debugging") parser.add_argument('--alt_pip', default='', help="server from which to pull candidate release packages") cl_args = parser.parse_args() # Credential Variables #------------------------------------------------------------------------------- # assumes naming: = .pem KEYFILE = cl_args.key_file KEYNAME = os.path.split(cl_args.key_file)[1].split('.pem')[0] PROFILE = None if cl_args.aws_profile == 'SET_BY_ENV' else cl_args.aws_profile # Globals #------------------------------------------------------------------------------- SECURITY_GROUP_NAME = 'certbot-security-group' SENTINEL = None #queue kill signal SUBNET_NAME = 'certbot-subnet' class Status: """Possible statuses of client tests.""" PASS = 'pass' FAIL = 'fail' # Boto3/AWS automation functions #------------------------------------------------------------------------------- def should_use_subnet(subnet): """Should we use the given subnet for these tests? We should if it is the default subnet for the availability zone or the subnet is named "certbot-subnet". """ if not subnet.map_public_ip_on_launch: return False if subnet.default_for_az: return True for tag in subnet.tags: if tag['Key'] == 'Name' and tag['Value'] == SUBNET_NAME: return True return False def make_security_group(vpc): """Creates a security group in the given VPC.""" # will fail if security group of GroupName already exists # cannot have duplicate SGs of the same name mysg = vpc.create_security_group(GroupName=SECURITY_GROUP_NAME, Description='security group for automated testing') mysg.authorize_ingress(IpProtocol="tcp", CidrIp="0.0.0.0/0", FromPort=22, ToPort=22) # for mosh mysg.authorize_ingress(IpProtocol="udp", CidrIp="0.0.0.0/0", FromPort=60000, ToPort=61000) return mysg def make_instance(ec2_client, instance_name, ami_id, keyname, security_group_id, subnet_id, self_destruct, machine_type='t2.micro'): """Creates an instance using the given parameters. If self_destruct is True, the instance will be configured to shutdown after 1 hour and to terminate itself on shutdown. """ block_device_mappings = _get_block_device_mappings(ec2_client, ami_id) tags = [{'Key': 'Name', 'Value': instance_name}] tag_spec = [{'ResourceType': 'instance', 'Tags': tags}] kwargs = { 'BlockDeviceMappings': block_device_mappings, 'ImageId': ami_id, 'SecurityGroupIds': [security_group_id], 'SubnetId': subnet_id, 'KeyName': keyname, 'MinCount': 1, 'MaxCount': 1, 'InstanceType': machine_type, 'TagSpecifications': tag_spec } if self_destruct: kwargs['InstanceInitiatedShutdownBehavior'] = 'terminate' kwargs['UserData'] = '#!/bin/bash\nshutdown -P +60\n' return ec2_client.create_instances(**kwargs)[0] def _get_block_device_mappings(ec2_client, ami_id): """Returns the list of block device mappings to ensure cleanup. This list sets connected EBS volumes to be deleted when the EC2 instance is terminated. """ # Not all devices use EBS, but the default value for DeleteOnTermination # when the device does use EBS is true. See: # * https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-ec2-blockdev-mapping.html # * https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-ec2-blockdev-template.html return [{'DeviceName': mapping['DeviceName'], 'Ebs': {'DeleteOnTermination': True}} for mapping in ec2_client.Image(ami_id).block_device_mappings if not mapping.get('Ebs', {}).get('DeleteOnTermination', True)] # Helper Routines #------------------------------------------------------------------------------- def block_until_ssh_open(ipstring, wait_time=10, timeout=120): "Blocks until server at ipstring has an open port 22" reached = False t_elapsed = 0 while not reached and t_elapsed < timeout: try: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.connect((ipstring, 22)) reached = True except socket.error as err: time.sleep(wait_time) t_elapsed += wait_time sock.close() def block_until_instance_ready(booting_instance, extra_wait_time=20): "Blocks booting_instance until AWS EC2 instance is ready to accept SSH connections" booting_instance.wait_until_running() # The instance needs to be reloaded to update its local attributes. See # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.Instance.reload. booting_instance.reload() # After waiting for the instance to be running and reloading the instance # state, we should have an IP address. assert booting_instance.public_ip_address is not None block_until_ssh_open(booting_instance.public_ip_address) time.sleep(extra_wait_time) return booting_instance # Fabric Routines #------------------------------------------------------------------------------- def local_git_clone(local_cxn, repo_url, log_dir): """clones master of repo_url""" local_cxn.local('cd %s && if [ -d letsencrypt ]; then rm -rf letsencrypt; fi' % log_dir) local_cxn.local('cd %s && git clone %s letsencrypt'% (log_dir, repo_url)) local_cxn.local('cd %s && tar czf le.tar.gz letsencrypt'% log_dir) def local_git_branch(local_cxn, repo_url, branch_name, log_dir): """clones branch of repo_url""" local_cxn.local('cd %s && if [ -d letsencrypt ]; then rm -rf letsencrypt; fi' % log_dir) local_cxn.local('cd %s && git clone %s letsencrypt --branch %s --single-branch'% (log_dir, repo_url, branch_name)) local_cxn.local('cd %s && tar czf le.tar.gz letsencrypt' % log_dir) def local_git_PR(local_cxn, repo_url, PRnumstr, log_dir, merge_master=True): """clones specified pull request from repo_url and optionally merges into master""" local_cxn.local('cd %s && if [ -d letsencrypt ]; then rm -rf letsencrypt; fi' % log_dir) local_cxn.local('cd %s && git clone %s letsencrypt' % (log_dir, repo_url)) local_cxn.local('cd %s && cd letsencrypt && ' 'git fetch origin pull/%s/head:lePRtest' % (log_dir, PRnumstr)) local_cxn.local('cd %s && cd letsencrypt && git checkout lePRtest' % log_dir) if merge_master: local_cxn.local('cd %s && cd letsencrypt && git remote update origin' % log_dir) local_cxn.local('cd %s && cd letsencrypt && ' 'git merge origin/master -m "testmerge"' % log_dir) local_cxn.local('cd %s && tar czf le.tar.gz letsencrypt' % log_dir) def local_repo_to_remote(cxn, log_dir): """copies local tarball of repo to remote""" filename = 'le.tar.gz' local_path = os.path.join(log_dir, filename) cxn.put(local=local_path, remote='') cxn.run('tar xzf %s' % filename) def local_repo_clean(local_cxn, log_dir): """delete tarball""" filename = 'le.tar.gz' local_path = os.path.join(log_dir, filename) local_cxn.local('rm %s' % local_path) def deploy_script(cxn, scriptpath, *args): """copies to remote and executes local script""" cxn.put(local=scriptpath, remote='', preserve_mode=True) scriptfile = os.path.split(scriptpath)[1] args_str = ' '.join(args) cxn.run('./'+scriptfile+' '+args_str) def install_and_launch_certbot(cxn, instance, target, log_dir): local_repo_to_remote(cxn, log_dir) # This needs to be like this, I promise. 1) The env argument to run doesn't work. # See https://github.com/fabric/fabric/issues/1744. 2) prefix() sticks an && between # the commands, so it needs to be exports rather than no &&s in between for the script subshell. with cxn.prefix('export PUBLIC_IP=%s && export PRIVATE_IP=%s && ' 'export PUBLIC_HOSTNAME=%s && export PIP_EXTRA_INDEX_URL=%s && ' 'export OS_TYPE=%s' % (instance.public_ip_address, instance.private_ip_address, instance.public_dns_name, cl_args.alt_pip, target['type'])): deploy_script(cxn, cl_args.test_script) def grab_certbot_log(cxn): "grabs letsencrypt.log via cat into logged stdout" cxn.sudo('/bin/bash -l -i -c \'if [ -f "/var/log/letsencrypt/letsencrypt.log" ]; then ' + 'cat "/var/log/letsencrypt/letsencrypt.log"; else echo "[novarlog]"; fi\'') # fallback file if /var/log is unwriteable...? correct? cxn.sudo('/bin/bash -l -i -c \'if [ -f ./certbot.log ]; then ' + 'cat ./certbot.log; else echo "[nolocallog]"; fi\'') def create_client_instance(ec2_client, target, security_group_id, subnet_id, self_destruct): """Create a single client instance for running tests.""" if 'machine_type' in target: machine_type = target['machine_type'] elif target['virt'] == 'hvm': machine_type = 't2.medium' else: # 32 bit systems machine_type = 'c1.medium' name = 'le-%s'%target['name'] print(name, end=" ") return make_instance(ec2_client, name, target['ami'], KEYNAME, machine_type=machine_type, security_group_id=security_group_id, subnet_id=subnet_id, self_destruct=self_destruct) def test_client_process(fab_config, inqueue, outqueue, log_dir): cur_proc = mp.current_process() for inreq in iter(inqueue.get, SENTINEL): ii, instance_id, target = inreq # Each client process is given its own session due to the suggestion at # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/resources.html?highlight=multithreading#multithreading-multiprocessing. aws_session = boto3.session.Session(profile_name=PROFILE) ec2_client = aws_session.resource('ec2') instance = ec2_client.Instance(id=instance_id) #save all stdout to log file sys.stdout = open(log_dir+'/'+'%d_%s.log'%(ii,target['name']), 'w') print("[%s : client %d %s %s]" % (cur_proc.name, ii, target['ami'], target['name'])) instance = block_until_instance_ready(instance) print("server %s at %s"%(instance, instance.public_ip_address)) host_string = "%s@%s"%(target['user'], instance.public_ip_address) print(host_string) with Connection(host_string, config=fab_config) as cxn: try: install_and_launch_certbot(cxn, instance, target, log_dir) outqueue.put((ii, target, Status.PASS)) print("%s - %s SUCCESS"%(target['ami'], target['name'])) except: outqueue.put((ii, target, Status.FAIL)) print("%s - %s FAIL"%(target['ami'], target['name'])) traceback.print_exc(file=sys.stdout) pass # append server certbot.log to each per-machine output log print("\n\ncertbot.log\n" + "-"*80 + "\n") try: grab_certbot_log(cxn) except: print("log fail\n") traceback.print_exc(file=sys.stdout) pass def cleanup(cl_args, instances, targetlist, log_dir): print('Logs in ', log_dir) # If lengths of instances and targetlist aren't equal, instances failed to # start before running tests so leaving instances running for debugging # isn't very useful. Let's cleanup after ourselves instead. if len(instances) != len(targetlist) or not cl_args.saveinstances: print('Terminating EC2 Instances') for instance in instances: instance.terminate() else: # print login information for the boxes for debugging for ii, target in enumerate(targetlist): print(target['name'], target['ami'], "%s@%s"%(target['user'], instances[ii].public_ip_address)) def main(): # Fabric library controlled through global env parameters fab_config = Config(overrides={ "connect_kwargs": { "key_filename": [KEYFILE], # https://github.com/fabric/fabric/issues/2007 }, "run": { "echo": True, "pty": True, }, "timeouts": { "connect": 10, }, }) # no network connection, so don't worry about closing this one. local_cxn = Connection('localhost', config=fab_config) # Set up local copy of git repo #------------------------------------------------------------------------------- log_dir = "letest-%d"%int(time.time()) #points to logging / working directory print("Making local dir for test repo and logs: %s"%log_dir) local_cxn.local('mkdir %s'%log_dir) try: # figure out what git object to test and locally create it in log_dir print("Making local git repo") if cl_args.pull_request != '~': print('Testing PR %s ' % cl_args.pull_request, "MERGING into master" if cl_args.merge_master else "") local_git_PR(local_cxn, cl_args.repo, cl_args.pull_request, log_dir, cl_args.merge_master) elif cl_args.branch != '~': print('Testing branch %s of %s' % (cl_args.branch, cl_args.repo)) local_git_branch(local_cxn, cl_args.repo, cl_args.branch, log_dir) else: print('Testing current branch of %s' % cl_args.repo, log_dir) local_git_clone(local_cxn, cl_args.repo, log_dir) except BaseException: print("FAIL: trouble with git repo") traceback.print_exc() exit(1) # Set up EC2 instances #------------------------------------------------------------------------------- configdata = yaml.safe_load(open(cl_args.config_file, 'r')) targetlist = configdata['targets'] print('Testing against these images: [%d total]'%len(targetlist)) for target in targetlist: print(target['ami'], target['name']) print("Connecting to EC2 using\n profile %s\n keyname %s\n keyfile %s"%(PROFILE, KEYNAME, KEYFILE)) aws_session = boto3.session.Session(profile_name=PROFILE) ec2_client = aws_session.resource('ec2') print("Determining Subnet") for subnet in ec2_client.subnets.all(): if should_use_subnet(subnet): subnet_id = subnet.id vpc_id = subnet.vpc.id break else: print("No usable subnet exists!") print("Please create a VPC with a subnet named {0}".format(SUBNET_NAME)) print("that maps public IPv4 addresses to instances launched in the subnet.") sys.exit(1) print("Making Security Group") vpc = ec2_client.Vpc(vpc_id) sg_exists = False for sg in vpc.security_groups.all(): if sg.group_name == SECURITY_GROUP_NAME: security_group_id = sg.id sg_exists = True print(" %s already exists"%SECURITY_GROUP_NAME) if not sg_exists: security_group_id = make_security_group(vpc).id time.sleep(30) instances = [] try: print("Creating instances: ", end="") # If we want to preserve instances, do not have them self-destruct. self_destruct = not cl_args.saveinstances for target in targetlist: instances.append( create_client_instance(ec2_client, target, security_group_id, subnet_id, self_destruct) ) print() # Install and launch client scripts in parallel #------------------------------------------------------------------------------- print("Uploading and running test script in parallel: %s"%cl_args.test_script) print("Output routed to log files in %s"%log_dir) # (Advice: always use Manager.Queue, never regular multiprocessing.Queue # the latter has implementation flaws that deadlock it in some circumstances) manager = Manager() outqueue = manager.Queue() inqueue = manager.Queue() # launch as many processes as clients to test num_processes = len(targetlist) jobs = [] #keep a reference to current procs # initiate process execution client_process_args=(fab_config, inqueue, outqueue, log_dir) for i in range(num_processes): p = mp.Process(target=test_client_process, args=client_process_args) jobs.append(p) p.daemon = True # kills subprocesses if parent is killed p.start() # fill up work queue for ii, target in enumerate(targetlist): inqueue.put((ii, instances[ii].id, target)) # add SENTINELs to end client processes for i in range(num_processes): inqueue.put(SENTINEL) print('Waiting on client processes', end='') for p in jobs: while p.is_alive(): p.join(5 * 60) # Regularly print output to keep Travis happy print('.', end='') sys.stdout.flush() print() # add SENTINEL to output queue outqueue.put(SENTINEL) # clean up local_repo_clean(local_cxn, log_dir) # print and save summary results results_file = open(log_dir+'/results', 'w') outputs = [outq for outq in iter(outqueue.get, SENTINEL)] outputs.sort(key=lambda x: x[0]) failed = False results_msg = "" for outq in outputs: ii, target, status = outq if status == Status.FAIL: failed = True with open(log_dir+'/'+'%d_%s.log'%(ii,target['name']), 'r') as f: print(target['name'] + " test failed. Test log:") print(f.read()) results_msg = results_msg + '%d %s %s\n'%(ii, target['name'], status) results_file.write('%d %s %s\n'%(ii, target['name'], status)) print(results_msg) if len(outputs) != num_processes: failed = True failure_message = 'FAILURE: Some target machines failed to run and were not tested. ' +\ 'Tests should be rerun.' print(failure_message) results_file.write(failure_message + '\n') results_file.close() if failed: sys.exit(1) finally: cleanup(cl_args, instances, targetlist, log_dir) if __name__ == '__main__': main()