certbot/tests/letstest/multitester.py

594 lines
24 KiB
Python
Raw Permalink Normal View History

2015-12-04 09:32:24 -05:00
"""
Certbot Integration Test Tool
2015-12-04 09:32:24 -05:00
- Configures (canned) boulder server
- Launches EC2 instances with a given list of AMIs for different distros
2016-04-14 20:10:27 -04:00
- Copies certbot repo and puts it on the instances
- Runs certbot tests (bash scripts) on all of these
2015-12-04 09:32:24 -05:00
- Logs execution and success/fail for debugging
Notes:
- Some AWS images, e.g. official CentOS and FreeBSD images
require acceptance of user terms on the AWS marketplace
website. This can't be automated.
- AWS EC2 has a default limit of 20 t2/t1 instances, if more
are needed, they need to be requested via online webform.
Usage:
- Requires AWS IAM secrets to be set up with aws cli
- Requires an AWS associated keyfile <keyname>.pem
>aws configure --profile HappyHacker
[interactive: enter secrets for IAM role]
>aws ec2 create-key-pair --profile HappyHacker --key-name MyKeyPair \
--query 'KeyMaterial' --output text > MyKeyPair.pem
then:
>python multitester.py targets.yaml MyKeyPair.pem HappyHacker scripts/test_leauto_upgrades.sh
2015-12-04 09:32:24 -05:00
see:
https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html
https://docs.aws.amazon.com/cli/latest/userguide/cli-ec2-keypairs.html
"""
from __future__ import print_function
from __future__ import with_statement
import sys, os, time, argparse, socket, traceback
2015-12-04 09:32:24 -05:00
import multiprocessing as mp
from multiprocessing import Manager
import urllib2
import yaml
import boto3
from botocore.exceptions import ClientError
2015-12-04 09:32:24 -05:00
import fabric
from fabric.api import run, execute, local, env, sudo, cd, lcd
from fabric.operations import get, put
from fabric.context_managers import shell_env
# Command line parser
#-------------------------------------------------------------------------------
parser = argparse.ArgumentParser(description='Builds EC2 cluster for testing.')
parser.add_argument('config_file',
help='yaml configuration file for AWS server cluster')
parser.add_argument('key_file',
help='key file (<keyname>.pem) for AWS')
parser.add_argument('aws_profile',
help='profile for AWS (i.e. as in ~/.aws/certificates)')
parser.add_argument('test_script',
default='test_letsencrypt_auto_certonly_standalone.sh',
2015-12-04 18:18:51 -05:00
help='path of bash script in to deploy and run')
2015-12-04 09:32:24 -05:00
#parser.add_argument('--script_args',
# nargs='+',
# help='space-delimited list of arguments to pass to the bash test script',
# required=False)
parser.add_argument('--repo',
default='https://github.com/letsencrypt/letsencrypt.git',
2016-04-14 20:10:27 -04:00
help='certbot git repo to use')
2015-12-04 09:32:24 -05:00
parser.add_argument('--branch',
default='~',
2016-04-14 20:10:27 -04:00
help='certbot git branch to trial')
2015-12-04 09:32:24 -05:00
parser.add_argument('--pull_request',
default='~',
help='letsencrypt/letsencrypt pull request to trial')
parser.add_argument('--merge_master',
action='store_true',
help="if set merges PR into master branch of letsencrypt/letsencrypt")
parser.add_argument('--saveinstances',
action='store_true',
help="don't kill EC2 instances after run, useful for debugging")
2015-12-15 13:59:13 -05:00
parser.add_argument('--alt_pip',
2015-12-22 21:57:37 -05:00
default='',
2015-12-15 13:59:13 -05:00
help="server from which to pull candidate release packages")
parser.add_argument('--killboulder',
action='store_true',
help="do not leave a persistent boulder server running")
parser.add_argument('--boulderonly',
action='store_true',
help="only make a boulder server")
2015-12-25 13:43:52 -05:00
parser.add_argument('--fast',
action='store_true',
help="use larger instance types to run faster (saves about a minute, probably not worth it)")
2015-12-04 09:32:24 -05:00
cl_args = parser.parse_args()
# Credential Variables
#-------------------------------------------------------------------------------
# assumes naming: <key_filename> = <keyname>.pem
KEYFILE = cl_args.key_file
KEYNAME = os.path.split(cl_args.key_file)[1].split('.pem')[0]
PROFILE = None if cl_args.aws_profile == 'SET_BY_ENV' else cl_args.aws_profile
2015-12-04 09:32:24 -05:00
# Globals
#-------------------------------------------------------------------------------
BOULDER_AMI = 'ami-072a9534772bec854' # premade shared boulder AMI 18.04LTS us-east-1
LOGDIR = "letest-%d"%int(time.time()) #points to logging / working directory
SECURITY_GROUP_NAME = 'certbot-security-group'
SENTINEL = None #queue kill signal
SUBNET_NAME = 'certbot-subnet'
2015-12-04 09:32:24 -05:00
class Status(object):
"""Possible statuses of client tests."""
PASS = 'pass'
FAIL = 'fail'
2015-12-04 09:32:24 -05:00
# Boto3/AWS automation functions
#-------------------------------------------------------------------------------
def should_use_subnet(subnet):
"""Should we use the given subnet for these tests?
We should if it is the default subnet for the availability zone or the
subnet is named "certbot-subnet".
"""
if not subnet.map_public_ip_on_launch:
return False
if subnet.default_for_az:
return True
for tag in subnet.tags:
if tag['Key'] == 'Name' and tag['Value'] == SUBNET_NAME:
return True
return False
def make_security_group(vpc):
"""Creates a security group in the given VPC."""
2015-12-04 09:32:24 -05:00
# will fail if security group of GroupName already exists
# cannot have duplicate SGs of the same name
mysg = vpc.create_security_group(GroupName=SECURITY_GROUP_NAME,
2015-12-04 09:32:24 -05:00
Description='security group for automated testing')
mysg.authorize_ingress(IpProtocol="tcp", CidrIp="0.0.0.0/0", FromPort=22, ToPort=22)
mysg.authorize_ingress(IpProtocol="tcp", CidrIp="0.0.0.0/0", FromPort=80, ToPort=80)
mysg.authorize_ingress(IpProtocol="tcp", CidrIp="0.0.0.0/0", FromPort=443, ToPort=443)
# for boulder wfe (http) server
mysg.authorize_ingress(IpProtocol="tcp", CidrIp="0.0.0.0/0", FromPort=4000, ToPort=4000)
# for mosh
mysg.authorize_ingress(IpProtocol="udp", CidrIp="0.0.0.0/0", FromPort=60000, ToPort=61000)
return mysg
def make_instance(ec2_client,
instance_name,
2015-12-04 09:32:24 -05:00
ami_id,
keyname,
security_group_id,
subnet_id,
2015-12-04 09:32:24 -05:00
machine_type='t2.micro',
userdata=""): #userdata contains bash or cloud-init script
block_device_mappings = _get_block_device_mappings(ec2_client, ami_id)
tags = [{'Key': 'Name', 'Value': instance_name}]
tag_spec = [{'ResourceType': 'instance', 'Tags': tags}]
return ec2_client.create_instances(
BlockDeviceMappings=block_device_mappings,
ImageId=ami_id,
SecurityGroupIds=[security_group_id],
SubnetId=subnet_id,
KeyName=keyname,
MinCount=1,
MaxCount=1,
UserData=userdata,
InstanceType=machine_type,
TagSpecifications=tag_spec)[0]
def _get_block_device_mappings(ec2_client, ami_id):
"""Returns the list of block device mappings to ensure cleanup.
This list sets connected EBS volumes to be deleted when the EC2
instance is terminated.
2015-12-04 09:32:24 -05:00
"""
# Not all devices use EBS, but the default value for DeleteOnTermination
# when the device does use EBS is true. See:
# * https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-ec2-blockdev-mapping.html
# * https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-ec2-blockdev-template.html
return [{'DeviceName': mapping['DeviceName'],
'Ebs': {'DeleteOnTermination': True}}
for mapping in ec2_client.Image(ami_id).block_device_mappings
if not mapping.get('Ebs', {}).get('DeleteOnTermination', True)]
2015-12-04 09:32:24 -05:00
# Helper Routines
#-------------------------------------------------------------------------------
def block_until_http_ready(urlstring, wait_time=10, timeout=240):
"Blocks until server at urlstring can respond to http requests"
server_ready = False
t_elapsed = 0
while not server_ready and t_elapsed < timeout:
try:
sys.stdout.write('.')
sys.stdout.flush()
req = urllib2.Request(urlstring)
response = urllib2.urlopen(req)
#if response.code == 200:
server_ready = True
except urllib2.URLError:
pass
time.sleep(wait_time)
t_elapsed += wait_time
def block_until_ssh_open(ipstring, wait_time=10, timeout=120):
"Blocks until server at ipstring has an open port 22"
reached = False
t_elapsed = 0
while not reached and t_elapsed < timeout:
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.connect((ipstring, 22))
reached = True
except socket.error as err:
time.sleep(wait_time)
t_elapsed += wait_time
sock.close()
def block_until_instance_ready(booting_instance, wait_time=5, extra_wait_time=20):
"Blocks booting_instance until AWS EC2 instance is ready to accept SSH connections"
state = booting_instance.state['Name']
ip = booting_instance.public_ip_address
while state != 'running' or ip is None:
2015-12-04 09:32:24 -05:00
time.sleep(wait_time)
# The instance needs to be reloaded to update its local attributes. See
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.Instance.reload.
booting_instance.reload()
state = booting_instance.state['Name']
ip = booting_instance.public_ip_address
block_until_ssh_open(ip)
2015-12-04 09:32:24 -05:00
time.sleep(extra_wait_time)
return booting_instance
2015-12-04 09:32:24 -05:00
# Fabric Routines
#-------------------------------------------------------------------------------
def local_git_clone(repo_url):
"clones master of repo_url"
with lcd(LOGDIR):
local('if [ -d letsencrypt ]; then rm -rf letsencrypt; fi')
2016-02-10 22:09:05 -05:00
local('git clone %s letsencrypt'% repo_url)
2015-12-04 09:32:24 -05:00
local('tar czf le.tar.gz letsencrypt')
def local_git_branch(repo_url, branch_name):
"clones branch <branch_name> of repo_url"
with lcd(LOGDIR):
local('if [ -d letsencrypt ]; then rm -rf letsencrypt; fi')
2016-02-10 22:09:05 -05:00
local('git clone %s letsencrypt --branch %s --single-branch'%(repo_url, branch_name))
2015-12-04 09:32:24 -05:00
local('tar czf le.tar.gz letsencrypt')
def local_git_PR(repo_url, PRnumstr, merge_master=True):
"clones specified pull request from repo_url and optionally merges into master"
with lcd(LOGDIR):
local('if [ -d letsencrypt ]; then rm -rf letsencrypt; fi')
2016-02-10 22:09:05 -05:00
local('git clone %s letsencrypt'% repo_url)
2015-12-04 09:32:24 -05:00
local('cd letsencrypt && git fetch origin pull/%s/head:lePRtest'%PRnumstr)
2016-04-14 11:57:34 -04:00
local('cd letsencrypt && git checkout lePRtest')
2015-12-04 09:32:24 -05:00
if merge_master:
local('cd letsencrypt && git remote update origin')
local('cd letsencrypt && git merge origin/master -m "testmerge"')
local('tar czf le.tar.gz letsencrypt')
def local_repo_to_remote():
"copies local tarball of repo to remote"
with lcd(LOGDIR):
put(local_path='le.tar.gz', remote_path='')
run('tar xzf le.tar.gz')
def local_repo_clean():
"delete tarball"
with lcd(LOGDIR):
local('rm le.tar.gz')
2015-12-04 18:18:51 -05:00
def deploy_script(scriptpath, *args):
2015-12-04 09:32:24 -05:00
"copies to remote and executes local script"
2015-12-04 18:18:51 -05:00
#with lcd('scripts'):
put(local_path=scriptpath, remote_path='', mirror_local_mode=True)
scriptfile = os.path.split(scriptpath)[1]
2015-12-04 09:32:24 -05:00
args_str = ' '.join(args)
2015-12-04 18:18:51 -05:00
run('./'+scriptfile+' '+args_str)
2015-12-04 09:32:24 -05:00
def run_boulder():
with cd('$GOPATH/src/github.com/letsencrypt/boulder'):
run('sudo docker-compose up -d')
2015-12-04 09:32:24 -05:00
def config_and_launch_boulder(instance):
2015-12-04 18:18:51 -05:00
execute(deploy_script, 'scripts/boulder_config.sh')
2015-12-04 09:32:24 -05:00
execute(run_boulder)
2016-04-14 20:10:27 -04:00
def install_and_launch_certbot(instance, boulder_url, target):
2015-12-04 09:32:24 -05:00
execute(local_repo_to_remote)
with shell_env(BOULDER_URL=boulder_url,
PUBLIC_IP=instance.public_ip_address,
PRIVATE_IP=instance.private_ip_address,
PUBLIC_HOSTNAME=instance.public_dns_name,
2015-12-15 13:59:13 -05:00
PIP_EXTRA_INDEX_URL=cl_args.alt_pip,
OS_TYPE=target['type']):
2015-12-04 09:32:24 -05:00
execute(deploy_script, cl_args.test_script)
2016-04-14 20:10:27 -04:00
def grab_certbot_log():
2015-12-04 09:32:24 -05:00
"grabs letsencrypt.log via cat into logged stdout"
sudo('if [ -f /var/log/letsencrypt/letsencrypt.log ]; then \
cat /var/log/letsencrypt/letsencrypt.log; else echo "[novarlog]"; fi')
# fallback file if /var/log is unwriteable...? correct?
2016-04-14 20:10:27 -04:00
sudo('if [ -f ./certbot.log ]; then \
cat ./certbot.log; else echo "[nolocallog]"; fi')
2015-12-04 09:32:24 -05:00
def create_client_instance(ec2_client, target, security_group_id, subnet_id):
"""Create a single client instance for running tests."""
if 'machine_type' in target:
machine_type = target['machine_type']
elif target['virt'] == 'hvm':
machine_type = 't2.medium' if cl_args.fast else 't2.micro'
else:
# 32 bit systems
machine_type = 'c1.medium' if cl_args.fast else 't1.micro'
if 'userdata' in target.keys():
userdata = target['userdata']
else:
userdata = ''
name = 'le-%s'%target['name']
print(name, end=" ")
return make_instance(ec2_client,
name,
target['ami'],
KEYNAME,
machine_type=machine_type,
security_group_id=security_group_id,
subnet_id=subnet_id,
userdata=userdata)
def test_client_process(inqueue, outqueue, boulder_url):
cur_proc = mp.current_process()
for inreq in iter(inqueue.get, SENTINEL):
ii, instance_id, target = inreq
# Each client process is given its own session due to the suggestion at
# https://boto3.amazonaws.com/v1/documentation/api/latest/guide/resources.html?highlight=multithreading#multithreading-multiprocessing.
aws_session = boto3.session.Session(profile_name=PROFILE)
ec2_client = aws_session.resource('ec2')
instance = ec2_client.Instance(id=instance_id)
#save all stdout to log file
sys.stdout = open(LOGDIR+'/'+'%d_%s.log'%(ii,target['name']), 'w')
print("[%s : client %d %s %s]" % (cur_proc.name, ii, target['ami'], target['name']))
instance = block_until_instance_ready(instance)
print("server %s at %s"%(instance, instance.public_ip_address))
env.host_string = "%s@%s"%(target['user'], instance.public_ip_address)
print(env.host_string)
try:
install_and_launch_certbot(instance, boulder_url, target)
outqueue.put((ii, target, Status.PASS))
print("%s - %s SUCCESS"%(target['ami'], target['name']))
except:
outqueue.put((ii, target, Status.FAIL))
print("%s - %s FAIL"%(target['ami'], target['name']))
traceback.print_exc(file=sys.stdout)
pass
2016-04-14 20:10:27 -04:00
# append server certbot.log to each per-machine output log
print("\n\ncertbot.log\n" + "-"*80 + "\n")
try:
2016-04-14 20:10:27 -04:00
execute(grab_certbot_log)
except:
print("log fail\n")
traceback.print_exc(file=sys.stdout)
pass
def cleanup(cl_args, instances, targetlist):
print('Logs in ', LOGDIR)
# If lengths of instances and targetlist aren't equal, instances failed to
# start before running tests so leaving instances running for debugging
# isn't very useful. Let's cleanup after ourselves instead.
2019-07-25 04:20:52 -04:00
if len(instances) != len(targetlist) or not cl_args.saveinstances:
print('Terminating EC2 Instances')
if cl_args.killboulder:
boulder_server.terminate()
for instance in instances:
instance.terminate()
else:
# print login information for the boxes for debugging
for ii, target in enumerate(targetlist):
print(target['name'],
target['ami'],
"%s@%s"%(target['user'], instances[ii].public_ip_address))
def main():
# Fabric library controlled through global env parameters
env.key_filename = KEYFILE
env.shell = '/bin/bash -l -i -c'
env.connection_attempts = 5
env.timeout = 10
# replace default SystemExit thrown by fabric during trouble
class FabricException(Exception):
pass
env['abort_exception'] = FabricException
# Set up local copy of git repo
#-------------------------------------------------------------------------------
print("Making local dir for test repo and logs: %s"%LOGDIR)
local('mkdir %s'%LOGDIR)
# figure out what git object to test and locally create it in LOGDIR
print("Making local git repo")
try:
if cl_args.pull_request != '~':
print('Testing PR %s '%cl_args.pull_request,
"MERGING into master" if cl_args.merge_master else "")
execute(local_git_PR, cl_args.repo, cl_args.pull_request, cl_args.merge_master)
elif cl_args.branch != '~':
print('Testing branch %s of %s'%(cl_args.branch, cl_args.repo))
execute(local_git_branch, cl_args.repo, cl_args.branch)
else:
print('Testing master of %s'%cl_args.repo)
execute(local_git_clone, cl_args.repo)
except FabricException:
print("FAIL: trouble with git repo")
traceback.print_exc()
exit()
# Set up EC2 instances
#-------------------------------------------------------------------------------
configdata = yaml.load(open(cl_args.config_file, 'r'))
targetlist = configdata['targets']
print('Testing against these images: [%d total]'%len(targetlist))
for target in targetlist:
print(target['ami'], target['name'])
print("Connecting to EC2 using\n profile %s\n keyname %s\n keyfile %s"%(PROFILE, KEYNAME, KEYFILE))
aws_session = boto3.session.Session(profile_name=PROFILE)
ec2_client = aws_session.resource('ec2')
print("Determining Subnet")
for subnet in ec2_client.subnets.all():
if should_use_subnet(subnet):
subnet_id = subnet.id
vpc_id = subnet.vpc.id
break
2015-12-04 09:32:24 -05:00
else:
print("No usable subnet exists!")
print("Please create a VPC with a subnet named {0}".format(SUBNET_NAME))
print("that maps public IPv4 addresses to instances launched in the subnet.")
sys.exit(1)
2015-12-04 09:32:24 -05:00
print("Making Security Group")
vpc = ec2_client.Vpc(vpc_id)
sg_exists = False
for sg in vpc.security_groups.all():
if sg.group_name == SECURITY_GROUP_NAME:
security_group_id = sg.id
sg_exists = True
print(" %s already exists"%SECURITY_GROUP_NAME)
if not sg_exists:
security_group_id = make_security_group(vpc).id
time.sleep(30)
boulder_preexists = False
boulder_servers = ec2_client.instances.filter(Filters=[
{'Name': 'tag:Name', 'Values': ['le-boulderserver']},
{'Name': 'instance-state-name', 'Values': ['running']}])
boulder_server = next(iter(boulder_servers), None)
print("Requesting Instances...")
if boulder_server:
print("Found existing boulder server:", boulder_server)
boulder_preexists = True
else:
print("Can't find a boulder server, starting one...")
boulder_server = make_instance(ec2_client,
'le-boulderserver',
BOULDER_AMI,
KEYNAME,
machine_type='t2.micro',
#machine_type='t2.medium',
security_group_id=security_group_id,
subnet_id=subnet_id)
instances = []
try:
if not cl_args.boulderonly:
print("Creating instances: ", end="")
for target in targetlist:
instances.append(
create_client_instance(ec2_client, target,
security_group_id, subnet_id)
)
print()
# Configure and launch boulder server
#-------------------------------------------------------------------------------
print("Waiting on Boulder Server")
boulder_server = block_until_instance_ready(boulder_server)
print(" server %s"%boulder_server)
# env.host_string defines the ssh user and host for connection
env.host_string = "ubuntu@%s"%boulder_server.public_ip_address
print("Boulder Server at (SSH):", env.host_string)
if not boulder_preexists:
print("Configuring and Launching Boulder")
config_and_launch_boulder(boulder_server)
# blocking often unnecessary, but cheap EC2 VMs can get very slow
block_until_http_ready('http://%s:4000'%boulder_server.public_ip_address,
wait_time=10, timeout=500)
boulder_url = "http://%s:4000/directory"%boulder_server.private_ip_address
print("Boulder Server at (public ip): http://%s:4000/directory"%boulder_server.public_ip_address)
print("Boulder Server at (EC2 private ip): %s"%boulder_url)
if cl_args.boulderonly:
sys.exit(0)
# Install and launch client scripts in parallel
#-------------------------------------------------------------------------------
print("Uploading and running test script in parallel: %s"%cl_args.test_script)
print("Output routed to log files in %s"%LOGDIR)
# (Advice: always use Manager.Queue, never regular multiprocessing.Queue
# the latter has implementation flaws that deadlock it in some circumstances)
manager = Manager()
outqueue = manager.Queue()
inqueue = manager.Queue()
# launch as many processes as clients to test
num_processes = len(targetlist)
jobs = [] #keep a reference to current procs
# initiate process execution
for i in range(num_processes):
p = mp.Process(target=test_client_process, args=(inqueue, outqueue, boulder_url))
jobs.append(p)
p.daemon = True # kills subprocesses if parent is killed
p.start()
# fill up work queue
for ii, target in enumerate(targetlist):
inqueue.put((ii, instances[ii].id, target))
# add SENTINELs to end client processes
for i in range(num_processes):
inqueue.put(SENTINEL)
print('Waiting on client processes', end='')
for p in jobs:
while p.is_alive():
p.join(5 * 60)
# Regularly print output to keep Travis happy
print('.', end='')
sys.stdout.flush()
print()
# add SENTINEL to output queue
outqueue.put(SENTINEL)
# clean up
execute(local_repo_clean)
# print and save summary results
results_file = open(LOGDIR+'/results', 'w')
outputs = [outq for outq in iter(outqueue.get, SENTINEL)]
outputs.sort(key=lambda x: x[0])
failed = False
for outq in outputs:
ii, target, status = outq
if status == Status.FAIL:
failed = True
print('%d %s %s'%(ii, target['name'], status))
results_file.write('%d %s %s\n'%(ii, target['name'], status))
if len(outputs) != num_processes:
failed = True
failure_message = 'FAILURE: Some target machines failed to run and were not tested. ' +\
'Tests should be rerun.'
print(failure_message)
results_file.write(failure_message + '\n')
results_file.close()
if failed:
sys.exit(1)
finally:
cleanup(cl_args, instances, targetlist)
# kill any connections
fabric.network.disconnect_all()
if __name__ == '__main__':
main()