diff --git a/discovery/aws/ecs.go b/discovery/aws/ecs.go index d6b36a7980..1d5ff366de 100644 --- a/discovery/aws/ecs.go +++ b/discovery/aws/ecs.go @@ -28,6 +28,7 @@ import ( "github.com/aws/aws-sdk-go-v2/credentials" "github.com/aws/aws-sdk-go-v2/credentials/stscreds" "github.com/aws/aws-sdk-go-v2/feature/ec2/imds" + "github.com/aws/aws-sdk-go-v2/service/ec2" "github.com/aws/aws-sdk-go-v2/service/ecs" "github.com/aws/aws-sdk-go-v2/service/ecs/types" "github.com/aws/aws-sdk-go-v2/service/sts" @@ -44,31 +45,37 @@ import ( ) const ( - ecsLabel = model.MetaLabelPrefix + "ecs_" - ecsLabelCluster = ecsLabel + "cluster" - ecsLabelClusterARN = ecsLabel + "cluster_arn" - ecsLabelService = ecsLabel + "service" - ecsLabelServiceARN = ecsLabel + "service_arn" - ecsLabelServiceStatus = ecsLabel + "service_status" - ecsLabelTaskGroup = ecsLabel + "task_group" - ecsLabelTaskARN = ecsLabel + "task_arn" - ecsLabelTaskDefinition = ecsLabel + "task_definition" - ecsLabelRegion = ecsLabel + "region" - ecsLabelAvailabilityZone = ecsLabel + "availability_zone" - ecsLabelAZID = ecsLabel + "availability_zone_id" - ecsLabelSubnetID = ecsLabel + "subnet_id" - ecsLabelIPAddress = ecsLabel + "ip_address" - ecsLabelLaunchType = ecsLabel + "launch_type" - ecsLabelDesiredStatus = ecsLabel + "desired_status" - ecsLabelLastStatus = ecsLabel + "last_status" - ecsLabelHealthStatus = ecsLabel + "health_status" - ecsLabelPlatformFamily = ecsLabel + "platform_family" - ecsLabelPlatformVersion = ecsLabel + "platform_version" - ecsLabelTag = ecsLabel + "tag_" - ecsLabelTagCluster = ecsLabelTag + "cluster_" - ecsLabelTagService = ecsLabelTag + "service_" - ecsLabelTagTask = ecsLabelTag + "task_" - ecsLabelSeparator = "," + ecsLabel = model.MetaLabelPrefix + "ecs_" + ecsLabelCluster = ecsLabel + "cluster" + ecsLabelClusterARN = ecsLabel + "cluster_arn" + ecsLabelService = ecsLabel + "service" + ecsLabelServiceARN = ecsLabel + "service_arn" + ecsLabelServiceStatus = ecsLabel + "service_status" + ecsLabelTaskGroup = ecsLabel + "task_group" + ecsLabelTaskARN = ecsLabel + "task_arn" + ecsLabelTaskDefinition = ecsLabel + "task_definition" + ecsLabelRegion = ecsLabel + "region" + ecsLabelAvailabilityZone = ecsLabel + "availability_zone" + ecsLabelSubnetID = ecsLabel + "subnet_id" + ecsLabelIPAddress = ecsLabel + "ip_address" + ecsLabelLaunchType = ecsLabel + "launch_type" + ecsLabelDesiredStatus = ecsLabel + "desired_status" + ecsLabelLastStatus = ecsLabel + "last_status" + ecsLabelHealthStatus = ecsLabel + "health_status" + ecsLabelPlatformFamily = ecsLabel + "platform_family" + ecsLabelPlatformVersion = ecsLabel + "platform_version" + ecsLabelTag = ecsLabel + "tag_" + ecsLabelTagCluster = ecsLabelTag + "cluster_" + ecsLabelTagService = ecsLabelTag + "service_" + ecsLabelTagTask = ecsLabelTag + "task_" + ecsLabelTagEC2 = ecsLabelTag + "ec2_" + ecsLabelNetworkMode = ecsLabel + "network_mode" + ecsLabelContainerInstanceARN = ecsLabel + "container_instance_arn" + ecsLabelEC2InstanceID = ecsLabel + "ec2_instance_id" + ecsLabelEC2InstanceType = ecsLabel + "ec2_instance_type" + ecsLabelEC2InstancePrivateIP = ecsLabel + "ec2_instance_private_ip" + ecsLabelEC2InstancePublicIP = ecsLabel + "ec2_instance_public_ip" + ecsLabelPublicIP = ecsLabel + "public_ip" ) // DefaultECSSDConfig is the default ECS SD configuration. @@ -153,6 +160,12 @@ type ecsClient interface { DescribeServices(context.Context, *ecs.DescribeServicesInput, ...func(*ecs.Options)) (*ecs.DescribeServicesOutput, error) ListTasks(context.Context, *ecs.ListTasksInput, ...func(*ecs.Options)) (*ecs.ListTasksOutput, error) DescribeTasks(context.Context, *ecs.DescribeTasksInput, ...func(*ecs.Options)) (*ecs.DescribeTasksOutput, error) + DescribeContainerInstances(context.Context, *ecs.DescribeContainerInstancesInput, ...func(*ecs.Options)) (*ecs.DescribeContainerInstancesOutput, error) +} + +type ecsEC2Client interface { + DescribeInstances(context.Context, *ec2.DescribeInstancesInput, ...func(*ec2.Options)) (*ec2.DescribeInstancesOutput, error) + DescribeNetworkInterfaces(context.Context, *ec2.DescribeNetworkInterfacesInput, ...func(*ec2.Options)) (*ec2.DescribeNetworkInterfacesOutput, error) } // ECSDiscovery periodically performs ECS-SD requests. It implements @@ -162,6 +175,7 @@ type ECSDiscovery struct { logger *slog.Logger cfg *ECSSDConfig ecs ecsClient + ec2 ecsEC2Client } // NewECSDiscovery returns a new ECSDiscovery which periodically refreshes its targets. @@ -191,7 +205,7 @@ func NewECSDiscovery(conf *ECSSDConfig, opts discovery.DiscovererOptions) (*ECSD } func (d *ECSDiscovery) initEcsClient(ctx context.Context) error { - if d.ecs != nil { + if d.ecs != nil && d.ec2 != nil { return nil } @@ -240,6 +254,10 @@ func (d *ECSDiscovery) initEcsClient(ctx context.Context) error { options.HTTPClient = client }) + d.ec2 = ec2.NewFromConfig(cfg, func(options *ec2.Options) { + options.HTTPClient = client + }) + // Test credentials by making a simple API call testCtx, cancel := context.WithTimeout(ctx, 10*time.Second) defer cancel() @@ -458,6 +476,113 @@ func (d *ECSDiscovery) describeTasks(ctx context.Context, clusterARN string, tas return tasks, errg.Wait() } +// describeContainerInstances returns a map of container instance ARN to EC2 instance ID +// Uses batching to respect AWS API limits (100 container instances per request). +func (d *ECSDiscovery) describeContainerInstances(ctx context.Context, clusterARN string, containerInstanceARNs []string) (map[string]string, error) { + if len(containerInstanceARNs) == 0 { + return make(map[string]string), nil + } + + containerInstToEC2 := make(map[string]string) + batchSize := 100 // AWS API limit + + for _, batch := range batchSlice(containerInstanceARNs, batchSize) { + resp, err := d.ecs.DescribeContainerInstances(ctx, &ecs.DescribeContainerInstancesInput{ + Cluster: aws.String(clusterARN), + ContainerInstances: batch, + }) + if err != nil { + return nil, fmt.Errorf("could not describe container instances: %w", err) + } + + for _, ci := range resp.ContainerInstances { + if ci.ContainerInstanceArn != nil && ci.Ec2InstanceId != nil { + containerInstToEC2[*ci.ContainerInstanceArn] = *ci.Ec2InstanceId + } + } + } + + return containerInstToEC2, nil +} + +// ec2InstanceInfo holds information retrieved from EC2 DescribeInstances. +type ec2InstanceInfo struct { + privateIP string + publicIP string + subnetID string + instanceType string + tags map[string]string +} + +// describeEC2Instances returns a map of EC2 instance ID to instance information. +func (d *ECSDiscovery) describeEC2Instances(ctx context.Context, instanceIDs []string) (map[string]ec2InstanceInfo, error) { + if len(instanceIDs) == 0 { + return make(map[string]ec2InstanceInfo), nil + } + + instanceInfo := make(map[string]ec2InstanceInfo) + + resp, err := d.ec2.DescribeInstances(ctx, &ec2.DescribeInstancesInput{ + InstanceIds: instanceIDs, + }) + if err != nil { + return nil, fmt.Errorf("could not describe EC2 instances: %w", err) + } + + for _, reservation := range resp.Reservations { + for _, instance := range reservation.Instances { + if instance.InstanceId != nil && instance.PrivateIpAddress != nil { + info := ec2InstanceInfo{ + privateIP: *instance.PrivateIpAddress, + tags: make(map[string]string), + } + if instance.PublicIpAddress != nil { + info.publicIP = *instance.PublicIpAddress + } + if instance.SubnetId != nil { + info.subnetID = *instance.SubnetId + } + if instance.InstanceType != "" { + info.instanceType = string(instance.InstanceType) + } + // Collect EC2 instance tags + for _, tag := range instance.Tags { + if tag.Key != nil && tag.Value != nil { + info.tags[*tag.Key] = *tag.Value + } + } + instanceInfo[*instance.InstanceId] = info + } + } + } + + return instanceInfo, nil +} + +// describeNetworkInterfaces returns a map of ENI ID to public IP address. +func (d *ECSDiscovery) describeNetworkInterfaces(ctx context.Context, eniIDs []string) (map[string]string, error) { + if len(eniIDs) == 0 { + return make(map[string]string), nil + } + + eniToPublicIP := make(map[string]string) + + resp, err := d.ec2.DescribeNetworkInterfaces(ctx, &ec2.DescribeNetworkInterfacesInput{ + NetworkInterfaceIds: eniIDs, + }) + if err != nil { + return nil, fmt.Errorf("could not describe network interfaces: %w", err) + } + + for _, eni := range resp.NetworkInterfaces { + if eni.NetworkInterfaceId != nil && eni.Association != nil && eni.Association.PublicIp != nil { + eniToPublicIP[*eni.NetworkInterfaceId] = *eni.Association.PublicIp + } + } + + return eniToPublicIP, nil +} + func batchSlice[T any](a []T, size int) [][]T { batches := make([][]T, 0, len(a)/size+1) for i := 0; i < len(a); i += size { @@ -554,8 +679,76 @@ func (d *ECSDiscovery) refresh(ctx context.Context) ([]*targetgroup.Group, error if tasks, exists := serviceTaskMap[serviceArn]; exists { var serviceTargets []model.LabelSet + // Collect container instance ARNs for all EC2 tasks to get instance type + var containerInstanceARNs []string + taskToContainerInstance := make(map[string]string) + // Collect ENI IDs for awsvpc tasks to get public IPs + var eniIDs []string + taskToENI := make(map[string]string) + for _, task := range tasks { - // Find the ENI attachment to get the private IP address + // Collect container instance ARN for any task running on EC2 + if task.ContainerInstanceArn != nil { + containerInstanceARNs = append(containerInstanceARNs, *task.ContainerInstanceArn) + taskToContainerInstance[*task.TaskArn] = *task.ContainerInstanceArn + } + + // Collect ENI IDs from awsvpc tasks + for _, attachment := range task.Attachments { + if attachment.Type != nil && *attachment.Type == "ElasticNetworkInterface" { + for _, detail := range attachment.Details { + if detail.Name != nil && *detail.Name == "networkInterfaceId" && detail.Value != nil { + eniIDs = append(eniIDs, *detail.Value) + taskToENI[*task.TaskArn] = *detail.Value + break + } + } + break + } + } + } + + // Batch describe container instances and EC2 instances to get instance type and other metadata + var containerInstToEC2 map[string]string + var ec2InstInfo map[string]ec2InstanceInfo + if len(containerInstanceARNs) > 0 { + var err error + containerInstToEC2, err = d.describeContainerInstances(ctx, clusterArn, containerInstanceARNs) + if err != nil { + d.logger.Error("Failed to describe container instances", "cluster", clusterArn, "error", err) + // Continue processing tasks + } else { + // Collect unique EC2 instance IDs + ec2InstanceIDs := make([]string, 0, len(containerInstToEC2)) + for _, ec2ID := range containerInstToEC2 { + ec2InstanceIDs = append(ec2InstanceIDs, ec2ID) + } + + // Batch describe EC2 instances + ec2InstInfo, err = d.describeEC2Instances(ctx, ec2InstanceIDs) + if err != nil { + d.logger.Error("Failed to describe EC2 instances", "cluster", clusterArn, "error", err) + } + } + } + + // Batch describe ENIs to get public IPs for awsvpc tasks + var eniToPublicIP map[string]string + if len(eniIDs) > 0 { + var err error + eniToPublicIP, err = d.describeNetworkInterfaces(ctx, eniIDs) + if err != nil { + d.logger.Error("Failed to describe network interfaces", "cluster", clusterArn, "error", err) + // Continue processing without ENI public IPs + } + } + + for _, task := range tasks { + var ipAddress, subnetID, publicIP string + var networkMode string + var ec2InstanceID, ec2InstanceType, ec2InstancePrivateIP, ec2InstancePublicIP string + + // Try to get IP from ENI attachment (awsvpc mode) var eniAttachment *types.Attachment for _, attachment := range task.Attachments { if attachment.Type != nil && *attachment.Type == "ElasticNetworkInterface" { @@ -563,19 +756,65 @@ func (d *ECSDiscovery) refresh(ctx context.Context) ([]*targetgroup.Group, error break } } - if eniAttachment == nil { - continue - } - var ipAddress, subnetID string - for _, detail := range eniAttachment.Details { - switch *detail.Name { - case "privateIPv4Address": - ipAddress = *detail.Value - case "subnetId": - subnetID = *detail.Value + if eniAttachment != nil { + // awsvpc networking mode - get IP from ENI + networkMode = "awsvpc" + for _, detail := range eniAttachment.Details { + switch *detail.Name { + case "privateIPv4Address": + ipAddress = *detail.Value + case "subnetId": + subnetID = *detail.Value + } + } + // Get public IP from ENI if available + if eniID, ok := taskToENI[*task.TaskArn]; ok { + if eniPublicIP, ok := eniToPublicIP[eniID]; ok { + publicIP = eniPublicIP + } + } + } else if task.ContainerInstanceArn != nil { + // bridge/host networking mode - need to get EC2 instance IP and subnet + networkMode = "bridge" + containerInstARN, ok := taskToContainerInstance[*task.TaskArn] + if ok { + ec2InstanceID, ok = containerInstToEC2[containerInstARN] + if ok { + info, ok := ec2InstInfo[ec2InstanceID] + if ok { + ipAddress = info.privateIP + publicIP = info.publicIP + subnetID = info.subnetID + ec2InstanceType = info.instanceType + ec2InstancePrivateIP = info.privateIP + ec2InstancePublicIP = info.publicIP + } else { + d.logger.Debug("EC2 instance info not found", "instance", ec2InstanceID, "task", *task.TaskArn) + } + } else { + d.logger.Debug("Container instance not found in map", "arn", containerInstARN, "task", *task.TaskArn) + } } } + + // Get EC2 instance metadata for awsvpc tasks running on EC2 + // We want the instance type and the host IPs for advanced use cases + if networkMode == "awsvpc" && task.ContainerInstanceArn != nil { + containerInstARN, ok := taskToContainerInstance[*task.TaskArn] + if ok { + ec2InstanceID, ok = containerInstToEC2[containerInstARN] + if ok { + info, ok := ec2InstInfo[ec2InstanceID] + if ok { + ec2InstanceType = info.instanceType + ec2InstancePrivateIP = info.privateIP + ec2InstancePublicIP = info.publicIP + } + } + } + } + if ipAddress == "" { continue } @@ -589,13 +828,38 @@ func (d *ECSDiscovery) refresh(ctx context.Context) ([]*targetgroup.Group, error ecsLabelTaskARN: model.LabelValue(*task.TaskArn), ecsLabelTaskDefinition: model.LabelValue(*task.TaskDefinitionArn), ecsLabelIPAddress: model.LabelValue(ipAddress), - ecsLabelSubnetID: model.LabelValue(subnetID), ecsLabelRegion: model.LabelValue(d.cfg.Region), ecsLabelLaunchType: model.LabelValue(task.LaunchType), ecsLabelAvailabilityZone: model.LabelValue(*task.AvailabilityZone), ecsLabelDesiredStatus: model.LabelValue(*task.DesiredStatus), ecsLabelLastStatus: model.LabelValue(*task.LastStatus), ecsLabelHealthStatus: model.LabelValue(task.HealthStatus), + ecsLabelNetworkMode: model.LabelValue(networkMode), + } + + // Add subnet ID when available (awsvpc mode from ENI, bridge/host from EC2 instance) + if subnetID != "" { + labels[ecsLabelSubnetID] = model.LabelValue(subnetID) + } + + // Add container instance and EC2 instance info for EC2 launch type + if task.ContainerInstanceArn != nil { + labels[ecsLabelContainerInstanceARN] = model.LabelValue(*task.ContainerInstanceArn) + } + if ec2InstanceID != "" { + labels[ecsLabelEC2InstanceID] = model.LabelValue(ec2InstanceID) + } + if ec2InstanceType != "" { + labels[ecsLabelEC2InstanceType] = model.LabelValue(ec2InstanceType) + } + if ec2InstancePrivateIP != "" { + labels[ecsLabelEC2InstancePrivateIP] = model.LabelValue(ec2InstancePrivateIP) + } + if ec2InstancePublicIP != "" { + labels[ecsLabelEC2InstancePublicIP] = model.LabelValue(ec2InstancePublicIP) + } + if publicIP != "" { + labels[ecsLabelPublicIP] = model.LabelValue(publicIP) } if task.PlatformFamily != nil { @@ -634,6 +898,15 @@ func (d *ECSDiscovery) refresh(ctx context.Context) ([]*targetgroup.Group, error } } + // Add EC2 instance tags (if running on EC2) + if ec2InstanceID != "" { + if info, ok := ec2InstInfo[ec2InstanceID]; ok { + for tagKey, tagValue := range info.tags { + labels[model.LabelName(ecsLabelTagEC2+strutil.SanitizeLabelName(tagKey))] = model.LabelValue(tagValue) + } + } + } + serviceTargets = append(serviceTargets, labels) } diff --git a/discovery/aws/ecs_test.go b/discovery/aws/ecs_test.go index 60138a01c7..1cb48b27fa 100644 --- a/discovery/aws/ecs_test.go +++ b/discovery/aws/ecs_test.go @@ -17,6 +17,8 @@ import ( "context" "testing" + "github.com/aws/aws-sdk-go-v2/service/ec2" + ec2Types "github.com/aws/aws-sdk-go-v2/service/ec2/types" "github.com/aws/aws-sdk-go-v2/service/ecs" ecsTypes "github.com/aws/aws-sdk-go-v2/service/ecs/types" "github.com/prometheus/common/model" @@ -29,9 +31,12 @@ import ( type ecsDataStore struct { region string - clusters []ecsTypes.Cluster - services []ecsTypes.Service - tasks []ecsTypes.Task + clusters []ecsTypes.Cluster + services []ecsTypes.Service + tasks []ecsTypes.Task + containerInstances []ecsTypes.ContainerInstance + ec2Instances map[string]ec2InstanceInfo // EC2 instance ID to instance info + eniPublicIPs map[string]string // ENI ID to public IP } func TestECSDiscoveryListClusterARNs(t *testing.T) { @@ -716,6 +721,7 @@ func TestECSDiscoveryRefresh(t *testing.T) { Details: []ecsTypes.KeyValuePair{ {Name: strptr("subnetId"), Value: strptr("subnet-12345")}, {Name: strptr("privateIPv4Address"), Value: strptr("10.0.1.100")}, + {Name: strptr("networkInterfaceId"), Value: strptr("eni-fargate-123")}, }, }, }, @@ -724,6 +730,9 @@ func TestECSDiscoveryRefresh(t *testing.T) { }, }, }, + eniPublicIPs: map[string]string{ + "eni-fargate-123": "52.1.2.3", + }, }, expected: []*targetgroup.Group{ { @@ -749,6 +758,8 @@ func TestECSDiscoveryRefresh(t *testing.T) { "__meta_ecs_health_status": model.LabelValue("HEALTHY"), "__meta_ecs_platform_family": model.LabelValue("Linux"), "__meta_ecs_platform_version": model.LabelValue("1.4.0"), + "__meta_ecs_network_mode": model.LabelValue("awsvpc"), + "__meta_ecs_public_ip": model.LabelValue("52.1.2.3"), "__meta_ecs_tag_cluster_Environment": model.LabelValue("test"), "__meta_ecs_tag_service_App": model.LabelValue("web"), "__meta_ecs_tag_task_Version": model.LabelValue("v1.0"), @@ -825,14 +836,345 @@ func TestECSDiscoveryRefresh(t *testing.T) { }, }, }, + { + name: "TaskWithBridgeNetworking", + ecsData: &ecsDataStore{ + region: "us-west-2", + clusters: []ecsTypes.Cluster{ + { + ClusterName: strptr("test-cluster"), + ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"), + Status: strptr("ACTIVE"), + }, + }, + services: []ecsTypes.Service{ + { + ServiceName: strptr("bridge-service"), + ServiceArn: strptr("arn:aws:ecs:us-west-2:123456789012:service/test-cluster/bridge-service"), + ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"), + Status: strptr("ACTIVE"), + }, + }, + tasks: []ecsTypes.Task{ + { + TaskArn: strptr("arn:aws:ecs:us-west-2:123456789012:task/test-cluster/task-bridge"), + ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"), + TaskDefinitionArn: strptr("arn:aws:ecs:us-west-2:123456789012:task-definition/bridge-task:1"), + Group: strptr("service:bridge-service"), + LaunchType: ecsTypes.LaunchTypeEc2, + LastStatus: strptr("RUNNING"), + DesiredStatus: strptr("RUNNING"), + HealthStatus: ecsTypes.HealthStatusHealthy, + AvailabilityZone: strptr("us-west-2a"), + ContainerInstanceArn: strptr("arn:aws:ecs:us-west-2:123456789012:container-instance/test-cluster/abc123"), + Attachments: []ecsTypes.Attachment{}, + }, + }, + containerInstances: []ecsTypes.ContainerInstance{ + { + ContainerInstanceArn: strptr("arn:aws:ecs:us-west-2:123456789012:container-instance/test-cluster/abc123"), + Ec2InstanceId: strptr("i-1234567890abcdef0"), + Status: strptr("ACTIVE"), + }, + }, + ec2Instances: map[string]ec2InstanceInfo{ + "i-1234567890abcdef0": { + privateIP: "10.0.1.50", + publicIP: "54.1.2.3", + subnetID: "subnet-bridge-1", + instanceType: "t3.medium", + tags: map[string]string{ + "Name": "ecs-host-1", + "Environment": "production", + }, + }, + }, + }, + expected: []*targetgroup.Group{ + { + Source: "us-west-2", + Targets: []model.LabelSet{ + { + model.AddressLabel: model.LabelValue("10.0.1.50:80"), + "__meta_ecs_cluster": model.LabelValue("test-cluster"), + "__meta_ecs_cluster_arn": model.LabelValue("arn:aws:ecs:us-west-2:123456789012:cluster/test-cluster"), + "__meta_ecs_service": model.LabelValue("bridge-service"), + "__meta_ecs_service_arn": model.LabelValue("arn:aws:ecs:us-west-2:123456789012:service/test-cluster/bridge-service"), + "__meta_ecs_service_status": model.LabelValue("ACTIVE"), + "__meta_ecs_task_group": model.LabelValue("service:bridge-service"), + "__meta_ecs_task_arn": model.LabelValue("arn:aws:ecs:us-west-2:123456789012:task/test-cluster/task-bridge"), + "__meta_ecs_task_definition": model.LabelValue("arn:aws:ecs:us-west-2:123456789012:task-definition/bridge-task:1"), + "__meta_ecs_region": model.LabelValue("us-west-2"), + "__meta_ecs_availability_zone": model.LabelValue("us-west-2a"), + "__meta_ecs_ip_address": model.LabelValue("10.0.1.50"), + "__meta_ecs_subnet_id": model.LabelValue("subnet-bridge-1"), + "__meta_ecs_launch_type": model.LabelValue("EC2"), + "__meta_ecs_desired_status": model.LabelValue("RUNNING"), + "__meta_ecs_last_status": model.LabelValue("RUNNING"), + "__meta_ecs_health_status": model.LabelValue("HEALTHY"), + "__meta_ecs_network_mode": model.LabelValue("bridge"), + "__meta_ecs_container_instance_arn": model.LabelValue("arn:aws:ecs:us-west-2:123456789012:container-instance/test-cluster/abc123"), + "__meta_ecs_ec2_instance_id": model.LabelValue("i-1234567890abcdef0"), + "__meta_ecs_ec2_instance_type": model.LabelValue("t3.medium"), + "__meta_ecs_ec2_instance_private_ip": model.LabelValue("10.0.1.50"), + "__meta_ecs_ec2_instance_public_ip": model.LabelValue("54.1.2.3"), + "__meta_ecs_public_ip": model.LabelValue("54.1.2.3"), + "__meta_ecs_tag_ec2_Name": model.LabelValue("ecs-host-1"), + "__meta_ecs_tag_ec2_Environment": model.LabelValue("production"), + }, + }, + }, + }, + }, + { + name: "MixedNetworkingModes", + ecsData: &ecsDataStore{ + region: "us-west-2", + clusters: []ecsTypes.Cluster{ + { + ClusterName: strptr("mixed-cluster"), + ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/mixed-cluster"), + Status: strptr("ACTIVE"), + }, + }, + services: []ecsTypes.Service{ + { + ServiceName: strptr("mixed-service"), + ServiceArn: strptr("arn:aws:ecs:us-west-2:123456789012:service/mixed-cluster/mixed-service"), + ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/mixed-cluster"), + Status: strptr("ACTIVE"), + }, + }, + tasks: []ecsTypes.Task{ + { + TaskArn: strptr("arn:aws:ecs:us-west-2:123456789012:task/mixed-cluster/task-awsvpc"), + ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/mixed-cluster"), + TaskDefinitionArn: strptr("arn:aws:ecs:us-west-2:123456789012:task-definition/awsvpc-task:1"), + Group: strptr("service:mixed-service"), + LaunchType: ecsTypes.LaunchTypeFargate, + LastStatus: strptr("RUNNING"), + DesiredStatus: strptr("RUNNING"), + HealthStatus: ecsTypes.HealthStatusHealthy, + AvailabilityZone: strptr("us-west-2a"), + Attachments: []ecsTypes.Attachment{ + { + Type: strptr("ElasticNetworkInterface"), + Details: []ecsTypes.KeyValuePair{ + {Name: strptr("subnetId"), Value: strptr("subnet-12345")}, + {Name: strptr("privateIPv4Address"), Value: strptr("10.0.2.100")}, + {Name: strptr("networkInterfaceId"), Value: strptr("eni-mixed-awsvpc")}, + }, + }, + }, + }, + { + TaskArn: strptr("arn:aws:ecs:us-west-2:123456789012:task/mixed-cluster/task-bridge"), + ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/mixed-cluster"), + TaskDefinitionArn: strptr("arn:aws:ecs:us-west-2:123456789012:task-definition/bridge-task:1"), + Group: strptr("service:mixed-service"), + LaunchType: ecsTypes.LaunchTypeEc2, + LastStatus: strptr("RUNNING"), + DesiredStatus: strptr("RUNNING"), + HealthStatus: ecsTypes.HealthStatusHealthy, + AvailabilityZone: strptr("us-west-2b"), + ContainerInstanceArn: strptr("arn:aws:ecs:us-west-2:123456789012:container-instance/mixed-cluster/xyz789"), + Attachments: []ecsTypes.Attachment{}, + }, + }, + containerInstances: []ecsTypes.ContainerInstance{ + { + ContainerInstanceArn: strptr("arn:aws:ecs:us-west-2:123456789012:container-instance/mixed-cluster/xyz789"), + Ec2InstanceId: strptr("i-0987654321fedcba0"), + Status: strptr("ACTIVE"), + }, + }, + ec2Instances: map[string]ec2InstanceInfo{ + "i-0987654321fedcba0": { + privateIP: "10.0.1.75", + publicIP: "54.2.3.4", + subnetID: "subnet-bridge-2", + instanceType: "t3.large", + tags: map[string]string{ + "Name": "mixed-host", + "Team": "platform", + }, + }, + }, + eniPublicIPs: map[string]string{ + "eni-mixed-awsvpc": "52.2.3.4", + }, + }, + expected: []*targetgroup.Group{ + { + Source: "us-west-2", + Targets: []model.LabelSet{ + { + model.AddressLabel: model.LabelValue("10.0.2.100:80"), + "__meta_ecs_cluster": model.LabelValue("mixed-cluster"), + "__meta_ecs_cluster_arn": model.LabelValue("arn:aws:ecs:us-west-2:123456789012:cluster/mixed-cluster"), + "__meta_ecs_service": model.LabelValue("mixed-service"), + "__meta_ecs_service_arn": model.LabelValue("arn:aws:ecs:us-west-2:123456789012:service/mixed-cluster/mixed-service"), + "__meta_ecs_service_status": model.LabelValue("ACTIVE"), + "__meta_ecs_task_group": model.LabelValue("service:mixed-service"), + "__meta_ecs_task_arn": model.LabelValue("arn:aws:ecs:us-west-2:123456789012:task/mixed-cluster/task-awsvpc"), + "__meta_ecs_task_definition": model.LabelValue("arn:aws:ecs:us-west-2:123456789012:task-definition/awsvpc-task:1"), + "__meta_ecs_region": model.LabelValue("us-west-2"), + "__meta_ecs_availability_zone": model.LabelValue("us-west-2a"), + "__meta_ecs_ip_address": model.LabelValue("10.0.2.100"), + "__meta_ecs_subnet_id": model.LabelValue("subnet-12345"), + "__meta_ecs_launch_type": model.LabelValue("FARGATE"), + "__meta_ecs_desired_status": model.LabelValue("RUNNING"), + "__meta_ecs_last_status": model.LabelValue("RUNNING"), + "__meta_ecs_health_status": model.LabelValue("HEALTHY"), + "__meta_ecs_network_mode": model.LabelValue("awsvpc"), + "__meta_ecs_public_ip": model.LabelValue("52.2.3.4"), + }, + { + model.AddressLabel: model.LabelValue("10.0.1.75:80"), + "__meta_ecs_cluster": model.LabelValue("mixed-cluster"), + "__meta_ecs_cluster_arn": model.LabelValue("arn:aws:ecs:us-west-2:123456789012:cluster/mixed-cluster"), + "__meta_ecs_service": model.LabelValue("mixed-service"), + "__meta_ecs_service_arn": model.LabelValue("arn:aws:ecs:us-west-2:123456789012:service/mixed-cluster/mixed-service"), + "__meta_ecs_service_status": model.LabelValue("ACTIVE"), + "__meta_ecs_task_group": model.LabelValue("service:mixed-service"), + "__meta_ecs_task_arn": model.LabelValue("arn:aws:ecs:us-west-2:123456789012:task/mixed-cluster/task-bridge"), + "__meta_ecs_task_definition": model.LabelValue("arn:aws:ecs:us-west-2:123456789012:task-definition/bridge-task:1"), + "__meta_ecs_region": model.LabelValue("us-west-2"), + "__meta_ecs_availability_zone": model.LabelValue("us-west-2b"), + "__meta_ecs_ip_address": model.LabelValue("10.0.1.75"), + "__meta_ecs_subnet_id": model.LabelValue("subnet-bridge-2"), + "__meta_ecs_launch_type": model.LabelValue("EC2"), + "__meta_ecs_desired_status": model.LabelValue("RUNNING"), + "__meta_ecs_last_status": model.LabelValue("RUNNING"), + "__meta_ecs_health_status": model.LabelValue("HEALTHY"), + "__meta_ecs_network_mode": model.LabelValue("bridge"), + "__meta_ecs_container_instance_arn": model.LabelValue("arn:aws:ecs:us-west-2:123456789012:container-instance/mixed-cluster/xyz789"), + "__meta_ecs_ec2_instance_id": model.LabelValue("i-0987654321fedcba0"), + "__meta_ecs_ec2_instance_type": model.LabelValue("t3.large"), + "__meta_ecs_ec2_instance_private_ip": model.LabelValue("10.0.1.75"), + "__meta_ecs_ec2_instance_public_ip": model.LabelValue("54.2.3.4"), + "__meta_ecs_public_ip": model.LabelValue("54.2.3.4"), + "__meta_ecs_tag_ec2_Name": model.LabelValue("mixed-host"), + "__meta_ecs_tag_ec2_Team": model.LabelValue("platform"), + }, + }, + }, + }, + }, + { + name: "EC2WithAwsvpcNetworking", + ecsData: &ecsDataStore{ + region: "us-west-2", + clusters: []ecsTypes.Cluster{ + { + ClusterName: strptr("ec2-awsvpc-cluster"), + ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/ec2-awsvpc-cluster"), + Status: strptr("ACTIVE"), + }, + }, + services: []ecsTypes.Service{ + { + ServiceName: strptr("ec2-awsvpc-service"), + ServiceArn: strptr("arn:aws:ecs:us-west-2:123456789012:service/ec2-awsvpc-cluster/ec2-awsvpc-service"), + ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/ec2-awsvpc-cluster"), + Status: strptr("ACTIVE"), + }, + }, + tasks: []ecsTypes.Task{ + { + TaskArn: strptr("arn:aws:ecs:us-west-2:123456789012:task/ec2-awsvpc-cluster/task-ec2-awsvpc"), + ClusterArn: strptr("arn:aws:ecs:us-west-2:123456789012:cluster/ec2-awsvpc-cluster"), + TaskDefinitionArn: strptr("arn:aws:ecs:us-west-2:123456789012:task-definition/ec2-awsvpc-task:1"), + Group: strptr("service:ec2-awsvpc-service"), + LaunchType: ecsTypes.LaunchTypeEc2, + LastStatus: strptr("RUNNING"), + DesiredStatus: strptr("RUNNING"), + HealthStatus: ecsTypes.HealthStatusHealthy, + AvailabilityZone: strptr("us-west-2c"), + ContainerInstanceArn: strptr("arn:aws:ecs:us-west-2:123456789012:container-instance/ec2-awsvpc-cluster/def456"), + // Has BOTH ENI attachment AND container instance ARN - should use ENI + Attachments: []ecsTypes.Attachment{ + { + Type: strptr("ElasticNetworkInterface"), + Details: []ecsTypes.KeyValuePair{ + {Name: strptr("subnetId"), Value: strptr("subnet-99999")}, + {Name: strptr("privateIPv4Address"), Value: strptr("10.0.3.200")}, + {Name: strptr("networkInterfaceId"), Value: strptr("eni-ec2-awsvpc")}, + }, + }, + }, + }, + }, + eniPublicIPs: map[string]string{ + "eni-ec2-awsvpc": "52.3.4.5", + }, + // Container instance data - IP should NOT be used, but instance type SHOULD be used + containerInstances: []ecsTypes.ContainerInstance{ + { + ContainerInstanceArn: strptr("arn:aws:ecs:us-west-2:123456789012:container-instance/ec2-awsvpc-cluster/def456"), + Ec2InstanceId: strptr("i-ec2awsvpcinstance"), + Status: strptr("ACTIVE"), + }, + }, + ec2Instances: map[string]ec2InstanceInfo{ + "i-ec2awsvpcinstance": { + privateIP: "10.0.9.99", // This IP should NOT be used (ENI IP is used instead) + publicIP: "54.3.4.5", // This public IP SHOULD be exposed + subnetID: "subnet-wrong", // This subnet should NOT be used (ENI subnet is used instead) + instanceType: "c5.2xlarge", // This instance type SHOULD be used + tags: map[string]string{ + "Name": "ec2-awsvpc-host", + "Owner": "team-a", + }, + }, + }, + }, + expected: []*targetgroup.Group{ + { + Source: "us-west-2", + Targets: []model.LabelSet{ + { + model.AddressLabel: model.LabelValue("10.0.3.200:80"), + "__meta_ecs_cluster": model.LabelValue("ec2-awsvpc-cluster"), + "__meta_ecs_cluster_arn": model.LabelValue("arn:aws:ecs:us-west-2:123456789012:cluster/ec2-awsvpc-cluster"), + "__meta_ecs_service": model.LabelValue("ec2-awsvpc-service"), + "__meta_ecs_service_arn": model.LabelValue("arn:aws:ecs:us-west-2:123456789012:service/ec2-awsvpc-cluster/ec2-awsvpc-service"), + "__meta_ecs_service_status": model.LabelValue("ACTIVE"), + "__meta_ecs_task_group": model.LabelValue("service:ec2-awsvpc-service"), + "__meta_ecs_task_arn": model.LabelValue("arn:aws:ecs:us-west-2:123456789012:task/ec2-awsvpc-cluster/task-ec2-awsvpc"), + "__meta_ecs_task_definition": model.LabelValue("arn:aws:ecs:us-west-2:123456789012:task-definition/ec2-awsvpc-task:1"), + "__meta_ecs_region": model.LabelValue("us-west-2"), + "__meta_ecs_availability_zone": model.LabelValue("us-west-2c"), + "__meta_ecs_ip_address": model.LabelValue("10.0.3.200"), + "__meta_ecs_subnet_id": model.LabelValue("subnet-99999"), + "__meta_ecs_launch_type": model.LabelValue("EC2"), + "__meta_ecs_desired_status": model.LabelValue("RUNNING"), + "__meta_ecs_last_status": model.LabelValue("RUNNING"), + "__meta_ecs_health_status": model.LabelValue("HEALTHY"), + "__meta_ecs_network_mode": model.LabelValue("awsvpc"), + "__meta_ecs_container_instance_arn": model.LabelValue("arn:aws:ecs:us-west-2:123456789012:container-instance/ec2-awsvpc-cluster/def456"), + "__meta_ecs_ec2_instance_id": model.LabelValue("i-ec2awsvpcinstance"), + "__meta_ecs_ec2_instance_type": model.LabelValue("c5.2xlarge"), + "__meta_ecs_ec2_instance_private_ip": model.LabelValue("10.0.9.99"), + "__meta_ecs_ec2_instance_public_ip": model.LabelValue("54.3.4.5"), + "__meta_ecs_public_ip": model.LabelValue("52.3.4.5"), + "__meta_ecs_tag_ec2_Name": model.LabelValue("ec2-awsvpc-host"), + "__meta_ecs_tag_ec2_Owner": model.LabelValue("team-a"), + }, + }, + }, + }, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - client := newMockECSClient(tt.ecsData) + ecsClient := newMockECSClient(tt.ecsData) + ec2Client := newMockECSEC2Client(tt.ecsData.ec2Instances, tt.ecsData.eniPublicIPs) d := &ECSDiscovery{ - ecs: client, + ecs: ecsClient, + ec2: ec2Client, cfg: &ECSSDConfig{ Region: tt.ecsData.region, Port: 80, @@ -951,3 +1293,91 @@ func (m *mockECSClient) DescribeTasks(_ context.Context, input *ecs.DescribeTask Tasks: tasks, }, nil } + +func (m *mockECSClient) DescribeContainerInstances(_ context.Context, input *ecs.DescribeContainerInstancesInput, _ ...func(*ecs.Options)) (*ecs.DescribeContainerInstancesOutput, error) { + var containerInstances []ecsTypes.ContainerInstance + for _, ciArn := range input.ContainerInstances { + for _, ci := range m.ecsData.containerInstances { + if *ci.ContainerInstanceArn == ciArn { + containerInstances = append(containerInstances, ci) + break + } + } + } + + return &ecs.DescribeContainerInstancesOutput{ + ContainerInstances: containerInstances, + }, nil +} + +// Mock EC2 client wrapper for ECS tests. +type mockECSEC2Client struct { + ec2Instances map[string]ec2InstanceInfo + eniPublicIPs map[string]string +} + +func newMockECSEC2Client(ec2Instances map[string]ec2InstanceInfo, eniPublicIPs map[string]string) *mockECSEC2Client { + return &mockECSEC2Client{ + ec2Instances: ec2Instances, + eniPublicIPs: eniPublicIPs, + } +} + +func (m *mockECSEC2Client) DescribeInstances(_ context.Context, input *ec2.DescribeInstancesInput, _ ...func(*ec2.Options)) (*ec2.DescribeInstancesOutput, error) { + var reservations []ec2Types.Reservation + + for _, instanceID := range input.InstanceIds { + if info, ok := m.ec2Instances[instanceID]; ok { + instance := ec2Types.Instance{ + InstanceId: &instanceID, + PrivateIpAddress: &info.privateIP, + } + if info.publicIP != "" { + instance.PublicIpAddress = &info.publicIP + } + if info.subnetID != "" { + instance.SubnetId = &info.subnetID + } + if info.instanceType != "" { + instance.InstanceType = ec2Types.InstanceType(info.instanceType) + } + // Add tags + for tagKey, tagValue := range info.tags { + instance.Tags = append(instance.Tags, ec2Types.Tag{ + Key: &tagKey, + Value: &tagValue, + }) + } + reservation := ec2Types.Reservation{ + Instances: []ec2Types.Instance{instance}, + } + reservations = append(reservations, reservation) + } + } + + return &ec2.DescribeInstancesOutput{ + Reservations: reservations, + }, nil +} + +func (m *mockECSEC2Client) DescribeNetworkInterfaces(_ context.Context, input *ec2.DescribeNetworkInterfacesInput, _ ...func(*ec2.Options)) (*ec2.DescribeNetworkInterfacesOutput, error) { + var networkInterfaces []ec2Types.NetworkInterface + + for _, eniID := range input.NetworkInterfaceIds { + if publicIP, ok := m.eniPublicIPs[eniID]; ok { + eni := ec2Types.NetworkInterface{ + NetworkInterfaceId: &eniID, + } + if publicIP != "" { + eni.Association = &ec2Types.NetworkInterfaceAssociation{ + PublicIp: &publicIP, + } + } + networkInterfaces = append(networkInterfaces, eni) + } + } + + return &ec2.DescribeNetworkInterfacesOutput{ + NetworkInterfaces: networkInterfaces, + }, nil +} diff --git a/docs/configuration/configuration.md b/docs/configuration/configuration.md index a539ee9461..3b71f26fc2 100644 --- a/docs/configuration/configuration.md +++ b/docs/configuration/configuration.md @@ -919,11 +919,16 @@ The following meta labels are available on targets during [relabeling](#relabel_ #### `ecs` -The `ecs` role discovers targets from AWS ECS containers. The private IP address is used by default, but may be changed to -the public IP address with relabeling. +The `ecs` role discovers targets from AWS ECS containers. -The IAM credentials used must have the following permissions to discover -scrape targets: +ECS service discovery supports all ECS networking modes: +- **awsvpc mode** (Fargate and EC2 with ENI): Uses the task's private IP address from its elastic network interface +- **bridge mode** (EC2): Uses the EC2 host instance's private IP address +- **host mode** (EC2): Uses the EC2 host instance's private IP address + +The private IP address is used by default, but may be changed to the public IP address with relabeling. + +The IAM credentials used must have the following permissions to discover scrape targets: - `ecs:ListClusters` - `ecs:DescribeClusters` @@ -931,6 +936,9 @@ scrape targets: - `ecs:DescribeServices` - `ecs:ListTasks` - `ecs:DescribeTasks` +- `ecs:DescribeContainerInstances` (required for EC2 launch type tasks) +- `ec2:DescribeInstances` (required for EC2 launch type tasks) +- `ec2:DescribeNetworkInterfaces` (required to get public IP for awsvpc mode tasks) The following meta labels are available on targets during [relabeling](#relabel_config): @@ -952,9 +960,17 @@ The following meta labels are available on targets during [relabeling](#relabel_ * `__meta_ecs_subnet_id`: the subnet ID where the task is running * `__meta_ecs_availability_zone`: the availability zone where the task is running * `__meta_ecs_region`: the AWS region +* `__meta_ecs_public_ip`: the public IP address (from ENI for awsvpc mode, from EC2 instance for bridge/host mode), if available +* `__meta_ecs_network_mode`: the network mode of the task (awsvpc or bridge) +* `__meta_ecs_container_instance_arn`: the ARN of the container instance (EC2 launch type only) +* `__meta_ecs_ec2_instance_id`: the EC2 instance ID (EC2 launch type only) +* `__meta_ecs_ec2_instance_type`: the EC2 instance type (EC2 launch type only) +* `__meta_ecs_ec2_instance_private_ip`: the private IP address of the EC2 instance (EC2 launch type only) +* `__meta_ecs_ec2_instance_public_ip`: the public IP address of the EC2 instance, if available (EC2 launch type only) * `__meta_ecs_tag_cluster_`: each cluster tag value, keyed by tag name * `__meta_ecs_tag_service_`: each service tag value, keyed by tag name * `__meta_ecs_tag_task_`: each task tag value, keyed by tag name +* `__meta_ecs_tag_ec2_`: each EC2 instance tag value, keyed by tag name (EC2 launch type only) See below for the configuration options for AWS discovery: