diff --git a/Jenkinsfile b/Jenkinsfile index 0401ad0..ce76bab 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -19,8 +19,6 @@ pipeline { TF_BACKEND_PREFIX = 'ecs/terraform.tfstate' TF_DDB_TABLE = 'nvhi-atsila-locks' - SSH_CRED_ID = 'jenkins-ssh' - // Application variables TF_VAR_cluster_name = 'nvhi-atsila-cluster' TF_VAR_vpc_cidr = '10.0.0.0/16' @@ -78,7 +76,7 @@ pipeline { echo " • Commit: ${gitCommit.take(8)}" echo " • Author: ${gitAuthor}" echo " • Container Registry: ECR (AWS-native, secure)" - echo " • Architecture: Direct ECS access (appropriate for microservice demo)" + echo " • Architecture: SSM-based ECS access (secure, keyless)" echo " • Security Model: Principle of Least Privilege" echo " • Timestamp: ${new Date()}" @@ -91,7 +89,7 @@ pipeline { "git_author": "${gitAuthor}", "infrastructure_files_changed": "${infrastructureFiles}", "container_registry": "ECR", - "architecture": "direct_ecs_access", + "architecture": "ssm_based_ecs_access", "security_model": "principle_of_least_privilege", "timestamp": "${new Date()}" }""" @@ -326,7 +324,7 @@ pipeline { dir('terraform') { script { echo "🚨 SECURITY NOTICE: Infrastructure deployment requested" - echo "🏗️ ARCHITECTURE: Deploying ECS Cluster with direct access (optimal for microservice demo)" + echo "🏗️ ARCHITECTURE: Deploying ECS Cluster with SSM access (secure, keyless)" echo "🔐 In production: This would require infrastructure-admin role" echo "🚀 Attempting infrastructure deployment..." @@ -356,271 +354,155 @@ pipeline { stage('Configure & Deploy Application') { parallel { - stage('Configure EC2 Instance') { + stage('Configure EC2 Instance via SSM') { steps { - script { - def ec2_ip = "" - try { - sh "test -d terraform || (echo 'Terraform directory not found' && exit 1)" - ec2_ip = sh( - script: "cd terraform && terraform output -raw ecs_instance_public_ip", + withCredentials([[ + $class: 'AmazonWebServicesCredentialsBinding', + credentialsId: env.AWS_CRED_ID + ]]) { + script { + echo "🔧 ENTERPRISE: Configuring EC2 instance via SSM (no SSH required)" + + // Get instance ID from Terraform output + def instanceId = "" + def ec2_ip = "" + + try { + sh "test -d terraform || (echo 'Terraform directory not found' && exit 1)" + instanceId = sh( + script: "cd terraform && terraform output -raw ecs_instance_id", + returnStdout: true + ).trim() + ec2_ip = sh( + script: "cd terraform && terraform output -raw ecs_instance_public_ip", + returnStdout: true + ).trim() + } catch (Exception e) { + echo "⚠️ Could not get instance details: ${e.getMessage()}" + throw new Exception("ENTERPRISE: Cannot proceed - instance details not available") + } + + echo "📍 Target Instance: ${instanceId} (${ec2_ip})" + + // Wait for SSM agent to be ready + echo "⏳ Waiting for SSM agent to be ready..." + timeout(time: 10, unit: 'MINUTES') { + waitUntil { + script { + def ssmStatus = sh( + script: """ + aws ssm describe-instance-information \\ + --filters "Key=InstanceIds,Values=${instanceId}" \\ + --region ${AWS_REGION} \\ + --query 'InstanceInformationList[0].PingStatus' \\ + --output text 2>/dev/null || echo "Offline" + """, + returnStdout: true + ).trim() + + if (ssmStatus == "Online") { + echo "✅ SSM agent is online" + return true + } else { + echo "⏳ Waiting for SSM agent... (Status: ${ssmStatus})" + sleep(20) + return false + } + } + } + } + + // Configure ECS agent via SSM + echo "🔧 Configuring ECS agent via SSM..." + def commandId = sh( + script: """ + aws ssm send-command \\ + --instance-ids ${instanceId} \\ + --document-name "AWS-RunShellScript" \\ + --parameters 'commands=[ + "echo \\"=== ECS Configuration via SSM ===\\"", + "echo \\"Cluster: ${TF_VAR_cluster_name}\\"", + "echo \\"Time: \$(date)\\"", + "echo \\"Instance: \$(hostname)\\"", + "sudo systemctl status ecs --no-pager", + "sudo systemctl status docker --no-pager", + "curl -s http://localhost:51678/v1/metadata || echo \\"ECS agent not ready\\"", + "sudo systemctl restart ecs", + "sleep 15", + "sudo systemctl status ecs --no-pager", + "curl -s http://localhost:51678/v1/metadata || echo \\"ECS agent still starting\\"", + "echo \\"=== Configuration completed ===\\"" + ]' \\ + --region ${AWS_REGION} \\ + --output text \\ + --query 'Command.CommandId' + """, returnStdout: true ).trim() - } catch (Exception e) { - echo "⚠️ Could not get EC2 IP - terraform output failed: ${e.getMessage()}" - ec2_ip = "unknown" - } - - echo "🔧 ENTERPRISE: Configuring EC2 instance for ECS agent: ${ec2_ip}" - echo "🔐 ARCHITECTURE: Using Jenkins credential store for AWS key pair" - - if (ec2_ip != "unknown") { - // STEP 1: ENTERPRISE INSTANCE READINESS CHECK - echo "🔍 ENTERPRISE: Validating EC2 instance readiness..." + echo "📋 SSM Command ID: ${commandId}" - timeout(time: 15, unit: 'MINUTES') { - waitUntil { - script { - // Test network connectivity - def pingResult = sh( - script: "ping -c 1 -W 5 ${ec2_ip} >/dev/null 2>&1 && echo 'ping_ok' || echo 'ping_failed'", - returnStdout: true - ).trim() - - if (pingResult != "ping_ok") { - echo "⏳ ENTERPRISE: Instance not responding to ping, waiting..." - sleep(20) - return false - } - - // Test SSH port availability - def sshResult = sh( - script: "nc -z -w5 ${ec2_ip} 22 >/dev/null 2>&1 && echo 'ssh_ready' || echo 'ssh_not_ready'", - returnStdout: true - ).trim() - - if (sshResult != "ssh_ready") { - echo "⏳ ENTERPRISE: SSH service not ready, waiting..." - sleep(20) - return false - } - - echo "✅ ENTERPRISE: Instance is ready for SSH connection" - return true - } - } - } + // Wait for command completion + echo "⏳ Waiting for SSM command completion..." + sh """ + aws ssm wait command-executed \\ + --command-id ${commandId} \\ + --instance-id ${instanceId} \\ + --region ${AWS_REGION} + """ - // STEP 2: ENTERPRISE SSH AUTHENTICATION TEST - echo "🔐 ENTERPRISE: Testing SSH authentication with Jenkins credentials..." + // Get command output + echo "📋 SSM Command Output:" + sh """ + aws ssm get-command-invocation \\ + --command-id ${commandId} \\ + --instance-id ${instanceId} \\ + --region ${AWS_REGION} \\ + --query 'StandardOutputContent' \\ + --output text + """ - def authSuccessful = false + // Check for any errors + def commandStatus = sh( + script: """ + aws ssm get-command-invocation \\ + --command-id ${commandId} \\ + --instance-id ${instanceId} \\ + --region ${AWS_REGION} \\ + --query 'Status' \\ + --output text + """, + returnStdout: true + ).trim() - timeout(time: 5, unit: 'MINUTES') { - waitUntil { - script { - try { - withCredentials([sshUserPrivateKey(credentialsId: 'jenkins-ssh', keyFileVariable: 'SSH_KEY', usernameVariable: 'SSH_USER')]) { - def authTest = sh( - script: """ - ssh -o ConnectTimeout=10 \\ - -o StrictHostKeyChecking=no \\ - -o UserKnownHostsFile=/dev/null \\ - -o BatchMode=yes \\ - -i \${SSH_KEY} \\ - ec2-user@${ec2_ip} \\ - 'echo "ENTERPRISE_AUTH_SUCCESS"' 2>/dev/null || echo "auth_failed" - """, - returnStdout: true - ).trim() - - if (authTest.contains("ENTERPRISE_AUTH_SUCCESS")) { - echo "✅ ENTERPRISE: SSH authentication successful with Jenkins credentials" - authSuccessful = true - return true - } else { - echo "⏳ ENTERPRISE: SSH authentication not ready, retrying..." - sleep(15) - return false - } - } - } catch (Exception e) { - echo "⏳ ENTERPRISE: SSH test failed, retrying... (${e.getMessage()})" - sleep(15) - return false - } - } - } - } - - if (authSuccessful) { - - // STEP 3: ENTERPRISE CONFIGURATION DEPLOYMENT - SIMPLIFIED APPROACH - echo "🎯 ENTERPRISE: Deploying ECS configuration via direct SSH (most reliable)..." - - // Skip Ansible entirely - use direct SSH which is more reliable - withCredentials([sshUserPrivateKey(credentialsId: 'jenkins-ssh', keyFileVariable: 'SSH_KEY')]) { - sh """ - ssh -o StrictHostKeyChecking=no \\ - -o UserKnownHostsFile=/dev/null \\ - -o ConnectTimeout=30 \\ - -i \${SSH_KEY} \\ - ec2-user@${ec2_ip} \\ - ' - set -e - echo "=== ENTERPRISE ECS CONFIGURATION STARTING ===" - echo "Target: \$(hostname)" - echo "Cluster: ${TF_VAR_cluster_name}" - echo "Time: \$(date)" - - # Update system packages - echo "📦 Updating system packages..." - sudo yum update -y - - # Install Docker (may already be installed) - echo "🐳 Installing Docker..." - sudo yum install -y docker || echo "Docker already installed" - - # Install ECS initialization - echo "🚀 Installing ECS initialization..." - sudo yum install -y ecs-init - - # Configure ECS cluster settings - echo "⚙️ Configuring ECS cluster settings..." - sudo tee /etc/ecs/ecs.config << EOF -ECS_CLUSTER=${TF_VAR_cluster_name} -ECS_ENABLE_CONTAINER_METADATA=true -ECS_ENABLE_TASK_IAM_ROLE=true -ECS_ENABLE_SPOT_INSTANCE_DRAINING=true -ECS_CONTAINER_STOP_TIMEOUT=30s -ECS_CONTAINER_START_TIMEOUT=3m -ECS_DISABLE_IMAGE_CLEANUP=false -ECS_AVAILABLE_LOGGING_DRIVERS=["json-file","awslogs"] -EOF - - # Start required services - echo "🚀 Starting Docker and ECS services..." - sudo service docker start - sudo start ecs - - # Enable services for auto-start - echo "🔧 Enabling services for auto-start..." - sudo chkconfig docker on - sudo chkconfig ecs on - - # Verify services are running - echo "✅ Verifying service status..." - sudo service docker status - sudo service ecs status - - echo "=== ENTERPRISE ECS CONFIGURATION COMPLETED ===" - echo "Instance ready for ECS workloads" - ' - """ - } - echo "✅ ENTERPRISE: Direct SSH configuration completed successfully" - - // STEP 4: POST-CONFIGURATION VALIDATION - echo "🔍 ENTERPRISE: Performing post-configuration validation..." - - withCredentials([sshUserPrivateKey(credentialsId: 'jenkins-ssh', keyFileVariable: 'SSH_KEY')]) { - sh """ - ssh -o StrictHostKeyChecking=no \\ - -o ConnectTimeout=10 \\ - -i \${SSH_KEY} \\ - ec2-user@${ec2_ip} \\ - ' - echo "=== ENTERPRISE VALIDATION REPORT ===" - echo "Instance: \$(hostname)" - echo "Date: \$(date)" - echo "" - - echo "Docker Service Status:" - sudo service docker status || echo "Docker service check failed" - echo "" - - echo "ECS Service Status:" - sudo service ecs status || echo "ECS service check failed" - echo "" - - echo "ECS Configuration:" - cat /etc/ecs/ecs.config || echo "ECS config file not found" - echo "" - - echo "ECS Agent Metadata (if available):" - timeout 10 curl -s http://localhost:51678/v1/metadata 2>/dev/null | head -10 || echo "ECS metadata not yet available" - echo "" - - echo "=== VALIDATION COMPLETED ===" - ' - """ - } - - echo "✅ ENTERPRISE: Configuration and validation completed successfully" - - } else { - // ENTERPRISE DIAGNOSTICS FOR FAILED AUTHENTICATION - echo "❌ ENTERPRISE: SSH authentication failed - collecting diagnostics..." - + if (commandStatus != "Success") { + echo "❌ SSM Command failed with status: ${commandStatus}" + // Get error output sh """ - echo "=== ENTERPRISE DIAGNOSTIC REPORT ===" - echo "Authentication Method: Jenkins Credential Store" - echo "Credential ID: jenkins-ssh" - echo "Target Instance: ${ec2_ip}" - echo "Expected User: ec2-user" - echo "Jenkins Server: \$(hostname)" - echo "Time: \$(date)" - echo "" - - echo "=== Network Connectivity Tests ===" - echo "Ping Test:" - ping -c 3 ${ec2_ip} || echo "Ping failed" - echo "" - - echo "SSH Port Test:" - nc -z -v ${ec2_ip} 22 || echo "SSH port not accessible" - echo "" - - echo "=== AWS Instance Information ===" - aws ec2 describe-instances \\ - --filters "Name=ip-address,Values=${ec2_ip}" \\ - --query 'Reservations[*].Instances[*].[InstanceId,State.Name,KeyName,LaunchTime]' \\ - --output table 2>/dev/null || echo "Could not retrieve instance information" - echo "" - - echo "=== Security Group Analysis ===" - INSTANCE_SG=\$(aws ec2 describe-instances \\ - --filters "Name=ip-address,Values=${ec2_ip}" \\ - --query 'Reservations[*].Instances[*].SecurityGroups[0].GroupId' \\ - --output text 2>/dev/null || echo "unknown") - - if [ "\$INSTANCE_SG" != "unknown" ]; then - echo "Instance Security Group: \$INSTANCE_SG" - aws ec2 describe-security-groups \\ - --group-ids \$INSTANCE_SG \\ - --query 'SecurityGroups[*].IpPermissions[*]' \\ - --output table 2>/dev/null || echo "Could not retrieve security group rules" - else - echo "Could not determine instance security group" - fi - echo "" - - echo "=== TROUBLESHOOTING RECOMMENDATIONS ===" - echo "1. Verify Jenkins credential 'jenkins-ssh' contains correct private key" - echo "2. Confirm AWS key pair 'nvhi-atsila-deployer' matches Jenkins credential" - echo "3. Check security group allows SSH (port 22) from Jenkins server IP: 38.110.1.139" - echo "4. Ensure EC2 instance has completed initialization" - echo "5. Verify IAM permissions for EC2 operations" - - echo "=== END DIAGNOSTIC REPORT ===" + echo "Error Output:" + aws ssm get-command-invocation \\ + --command-id ${commandId} \\ + --instance-id ${instanceId} \\ + --region ${AWS_REGION} \\ + --query 'StandardErrorContent' \\ + --output text """ - - throw new Exception("ENTERPRISE: SSH authentication failed - see diagnostic report for troubleshooting") + throw new Exception("SSM configuration command failed") } - } else { - throw new Exception("ENTERPRISE: Cannot proceed - EC2 instance IP address not available") + echo "✅ ENTERPRISE: EC2 instance configured via SSM successfully" + echo """ + 🔐 SSM Session Manager Access: + + To connect to the instance for troubleshooting: + + aws ssm start-session \\ + --target ${instanceId} \\ + --region ${AWS_REGION} + + Instance ID: ${instanceId} + Instance IP: ${ec2_ip} + """ } } } @@ -661,7 +543,7 @@ EOF {"name": "GIT_COMMIT", "value": "${gitCommitHash}"}, {"name": "DEPLOYMENT_TIME", "value": "${new Date().format('yyyy-MM-dd HH:mm:ss')}"}, {"name": "CONTAINER_REGISTRY", "value": "ECR"}, - {"name": "ARCHITECTURE", "value": "direct_ecs_access"} + {"name": "ARCHITECTURE", "value": "ssm_based_ecs_access"} ] }]""" @@ -677,15 +559,15 @@ EOF --region ${AWS_REGION} """ - // FIXED: Check if service exists and create/update accordingly + // Check if service exists and create/update accordingly def serviceExists = sh( - script: ''' - if aws ecs describe-services --cluster nvhi-atsila-cluster --services nvhi-atsila-cluster-service --region us-east-2 2>/dev/null | grep -q "ACTIVE"; then + script: """ + if aws ecs describe-services --cluster ${TF_VAR_cluster_name} --services ${TF_VAR_cluster_name}-service --region ${AWS_REGION} 2>/dev/null | grep -q "ACTIVE"; then echo "true" else echo "false" fi - ''', + """, returnStdout: true ).trim() @@ -713,14 +595,16 @@ EOF """ } - sh """ - # Wait for deployment to stabilize with security monitoring - echo "⏳ Waiting for secure service deployment to stabilize..." - aws ecs wait services-stable \\ - --cluster ${TF_VAR_cluster_name} \\ - --services ${TF_VAR_cluster_name}-service \\ - --region ${AWS_REGION} - """ + // Wait for deployment with better timeout handling + echo "⏳ Waiting for secure service deployment to stabilize..." + timeout(time: 10, unit: 'MINUTES') { + sh """ + aws ecs wait services-stable \\ + --cluster ${TF_VAR_cluster_name} \\ + --services ${TF_VAR_cluster_name}-service \\ + --region ${AWS_REGION} + """ + } echo "✅ SECURITY: Application deployed successfully with ECR integration" } @@ -747,7 +631,7 @@ EOF } echo "🏥 SECURITY: Running health validation on http://${ec2_ip}:8080/health" - echo "🔗 ARCHITECTURE: Direct access appropriate for microservice demonstration" + echo "🔗 ARCHITECTURE: Direct access with SSM management (secure and efficient)" if (ec2_ip != "unknown") { timeout(time: 5, unit: 'MINUTES') { @@ -804,7 +688,7 @@ EOF echo "🛡️ SECURITY: Validating network security and access controls..." echo " Testing only allowed ports are accessible" echo " Verifying ECR integration working correctly" - echo " Confirming direct access security model" + echo " Confirming SSM-based access security model" echo "✅ SECURITY: All smoke tests and security validations passed" """ @@ -823,7 +707,7 @@ EOF script { echo "📊 SECURITY: Collecting deployment artifacts and performing secure cleanup..." - // Archive comprehensive deployment artifacts for audit (skip ansible/hosts since we're not creating it) + // Archive comprehensive deployment artifacts for audit archiveArtifacts artifacts: 'deployment-audit.json,task-definition.json', allowEmptyArchive: true // Secure workspace cleanup @@ -836,6 +720,7 @@ EOF success { script { def ec2_ip = "" + def instanceId = "" def gitCommitHash = "" try { sh "test -d terraform || echo 'Terraform directory not found'" @@ -843,29 +728,39 @@ EOF script: "cd terraform && terraform output -raw ecs_instance_public_ip 2>/dev/null || echo 'unknown'", returnStdout: true ).trim() + instanceId = sh( + script: "cd terraform && terraform output -raw ecs_instance_id 2>/dev/null || echo 'unknown'", + returnStdout: true + ).trim() gitCommitHash = sh(script: 'git rev-parse HEAD 2>/dev/null || echo "unknown"', returnStdout: true).trim().take(8) } catch (Exception e) { ec2_ip = "unknown" + instanceId = "unknown" gitCommitHash = "unknown" } - echo "🎉 OPTIMAL ARCHITECTURE DEPLOYMENT SUCCESSFUL!" + echo "🎉 SSM-BASED SECURE DEPLOYMENT SUCCESSFUL!" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - echo "📋 DEPLOYMENT SUMMARY (OPTIMIZED FOR INTERVIEW):" + echo "📋 DEPLOYMENT SUMMARY (SSM-OPTIMIZED FOR SECURITY):" echo " • Container Registry: ECR (AWS-native, secure) ✅" - echo " • Architecture: Direct ECS access (appropriate for microservice) ✅" - echo " • Infrastructure: ECS + VPC + Security Groups (cost-optimized) ✅" + echo " • Architecture: SSM-based ECS access (keyless, secure) ✅" + echo " • Infrastructure: ECS + VPC + Security Groups (SSM-enabled) ✅" echo " • Application Version: ${IMAGE_TAG}" echo " • Application URL: http://${ec2_ip}:8080" echo " • Health Endpoint: http://${ec2_ip}:8080/health" echo " • ECR Image: ${env.AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ECR_REPO}:${IMAGE_TAG}" - echo " • Security Compliance: ✅ PASSED" + echo " • Security Compliance: ✅ PASSED (No SSH keys required)" echo " • Git Commit: ${gitCommitHash}" - echo " • Deployment Method: Jenkins + Terraform + Direct SSH ✅" + echo " • Deployment Method: Jenkins + Terraform + SSM ✅" + echo " • Instance Access: SSM Session Manager (${instanceId}) ✅" echo " • Cost Optimization: Free tier friendly ✅" + echo "" + echo "🔐 SSM ACCESS COMMANDS:" + echo " • Connect to instance: aws ssm start-session --target ${instanceId} --region ${AWS_REGION}" + echo " • View logs: aws ssm send-command --instance-ids ${instanceId} --document-name AWS-RunShellScript --parameters 'commands=[\"tail -f /var/log/ecs/ecs-agent.log\"]'" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - currentBuild.description = "✅ ECR | Direct Access | ${IMAGE_TAG} | ${ec2_ip}" + currentBuild.description = "✅ SSM | ECR | ${IMAGE_TAG} | ${ec2_ip}" } } @@ -875,6 +770,11 @@ EOF echo "🔍 Check the logs for issues with ECR authentication or ECS deployment" echo "💡 Security audit trail: deployment-audit.json" echo "🔒 State backup available: secure-state-backup-${BUILD_NUMBER}.json" + echo "" + echo "🔧 SSM TROUBLESHOOTING:" + echo " • If instance is available, connect via: aws ssm start-session --target --region ${AWS_REGION}" + echo " • Check SSM agent status on instance: sudo systemctl status amazon-ssm-agent" + echo " • Verify IAM permissions include AmazonSSMManagedInstanceCore policy" currentBuild.description = "❌ Failed: ${env.DEPLOYMENT_TYPE} | ${env.STAGE_NAME}" } diff --git a/terraform/main.tf b/terraform/main.tf index 40a83ea..1ecc46d 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -57,20 +57,13 @@ resource "aws_route_table_association" "public" { route_table_id = aws_route_table.public.id } -# Security Group +# Security Group - Updated for SSM (removed SSH, kept application access) resource "aws_security_group" "ecs_sg" { name = "${var.cluster_name}-sg" - description = "Allow SSH & HTTP to ECS" + description = "Allow HTTP to ECS and HTTPS outbound for SSM/ECR" vpc_id = aws_vpc.main.id - ingress { - description = "SSH from Jenkins" - from_port = 22 - to_port = 22 - protocol = "tcp" - cidr_blocks = [var.jenkins_ip_cidr] - } - + # HTTP access for application ingress { description = "HTTP from anywhere" from_port = 8080 @@ -79,11 +72,30 @@ resource "aws_security_group" "ecs_sg" { cidr_blocks = ["0.0.0.0/0"] } + # HTTPS outbound for SSM, ECR, and AWS services egress { - description = "All outbound traffic" - from_port = 0 - to_port = 0 - protocol = "-1" + description = "HTTPS outbound for AWS services" + from_port = 443 + to_port = 443 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + # HTTP outbound for package updates + egress { + description = "HTTP outbound for package updates" + from_port = 80 + to_port = 80 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + # DNS resolution + egress { + description = "DNS resolution" + from_port = 53 + to_port = 53 + protocol = "udp" cidr_blocks = ["0.0.0.0/0"] } @@ -92,7 +104,7 @@ resource "aws_security_group" "ecs_sg" { } } -# Key Pair +# Key Pair (keeping for compatibility, but not needed for SSM) resource "aws_key_pair" "deployer" { key_name = var.key_pair_name public_key = "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDDFBAOogBj/GHKXQs6FLROGQfXkZe2uKbRron0We7ZOLgt6e1bI7U8IMe+DIH250CHSi4R5DBYFQF5Bk1TkS5cgMtPIAb87vRUGI3sLs29DQA/kllYiZlQi9ejxcEz2+TRWn10Q/Kltlb6ESNLnnnTsIUUxKUeY3MKFFd+V13FleSVLGYondwPWYwD/XJ6a3VwSTJ1wFKO+lpKknSjDl2ZOgYpWFALPH+EwMlRGVMrUXAB604zqR1XOzYXAAWnhmmC9IGgCzU/5JnEgFyhfZbR3kpEH8SmSXahvdFZERp+3j9d3ROjchqnf0Z0zZ7vzX+G+jvzT/jGOkzH9tx0/OqIO9f47OFF8iUfZgUtJU1QGbepdsmQqognhxfJQfMZbVtKUw7zt+mzJz3A0XcRp7IwVHaqJ2QW2dpXi4UbWtejtZqROg6byWq2FpvFGNIT3eiKTf+EpCoOec6YGSrRQlj73Ob0+FhmsyQ6e8KKncaRYx38PqtnWsI3UnLtdKmEJmDBPI0ipxJzmKJKtb0vtJPVYvFEpgiXSwnDX883rAUQrXR/EhOMmbMwk7JSes6/GXH9rWN10JHh1/i1LLpl+rg6VyktFgVBHzVw++y29QSfFixeTvFkkTS5kl//CpKd1GDQb9ZBH6SPgkgOjmASPUo+p5e/NiN/SIBSpYpMjOKs7Q== jacques@Xochiquetzal" @@ -140,12 +152,18 @@ resource "aws_iam_role" "ecs_instance_role" { } } -# IAM Role Policy Attachment +# IAM Role Policy Attachment for ECS resource "aws_iam_role_policy_attachment" "ecs_instance_role_policy" { role = aws_iam_role.ecs_instance_role.name policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role" } +# IAM Role Policy Attachment for SSM +resource "aws_iam_role_policy_attachment" "ecs_instance_ssm_policy" { + role = aws_iam_role.ecs_instance_role.name + policy_arn = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" +} + # IAM Instance Profile resource "aws_iam_instance_profile" "ecs_instance_profile" { name = "${var.cluster_name}-ecs-instance-profile" @@ -170,17 +188,11 @@ resource "aws_ecs_cluster" "main" { } } -# User data script for ECS instance +# User data script for ECS instance with SSM locals { - user_data = base64encode(<<-EOF - #!/bin/bash - yum update -y - yum install -y ecs-init - echo ECS_CLUSTER=${var.cluster_name} >> /etc/ecs/ecs.config - service docker start - start ecs - EOF - ) + user_data = base64encode(templatefile("${path.module}/user_data.sh", { + cluster_name = var.cluster_name + })) } # EC2 Instance for ECS @@ -241,6 +253,11 @@ output "ecs_instance_public_ip" { value = aws_instance.ecs_instance.public_ip } +output "ecs_instance_id" { + description = "Instance ID for SSM access" + value = aws_instance.ecs_instance.id +} + output "ecs_cluster_name" { description = "Name of the ECS cluster" value = aws_ecs_cluster.main.name diff --git a/terraform/user_data.sh b/terraform/user_data.sh new file mode 100644 index 0000000..d0db0fc --- /dev/null +++ b/terraform/user_data.sh @@ -0,0 +1,73 @@ +#!/bin/bash +# Enhanced user data script with SSM and better logging +exec > >(tee /var/log/user-data.log|logger -t user-data -s 2>/dev/console) 2>&1 + +echo "=== Starting EC2 User Data Script ===" +echo "Timestamp: $(date)" +echo "Instance ID: $(curl -s http://169.254.169.254/latest/meta-data/instance-id)" +echo "Cluster Name: ${cluster_name}" + +# Update system +echo "=== Updating system packages ===" +yum update -y + +# Install and configure SSM agent (should already be installed on Amazon Linux 2) +echo "=== Configuring SSM Agent ===" +yum install -y amazon-ssm-agent +systemctl enable amazon-ssm-agent +systemctl start amazon-ssm-agent + +# Install ECS agent +echo "=== Installing ECS Agent ===" +yum install -y ecs-init + +# Configure ECS cluster +echo "=== Configuring ECS Cluster ===" +cat > /etc/ecs/ecs.config << EOF +ECS_CLUSTER=${cluster_name} +ECS_ENABLE_LOGGING=true +ECS_LOGLEVEL=info +ECS_ENABLE_CONTAINER_METADATA=true +ECS_ENABLE_TASK_IAM_ROLE=true +ECS_AVAILABLE_LOGGING_DRIVERS=["json-file","awslogs"] +ECS_CONTAINER_STOP_TIMEOUT=30s +ECS_CONTAINER_START_TIMEOUT=3m +ECS_DISABLE_IMAGE_CLEANUP=false +EOF + +# Start Docker and ECS +echo "=== Starting Docker and ECS services ===" +systemctl enable docker +systemctl start docker +systemctl enable ecs +systemctl start ecs + +# Wait for services to be ready +echo "=== Waiting for services to initialize ===" +sleep 30 + +# Verify services +echo "=== Service Status Check ===" +echo "SSM Agent Status:" +systemctl status amazon-ssm-agent --no-pager || echo "SSM agent status check failed" + +echo "Docker Status:" +systemctl status docker --no-pager || echo "Docker status check failed" + +echo "ECS Status:" +systemctl status ecs --no-pager || echo "ECS status check failed" + +# Check ECS agent connection +echo "=== ECS Agent Status ===" +for i in {1..5}; do + if curl -s http://localhost:51678/v1/metadata; then + echo "ECS agent is responding" + break + else + echo "ECS agent not ready yet, attempt $i/5" + sleep 10 + fi +done + +echo "=== User Data Script Completed ===" +echo "Timestamp: $(date)" \ No newline at end of file