Our AWS infrastructure was a mess. Manual changes, configuration drift, no version control.

Migrated to Terraform. 100+ resources as code, deployment 2h → 10min, zero drift.

Table of Contents

The Problem

Before Terraform:

  • 100+ AWS resources
  • Manual console changes
  • No version control
  • Configuration drift
  • Deployment: 2 hours
  • Rollback: Impossible

Basic Setup

# main.tf
terraform {
  required_version = ">= 0.13"
  
  required_providers {
    aws = {
      source  = "hashicorp/aws"
      version = "~> 3.0"
    }
  }
  
  backend "s3" {
    bucket = "my-terraform-state"
    key    = "prod/terraform.tfstate"
    region = "us-east-1"
    encrypt = true
    dynamodb_table = "terraform-locks"
  }
}

provider "aws" {
  region = var.aws_region
}

VPC Module

# modules/vpc/main.tf
resource "aws_vpc" "main" {
  cidr_block           = var.vpc_cidr
  enable_dns_hostnames = true
  enable_dns_support   = true
  
  tags = {
    Name        = "${var.environment}-vpc"
    Environment = var.environment
  }
}

resource "aws_subnet" "public" {
  count             = length(var.public_subnets)
  vpc_id            = aws_vpc.main.id
  cidr_block        = var.public_subnets[count.index]
  availability_zone = var.availability_zones[count.index]
  
  map_public_ip_on_launch = true
  
  tags = {
    Name = "${var.environment}-public-${count.index + 1}"
  }
}

resource "aws_subnet" "private" {
  count             = length(var.private_subnets)
  vpc_id            = aws_vpc.main.id
  cidr_block        = var.private_subnets[count.index]
  availability_zone = var.availability_zones[count.index]
  
  tags = {
    Name = "${var.environment}-private-${count.index + 1}"
  }
}

resource "aws_internet_gateway" "main" {
  vpc_id = aws_vpc.main.id
  
  tags = {
    Name = "${var.environment}-igw"
  }
}

resource "aws_nat_gateway" "main" {
  count         = length(var.public_subnets)
  allocation_id = aws_eip.nat[count.index].id
  subnet_id     = aws_subnet.public[count.index].id
  
  tags = {
    Name = "${var.environment}-nat-${count.index + 1}"
  }
}

resource "aws_eip" "nat" {
  count = length(var.public_subnets)
  vpc   = true
  
  tags = {
    Name = "${var.environment}-nat-eip-${count.index + 1}"
  }
}

EKS Cluster

# modules/eks/main.tf
resource "aws_eks_cluster" "main" {
  name     = "${var.environment}-eks"
  role_arn = aws_iam_role.cluster.arn
  version  = var.kubernetes_version
  
  vpc_config {
    subnet_ids              = var.subnet_ids
    endpoint_private_access = true
    endpoint_public_access  = true
  }
  
  depends_on = [
    aws_iam_role_policy_attachment.cluster_policy,
    aws_iam_role_policy_attachment.service_policy
  ]
}

resource "aws_eks_node_group" "main" {
  cluster_name    = aws_eks_cluster.main.name
  node_group_name = "${var.environment}-node-group"
  node_role_arn   = aws_iam_role.node.arn
  subnet_ids      = var.subnet_ids
  
  scaling_config {
    desired_size = var.desired_size
    max_size     = var.max_size
    min_size     = var.min_size
  }
  
  instance_types = var.instance_types
  
  depends_on = [
    aws_iam_role_policy_attachment.node_policy,
    aws_iam_role_policy_attachment.cni_policy,
    aws_iam_role_policy_attachment.registry_policy
  ]
}

RDS Database

# modules/rds/main.tf
resource "aws_db_instance" "main" {
  identifier     = "${var.environment}-db"
  engine         = "postgres"
  engine_version = "12.5"
  instance_class = var.instance_class
  
  allocated_storage     = var.allocated_storage
  max_allocated_storage = var.max_allocated_storage
  storage_encrypted     = true
  
  db_name  = var.database_name
  username = var.master_username
  password = var.master_password
  
  vpc_security_group_ids = [aws_security_group.db.id]
  db_subnet_group_name   = aws_db_subnet_group.main.name
  
  backup_retention_period = 7
  backup_window          = "03:00-04:00"
  maintenance_window     = "mon:04:00-mon:05:00"
  
  skip_final_snapshot = false
  final_snapshot_identifier = "${var.environment}-db-final-snapshot"
  
  tags = {
    Name        = "${var.environment}-db"
    Environment = var.environment
  }
}

resource "aws_db_subnet_group" "main" {
  name       = "${var.environment}-db-subnet-group"
  subnet_ids = var.subnet_ids
  
  tags = {
    Name = "${var.environment}-db-subnet-group"
  }
}

Environment Configuration

# environments/prod/main.tf
module "vpc" {
  source = "../../modules/vpc"
  
  environment         = "prod"
  vpc_cidr           = "10.0.0.0/16"
  public_subnets     = ["10.0.1.0/24", "10.0.2.0/24"]
  private_subnets    = ["10.0.10.0/24", "10.0.20.0/24"]
  availability_zones = ["us-east-1a", "us-east-1b"]
}

module "eks" {
  source = "../../modules/eks"
  
  environment        = "prod"
  subnet_ids         = module.vpc.private_subnet_ids
  kubernetes_version = "1.18"
  desired_size       = 3
  max_size           = 10
  min_size           = 3
  instance_types     = ["t3.large"]
}

module "rds" {
  source = "../../modules/rds"
  
  environment           = "prod"
  instance_class        = "db.t3.large"
  allocated_storage     = 100
  max_allocated_storage = 1000
  database_name         = "myapp"
  master_username       = var.db_username
  master_password       = var.db_password
  subnet_ids            = module.vpc.private_subnet_ids
}

Deployment Workflow

#!/bin/bash
# deploy.sh

set -e

ENVIRONMENT=$1

if [ -z "$ENVIRONMENT" ]; then
  echo "Usage: ./deploy.sh <environment>"
  exit 1
fi

cd environments/$ENVIRONMENT

# Initialize
terraform init

# Plan
terraform plan -out=tfplan

# Apply
terraform apply tfplan

# Clean up
rm tfplan

State Management

# backend.tf
resource "aws_s3_bucket" "terraform_state" {
  bucket = "my-terraform-state"
  
  versioning {
    enabled = true
  }
  
  server_side_encryption_configuration {
    rule {
      apply_server_side_encryption_by_default {
        sse_algorithm = "AES256"
      }
    }
  }
  
  lifecycle {
    prevent_destroy = true
  }
}

resource "aws_dynamodb_table" "terraform_locks" {
  name         = "terraform-locks"
  billing_mode = "PAY_PER_REQUEST"
  hash_key     = "LockID"
  
  attribute {
    name = "LockID"
    type = "S"
  }
}

Results

Deployment:

  • Time: 2h → 10min (-92%)
  • Errors: 50% → 0%
  • Rollback: Impossible → 2min

Infrastructure:

  • Resources managed: 100+
  • Configuration drift: 0
  • Version control: ✅
  • Reproducibility: 100%

Team Productivity:

  • Infrastructure changes: 5x faster
  • Onboarding time: -70%
  • Documentation: Auto-generated

Lessons Learned

  1. Modules essential: Reusability
  2. State management critical: S3 + DynamoDB
  3. Plan before apply: Catch errors
  4. Version everything: Git for infrastructure
  5. Gradual migration: Don’t rush

Conclusion

Terraform transformed our infrastructure management. 100+ resources as code, deployment 2h → 10min, zero drift.

Key takeaways:

  1. Deployment: 2h → 10min (-92%)
  2. Configuration drift: 0
  3. Resources managed: 100+
  4. Rollback: 2min
  5. Team productivity: 5x

Use Terraform. Infrastructure as code works.