AI-Powered Testing: From 60% to 95% Code Coverage in One Week
Our test coverage was stuck at 60%. Writing tests manually was slow and boring. I experimented with GPT-4 to generate tests automatically.
Results: 95% coverage in one week. Found 12 bugs in the process. Here’s how.
Table of Contents
The Problem
Before:
- Test coverage: 60%
- Time to write tests: 40% of development time
- Developer satisfaction: Low (boring work)
- Bugs found by tests: Minimal
Goal: Increase coverage without burning out the team.
Solution: AI-Generated Tests
Use GPT-4 to:
- Generate test cases
- Identify edge cases
- Create test data
- Write assertions
Setup
from openai import OpenAI
client = OpenAI(api_key="your-api-key")
def generate_tests(code, language="python"):
"""Generate comprehensive tests for given code."""
prompt = f"""
Generate comprehensive unit tests for this {language} code.
Code:
```{language}
{code}
Requirements:
- Test all functions
- Cover edge cases
- Include error cases
- Use appropriate assertions
- Add descriptive test names
- Include docstrings
Output format: Complete test file with imports """
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}],
temperature=0.3 # Lower temperature for consistent output
)
return response.choices[0].message.content
## Example 1: Simple Function
**Original Code**:
```python
# utils/calculator.py
def calculate_discount(price, discount_percent, user_tier='regular'):
"""Calculate discounted price based on user tier."""
if price < 0:
raise ValueError("Price cannot be negative")
if discount_percent < 0 or discount_percent > 100:
raise ValueError("Discount must be between 0 and 100")
base_discount = price * (discount_percent / 100)
# Tier multipliers
multipliers = {
'regular': 1.0,
'premium': 1.2,
'vip': 1.5
}
multiplier = multipliers.get(user_tier, 1.0)
final_discount = base_discount * multiplier
return price - final_discount
AI-Generated Tests:
# tests/test_calculator.py
import pytest
from decimal import Decimal
from utils.calculator import calculate_discount
class TestCalculateDiscount:
"""Test suite for calculate_discount function."""
def test_regular_user_basic_discount(self):
"""Test basic discount calculation for regular user."""
result = calculate_discount(100, 10, 'regular')
assert result == 90.0
def test_premium_user_enhanced_discount(self):
"""Test enhanced discount for premium user."""
result = calculate_discount(100, 10, 'premium')
assert result == 88.0 # 10% * 1.2 = 12%
def test_vip_user_maximum_discount(self):
"""Test maximum discount for VIP user."""
result = calculate_discount(100, 10, 'vip')
assert result == 85.0 # 10% * 1.5 = 15%
def test_zero_discount(self):
"""Test with zero discount."""
result = calculate_discount(100, 0, 'regular')
assert result == 100.0
def test_maximum_discount(self):
"""Test with 100% discount."""
result = calculate_discount(100, 100, 'regular')
assert result == 0.0
def test_negative_price_raises_error(self):
"""Test that negative price raises ValueError."""
with pytest.raises(ValueError, match="Price cannot be negative"):
calculate_discount(-10, 10, 'regular')
def test_negative_discount_raises_error(self):
"""Test that negative discount raises ValueError."""
with pytest.raises(ValueError, match="Discount must be between 0 and 100"):
calculate_discount(100, -5, 'regular')
def test_discount_over_100_raises_error(self):
"""Test that discount over 100 raises ValueError."""
with pytest.raises(ValueError, match="Discount must be between 0 and 100"):
calculate_discount(100, 150, 'regular')
def test_unknown_tier_defaults_to_regular(self):
"""Test that unknown tier defaults to regular multiplier."""
result = calculate_discount(100, 10, 'unknown')
assert result == 90.0
def test_decimal_price(self):
"""Test with decimal price."""
result = calculate_discount(99.99, 10, 'regular')
assert abs(result - 89.991) < 0.01
def test_decimal_discount(self):
"""Test with decimal discount percentage."""
result = calculate_discount(100, 15.5, 'regular')
assert result == 84.5
def test_zero_price(self):
"""Test with zero price."""
result = calculate_discount(0, 10, 'regular')
assert result == 0.0
Coverage: 100% (vs 0% before)
Example 2: Complex Class
Original Code:
# services/order_service.py
class OrderService:
def __init__(self, db, payment_gateway):
self.db = db
self.payment_gateway = payment_gateway
def create_order(self, user_id, items, shipping_address):
"""Create new order."""
# Validate user
user = self.db.users.find_one({"_id": user_id})
if not user:
raise ValueError("User not found")
# Calculate total
total = sum(item['price'] * item['quantity'] for item in items)
# Create order
order = {
"user_id": user_id,
"items": items,
"total": total,
"shipping_address": shipping_address,
"status": "pending"
}
order_id = self.db.orders.insert_one(order).inserted_id
return order_id
def process_payment(self, order_id, payment_method):
"""Process payment for order."""
order = self.db.orders.find_one({"_id": order_id})
if not order:
raise ValueError("Order not found")
if order['status'] != 'pending':
raise ValueError("Order already processed")
# Process payment
payment_result = self.payment_gateway.charge(
amount=order['total'],
method=payment_method
)
if payment_result['success']:
self.db.orders.update_one(
{"_id": order_id},
{"$set": {"status": "paid", "payment_id": payment_result['id']}}
)
return True
else:
self.db.orders.update_one(
{"_id": order_id},
{"$set": {"status": "failed"}}
)
return False
AI-Generated Tests (excerpt):
# tests/test_order_service.py
import pytest
from unittest.mock import Mock, MagicMock
from services.order_service import OrderService
@pytest.fixture
def mock_db():
"""Create mock database."""
db = Mock()
db.users = Mock()
db.orders = Mock()
return db
@pytest.fixture
def mock_payment_gateway():
"""Create mock payment gateway."""
return Mock()
@pytest.fixture
def order_service(mock_db, mock_payment_gateway):
"""Create OrderService instance."""
return OrderService(mock_db, mock_payment_gateway)
class TestCreateOrder:
def test_create_order_success(self, order_service, mock_db):
"""Test successful order creation."""
# Setup
mock_db.users.find_one.return_value = {"_id": "user123"}
mock_db.orders.insert_one.return_value = Mock(inserted_id="order123")
items = [
{"price": 10.0, "quantity": 2},
{"price": 5.0, "quantity": 1}
]
# Execute
order_id = order_service.create_order(
"user123",
items,
"123 Main St"
)
# Assert
assert order_id == "order123"
mock_db.orders.insert_one.assert_called_once()
# Verify order data
call_args = mock_db.orders.insert_one.call_args[0][0]
assert call_args['total'] == 25.0
assert call_args['status'] == 'pending'
def test_create_order_user_not_found(self, order_service, mock_db):
"""Test order creation with non-existent user."""
mock_db.users.find_one.return_value = None
with pytest.raises(ValueError, match="User not found"):
order_service.create_order("invalid", [], "address")
def test_create_order_empty_items(self, order_service, mock_db):
"""Test order creation with empty items list."""
mock_db.users.find_one.return_value = {"_id": "user123"}
mock_db.orders.insert_one.return_value = Mock(inserted_id="order123")
order_id = order_service.create_order("user123", [], "address")
call_args = mock_db.orders.insert_one.call_args[0][0]
assert call_args['total'] == 0.0
class TestProcessPayment:
def test_process_payment_success(self, order_service, mock_db, mock_payment_gateway):
"""Test successful payment processing."""
# Setup
mock_db.orders.find_one.return_value = {
"_id": "order123",
"total": 100.0,
"status": "pending"
}
mock_payment_gateway.charge.return_value = {
"success": True,
"id": "payment123"
}
# Execute
result = order_service.process_payment("order123", "credit_card")
# Assert
assert result is True
mock_db.orders.update_one.assert_called_once()
# Verify update
call_args = mock_db.orders.update_one.call_args
assert call_args[0][1]['$set']['status'] == 'paid'
def test_process_payment_failure(self, order_service, mock_db, mock_payment_gateway):
"""Test failed payment processing."""
mock_db.orders.find_one.return_value = {
"_id": "order123",
"total": 100.0,
"status": "pending"
}
mock_payment_gateway.charge.return_value = {"success": False}
result = order_service.process_payment("order123", "credit_card")
assert result is False
call_args = mock_db.orders.update_one.call_args
assert call_args[0][1]['$set']['status'] == 'failed'
def test_process_payment_order_not_found(self, order_service, mock_db):
"""Test payment processing with non-existent order."""
mock_db.orders.find_one.return_value = None
with pytest.raises(ValueError, match="Order not found"):
order_service.process_payment("invalid", "credit_card")
def test_process_payment_already_processed(self, order_service, mock_db):
"""Test payment processing on already processed order."""
mock_db.orders.find_one.return_value = {
"_id": "order123",
"status": "paid"
}
with pytest.raises(ValueError, match="Order already processed"):
order_service.process_payment("order123", "credit_card")
Coverage: 100% (vs 30% before)
Automation Script
# scripts/generate_tests.py
import os
import ast
from openai import OpenAI
client = OpenAI()
def extract_functions(file_path):
"""Extract all functions from Python file."""
with open(file_path, 'r') as f:
tree = ast.parse(f.read())
functions = []
for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef):
functions.append(node.name)
return functions
def generate_tests_for_file(file_path):
"""Generate tests for entire file."""
with open(file_path, 'r') as f:
code = f.read()
prompt = f"""
Generate comprehensive pytest tests for this Python file.
{code}
Include:
- All functions and methods
- Edge cases
- Error cases
- Mocking for external dependencies
- Fixtures
- Descriptive test names
"""
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}],
temperature=0.2
)
return response.choices[0].message.content
def process_directory(src_dir, test_dir):
"""Generate tests for all Python files in directory."""
for root, dirs, files in os.walk(src_dir):
for file in files:
if file.endswith('.py') and not file.startswith('test_'):
src_path = os.path.join(root, file)
# Generate tests
print(f"Generating tests for {src_path}...")
tests = generate_tests_for_file(src_path)
# Save tests
rel_path = os.path.relpath(src_path, src_dir)
test_path = os.path.join(test_dir, f"test_{rel_path}")
os.makedirs(os.path.dirname(test_path), exist_ok=True)
with open(test_path, 'w') as f:
f.write(tests)
print(f"✓ Tests saved to {test_path}")
# Run
process_directory('src/', 'tests/')
Results
Week 1 Progress:
| Day | Files Processed | Tests Generated | Coverage |
|---|---|---|---|
| 1 | 10 | 150 | 65% |
| 2 | 15 | 220 | 72% |
| 3 | 20 | 310 | 80% |
| 4 | 12 | 180 | 87% |
| 5 | 8 | 120 | 92% |
| 6-7 | Manual fixes | 50 | 95% |
Bugs Found: 12 (discovered by AI-generated tests!)
Quality Analysis
AI-Generated Tests:
- ✅ Comprehensive coverage
- ✅ Good edge case detection
- ✅ Proper mocking
- ⚠️ Sometimes over-complicated
- ⚠️ Occasional incorrect assertions
Manual Review Required: ~20% of tests needed tweaking
Cost Analysis
AI Costs:
- 500 API calls to GPT-4
- Average: 2000 tokens/call
- Total: ~1M tokens
- Cost: ~$30
Time Saved:
- Manual: 80 hours
- AI + Review: 20 hours
- Saved: 60 hours
ROI: At $100/hour = $6,000 saved for $30 spent
Lessons Learned
- AI excels at boilerplate - Standard test patterns
- Review is essential - Don’t trust blindly
- Edge cases are good - AI finds cases humans miss
- Mocking needs attention - Sometimes incorrect
- Huge time saver - 75% faster
Conclusion
AI-powered testing is a game-changer. 95% coverage in one week, found 12 bugs, saved 60 hours.
Key takeaways:
- GPT-4 generates comprehensive tests
- 75% time savings
- Better edge case coverage
- Requires manual review (~20%)
- Massive ROI ($30 → $6000 value)
Use AI for test generation. Your future self will thank you.