Complete Solution Architect's Handbook for Database Performance Issues
-- MySQL: Find and kill long-running connections
SELECT id, user, host, db, command, time, state, info
FROM information_schema.processlist
WHERE time > 300 AND command != 'Sleep';
-- Kill specific connection
KILL CONNECTION [connection_id];
-- PostgreSQL: Find and terminate connections
SELECT pid, usename, application_name, client_addr, state, query_start, query
FROM pg_stat_activity
WHERE state = 'active' AND query_start < now() - interval '5 minutes';
-- Terminate specific connection
SELECT pg_terminate_backend([pid]);
max_connections
parameter via the RDS parameter group to allow more clients to connect.
# Modify RDS parameter group
aws rds modify-db-parameter-group \
--db-parameter-group-name your-param-group \
--parameters "ParameterName=max_connections,ParameterValue=500,ApplyMethod=immediate"
# Apply to RDS instance
aws rds modify-db-instance \
--db-instance-identifier your-db-instance \
--db-parameter-group-name your-param-group \
--apply-immediately
# MySQL Parameter Group Settings
aws rds modify-db-parameter-group \
--db-parameter-group-name your-param-group \
--parameters "ParameterName=slow_query_log,ParameterValue=1,ApplyMethod=immediate" \
"ParameterName=long_query_time,ParameterValue=0.2,ApplyMethod=immediate" \
"ParameterName=log_queries_not_using_indexes,ParameterValue=1,ApplyMethod=immediate"
# PostgreSQL Settings
aws rds modify-db-parameter-group \
--db-parameter-group-name your-param-group \
--parameters "ParameterName=log_min_duration_statement,ParameterValue=200,ApplyMethod=immediate"
# Scale up RDS instance immediately
aws rds modify-db-instance \
--db-instance-identifier your-db-instance \
--db-instance-class db.r5.2xlarge \
--apply-immediately
# Monitor scaling progress
aws rds describe-db-instances \
--db-instance-identifier your-db-instance \
--query 'DBInstances[0].DBInstanceStatus'
# AWS CLI to get RDS metrics
aws cloudwatch get-metric-statistics \
--namespace AWS/RDS \
--metric-name CPUUtilization \
--dimensions Name=DBInstanceIdentifier,Value=your-db-instance \
--start-time 2024-01-01T00:00:00Z \
--end-time 2024-01-01T23:59:59Z \
--period 300 \
--statistics Average,Maximum
# Enable Performance Insights
aws rds modify-db-instance \
--db-instance-identifier your-db-instance \
--enable-performance-insights \
--performance-insights-retention-period 7
# Query Performance Insights data
aws pi describe-dimension-keys \
--service-type RDS \
--identifier your-resource-id \
--metric db.SQL.Innodb_rows_read.avg \
--start-time 2024-01-01T00:00:00Z \
--end-time 2024-01-01T23:59:59Z \
--group-by db.sql.id
-- Check current connections and processes
SHOW PROCESSLIST;
-- Check for locked tables
SHOW OPEN TABLES WHERE In_use > 0;
-- Check innodb status for deadlocks
SHOW ENGINE INNODB STATUS;
-- Check slow query log location
SHOW VARIABLES LIKE 'slow_query_log_file';
-- Analyze table statistics
SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_ROWS,
ROUND(DATA_LENGTH/1024/1024) AS DataMB,
ROUND(INDEX_LENGTH/1024/1024) AS IndexMB
FROM information_schema.TABLES
WHERE TABLE_SCHEMA NOT IN ('information_schema','mysql','performance_schema')
ORDER BY (DATA_LENGTH + INDEX_LENGTH) DESC;
-- Check active connections and queries
SELECT pid, usename, application_name, client_addr, state,
query_start, now() - query_start AS duration, query
FROM pg_stat_activity
WHERE state = 'active'
ORDER BY duration DESC;
-- Check for blocking queries
SELECT blocked_locks.pid AS blocked_pid,
blocked_activity.usename AS blocked_user,
blocking_locks.pid AS blocking_pid,
blocking_activity.usename AS blocking_user,
blocked_activity.query AS blocked_statement,
blocking_activity.query AS blocking_statement
FROM pg_catalog.pg_locks blocked_locks
JOIN pg_catalog.pg_stat_activity blocked_activity
ON blocked_activity.pid = blocked_locks.pid
JOIN pg_catalog.pg_locks blocking_locks
ON blocking_locks.locktype = blocked_locks.locktype
JOIN pg_catalog.pg_stat_activity blocking_activity
ON blocking_activity.pid = blocking_locks.pid
WHERE NOT blocked_locks.granted;
-- Check table and index usage
SELECT schemaname, tablename, attname, n_distinct, correlation
FROM pg_stats
WHERE schemaname NOT IN ('information_schema', 'pg_catalog')
ORDER BY n_distinct DESC;
// Java - HikariCP Configuration
HikariConfig config = new HikariConfig();
config.setJdbcUrl("jdbc:mysql://your-rds-endpoint:3306/database");
config.setUsername("username");
config.setPassword("password");
config.setMaximumPoolSize(20); // Max connections in pool
config.setMinimumIdle(5); // Min idle connections
config.setConnectionTimeout(20000); // 20 seconds
config.setIdleTimeout(300000); // 5 minutes
config.setMaxLifetime(1200000); // 20 minutes
config.setLeakDetectionThreshold(60000); // 60 seconds
HikariDataSource dataSource = new HikariDataSource(config);
// Node.js - Database Pool Configuration
const mysql = require('mysql2');
const pool = mysql.createPool({
host: 'your-rds-endpoint',
user: 'username',
password: 'password',
database: 'database',
waitForConnections: true,
connectionLimit: 20,
queueLimit: 0,
acquireTimeout: 20000,
timeout: 20000,
reconnect: true
});
# Python - SQLAlchemy Connection Pool
from sqlalchemy import create_engine
engine = create_engine(
'mysql://username:password@your-rds-endpoint/database',
pool_size=20,
max_overflow=30,
pool_pre_ping=True,
pool_recycle=3600
)
# Gradually reduce slow query threshold
aws rds modify-db-parameter-group \
--db-parameter-group-name your-param-group \
--parameters "ParameterName=long_query_time,ParameterValue=0.05,ApplyMethod=immediate"
# Analyze slow query log
mysql -e "
SELECT query_time, lock_time, rows_sent, rows_examined, sql_text
FROM mysql.slow_log
WHERE query_time > 0.05
ORDER BY query_time DESC
LIMIT 10;"
status, created_at
if often queried together).LOWER(name)
) if queries use those transformations.
-- MySQL: Create indexes based on slow query analysis
-- Composite index for frequently used WHERE clauses
CREATE INDEX idx_user_status_created ON users(status, created_at);
-- Covering index to avoid table lookups
CREATE INDEX idx_order_customer_covering ON orders(customer_id, status, order_date, total_amount);
-- PostgreSQL: Partial indexes for selective conditions
CREATE INDEX CONCURRENTLY idx_active_users ON users(email)
WHERE status = 'active';
-- Expression indexes for computed values
CREATE INDEX CONCURRENTLY idx_user_full_name ON users(LOWER(first_name || ' ' || last_name));
-- Check index usage
SELECT schemaname, tablename, indexname, idx_scan, idx_tup_read, idx_tup_fetch
FROM pg_stat_user_indexes
ORDER BY idx_scan DESC;
# Create read replica
aws rds create-db-instance-read-replica \
--db-instance-identifier mydb-read-replica \
--source-db-instance-identifier mydb-master \
--db-instance-class db.r5.large \
--availability-zone us-west-2b
// Java example with read/write splitting
@Service
public class DatabaseService {
@Autowired
@Qualifier("masterDataSource")
private DataSource masterDataSource;
@Autowired
@Qualifier("replicaDataSource")
private DataSource replicaDataSource;
public void writeData(String data) {
// Use master for writes
JdbcTemplate masterTemplate = new JdbcTemplate(masterDataSource);
masterTemplate.update("INSERT INTO table VALUES (?)", data);
}
public List readData() {
// Use replica for reads
JdbcTemplate replicaTemplate = new JdbcTemplate(replicaDataSource);
return replicaTemplate.queryForList("SELECT * FROM table", String.class);
}
}
# Automated scaling based on metrics
aws application-autoscaling register-scalable-target \
--service-namespace rds \
--scalable-dimension rds:cluster:ReadReplicaCount \
--resource-id cluster:your-aurora-cluster \
--min-capacity 1 \
--max-capacity 15
# Create scaling policy
aws application-autoscaling put-scaling-policy \
--policy-name cpu-scaling-policy \
--service-namespace rds \
--scalable-dimension rds:cluster:ReadReplicaCount \
--resource-id cluster:your-aurora-cluster \
--policy-type TargetTrackingScaling \
--target-tracking-scaling-policy-configuration '{
"TargetValue": 70.0,
"PredefinedMetricSpecification": {
"PredefinedMetricType": "RDSReaderAverageCPUUtilization"
}
}'
# High CPU Utilization Alert
aws cloudwatch put-metric-alarm \
--alarm-name "RDS-High-CPU" \
--alarm-description "RDS CPU utilization is high" \
--metric-name CPUUtilization \
--namespace AWS/RDS \
--statistic Average \
--period 300 \
--threshold 80 \
--comparison-operator GreaterThanThreshold \
--dimensions Name=DBInstanceIdentifier,Value=your-db-instance \
--evaluation-periods 2 \
--alarm-actions arn:aws:sns:us-west-2:123456789012:rds-alerts
# Connection Count Alert
aws cloudwatch put-metric-alarm \
--alarm-name "RDS-High-Connections" \
--alarm-description "RDS connection count is approaching limit" \
--metric-name DatabaseConnections \
--namespace AWS/RDS \
--statistic Average \
--period 300 \
--threshold 400 \
--comparison-operator GreaterThanThreshold \
--dimensions Name=DBInstanceIdentifier,Value=your-db-instance \
--evaluation-periods 1 \
--alarm-actions arn:aws:sns:us-west-2:123456789012:rds-alerts
# Read/Write Latency Alerts
aws cloudwatch put-metric-alarm \
--alarm-name "RDS-High-Read-Latency" \
--alarm-description "RDS read latency is high" \
--metric-name ReadLatency \
--namespace AWS/RDS \
--statistic Average \
--period 300 \
--threshold 0.2 \
--comparison-operator GreaterThanThreshold \
--dimensions Name=DBInstanceIdentifier,Value=your-db-instance \
--evaluation-periods 2 \
--alarm-actions arn:aws:sns:us-west-2:123456789012:rds-alerts
# Create comprehensive RDS dashboard
aws cloudwatch put-dashboard \
--dashboard-name "RDS-Performance-Dashboard" \
--dashboard-body '{
"widgets": [
{
"type": "metric",
"properties": {
"metrics": [
["AWS/RDS", "CPUUtilization", "DBInstanceIdentifier", "your-db-instance"],
[".", "DatabaseConnections", ".", "."],
[".", "ReadLatency", ".", "."],
[".", "WriteLatency", ".", "."]
],
"period": 300,
"stat": "Average",
"region": "us-west-2",
"title": "RDS Key Metrics"
}
},
{
"type": "metric",
"properties": {
"metrics": [
["AWS/RDS", "FreeableMemory", "DBInstanceIdentifier", "your-db-instance"],
[".", "SwapUsage", ".", "."],
[".", "ReadIOPS", ".", "."],
[".", "WriteIOPS", ".", "."]
],
"period": 300,
"stat": "Average",
"region": "us-west-2",
"title": "RDS Resource Utilization"
}
}
]
}'
// Java - Database Connection Pool Monitoring
@Component
public class DatabasePoolMonitor {
@Autowired
private HikariDataSource dataSource;
@Scheduled(fixedRate = 30000) // Every 30 seconds
public void monitorConnectionPool() {
HikariPoolMXBean poolMXBean = dataSource.getHikariPoolMXBean();
// Log key metrics
log.info("Active Connections: {}", poolMXBean.getActiveConnections());
log.info("Idle Connections: {}", poolMXBean.getIdleConnections());
log.info("Total Connections: {}", poolMXBean.getTotalConnections());
log.info("Threads Awaiting Connection: {}", poolMXBean.getThreadsAwaitingConnection());
// Send to CloudWatch custom metrics
CloudWatchClient cloudWatch = CloudWatchClient.create();
PutMetricDataRequest request = PutMetricDataRequest.builder()
.namespace("Custom/Database")
.metricData(
MetricDatum.builder()
.metricName("ActiveConnections")
.value((double) poolMXBean.getActiveConnections())
.timestamp(Instant.now())
.build(),
MetricDatum.builder()
.metricName("ThreadsAwaitingConnection")
.value((double) poolMXBean.getThreadsAwaitingConnection())
.timestamp(Instant.now())
.build()
)
.build();
cloudWatch.putMetricData(request);
}
}
// Node.js - Connection Pool Monitoring
const CloudWatch = require('aws-sdk/clients/cloudwatch');
const cloudwatch = new CloudWatch();
setInterval(() => {
const poolStats = pool.poolState();
const params = {
Namespace: 'Custom/Database',
MetricData: [
{
MetricName: 'ActiveConnections',
Value: poolStats.activeConnections,
Timestamp: new Date()
},
{
MetricName: 'QueuedRequests',
Value: poolStats.queuedRequests,
Timestamp: new Date()
}
]
};
cloudwatch.putMetricData(params, (err, data) => {
if (err) console.error('CloudWatch Error:', err);
});
}, 30000);
# Lambda function for automated response to RDS alerts
import boto3
import json
import logging
def lambda_handler(event, context):
rds_client = boto3.client('rds')
cloudwatch = boto3.client('cloudwatch')
# Parse SNS message
message = json.loads(event['Records'][0]['Sns']['Message'])
alarm_name = message['AlarmName']
instance_id = message['Dimensions'][0]['value']
logging.info(f"Processing alarm: {alarm_name} for instance: {instance_id}")
if 'High-CPU' in alarm_name:
# Get current instance class
response = rds_client.describe_db_instances(
DBInstanceIdentifier=instance_id
)
current_class = response['DBInstances'][0]['DBInstanceClass']
# Scale up mapping
scale_up_map = {
'db.t3.micro': 'db.t3.small',
'db.t3.small': 'db.t3.medium',
'db.t3.medium': 'db.t3.large',
'db.r5.large': 'db.r5.xlarge',
'db.r5.xlarge': 'db.r5.2xlarge'
};
if current_class in scale_up_map:
new_class = scale_up_map[current_class];
# Scale up the instance
rds_client.modify_db_instance(
DBInstanceIdentifier=instance_id,
DBInstanceClass=new_class,
ApplyImmediately=True
);
logging.info(f"Scaled up {instance_id} from {current_class} to {new_class}");
elif 'High-Connections' in alarm_name:
# Kill long-running connections
# This would require custom logic based on your database engine
# Send notification to team
sns = boto3.client('sns');
sns.publish(
TopicArn='arn:aws:sns:us-west-2:123456789012:dba-alerts',
Message=f'High connection count detected on {instance_id}. Manual intervention may be required.',
Subject='RDS Connection Alert'
);
return {
'statusCode': 200,
'body': json.dumps('Alert processed successfully')
}
# Load testing script using Apache Bench
#!/bin/bash
DB_ENDPOINT="your-rds-endpoint"
TEST_DURATION=300 # 5 minutes
CONCURRENT_USERS=50
echo "Starting RDS Performance Test..."
echo "Endpoint: $DB_ENDPOINT"
echo "Duration: $TEST_DURATION seconds"
echo "Concurrent Users: $CONCURRENT_USERS"
# Test 1: Connection stress test
mysql -h $DB_ENDPOINT -u username -p -e "
CREATE TEMPORARY TABLE load_test AS
SELECT
CONNECTION_ID() as conn_id,
NOW() as start_time,
'connection_test' as test_type;"
# Test 2: Query performance test
for i in {1..100}; do
mysql -h $DB_ENDPOINT -u username -p -e "
SELECT COUNT(*) FROM your_large_table
WHERE indexed_column = 'test_value_$i';" &
done
wait
echo "Performance test completed. Check CloudWatch metrics."
# Python script for comprehensive load testing
import threading
import time
import mysql.connector
from mysql.connector import pooling
import statistics
class DatabaseLoadTester:
def __init__(self, host, user, password, database, pool_size=20):
self.config = {
'host': host,
'user': user,
'password': password,
'database': database
}
self.connection_pool = pooling.MySQLConnectionPool(
pool_name="test_pool",
pool_size=pool_size,
**self.config
)
self.results = []
def execute_test_query(self):
start_time = time.time()
try:
connection = self.connection_pool.get_connection()
cursor = connection.cursor()
# Sample test queries
cursor.execute("SELECT COUNT(*) FROM information_schema.tables")
result = cursor.fetchone()
cursor.close()
connection.close()
end_time = time.time()
query_time = (end_time - start_time) * 1000 # Convert to ms
self.results.append(query_time)
except Exception as e:
print(f"Query failed: {e}")
def run_load_test(self, num_threads=10, duration_seconds=60):
threads = []
start_time = time.time();
def worker():
while time.time() - start_time < duration_seconds:
self.execute_test_query();
time.sleep(0.1) # Small delay between queries
# Start threads
for _ in range(num_threads):
thread = threading.Thread(target=worker)
thread.start();
threads.append(thread);
# Wait for all threads to complete
for thread in threads:
thread.join();
# Calculate statistics
if self.results:
avg_time = statistics.mean(self.results);
median_time = statistics.median(self.results);
p95_time = sorted(self.results)[int(0.95 * len(self.results))];
print(f"Test Results:");
print(f"Total Queries: {len(self.results)}");
print(f"Average Response Time: {avg_time:.2f}ms");
print(f"Median Response Time: {median_time:.2f}ms");
print(f"95th Percentile: {p95_time:.2f}ms");
# Usage
tester = DatabaseLoadTester('your-rds-endpoint', 'username', 'password', 'database');
tester.run_load_test(num_threads=20, duration_seconds=300);