Production Best Practices
Deploy monitoring to production safely and effectively with enterprise-grade reliability and performance.
Overview
Deploying AI agents to production requires careful planning, monitoring, and operational excellence. This guide covers:
- Pre-Production Checklist - Ensure readiness for production
- Production Configuration - Optimize for production environments
- Monitoring in Production - Real-time production monitoring
- Incident Response - Handle production incidents effectively
- Scaling Strategies - Scale your AI operations safely
Pre-Production Checklist
1. Infrastructure Readiness
class ProductionReadinessCheck:
def __init__(self):
self.ants = AgenticAnts(api_key=os.getenv('AGENTICANTS_API_KEY'))
def check_infrastructure_readiness(self):
"""Verify infrastructure is ready for production"""
checks = []
# Check API connectivity
try:
self.ants.health.check()
checks.append(("API Connectivity", "✅ PASS"))
except Exception as e:
checks.append(("API Connectivity", f"❌ FAIL: {e}"))
# Check monitoring setup
try:
metrics = self.ants.metrics.get_system_metrics()
checks.append(("Monitoring Setup", "✅ PASS"))
except Exception as e:
checks.append(("Monitoring Setup", f"❌ FAIL: {e}"))
# Check alerting configuration
try:
alerts = self.ants.alerts.list()
checks.append(("Alerting Configuration", "✅ PASS"))
except Exception as e:
checks.append(("Alerting Configuration", f"❌ FAIL: {e}"))
# Check backup systems
try:
backup_status = self.ants.backup.check_status()
checks.append(("Backup Systems", "✅ PASS"))
except Exception as e:
checks.append(("Backup Systems", f"❌ FAIL: {e}"))
return checks
def print_readiness_report(self):
"""Print comprehensive readiness report"""
print("🚀 Production Readiness Report")
print("=" * 50)
checks = self.check_infrastructure_readiness()
for check_name, status in checks:
print(f"{check_name}: {status}")
# Overall status
failed_checks = [check for check in checks if "❌" in check[1]]
if failed_checks:
print(f"\n❌ {len(failed_checks)} checks failed. Fix before deploying to production.")
else:
print("\n✅ All checks passed. Ready for production deployment!")2. Security and Compliance
class ProductionSecurityCheck {
private ants: AgenticAnts
async checkSecurityReadiness() {
const securityChecks = []
// Check API key security
try {
await this.ants.security.validateApiKey()
securityChecks.push({ check: 'API Key Security', status: 'PASS' })
} catch (error) {
securityChecks.push({ check: 'API Key Security', status: 'FAIL', error: error.message })
}
// Check data encryption
try {
const encryptionStatus = await this.ants.security.checkEncryption()
securityChecks.push({ check: 'Data Encryption', status: 'PASS' })
} catch (error) {
securityChecks.push({ check: 'Data Encryption', status: 'FAIL', error: error.message })
}
// Check access controls
try {
const accessControls = await this.ants.security.checkAccessControls()
securityChecks.push({ check: 'Access Controls', status: 'PASS' })
} catch (error) {
securityChecks.push({ check: 'Access Controls', status: 'FAIL', error: error.message })
}
// Check compliance requirements
try {
const complianceStatus = await this.ants.compliance.checkStatus()
securityChecks.push({ check: 'Compliance Requirements', status: 'PASS' })
} catch (error) {
securityChecks.push({ check: 'Compliance Requirements', status: 'FAIL', error: error.message })
}
return securityChecks
}
async printSecurityReport() {
console.log('🔒 Production Security Report')
console.log('='.repeat(50))
const checks = await this.checkSecurityReadiness()
checks.forEach(check => {
const status = check.status === 'PASS' ? '✅' : '❌'
console.log(`${check.check}: ${status} ${check.status}`)
if (check.error) {
console.log(` Error: ${check.error}`)
}
})
}
}3. Performance Baseline
class ProductionPerformanceBaseline:
def __init__(self):
self.ants = AgenticAnts(api_key=os.getenv('AGENTICANTS_API_KEY'))
def establish_performance_baseline(self):
"""Establish performance baseline for production"""
baseline_metrics = {}
# Test response times
response_times = []
for i in range(100):
start_time = time.time()
# Simulate agent call
result = self.simulate_agent_call()
response_time = time.time() - start_time
response_times.append(response_time)
baseline_metrics["avg_response_time"] = sum(response_times) / len(response_times)
baseline_metrics["p95_response_time"] = sorted(response_times)[95]
baseline_metrics["p99_response_time"] = sorted(response_times)[99]
# Test throughput
throughput = self.test_throughput()
baseline_metrics["max_throughput"] = throughput
# Test error rates
error_rate = self.test_error_rate()
baseline_metrics["baseline_error_rate"] = error_rate
return baseline_metrics
def test_throughput(self):
"""Test maximum throughput"""
# Implement throughput testing
return 100 # requests per second
def test_error_rate(self):
"""Test baseline error rate"""
# Implement error rate testing
return 0.01 # 1% error rateProduction Configuration
1. Environment Configuration
class ProductionConfiguration:
def __init__(self):
self.ants = AgenticAnts(api_key=os.getenv('AGENTICANTS_API_KEY'))
def configure_production_environment(self):
"""Configure AgenticAnts for production"""
# Configure monitoring
self.ants.config.set_production_mode(True)
# Configure sampling
self.ants.config.set_sampling_rate(1.0) # 100% sampling in production
# Configure retention
self.ants.config.set_data_retention({
"traces": "90_days",
"metrics": "1_year",
"logs": "7_years"
})
# Configure alerting
self.ants.alerts.configure_production_alerts()
# Configure backup
self.ants.backup.configure_production_backup()
print("✅ Production environment configured successfully")
def configure_production_alerts(self):
"""Configure production-specific alerts"""
# High error rate alert
self.ants.alerts.create({
"name": "High Error Rate",
"condition": "error_rate > 5%",
"window": "5m",
"channels": ["email", "slack", "pagerduty"],
"severity": "critical"
})
# High latency alert
self.ants.alerts.create({
"name": "High Latency",
"condition": "p95_latency > 5000ms",
"window": "10m",
"channels": ["email", "slack"],
"severity": "warning"
})
# Cost spike alert
self.ants.alerts.create({
"name": "Cost Spike",
"condition": "daily_cost > 1000",
"window": "1d",
"channels": ["email", "slack"],
"severity": "warning"
})
print("✅ Production alerts configured successfully")2. Load Balancing and Scaling
class ProductionScaling {
private ants: AgenticAnts
async configureLoadBalancing() {
// Configure load balancing for multiple instances
await this.ants.config.setLoadBalancing({
strategy: 'round_robin',
healthCheckInterval: 30,
maxRetries: 3,
timeout: 5000
})
console.log('✅ Load balancing configured')
}
async configureAutoScaling() {
// Configure auto-scaling based on metrics
await this.ants.scaling.configureAutoScaling({
minInstances: 2,
maxInstances: 10,
scaleUpThreshold: 80, // CPU usage
scaleDownThreshold: 30,
scaleUpCooldown: 300, // 5 minutes
scaleDownCooldown: 600 // 10 minutes
})
console.log('✅ Auto-scaling configured')
}
async configureCircuitBreakers() {
// Configure circuit breakers for resilience
await this.ants.resilience.configureCircuitBreakers({
failureThreshold: 5,
recoveryTimeout: 30000,
halfOpenMaxCalls: 3
})
console.log('✅ Circuit breakers configured')
}
}3. Data Management
class ProductionDataManagement:
def __init__(self):
self.ants = AgenticAnts(api_key=os.getenv('AGENTICANTS_API_KEY'))
def configure_data_management(self):
"""Configure data management for production"""
# Configure data encryption
self.ants.security.configure_encryption({
"at_rest": "AES-256",
"in_transit": "TLS-1.3",
"key_rotation": "90_days"
})
# Configure data retention
self.ants.data.configure_retention({
"traces": "90_days",
"metrics": "1_year",
"logs": "7_years",
"backups": "1_year"
})
# Configure data backup
self.ants.backup.configure_backup({
"frequency": "daily",
"retention": "30_days",
"encryption": True
})
# Configure data export
self.ants.data.configure_export({
"format": "json",
"compression": True,
"encryption": True
})
print("✅ Data management configured successfully")Monitoring in Production
1. Real-Time Monitoring
class ProductionMonitoring:
def __init__(self):
self.ants = AgenticAnts(api_key=os.getenv('AGENTICANTS_API_KEY'))
def setup_production_monitoring(self):
"""Setup comprehensive production monitoring"""
# Setup real-time dashboards
self.ants.dashboards.create_production_dashboard()
# Setup health checks
self.ants.health.setup_production_health_checks()
# Setup performance monitoring
self.ants.metrics.setup_production_metrics()
# Setup cost monitoring
self.ants.finops.setup_production_cost_monitoring()
print("✅ Production monitoring setup complete")
def create_production_dashboard(self):
"""Create production monitoring dashboard"""
dashboard = self.ants.dashboards.create({
"name": "Production Overview",
"widgets": [
{
"type": "timeseries",
"title": "Request Rate",
"metric": "requests_per_second",
"time_range": "1h"
},
{
"type": "timeseries",
"title": "Response Time",
"metric": "p95_latency",
"time_range": "1h"
},
{
"type": "gauge",
"title": "Error Rate",
"metric": "error_rate",
"thresholds": [5, 10]
},
{
"type": "timeseries",
"title": "Cost",
"metric": "daily_cost",
"time_range": "7d"
}
]
})
return dashboard2. Health Checks
class ProductionHealthChecks {
private ants: AgenticAnts
async setupHealthChecks() {
// Configure health checks for all components
await this.ants.health.configureHealthChecks({
endpoints: [
{
name: 'api-health',
url: '/api/health',
interval: 30,
timeout: 5,
expectedStatus: 200
},
{
name: 'database-health',
url: '/api/db/health',
interval: 60,
timeout: 10,
expectedStatus: 200
},
{
name: 'vector-db-health',
url: '/api/vector/health',
interval: 60,
timeout: 10,
expectedStatus: 200
}
]
})
console.log('✅ Health checks configured')
}
async getHealthStatus() {
const healthStatus = await this.ants.health.getStatus()
console.log('🏥 Production Health Status')
console.log('='.repeat(50))
Object.entries(healthStatus).forEach(([component, status]) => {
const icon = status.healthy ? '✅' : '❌'
console.log(`${component}: ${icon} ${status.status}`)
if (!status.healthy) {
console.log(` Error: ${status.error}`)
}
})
return healthStatus
}
}3. Performance Monitoring
class ProductionPerformanceMonitoring:
def __init__(self):
self.ants = AgenticAnts(api_key=os.getenv('AGENTICANTS_API_KEY'))
def monitor_production_performance(self):
"""Monitor production performance metrics"""
# Get current performance metrics
metrics = self.ants.metrics.get_production_metrics()
print("📊 Production Performance Metrics")
print("=" * 50)
print(f"Request Rate: {metrics['request_rate']} req/s")
print(f"Response Time (P95): {metrics['p95_latency']}ms")
print(f"Error Rate: {metrics['error_rate']}%")
print(f"Success Rate: {metrics['success_rate']}%")
print(f"Active Connections: {metrics['active_connections']}")
# Check for performance issues
if metrics['error_rate'] > 5:
print("⚠️ High error rate detected!")
if metrics['p95_latency'] > 5000:
print("⚠️ High latency detected!")
if metrics['active_connections'] > 1000:
print("⚠️ High connection count detected!")
return metrics
def get_performance_trends(self):
"""Get performance trends over time"""
trends = self.ants.metrics.get_performance_trends(
period="last_24h",
granularity="1h"
)
print("📈 Performance Trends (Last 24h)")
print("=" * 50)
for trend in trends:
print(f"{trend['timestamp']}: {trend['request_rate']} req/s, {trend['p95_latency']}ms")
return trendsIncident Response
1. Incident Detection
class ProductionIncidentResponse:
def __init__(self):
self.ants = AgenticAnts(api_key=os.getenv('AGENTICANTS_API_KEY'))
def setup_incident_detection(self):
"""Setup incident detection and response"""
# Configure incident detection rules
self.ants.incidents.configure_detection_rules({
"high_error_rate": {
"condition": "error_rate > 10%",
"window": "5m",
"severity": "critical"
},
"high_latency": {
"condition": "p95_latency > 10000ms",
"window": "10m",
"severity": "high"
},
"cost_spike": {
"condition": "hourly_cost > 100",
"window": "1h",
"severity": "medium"
}
})
# Configure incident response workflows
self.ants.incidents.configure_response_workflows()
print("✅ Incident detection and response configured")
def handle_incident(self, incident_id: str):
"""Handle production incident"""
incident = self.ants.incidents.get(incident_id)
print(f"🚨 Incident {incident_id} detected")
print(f"Severity: {incident['severity']}")
print(f"Description: {incident['description']}")
print(f"Affected systems: {incident['affected_systems']}")
# Execute incident response workflow
response = self.ants.incidents.execute_response_workflow(incident_id)
print(f"Response actions: {response['actions']}")
return response2. Incident Response Workflows
class IncidentResponseWorkflows {
private ants: AgenticAnts
async configureResponseWorkflows() {
// Configure automated response workflows
await this.ants.incidents.configureWorkflows({
critical: {
steps: [
{ action: 'notify', channels: ['pagerduty', 'slack'] },
{ action: 'scale', instances: 2 },
{ action: 'enable', feature: 'circuit_breaker' },
{ action: 'notify', channels: ['slack'], message: 'Mitigation applied' }
]
},
high: {
steps: [
{ action: 'notify', channels: ['slack'] },
{ action: 'investigate', automated: true },
{ action: 'notify', channels: ['slack'], message: 'Investigation started' }
]
},
medium: {
steps: [
{ action: 'notify', channels: ['email'] },
{ action: 'log', level: 'warning' }
]
}
})
console.log('✅ Incident response workflows configured')
}
async executeIncidentResponse(incidentId: string) {
const incident = await this.ants.incidents.get(incidentId)
console.log(`🚨 Executing incident response for ${incidentId}`)
console.log(`Severity: ${incident.severity}`)
// Execute appropriate workflow based on severity
const workflow = await this.ants.incidents.getWorkflow(incident.severity)
for (const step of workflow.steps) {
console.log(`Executing: ${step.action}`)
await this.ants.incidents.executeStep(step)
}
console.log('✅ Incident response workflow completed')
}
}3. Post-Incident Analysis
class PostIncidentAnalysis:
def __init__(self):
self.ants = AgenticAnts(api_key=os.getenv('AGENTICANTS_API_KEY'))
def conduct_post_incident_analysis(self, incident_id: str):
"""Conduct post-incident analysis"""
incident = self.ants.incidents.get(incident_id)
print(f"📋 Post-Incident Analysis for {incident_id}")
print("=" * 50)
# Analyze incident timeline
timeline = self.ants.incidents.get_timeline(incident_id)
print("Timeline:")
for event in timeline:
print(f" {event['timestamp']}: {event['description']}")
# Analyze root cause
root_cause = self.ants.incidents.analyze_root_cause(incident_id)
print(f"\nRoot Cause: {root_cause['description']}")
# Generate recommendations
recommendations = self.ants.incidents.generate_recommendations(incident_id)
print("\nRecommendations:")
for rec in recommendations:
print(f" - {rec['description']}")
# Generate post-incident report
report = self.ants.incidents.generate_post_incident_report(incident_id)
return reportScaling Strategies
1. Horizontal Scaling
class ProductionScaling:
def __init__(self):
self.ants = AgenticAnts(api_key=os.getenv('AGENTICANTS_API_KEY'))
def setup_horizontal_scaling(self):
"""Setup horizontal scaling for production"""
# Configure auto-scaling
self.ants.scaling.configure_auto_scaling({
"min_instances": 2,
"max_instances": 10,
"scale_up_threshold": 80, # CPU usage
"scale_down_threshold": 30,
"scale_up_cooldown": 300, # 5 minutes
"scale_down_cooldown": 600 # 10 minutes
})
# Configure load balancing
self.ants.scaling.configure_load_balancing({
"strategy": "round_robin",
"health_check_interval": 30,
"max_retries": 3,
"timeout": 5000
})
print("✅ Horizontal scaling configured")
def monitor_scaling_metrics(self):
"""Monitor scaling metrics"""
metrics = self.ants.scaling.get_scaling_metrics()
print("📊 Scaling Metrics")
print("=" * 50)
print(f"Current Instances: {metrics['current_instances']}")
print(f"CPU Usage: {metrics['cpu_usage']}%")
print(f"Memory Usage: {metrics['memory_usage']}%")
print(f"Request Rate: {metrics['request_rate']} req/s")
# Check scaling triggers
if metrics['cpu_usage'] > 80:
print("⚠️ High CPU usage - scaling up may be triggered")
if metrics['cpu_usage'] < 30:
print("ℹ️ Low CPU usage - scaling down may be triggered")
return metrics2. Vertical Scaling
class VerticalScaling {
private ants: AgenticAnts
async setupVerticalScaling() {
// Configure vertical scaling based on resource usage
await this.ants.scaling.configureVerticalScaling({
cpuThreshold: 80,
memoryThreshold: 85,
scaleUpFactor: 1.5,
scaleDownFactor: 0.8,
cooldownPeriod: 300
})
console.log('✅ Vertical scaling configured')
}
async monitorResourceUsage() {
const resourceMetrics = await this.ants.scaling.getResourceMetrics()
console.log('📊 Resource Usage Metrics')
console.log('='.repeat(50))
console.log(`CPU Usage: ${resourceMetrics.cpuUsage}%`)
console.log(`Memory Usage: ${resourceMetrics.memoryUsage}%`)
console.log(`Disk Usage: ${resourceMetrics.diskUsage}%`)
console.log(`Network Usage: ${resourceMetrics.networkUsage}%`)
// Check for resource constraints
if (resourceMetrics.cpuUsage > 90) {
console.log('⚠️ High CPU usage detected')
}
if (resourceMetrics.memoryUsage > 90) {
console.log('⚠️ High memory usage detected')
}
return resourceMetrics
}
}Best Practices
1. Production-Ready Code
class ProductionReadyAgent:
def __init__(self):
self.ants = AgenticAnts(api_key=os.getenv('AGENTICANTS_API_KEY'))
def process_request(self, request: dict):
"""Process request with production-ready error handling"""
# Create trace for monitoring
trace = self.ants.trace.create(
name="production-agent",
input=request,
metadata={
"environment": "production",
"version": "1.0.0"
}
)
try:
# Validate request
self.validate_request(request)
# Process request
result = self.process_core_logic(request)
# Log success
trace.complete(
output=result,
metadata={
"success": True,
"processing_time": trace.duration
}
)
return result
except ValidationError as e:
trace.error(error=str(e), metadata={"error_type": "validation"})
raise HTTPException(status_code=400, detail=str(e))
except ProcessingError as e:
trace.error(error=str(e), metadata={"error_type": "processing"})
raise HTTPException(status_code=500, detail="Internal processing error")
except Exception as e:
trace.error(error=str(e), metadata={"error_type": "unexpected"})
raise HTTPException(status_code=500, detail="Internal server error")
def validate_request(self, request: dict):
"""Validate incoming request"""
if not request.get("input"):
raise ValidationError("Input is required")
if len(request["input"]) > 10000:
raise ValidationError("Input too long")
def process_core_logic(self, request: dict):
"""Core business logic"""
# Implement your agent logic here
return {"result": "processed"}2. Monitoring and Alerting
class ProductionMonitoring {
private ants: AgenticAnts
async setupProductionMonitoring() {
// Configure comprehensive monitoring
await this.ants.monitoring.configure({
metrics: {
performance: true,
errors: true,
costs: true,
security: true
},
alerting: {
enabled: true,
channels: ['email', 'slack', 'pagerduty']
},
dashboards: {
realTime: true,
historical: true
}
})
console.log('✅ Production monitoring configured')
}
async createProductionDashboards() {
// Create production dashboards
const dashboards = [
{
name: 'Production Overview',
widgets: [
{ type: 'timeseries', metric: 'request_rate' },
{ type: 'gauge', metric: 'error_rate' },
{ type: 'timeseries', metric: 'response_time' }
]
},
{
name: 'Cost Monitoring',
widgets: [
{ type: 'timeseries', metric: 'daily_cost' },
{ type: 'counter', metric: 'total_cost' }
]
}
]
for (const dashboard of dashboards) {
await this.ants.dashboards.create(dashboard)
}
console.log('✅ Production dashboards created')
}
}Troubleshooting
Common Production Issues
Issue: High error rates in production
def debug_production_errors():
# Analyze production errors
errors = ants.traces.query({
"status": "error",
"environment": "production",
"period": "last_24h"
})
print("Production Error Analysis:")
for error in errors:
print(f"Error: {error.error}")
print(f"Time: {error.timestamp}")
print(f"Agent: {error.metadata.get('agent')}")
print(f"Request: {error.input}")
print("---")Issue: Performance degradation
async function debugPerformanceDegradation() {
const performanceIssues = await ants.metrics.getPerformanceIssues({
period: 'last_24h',
threshold: 5000 // 5 seconds
})
console.log('Performance Issues:')
performanceIssues.forEach(issue => {
console.log(`Agent: ${issue.agent}`)
console.log(`Latency: ${issue.latency}ms`)
console.log(`Time: ${issue.timestamp}`)
})
}Next Steps
- Cost Optimization - Reduce costs with our optimization guide
- Debugging - Troubleshoot issues with our debugging guide
- Security - Learn about security best practices
Example Projects
- Production Customer Support Bot - GitHub Repository (opens in a new tab)
- Scalable Content Generation - GitHub Repository (opens in a new tab)
- Enterprise AI Platform - GitHub Repository (opens in a new tab)
Congratulations! 🎉 You now have a production-ready AI system with comprehensive monitoring, alerting, and incident response capabilities.