API Design for AI-First Applications: Patterns That Scale

@app.post("/api/ai/chat/stream")
async def chat_stream(request: ChatRequest):
    async def generate():
        try:
            async for chunk in llm.stream(
                messages=request.messages,
                model=request.model
            ):
                yield f"data: {json.dumps(chunk)}\n\n"
            yield "data: [DONE]\n\n"
        except Exception as e:
            yield f"data: {json.dumps({'error': str(e)})}\n\n"
    
    return StreamingResponse(
        generate(),
        media_type="text/event-stream"
    )

async def generate():
    sequence_id = 0
    async for chunk in llm.stream(...):
        sequence_id += 1
        yield f"data: {json.dumps({
            'seq': sequence_id,
            'content': chunk,
            'done': False
        })}\n\n"

# First, upload context and get a reference
POST /api/ai/context
{
    "files": [...],
    "symbols": [...],
    "metadata": {...}
}

Response:
{
    "context_id": "ctx_abc123",
    "expires_at": "2024-01-20T10:30:00Z",
    "size_bytes": 524288
}

# Then reference it in subsequent requests
POST /api/ai/chat
{
    "context_id": "ctx_abc123",
    "message": "Explain this function"
}

class TokenBucketLimiter:
    async def check_limit(
        self, 
        user_id: str, 
        estimated_tokens: int
    ) -> bool:
        current = await redis.get(f"tokens:{user_id}")
        if current and int(current) + estimated_tokens > LIMIT:
            return False
        
        await redis.incrby(
            f"tokens:{user_id}", 
            estimated_tokens
        )
        await redis.expire(f"tokens:{user_id}", 3600)
        return True

@app.post("/api/ai/analyze")
async def analyze(request: AnalyzeRequest):
    # Estimate tokens before making LLM call
    estimated = estimate_tokens(
        request.context_size + request.query_length
    )
    
    if not await limiter.check_limit(
        request.user_id, 
        estimated
    ):
        raise HTTPException(429, "Token limit exceeded")
    
    # Proceed with actual LLM call

@app.post("/api/ai/generate")
async def generate(
    request: GenerateRequest,
    idempotency_key: str = Header(...)
):
    # Check if we've seen this request
    cached = await redis.get(f"idem:{idempotency_key}")
    if cached:
        result = json.loads(cached)
        if result['status'] == 'complete':
            return result['data']
        elif result['status'] == 'in_progress':
            # Return 409 with progress indicator
            raise HTTPException(
                409,
                detail={
                    'status': 'in_progress',
                    'progress': result['progress']
                }
            )
    
    # Mark as in-progress
    await redis.setex(
        f"idem:{idempotency_key}",
        3600,
        json.dumps({
            'status': 'in_progress',
            'progress': 0
        })
    )
    
    # Do the actual work...

{
    "error": {
        "code": "context_too_large",
        "message": "Context size (156,000 tokens) exceeds model limit (128,000)",
        "details": {
            "tokens_requested": 156000,
            "tokens_limit": 128000,
            "tokens_over": 28000
        },
        "suggestions": [
            "Reduce context by removing older messages",
            "Use a model with larger context window",
            "Split request into multiple chunks"
        ]
    }
}

POST /api/ai/v1/complete
{
    "model": "gpt-4-turbo-2024-01-15",
    "model_fallback": ["gpt-4-turbo", "gpt-4"],
    "feature_flags": ["streaming", "function_calling"],
    ...
}

@app.get("/health")
async def health_check():
    checks = {}
    
    # Database connectivity
    checks['database'] = await check_db()
    
    # LLM provider availability
    checks['llm_provider'] = await check_llm_health()
    
    # Context cache accessibility
    checks['context_cache'] = await check_redis()
    
    # Token budget availability
    checks['rate_limits'] = await check_remaining_quota()
    
    all_healthy = all(c['healthy'] for c in checks.values())
    
    return JSONResponse(
        status_code=200 if all_healthy else 503,
        content=checks
    )

async def check_llm_health():
    try:
        # Fast, cheap test prompt
        await llm.complete(
            "Test",
            max_tokens=1,
            timeout=2.0
        )
        return {'healthy': True, 'latency_ms': ...}
    except Exception as e:
        return {'healthy': False, 'error': str(e)}

@app.post("/api/ai/analyze")
async def analyze(request: AnalyzeRequest):
    span = tracer.start_span("ai_analyze")
    
    span.set_attribute("context_size_bytes", len(request.context))
    span.set_attribute("estimated_tokens", estimated_tokens)
    span.set_attribute("model_requested", request.model)
    
    try:
        result = await perform_analysis(request)
        
        span.set_attribute("actual_tokens", result.token_count)
        span.set_attribute("model_used", result.model)
        span.set_attribute("cache_hit", result.cache_hit)
        
        return result
    finally:
        span.end()

API Design for AI-First Applications: Patterns That Scale

API Design for AI-First Applications: Patterns That Scale

The Problem With Standard REST for AI

Stream Everything (Seriously)

Context as First-Class API Concern

Rate Limiting That Accounts for Cost

Idempotency Isn't Optional

Error Messages That Don't Suck

Versioning for Rapid Model Changes

Health Checks That Actually Check Health

Design for Observability from Day One

The Architecture Matters

Related Posts

How to Use AI for User Stories: Complete Implementation Guide

AI for Software Development FAQ: Transform Your Workflow

Complete Guide to AI for Software Development: Transform Your Workflow

Complete Guide to AI and Software Development: From Chaos to Code

Cloud-Native Development: Why Serverless and Kubernetes Are the Future

Tags