Large Language Models (LLMs) have revolutionized AI development, enabling applications from chatbots to code generation, content analysis, and intelligent search. Python is the de facto language for LLM integration, with rich ecosystems like LangChain, libraries for API interaction, and frameworks for production deployment.
This guide covers the complete journey: from simple API calls to OpenAI and Anthropic APIs, building sophisticated applications with LangChain, implementing Retrieval Augmented Generation (RAG) for custom data, streaming responses, running local models, and deploying production-grade LLM systems.
What You’ll Learn
- Making API calls to OpenAI GPT, Anthropic Claude, and other providers
- Building chains and agents with LangChain
- Effective prompt engineering techniques
- Implementing RAG pipelines for custom knowledge
- Streaming responses for better UX
- Running local LLMs with Ollama
- Production deployment patterns and best practices
Making LLM API Calls
OpenAI API Integration
pip install openai python-dotenv
import os
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# Simple text completion
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Explain quantum computing in simple terms."}
],
max_tokens=1000,
temperature=0.7
)
print(response.choices[0].message.content)
Anthropic Claude API
pip install anthropic
import os
from anthropic import Anthropic
from dotenv import load_dotenv
load_dotenv()
client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
# Claude API call
response = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1024,
messages=[
{"role": "user", "content": "Write a Python function to calculate factorial."}
]
)
print(response.content[0].text)
Error Handling and Retries
import time
from openai import OpenAI, RateLimitError, APIError
client = OpenAI()
def call_llm_with_retry(prompt, max_retries=3, backoff_factor=2):
"""Call LLM with exponential backoff retry logic"""
for attempt in range(max_retries):
try:
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
max_tokens=500
)
return response.choices[0].message.content
except RateLimitError as e:
if attempt == max_retries - 1:
raise
wait_time = backoff_factor ** attempt
print(f"Rate limited. Waiting {wait_time}s before retry...")
time.sleep(wait_time)
except APIError as e:
if attempt == max_retries - 1:
raise
print(f"API error: {e}. Retrying...")
time.sleep(2 ** attempt)
raise Exception("Max retries exceeded")
Building with LangChain
LangChain Setup
pip install langchain langchain-openai langchain-anthropic
Simple LLM Chain
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
# Initialize LLM
llm = ChatOpenAI(model="gpt-4o", temperature=0.7)
# Create prompt template
prompt = ChatPromptTemplate.from_template(
"Explain {topic} in {language} in simple terms."
)
# Create chain
chain = prompt | llm | StrOutputParser()
# Run chain
result = chain.invoke({
"topic": "machine learning",
"language": "Spanish"
})
print(result)
Memory and Conversation
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationChain
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o", temperature=0.7)
# Create memory
memory = ConversationBufferMemory(return_messages=True)
# Create conversation chain
conversation = ConversationChain(
llm=llm,
memory=memory,
verbose=True
)
# Multi-turn conversation
response1 = conversation.run(input="My name is Alice")
response2 = conversation.run(input="What's my name?")
response3 = conversation.run(input="Tell me a joke")
Agents with Tools
from langchain_openai import ChatOpenAI
from langchain.agents import create_tool_calling_agent, AgentExecutor
from langchain.tools import tool
from langchain_core.prompts import ChatPromptTemplate
@tool
def calculate(expression: str) -> str:
"""Calculate mathematical expressions"""
try:
result = eval(expression)
return str(result)
except Exception as e:
return f"Error: {e}"
@tool
def get_weather(city: str) -> str:
"""Get current weather for a city"""
# Simulated weather data
weather_data = {
"New York": "22°C, Sunny",
"London": "15°C, Rainy",
"Tokyo": "18°C, Cloudy"
}
return weather_data.get(city, "City not found")
# Create agent
llm = ChatOpenAI(model="gpt-4o", temperature=0)
tools = [calculate, get_weather]
prompt = ChatPromptTemplate.from_messages([
("system", "You are a helpful assistant with access to tools."),
("user", "{input}"),
("placeholder", "{agent_scratchpad}")
])
agent = create_tool_calling_agent(llm, tools, prompt)
executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
# Use agent
result = executor.invoke({
"input": "What's 42 * 7 and the weather in Tokyo?"
})
Prompt Engineering Techniques
Few-Shot Prompting
from langchain.prompts import FewShotChatMessagePromptTemplate, ChatPromptTemplate
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o")
# Define examples
examples = [
{
"input": "Happy",
"output": "Positive"
},
{
"input": "Sad",
"output": "Negative"
},
{
"input": "Excited",
"output": "Positive"
}
]
# Create few-shot prompt
prompt = FewShotChatMessagePromptTemplate(
examples=examples,
example_prompt=ChatPromptTemplate.from_template(
"Input: {input}\nOutput: {output}"
)
)
# Use in chain
full_prompt = ChatPromptTemplate.from_messages([
("system", "Classify the sentiment of the following word."),
prompt,
("user", "Classify: {word}")
])
chain = full_prompt | llm
result = chain.invoke({"word": "Confused"})
Chain-of-Thought Prompting
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
llm = ChatOpenAI(model="gpt-4o")
prompt = ChatPromptTemplate.from_template("""
Solve the following problem step by step:
Problem: {problem}
Think through it carefully, showing your work:
1. First, identify what we know
2. Then determine what we need to find
3. Finally, solve step by step
Solution:
""")
chain = prompt | llm
result = chain.invoke({
"problem": "If a train travels 200km in 4 hours, what's its average speed?"
})
print(result.content)
- Be specific and clear about what you want
- Provide context and examples (few-shot prompting)
- Ask for step-by-step reasoning (chain-of-thought)
- Use role-playing (“You are a…”)
- Specify output format clearly
- Test and iterate on prompts
Retrieval Augmented Generation (RAG)
RAG Pipeline Setup
pip install langchain-community langchain-openai chromadb
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
# Load documents
loader = PyPDFLoader("document.pdf")
documents = loader.load()
# Split into chunks
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=100
)
chunks = splitter.split_documents(documents)
# Create embeddings
embeddings = OpenAIEmbeddings()
# Create vector store
vector_store = Chroma.from_documents(chunks, embeddings)
# Create RAG chain
llm = ChatOpenAI(model="gpt-4o")
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=vector_store.as_retriever(),
return_source_documents=True
)
# Query
result = qa_chain({"query": "What is the main topic?"})
print(result["result"])
print("Sources:", result["source_documents"])
Advanced RAG with Conversational Context
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
# Create conversational RAG
memory = ConversationBufferMemory(memory_key="chat_history")
qa_chain = ConversationalRetrievalChain.from_llm(
llm=ChatOpenAI(model="gpt-4o"),
retriever=vector_store.as_retriever(),
memory=memory,
return_source_documents=True
)
# Multi-turn conversation with retrieval
response1 = qa_chain({"question": "What's the main topic?"})
response2 = qa_chain({"question": "Tell me more about that"})
response3 = qa_chain({"question": "How does it relate to X?"})
# LLM will maintain context across questions
Streaming Responses
Streaming with OpenAI
from openai import OpenAI
client = OpenAI()
# Stream response
with client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": "Write a poem about Python"}],
stream=True
) as stream:
for text in stream.text_stream:
print(text, end="", flush=True)
Streaming with LangChain
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
llm = ChatOpenAI(model="gpt-4o")
prompt = ChatPromptTemplate.from_template("Tell me about {topic}")
chain = prompt | llm
# Stream output
for chunk in chain.stream({"topic": "quantum computing"}):
print(chunk.content, end="", flush=True)
Streaming in FastAPI
from fastapi import FastAPI from fastapi.responses import StreamingResponse from langchain_openai import ChatOpenAI from langchain.prompts import ChatPromptTemplate import json app = FastAPI() llm = ChatOpenAI(model="gpt-4o") @app.post("/stream") async def stream_response(prompt: str): """Stream LLM response""" prompt_template = ChatPromptTemplate.from_template(prompt) chain = prompt_template | llm def generate(): for chunk in chain.stream({}): yield json.dumps({"text": chunk.content}) + "\n" return StreamingResponse(generate(), media_type="application/x-ndjson")
Running Local LLMs
Ollama Setup for Local Models
# Install Ollama from https://ollama.com ollama pull llama2 ollama pull mistral ollama pull neural-chat # Run Ollama (default: localhost:11434) ollama serve
Ollama with LangChain
pip install langchain-community ollama
from langchain_community.llms import Ollama
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
# Initialize local Ollama model
llm = Ollama(model="mistral", base_url="http://localhost:11434")
# Create chain
prompt = ChatPromptTemplate.from_template(
"Explain {topic} in simple terms"
)
chain = prompt | llm | StrOutputParser()
# Run
result = chain.invoke({"topic": "machine learning"})
print(result)
Ollama with FastAPI
from fastapi import FastAPI
from pydantic import BaseModel
import requests
app = FastAPI()
class GenerateRequest(BaseModel):
prompt: str
model: str = "mistral"
@app.post("/generate")
async def generate_text(request: GenerateRequest):
"""Generate text using local Ollama LLM"""
response = requests.post(
"http://localhost:11434/api/generate",
json={
"model": request.model,
"prompt": request.prompt,
"stream": False
}
)
return {"result": response.json()["response"]}
Local LLM Advantages
- No API costs or rate limits
- Privacy: data stays local
- Low latency for inference
- Full control over model and parameters
- No internet dependency
Production Deployment Patterns
Docker Deployment
FROM python:3.11-slim WORKDIR /app COPY requirements.txt . RUN pip install -r requirements.txt COPY app/ . CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
Production Best Practices
Production LLM Deployment
- Rate Limiting: Prevent abuse and control costs
- Caching: Cache identical requests to reduce API calls and latency
- Monitoring: Track TTFT (time to first token), latency, token costs
- Error Handling: Graceful degradation and fallback models
- Concurrency: Use async/await for multiple concurrent requests
- Cost Control: Monitor token usage, implement quotas
- Security: Validate inputs, sanitize outputs, protect API keys
- Scaling: Use load balancers, auto-scaling based on traffic
Caching LLM Responses
from functools import lru_cache
import hashlib
from redis import Redis
redis_client = Redis(host='localhost', port=6379, db=0)
def cache_llm_response(prompt: str, model: str = "gpt-4o") -> str:
"""Cache LLM responses to reduce API calls"""
# Create cache key
cache_key = f"llm:{model}:{hashlib.md5(prompt.encode()).hexdigest()}"
# Check cache
cached = redis_client.get(cache_key)
if cached:
return cached.decode()
# Call LLM
response = call_llm(prompt, model)
# Cache for 24 hours
redis_client.setex(cache_key, 86400, response)
return response
Monitoring LLM Performance
import time
from prometheus_client import Counter, Histogram
# Define metrics
llm_requests = Counter('llm_requests_total', 'Total LLM requests', ['model', 'status'])
llm_latency = Histogram('llm_latency_seconds', 'LLM request latency', ['model'])
llm_tokens = Counter('llm_tokens_total', 'Total tokens used', ['model', 'type'])
def call_llm_with_monitoring(prompt: str, model: str):
"""Call LLM with performance monitoring"""
start_time = time.time()
try:
response = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}]
)
# Record metrics
llm_requests.labels(model=model, status='success').inc()
llm_latency.labels(model=model).observe(time.time() - start_time)
llm_tokens.labels(model=model, type='prompt').inc(response.usage.prompt_tokens)
llm_tokens.labels(model=model, type='completion').inc(response.usage.completion_tokens)
return response.choices[0].message.content
except Exception as e:
llm_requests.labels(model=model, status='error').inc()
raise
FastAPI LLM Service with Rate Limiting
from fastapi import FastAPI, Depends
from slowapi import Limiter
from slowapi.util import get_remote_address
from pydantic import BaseModel
from langchain_openai import ChatOpenAI
app = FastAPI()
limiter = Limiter(key_func=get_remote_address)
class CompletionRequest(BaseModel):
prompt: str
model: str = "gpt-4o"
llm = ChatOpenAI(model="gpt-4o")
@app.post("/completion")
@limiter.limit("5/minute") # 5 requests per minute per IP
async def get_completion(request: CompletionRequest):
"""Get LLM completion with rate limiting"""
try:
response = llm.invoke(request.prompt)
return {
"status": "success",
"result": response.content
}
except Exception as e:
return {
"status": "error",
"error": str(e)
}
Start simple with direct API calls, evolve to LangChain for complex applications, implement RAG when you need custom data, and ensure production-readiness with proper error handling, monitoring, and cost control.