Generative AI: Building with Large Language Models
Introduction
Generative AI powered by LLMs enables content creation, code generation, conversational agents, and intelligent automation. This guide covers Azure OpenAI, application patterns, and production implementation.
Azure OpenAI Service
Setup and Configuration
from openai import AzureOpenAI
client = AzureOpenAI(
api_key="<your-key>",
api_version="2024-02-15-preview",
azure_endpoint="https://<resource>.openai.azure.com/"
)
response = client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Explain quantum computing in simple terms."}
],
temperature=0.7,
max_tokens=500
)
print(response.choices[0].message.content)
Application Patterns
Chat Completions
conversation_history = [
{"role": "system", "content": "You are a Python coding expert."}
]
def chat(user_message):
conversation_history.append({"role": "user", "content": user_message})
response = client.chat.completions.create(
model="gpt-4",
messages=conversation_history,
temperature=0.3
)
assistant_message = response.choices[0].message.content
conversation_history.append({"role": "assistant", "content": assistant_message})
return assistant_message
# Multi-turn conversation
print(chat("Write a function to reverse a string"))
print(chat("Now add type hints and docstring"))
Streaming Responses
def stream_chat(user_message):
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": user_message}],
stream=True
)
for chunk in response:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="", flush=True)
Retrieval-Augmented Generation (RAG)
Vector Database Integration
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential
search_client = SearchClient(
endpoint="https://<search-service>.search.windows.net",
index_name="documents",
credential=AzureKeyCredential("<key>")
)
def rag_query(question):
# Retrieve relevant documents
results = search_client.search(
search_text=question,
select="content",
top=3
)
context = "\n\n".join([doc["content"] for doc in results])
# Generate answer with context
response = client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": "Answer based on the provided context."},
{"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}"}
]
)
return response.choices[0].message.content
Semantic Kernel Framework
Basic Setup
import semantic_kernel as sk
from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
kernel = sk.Kernel()
kernel.add_chat_service(
"chat",
AzureChatCompletion(
deployment_name="gpt-4",
endpoint="<endpoint>",
api_key="<key>"
)
)
# Define semantic function
prompt = """
Summarize the following text in {{$maxWords}} words or less:
{{$input}}
"""
summarize = kernel.create_semantic_function(prompt, max_tokens=500)
result = await summarize("Very long text here...", maxWords=50)
print(result)
Plugin Architecture
from semantic_kernel.skill_definition import sk_function, sk_function_context_parameter
class MathPlugin:
@sk_function(
description="Add two numbers",
name="add"
)
@sk_function_context_parameter(name="num1", description="First number")
@sk_function_context_parameter(name="num2", description="Second number")
def add(self, context) -> str:
num1 = float(context["num1"])
num2 = float(context["num2"])
return str(num1 + num2)
# Register plugin
kernel.import_skill(MathPlugin(), "Math")
# Use with LLM
result = await kernel.run_async(
kernel.skills.get_function("Math", "add"),
num1="15",
num2="27"
)
Function Calling
functions = [
{
"name": "get_weather",
"description": "Get current weather for a location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City name"
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"]
}
},
"required": ["location"]
}
}
]
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": "What's the weather in Seattle?"}],
functions=functions,
function_call="auto"
)
if response.choices[0].message.function_call:
function_name = response.choices[0].message.function_call.name
arguments = json.loads(response.choices[0].message.function_call.arguments)
# Execute function
weather_data = get_weather(**arguments)
# Send result back to model
response = client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "user", "content": "What's the weather in Seattle?"},
response.choices[0].message,
{"role": "function", "name": function_name, "content": json.dumps(weather_data)}
]
)
Content Generation
Code Generation
def generate_code(specification, language="python"):
prompt = f"""
Generate {language} code for the following specification:
{specification}
Requirements:
- Include type hints
- Add comprehensive docstrings
- Follow PEP 8 style guide
- Include error handling
"""
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}],
temperature=0.2
)
return response.choices[0].message.content
Data Transformation
def transform_data(data, target_format):
prompt = f"""
Transform the following data to {target_format} format:
{data}
Ensure valid syntax and proper formatting.
"""
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}],
temperature=0
)
return response.choices[0].message.content
Token Management
import tiktoken
def count_tokens(text, model="gpt-4"):
encoding = tiktoken.encoding_for_model(model)
return len(encoding.encode(text))
def truncate_to_token_limit(text, max_tokens=4000, model="gpt-4"):
encoding = tiktoken.encoding_for_model(model)
tokens = encoding.encode(text)
if len(tokens) <= max_tokens:
return text
truncated_tokens = tokens[:max_tokens]
return encoding.decode(truncated_tokens)
Cost Optimization
class CostTracker:
PRICING = {
"gpt-4": {"input": 0.03, "output": 0.06}, # per 1K tokens
"gpt-35-turbo": {"input": 0.0015, "output": 0.002}
}
def __init__(self):
self.total_cost = 0
def calculate_cost(self, model, input_tokens, output_tokens):
pricing = self.PRICING.get(model, self.PRICING["gpt-35-turbo"])
cost = (input_tokens / 1000 * pricing["input"]) + (output_tokens / 1000 * pricing["output"])
self.total_cost += cost
return cost
def log_request(self, response):
usage = response.usage
cost = self.calculate_cost(
response.model,
usage.prompt_tokens,
usage.completion_tokens
)
print(f"Request cost: ${cost:.4f} | Total: ${self.total_cost:.4f}")
Safety and Content Filtering
from azure.ai.contentsafety import ContentSafetyClient
content_safety = ContentSafetyClient(
endpoint="<endpoint>",
credential=AzureKeyCredential("<key>")
)
def safe_generation(prompt):
# Generate content
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}]
)
generated_text = response.choices[0].message.content
# Check for harmful content
safety_result = content_safety.analyze_text(
text=generated_text,
categories=["Hate", "Sexual", "Violence", "SelfHarm"]
)
# Filter if unsafe
for category in safety_result.categories_analysis:
if category.severity >= 4: # High severity
return "Content filtered due to safety policies."
return generated_text
Best Practices
- Use system messages to set consistent behavior
- Implement retry logic with exponential backoff
- Cache responses for identical prompts
- Monitor token usage and costs
- Set appropriate temperature (0-0.3 for factual, 0.7-1.0 for creative)
- Implement content safety checks
- Use streaming for better UX
- Version prompts alongside code
- Test with diverse inputs
Troubleshooting
| Issue | Cause | Resolution |
|---|---|---|
| Rate limit errors | Too many requests | Implement retry with backoff |
| High costs | Inefficient prompts | Optimize token usage; use cheaper models |
| Inconsistent outputs | High temperature | Lower temperature; use structured output |
| Context overflow | Long conversations | Implement conversation summarization |
| Hallucinations | Lack of grounding | Use RAG; add verification steps |
Key Takeaways
Building with generative AI requires prompt engineering, cost management, safety controls, and thoughtful integration patterns for reliable production applications.