Advanced AI Topics: Multi-Modal Models and Emerging Capabilities
Introduction
Advanced AI systems combine vision, language, and audio for sophisticated applications. This guide covers multi-modal models, image generation, speech processing, embeddings, and emerging capabilities.
GPT-4 Vision (GPT-4V)
Image Understanding
from openai import AzureOpenAI
import base64
client = AzureOpenAI(
api_key="<key>",
api_version="2024-02-15-preview",
azure_endpoint="<endpoint>"
)
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def analyze_image(image_path, question):
base64_image = encode_image(image_path)
response = client.chat.completions.create(
model="gpt-4-vision",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": question},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
],
max_tokens=500
)
return response.choices[0].message.content
# Example usage
result = analyze_image("chart.png", "Explain the trends shown in this chart")
print(result)
Document Analysis
def extract_table_data(image_path):
"""Extract structured data from table images"""
prompt = """
Analyze this table image and extract the data in JSON format.
Structure: {"columns": [...], "rows": [[...], [...], ...]}
"""
base64_image = encode_image(image_path)
response = client.chat.completions.create(
model="gpt-4-vision",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
]
}
]
)
return json.loads(response.choices[0].message.content)
DALL-E Image Generation
def generate_image(prompt, size="1024x1024", quality="standard", n=1):
"""Generate images from text descriptions"""
response = client.images.generate(
model="dall-e-3",
prompt=prompt,
size=size,
quality=quality,
n=n
)
image_url = response.data[0].url
revised_prompt = response.data[0].revised_prompt
return {
"url": image_url,
"revised_prompt": revised_prompt
}
# Example
result = generate_image(
"A futuristic cityscape with flying cars and holographic billboards, cyberpunk style"
)
Whisper Speech Recognition
from azure.cognitiveservices.speech import SpeechConfig, AudioConfig, SpeechRecognizer
def transcribe_audio(audio_file_path):
"""Convert speech to text"""
speech_config = SpeechConfig(
subscription="<key>",
region="<region>"
)
audio_config = AudioConfig(filename=audio_file_path)
recognizer = SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
result = recognizer.recognize_once()
return result.text
# Or use Azure OpenAI Whisper
def transcribe_with_whisper(audio_file_path):
with open(audio_file_path, "rb") as audio_file:
transcript = client.audio.transcriptions.create(
model="whisper",
file=audio_file
)
return transcript.text
Text-to-Speech Generation
from azure.cognitiveservices.speech import SpeechSynthesizer
def generate_speech(text, output_file="output.wav", voice="en-US-JennyNeural"):
"""Convert text to natural speech"""
speech_config = SpeechConfig(subscription="<key>", region="<region>")
speech_config.speech_synthesis_voice_name = voice
audio_config = AudioConfig(filename=output_file)
synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
result = synthesizer.speak_text_async(text).get()
return result.audio_duration
Embeddings for Semantic Search
from openai import AzureOpenAI
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
def get_embedding(text, model="text-embedding-ada-002"):
"""Generate vector embedding for text"""
response = client.embeddings.create(
input=text,
model=model
)
return response.data[0].embedding
def semantic_search(query, documents):
"""Find most relevant documents using embeddings"""
query_embedding = get_embedding(query)
doc_embeddings = [get_embedding(doc) for doc in documents]
similarities = cosine_similarity([query_embedding], doc_embeddings)[0]
results = sorted(
zip(documents, similarities),
key=lambda x: x[1],
reverse=True
)
return results
# Example
docs = [
"Azure Machine Learning provides MLOps capabilities",
"Python is a popular programming language",
"Cloud computing enables scalable infrastructure"
]
results = semantic_search("How to deploy ML models?", docs)
for doc, score in results:
print(f"{score:.3f}: {doc}")
Fine-Tuning Custom Models
from openai import AzureOpenAI
def prepare_training_data(examples):
"""Format training data for fine-tuning"""
training_data = []
for example in examples:
training_data.append({
"messages": [
{"role": "system", "content": "You are a customer support assistant."},
{"role": "user", "content": example["input"]},
{"role": "assistant", "content": example["output"]}
]
})
with open("training_data.jsonl", "w") as f:
for item in training_data:
f.write(json.dumps(item) + "\n")
def create_fine_tune_job(training_file_id):
"""Submit fine-tuning job"""
response = client.fine_tuning.jobs.create(
training_file=training_file_id,
model="gpt-35-turbo",
hyperparameters={
"n_epochs": 3,
"batch_size": 1,
"learning_rate_multiplier": 0.1
}
)
return response.id
# Upload training file
with open("training_data.jsonl", "rb") as f:
file_response = client.files.create(file=f, purpose="fine-tune")
# Create fine-tune job
job_id = create_fine_tune_job(file_response.id)
Multi-Agent Orchestration
class AgentOrchestrator:
"""Coordinate multiple specialized agents"""
def __init__(self):
self.agents = {
"researcher": self._create_agent("Research specialist"),
"coder": self._create_agent("Expert programmer"),
"writer": self._create_agent("Technical writer")
}
def _create_agent(self, role):
return lambda prompt: client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": f"You are a {role}."},
{"role": "user", "content": prompt}
]
).choices[0].message.content
def solve_complex_task(self, task):
"""Break down task and delegate to specialized agents"""
# Research phase
research = self.agents["researcher"](
f"Research this topic: {task}"
)
# Code implementation
code = self.agents["coder"](
f"Based on this research, implement a solution:\n{research}"
)
# Documentation
docs = self.agents["writer"](
f"Document this code:\n{code}"
)
return {
"research": research,
"implementation": code,
"documentation": docs
}
Emerging Capabilities
Chain-of-Thought Reasoning
def chain_of_thought_reasoning(problem):
"""Use step-by-step reasoning for complex problems"""
prompt = f"""
Solve this problem step by step. Show your reasoning at each step.
Problem: {problem}
Step 1: Understand the problem
Step 2: Identify relevant information
Step 3: Break down into sub-problems
Step 4: Solve each sub-problem
Step 5: Combine solutions
Final Answer:
"""
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}],
temperature=0.1
)
return response.choices[0].message.content
Tree-of-Thought
def tree_of_thought(problem, num_branches=3):
"""Explore multiple reasoning paths"""
branches = []
for i in range(num_branches):
prompt = f"""
Generate reasoning path #{i+1} for this problem:
{problem}
Think creatively and explore different approaches.
"""
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}],
temperature=0.8
)
branches.append(response.choices[0].message.content)
# Evaluate branches
evaluation_prompt = f"""
Problem: {problem}
Reasoning paths:
{chr(10).join([f"{i+1}. {branch}" for i, branch in enumerate(branches)])}
Which reasoning path is most sound? Explain and provide final answer.
"""
final_response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": evaluation_prompt}]
)
return final_response.choices[0].message.content
Best Practices
- Combine modalities for richer applications
- Validate generated content before use
- Implement fallback mechanisms
- Monitor usage and costs across services
- Cache embeddings for repeated queries
- Use appropriate models for each task
- Test extensively with edge cases
- Implement safety filters for all outputs
Future Trends
- Multimodal Foundation Models: Unified models for all modalities
- Autonomous Agents: Self-directed task completion
- Continuous Learning: Models that adapt without retraining
- Smaller, Efficient Models: Edge deployment capabilities
- Enhanced Reasoning: Improved logical and mathematical capabilities
Troubleshooting
| Issue | Cause | Resolution |
|---|---|---|
| Vision API errors | Unsupported image format | Convert to JPEG/PNG; check size limits |
| Poor image quality | Low resolution input | Use higher resolution; enhance preprocessing |
| Embedding drift | Model version change | Re-embed all documents with same model |
| High latency | Large multi-modal inputs | Compress images; optimize requests |
Key Takeaways
Advanced AI combines multiple modalities, specialized models, and sophisticated orchestration patterns to solve complex, real-world problems.