Advanced AI Topics: Multi-Modal Models and Emerging Capabilities

Advanced AI Topics: Multi-Modal Models and Emerging Capabilities

Introduction

Advanced AI systems combine vision, language, and audio for sophisticated applications. This guide covers multi-modal models, image generation, speech processing, embeddings, and emerging capabilities.

GPT-4 Vision (GPT-4V)

Image Understanding

from openai import AzureOpenAI
import base64

client = AzureOpenAI(
    api_key="<key>",
    api_version="2024-02-15-preview",
    azure_endpoint="<endpoint>"
)

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def analyze_image(image_path, question):
    base64_image = encode_image(image_path)
    
    response = client.chat.completions.create(
        model="gpt-4-vision",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": question},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ],
        max_tokens=500
    )
    
    return response.choices[0].message.content

# Example usage
result = analyze_image("chart.png", "Explain the trends shown in this chart")
print(result)

Document Analysis

def extract_table_data(image_path):
    """Extract structured data from table images"""
    prompt = """
Analyze this table image and extract the data in JSON format.
Structure: {"columns": [...], "rows": [[...], [...], ...]}
"""
    
    base64_image = encode_image(image_path)
    
    response = client.chat.completions.create(
        model="gpt-4-vision",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
                ]
            }
        ]
    )
    
    return json.loads(response.choices[0].message.content)

DALL-E Image Generation

def generate_image(prompt, size="1024x1024", quality="standard", n=1):
    """Generate images from text descriptions"""
    response = client.images.generate(
        model="dall-e-3",
        prompt=prompt,
        size=size,
        quality=quality,
        n=n
    )
    
    image_url = response.data[0].url
    revised_prompt = response.data[0].revised_prompt
    
    return {
        "url": image_url,
        "revised_prompt": revised_prompt
    }

# Example
result = generate_image(
    "A futuristic cityscape with flying cars and holographic billboards, cyberpunk style"
)

Whisper Speech Recognition

from azure.cognitiveservices.speech import SpeechConfig, AudioConfig, SpeechRecognizer

def transcribe_audio(audio_file_path):
    """Convert speech to text"""
    speech_config = SpeechConfig(
        subscription="<key>",
        region="<region>"
    )
    
    audio_config = AudioConfig(filename=audio_file_path)
    recognizer = SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
    
    result = recognizer.recognize_once()
    return result.text

# Or use Azure OpenAI Whisper
def transcribe_with_whisper(audio_file_path):
    with open(audio_file_path, "rb") as audio_file:
        transcript = client.audio.transcriptions.create(
            model="whisper",
            file=audio_file
        )
    
    return transcript.text

Text-to-Speech Generation

from azure.cognitiveservices.speech import SpeechSynthesizer

def generate_speech(text, output_file="output.wav", voice="en-US-JennyNeural"):
    """Convert text to natural speech"""
    speech_config = SpeechConfig(subscription="<key>", region="<region>")
    speech_config.speech_synthesis_voice_name = voice
    
    audio_config = AudioConfig(filename=output_file)
    synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
    
    result = synthesizer.speak_text_async(text).get()
    return result.audio_duration

Embeddings for Semantic Search

from openai import AzureOpenAI
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def get_embedding(text, model="text-embedding-ada-002"):
    """Generate vector embedding for text"""
    response = client.embeddings.create(
        input=text,
        model=model
    )
    return response.data[0].embedding

def semantic_search(query, documents):
    """Find most relevant documents using embeddings"""
    query_embedding = get_embedding(query)
    doc_embeddings = [get_embedding(doc) for doc in documents]
    
    similarities = cosine_similarity([query_embedding], doc_embeddings)[0]
    
    results = sorted(
        zip(documents, similarities),
        key=lambda x: x[1],
        reverse=True
    )
    
    return results

# Example
docs = [
    "Azure Machine Learning provides MLOps capabilities",
    "Python is a popular programming language",
    "Cloud computing enables scalable infrastructure"
]

results = semantic_search("How to deploy ML models?", docs)
for doc, score in results:
    print(f"{score:.3f}: {doc}")

Fine-Tuning Custom Models

from openai import AzureOpenAI

def prepare_training_data(examples):
    """Format training data for fine-tuning"""
    training_data = []
    for example in examples:
        training_data.append({
            "messages": [
                {"role": "system", "content": "You are a customer support assistant."},
                {"role": "user", "content": example["input"]},
                {"role": "assistant", "content": example["output"]}
            ]
        })
    
    with open("training_data.jsonl", "w") as f:
        for item in training_data:
            f.write(json.dumps(item) + "\n")

def create_fine_tune_job(training_file_id):
    """Submit fine-tuning job"""
    response = client.fine_tuning.jobs.create(
        training_file=training_file_id,
        model="gpt-35-turbo",
        hyperparameters={
            "n_epochs": 3,
            "batch_size": 1,
            "learning_rate_multiplier": 0.1
        }
    )
    
    return response.id

# Upload training file
with open("training_data.jsonl", "rb") as f:
    file_response = client.files.create(file=f, purpose="fine-tune")

# Create fine-tune job
job_id = create_fine_tune_job(file_response.id)

Multi-Agent Orchestration

class AgentOrchestrator:
    """Coordinate multiple specialized agents"""
    
    def __init__(self):
        self.agents = {
            "researcher": self._create_agent("Research specialist"),
            "coder": self._create_agent("Expert programmer"),
            "writer": self._create_agent("Technical writer")
        }
    
    def _create_agent(self, role):
        return lambda prompt: client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": f"You are a {role}."},
                {"role": "user", "content": prompt}
            ]
        ).choices[0].message.content
    
    def solve_complex_task(self, task):
        """Break down task and delegate to specialized agents"""
        
        # Research phase
        research = self.agents["researcher"](
            f"Research this topic: {task}"
        )
        
        # Code implementation
        code = self.agents["coder"](
            f"Based on this research, implement a solution:\n{research}"
        )
        
        # Documentation
        docs = self.agents["writer"](
            f"Document this code:\n{code}"
        )
        
        return {
            "research": research,
            "implementation": code,
            "documentation": docs
        }

Emerging Capabilities

Chain-of-Thought Reasoning

def chain_of_thought_reasoning(problem):
    """Use step-by-step reasoning for complex problems"""
    prompt = f"""
Solve this problem step by step. Show your reasoning at each step.

Problem: {problem}

Step 1: Understand the problem
Step 2: Identify relevant information
Step 3: Break down into sub-problems
Step 4: Solve each sub-problem
Step 5: Combine solutions
Final Answer:
"""
    
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.1
    )
    
    return response.choices[0].message.content

Tree-of-Thought

def tree_of_thought(problem, num_branches=3):
    """Explore multiple reasoning paths"""
    branches = []
    
    for i in range(num_branches):
        prompt = f"""
Generate reasoning path #{i+1} for this problem:
{problem}

Think creatively and explore different approaches.
"""
        
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.8
        )
        
        branches.append(response.choices[0].message.content)
    
    # Evaluate branches
    evaluation_prompt = f"""
Problem: {problem}

Reasoning paths:
{chr(10).join([f"{i+1}. {branch}" for i, branch in enumerate(branches)])}

Which reasoning path is most sound? Explain and provide final answer.
"""
    
    final_response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": evaluation_prompt}]
    )
    
    return final_response.choices[0].message.content

Best Practices

  • Combine modalities for richer applications
  • Validate generated content before use
  • Implement fallback mechanisms
  • Monitor usage and costs across services
  • Cache embeddings for repeated queries
  • Use appropriate models for each task
  • Test extensively with edge cases
  • Implement safety filters for all outputs

Future Trends

  • Multimodal Foundation Models: Unified models for all modalities
  • Autonomous Agents: Self-directed task completion
  • Continuous Learning: Models that adapt without retraining
  • Smaller, Efficient Models: Edge deployment capabilities
  • Enhanced Reasoning: Improved logical and mathematical capabilities

Troubleshooting

Issue Cause Resolution
Vision API errors Unsupported image format Convert to JPEG/PNG; check size limits
Poor image quality Low resolution input Use higher resolution; enhance preprocessing
Embedding drift Model version change Re-embed all documents with same model
High latency Large multi-modal inputs Compress images; optimize requests

Key Takeaways

Advanced AI combines multiple modalities, specialized models, and sophisticated orchestration patterns to solve complex, real-world problems.

References