ExLLM

A unified Elixir client for Large Language Models with integrated cost tracking, providing a consistent interface across multiple LLM providers.

⚠️ Alpha Quality Software: This library is in early development. APIs may change without notice until version 1.0.0 is released. Use in production at your own risk.

What's New in v0.4.1

Features

Supported Providers

Installation

Add ex_llm to your list of dependencies in mix.exs:

def deps do
[
{:ex_llm, "~> 0.4.1"},
# Included dependencies (no need to add these manually):
# - {:instructor, "~> 0.1.0"} - For structured outputs
# - {:bumblebee, "~> 0.5"} - For local model support
# - {:nx, "~> 0.7"} - For numerical computing
# Optional hardware acceleration backends (choose one):
{:exla, "~> 0.7", optional: true},
# Optional: For Apple Silicon Metal acceleration
# (not included in Hex package, add manually if needed)
{:emlx, github: "elixir-nx/emlx", branch: "main", optional: true}
]
end

Quick Start

📚 Quick Start Guide - Get up and running in 5 minutes
📖 User Guide - Comprehensive documentation of all features

Configuration

Configure your LLM providers in config/config.exs:

config :ex_llm,
anthropic: [
api_key: System.get_env("ANTHROPIC_API_KEY"),
base_url: "https://api.anthropic.com"
],
openai: [
api_key: System.get_env("OPENAI_API_KEY"),
base_url: "https://api.openai.com"
],
xai: [
api_key: System.get_env("XAI_API_KEY"),
base_url: "https://api.x.ai"
],
groq: [
api_key: System.get_env("GROQ_API_KEY"),
base_url: "https://api.groq.com"
],
mistral: [
api_key: System.get_env("MISTRAL_API_KEY"),
base_url: "https://api.mistral.ai"
],
perplexity: [
api_key: System.get_env("PERPLEXITY_API_KEY"),
base_url: "https://api.perplexity.ai"
],
ollama: [
base_url: "http://localhost:11434"
],
lmstudio: [
base_url: "http://localhost:1234"
],
bedrock: [
# AWS credentials (optional - uses credential chain by default)
access_key_id: System.get_env("AWS_ACCESS_KEY_ID"),
secret_access_key: System.get_env("AWS_SECRET_ACCESS_KEY"),
region: System.get_env("AWS_REGION") || "us-east-1",
model: "nova-lite" # Default model (cost-effective)
],
gemini: [
api_key: System.get_env("GEMINI_API_KEY"),
base_url: "https://generativelanguage.googleapis.com"
],
openrouter: [
api_key: System.get_env("OPENROUTER_API_KEY"),
base_url: "https://openrouter.ai/api/v1"
]

Basic Usage

# Simple chat completion with automatic cost tracking
messages = [
%{role: "user", content: "Hello, how are you?"}
]
{:ok, response} = ExLLM.chat(:anthropic, messages)
IO.puts(response.content)
IO.puts("Cost: #{ExLLM.format_cost(response.cost.total_cost)}")
# Using Bumblebee for local models (no API costs!)
{:ok, response} = ExLLM.chat(:bumblebee, messages, model: "microsoft/phi-4")
IO.puts(response.content)
# Using LM Studio (local server)
{:ok, response} = ExLLM.chat(:lmstudio, messages)
IO.puts(response.content)
# Using Groq for ultra-fast inference
{:ok, response} = ExLLM.chat(:groq, messages, model: "deepseek-r1-distill-llama-70b")
IO.puts(response.content)
# Using Mistral AI
{:ok, response} = ExLLM.chat(:mistral, messages, model: "mistral-large-latest")
IO.puts(response.content)
# Using Perplexity for search-enhanced responses
{:ok, response} = ExLLM.chat(:perplexity, messages, model: "sonar-reasoning")
IO.puts(response.content)
# Using OpenRouter for access to many models
{:ok, response} = ExLLM.chat(:openrouter, messages, model: "openai/gpt-4o-mini")
IO.puts(response.content)
# Streaming chat with error recovery
ExLLM.stream_chat(:anthropic, messages,
stream_recovery: true,
fn chunk ->
IO.write(chunk.content)
end
)
# Using mock adapter for testing
{:ok, response} = ExLLM.chat(:mock, messages,
mock_response: "This is a test response"
)
# Estimate tokens before making a request
tokens = ExLLM.estimate_tokens(messages)
IO.puts("Estimated tokens: #{tokens}")
# Calculate cost for specific usage
usage = %{input_tokens: 1000, output_tokens: 500}
cost = ExLLM.calculate_cost(:openai, "gpt-4", usage)
IO.puts("Total cost: #{ExLLM.format_cost(cost.total_cost)}")

Advanced Usage

# With custom options
options = [
model: "claude-3-5-sonnet-20241022",
max_tokens: 1000,
temperature: 0.7,
retry_count: 3, # Automatic retry with exponential backoff
retry_delay: 1000 # Initial retry delay in ms
]
{:ok, response} = ExLLM.chat(:anthropic, messages, options)
# Function calling
functions = [
%{
name: "get_weather",
description: "Get the current weather for a location",
parameters: %{
type: "object",
properties: %{
location: %{type: "string", description: "City, State or Country"},
unit: %{type: "string", enum: ["celsius", "fahrenheit"], description: "Temperature unit"}
},
required: ["location"]
}
}
]
{:ok, response} = ExLLM.chat(:anthropic,
[%{role: "user", content: "What's the weather in Paris, France?"}],
functions: functions
)
# Parse and execute function calls
case ExLLM.parse_function_calls(response) do
{:ok, [call | _]} ->
# Execute the function
result = get_weather(call.arguments.location, call.arguments[:unit] || "celsius")
# Format the result for the conversation
function_message = ExLLM.format_function_result(call.name, result)
:none ->
# No function calls in response
end
# Model discovery and recommendations
{:ok, models} = ExLLM.list_models(:anthropic)
Enum.each(models, &IO.puts(&1.name))
# Find models with specific capabilities
vision_models = ExLLM.find_models_with_features([:vision])
function_models = ExLLM.find_models_with_features([:function_calling, :streaming])
# Get model recommendations
recommended = ExLLM.recommend_models(%{
provider: :anthropic,
min_context_window: 100_000,
required_features: [:function_calling],
preferred_features: [:vision],
max_cost_per_million_tokens: 15.0
})
# Compare models
comparison = ExLLM.compare_models([
{:anthropic, "claude-3-5-sonnet-20241022"},
{:openai, "gpt-4-turbo"},
{:gemini, "gemini-pro"}
])
# Provider capabilities - find providers by features
{:ok, caps} = ExLLM.get_provider_capabilities(:openai)
IO.puts("Endpoints: #{Enum.join(caps.endpoints, ", ")}")
# => "Endpoints: chat, embeddings, images, audio, completions, fine_tuning, files"
# Find providers with specific features
providers = ExLLM.find_providers_with_features([:embeddings, :streaming])
# => [:openai, :ollama]
# Get provider recommendations
recommendations = ExLLM.recommend_providers(%{
required_features: [:vision, :streaming],
preferred_features: [:audio_input, :function_calling],
prefer_local: false
})
# => [
# %{provider: :openai, score: 0.95, matched_features: [...], missing_features: []},
# %{provider: :anthropic, score: 0.80, matched_features: [...], missing_features: [:audio_input]}
# ]
# Context management - automatically truncate long conversations
long_conversation = [
%{role: "system", content: "You are a helpful assistant."},
# ... many messages ...
%{role: "user", content: "What's the weather?"}
]
# Automatically truncates to fit model's context window
{:ok, response} = ExLLM.chat(:anthropic, long_conversation,
max_tokens: 4000, # Max tokens for context
strategy: :smart # Preserve system messages and recent context
)

Session Management

# Create a new conversation session
session = ExLLM.new_session(:anthropic, name: "Customer Support")
# Chat with automatic session tracking
{:ok, {response, session}} = ExLLM.chat_with_session(session, "Hello!")
IO.puts(response.content)
# Continue the conversation
{:ok, {response, session}} = ExLLM.chat_with_session(session, "What can you help me with?")
# Session automatically tracks:
# - Message history
# - Token usage
# - Conversation context
# Review session details
messages = ExLLM.get_session_messages(session)
total_tokens = ExLLM.session_token_usage(session)
IO.puts("Total tokens used: #{total_tokens}")
# Save session for later
{:ok, json} = ExLLM.save_session(session)
File.write!("session.json", json)
# Load session later
{:ok, session} = ExLLM.load_session(File.read!("session.json"))

API Reference

Core Functions

Session Functions

Function Calling

Model Capabilities

Provider Capabilities

Capability Normalization

ExLLM automatically normalizes different capability names used by various providers. This means you can use provider-specific terminology and ExLLM will understand it:

# These all refer to the same capability (function calling)
ExLLM.provider_supports?(:openai, :function_calling) # => true
ExLLM.provider_supports?(:anthropic, :tool_use) # => true
ExLLM.provider_supports?(:openai, :tools) # => true
# Find providers using any terminology
ExLLM.find_providers_with_features([:tool_use]) # Works!
ExLLM.find_providers_with_features([:function_calling]) # Also works!

Common normalizations:

Error Recovery

Data Structures

LLMResponse

%ExLLM.Types.LLMResponse{
content: "Hello! I'm doing well, thank you for asking.",
usage: %{input_tokens: 12, output_tokens: 15},
model: "claude-3-5-sonnet-20241022",
finish_reason: "end_turn",
cost: %{
total_cost: 0.000261,
input_cost: 0.000036,
output_cost: 0.000225,
currency: "USD"
}
}

StreamChunk

%ExLLM.Types.StreamChunk{
content: "Hello",
delta: true,
finish_reason: nil
}

Model

%ExLLM.Types.Model{
name: "claude-3-5-sonnet-20241022",
provider: :anthropic,
context_length: 200000,
supports_streaming: true
}

Model Configuration

ExLLM uses external YAML configuration files for model metadata, pricing, and capabilities. This allows easy updates without code changes:

External Configuration Structure

# config/models/anthropic.yml
provider: anthropic
default_model: "claude-sonnet-4-20250514"
models:
claude-3-5-sonnet-20241022:
context_window: 200000
pricing:
input: 3.00 # per 1M tokens
output: 15.00
capabilities:
- streaming
- function_calling
- vision

Configuration Management

# Get model pricing
pricing = ExLLM.ModelConfig.get_pricing(:anthropic, "claude-3-5-sonnet-20241022")
# Get context window
context = ExLLM.ModelConfig.get_context_window(:openai, "gpt-4o")
# Get default model for provider
default = ExLLM.ModelConfig.get_default_model(:openrouter)
# Configuration is cached for performance
# Updates require restart or cache refresh

Cost Tracking

ExLLM automatically tracks costs for all API calls using the external pricing configuration:

Automatic Cost Calculation

{:ok, response} = ExLLM.chat(:anthropic, messages)
# Access cost information
if response.cost do
IO.puts("Input tokens: #{response.cost.input_tokens}")
IO.puts("Output tokens: #{response.cost.output_tokens}")
IO.puts("Total cost: #{ExLLM.format_cost(response.cost.total_cost)}")
end

Token Estimation

# Estimate tokens before making a request
messages = [
%{role: "system", content: "You are a helpful assistant."},
%{role: "user", content: "Explain quantum computing in simple terms."}
]
estimated_tokens = ExLLM.estimate_tokens(messages)
# Use this to predict costs before making the actual API call

Cost Comparison

# Compare costs across different providers
usage = %{input_tokens: 1000, output_tokens: 2000}
providers = [
{:openai, "gpt-4"},
{:openai, "gpt-3.5-turbo"},
{:anthropic, "claude-3-5-sonnet-20241022"},
{:anthropic, "claude-3-haiku-20240307"}
]
Enum.each(providers, fn {provider, model} ->
cost = ExLLM.calculate_cost(provider, model, usage)
unless cost[:error] do
IO.puts("#{provider}/#{model}: #{ExLLM.format_cost(cost.total_cost)}")
end
end)

Supported Pricing

ExLLM includes pricing data (as of June 2025) in external YAML files for all supported providers:

Pricing data is stored in config/models/*.yml files and can be updated independently of code changes.

Context Management

ExLLM automatically manages context windows to ensure your messages fit within model limits:

Automatic Context Truncation

# Long conversation that might exceed context window
messages = [
%{role: "system", content: "You are a helpful assistant."},
# ... hundreds of messages ...
%{role: "user", content: "What's my current task?"}
]
# ExLLM automatically truncates to fit the model's context window
{:ok, response} = ExLLM.chat(:anthropic, messages)

Context Window Validation

# Check if messages fit within context window
case ExLLM.validate_context(messages, model: "gpt-3.5-turbo") do
{:ok, token_count} ->
IO.puts("Messages use #{token_count} tokens")
{:error, {:context_too_large, %{tokens: tokens, max_tokens: max}}} ->
IO.puts("Messages too large: #{tokens} tokens (max: #{max})")
end

Context Strategies

# Sliding window (default) - keeps most recent messages
{:ok, response} = ExLLM.chat(:anthropic, messages,
max_tokens: 4000,
strategy: :sliding_window
)
# Smart strategy - preserves system messages and recent context
{:ok, response} = ExLLM.chat(:anthropic, messages,
max_tokens: 4000,
strategy: :smart,
preserve_messages: 10 # Always keep last 10 messages
)

Context Statistics

# Get detailed statistics about your messages
stats = ExLLM.context_stats(messages)
IO.inspect(stats)
# %{
# message_count: 150,
# total_tokens: 45000,
# by_role: %{"system" => 1, "user" => 75, "assistant" => 74},
# avg_tokens_per_message: 300
# }
# Check context window sizes
IO.puts(ExLLM.context_window_size(:anthropic, "claude-3-5-sonnet-20241022"))
# => 200000

Session Management

ExLLM includes built-in session management for maintaining conversation state:

Creating and Using Sessions

# Create a new session
session = ExLLM.new_session(:anthropic, name: "My Chat")
# Chat with automatic session tracking
{:ok, {response, updated_session}} = ExLLM.chat_with_session(session, "Hello!")
# Continue the conversation
{:ok, {response2, session2}} = ExLLM.chat_with_session(updated_session, "What's 2+2?")
# Access session messages
messages = ExLLM.get_session_messages(session2)
# => [%{role: "user", content: "Hello!"}, %{role: "assistant", content: "..."}, ...]

Session Persistence

# Save session to disk
{:ok, path} = ExLLM.save_session(session, "/path/to/sessions")
# Load session from disk
{:ok, loaded_session} = ExLLM.load_session("/path/to/sessions/session_id.json")
# Export session as markdown
{:ok, markdown} = ExLLM.export_session_markdown(session)
File.write!("conversation.md", markdown)

Session Information

# Get session metadata
info = ExLLM.session_info(session)
# => %{
# id: "123...",
# name: "My Chat",
# created_at: ~U[2025-01-24 10:00:00Z],
# message_count: 10,
# total_tokens: 1500
# }
# Get token usage for session
tokens = ExLLM.session_token_usage(session)
# => 1500
# Clear session messages
clean_session = ExLLM.clear_session(session)

Structured Outputs

ExLLM integrates with instructor_ex to provide structured output validation. This allows you to define expected response structures using Ecto schemas and automatically validate LLM responses.

Instructor is included as a dependency of ExLLM, so no additional installation is needed.

Basic Usage

# Define your schema
defmodule EmailClassification do
use Ecto.Schema
use Instructor.Validator
@llm_doc "Classification of an email as spam or not spam"
@primary_key false
embedded_schema do
field :classification, Ecto.Enum, values: [:spam, :not_spam]
field :confidence, :float
field :reason, :string
end
@impl true
def validate_changeset(changeset) do
changeset
|> Ecto.Changeset.validate_required([:classification, :confidence, :reason])
|> Ecto.Changeset.validate_number(:confidence,
greater_than_or_equal_to: 0.0,
less_than_or_equal_to: 1.0
)
end
end
# Use with ExLLM
messages = [%{role: "user", content: "Is this spam? 'You won a million dollars!'"}]
{:ok, result} = ExLLM.chat(:anthropic, messages,
response_model: EmailClassification,
max_retries: 3 # Automatically retry on validation errors
)
IO.inspect(result)
# %EmailClassification{
# classification: :spam,
# confidence: 0.95,
# reason: "Classic lottery scam pattern"
# }

With Simple Type Specifications

# Define expected structure without Ecto
response_model = %{
name: :string,
age: :integer,
email: :string,
tags: {:array, :string}
}
messages = [%{role: "user", content: "Extract: John Doe, 30 years old, john@example.com, likes elixir and coding"}]
{:ok, result} = ExLLM.chat(:anthropic, messages,
response_model: response_model
)
IO.inspect(result)
# %{
# name: "John Doe",
# age: 30,
# email: "john@example.com",
# tags: ["elixir", "coding"]
# }

Advanced Example

defmodule UserProfile do
use Ecto.Schema
use Instructor.Validator
@llm_doc """
User profile extraction from text.
Extract all available information about the user.
"""
embedded_schema do
field :name, :string
field :email, :string
field :age, :integer
field :location, :string
embeds_many :interests, Interest do
field :name, :string
field :level, Ecto.Enum, values: [:beginner, :intermediate, :expert]
end
end
@impl true
def validate_changeset(changeset) do
changeset
|> Ecto.Changeset.validate_required([:name])
|> Ecto.Changeset.validate_format(:email, ~r/@/)
|> Ecto.Changeset.validate_number(:age, greater_than: 0, less_than: 150)
end
end
# Complex extraction with nested structures
text = """
Hi, I'm Jane Smith, a 28-year-old software engineer from Seattle.
You can reach me at jane.smith@tech.com. I'm an expert in Elixir,
intermediate in Python, and just starting to learn Rust.
"""
{:ok, profile} = ExLLM.chat(:anthropic,
[%{role: "user", content: "Extract user profile: #{text}"}],
response_model: UserProfile,
max_retries: 3
)

Using the Instructor Module Directly

# Direct usage of ExLLM.Instructor
{:ok, result} = ExLLM.Instructor.chat(:anthropic, messages,
response_model: EmailClassification,
max_retries: 3,
temperature: 0.1 # Lower temperature for more consistent structure
)
# Parse an existing response
{:ok, response} = ExLLM.chat(:anthropic, messages)
{:ok, structured} = ExLLM.Instructor.parse_response(response, UserProfile)
# Check if instructor is available
if ExLLM.Instructor.available?() do
# Use structured outputs
else
# Fall back to regular parsing
end

Supported Providers

Structured outputs work with providers that have instructor adapters:

Error Handling

case ExLLM.chat(:anthropic, messages, response_model: UserProfile) do
{:ok, profile} ->
# Successfully validated structure
IO.inspect(profile)
{:error, {:validation_failed, errors}} ->
# Validation failed after retries
IO.inspect(errors)
{:error, reason} ->
# Other error
IO.inspect(reason)
end

Configuration

ExLLM supports multiple configuration providers:

Environment Variables (Default)

# Uses ExLLM.ConfigProvider.Default
# Reads from application config and environment variables

Static Configuration

config = %{
anthropic: [
api_key: "your-api-key",
base_url: "https://api.anthropic.com"
]
}
ExLLM.set_config_provider({ExLLM.ConfigProvider.Static, config})

Logging

ExLLM provides a unified logging system with fine-grained control over what gets logged and how sensitive data is handled.

📖 Read the full Logger User Guide for detailed documentation.

# Quick example
alias ExLLM.Logger
Logger.info("Starting chat completion")
Logger.with_context(provider: :openai, operation: :chat) do
Logger.info("Sending request")
# ... make API call ...
Logger.info("Request completed", tokens: 150, duration_ms: 230)
end

Configure logging in your config/config.exs:

config :ex_llm,
log_level: :info,
log_components: %{
requests: true,
responses: true,
streaming: false, # Can be noisy
retries: true,
cache: false,
models: true
},
log_redaction: %{
api_keys: true, # Always recommended
content: false # Set true in production
}

Custom Configuration Provider

defmodule MyConfigProvider do
@behaviour ExLLM.ConfigProvider
@impl true
def get_config(provider, key) do
# Your custom logic here
end
@impl true
def has_config?(provider) do
# Your custom logic here
end
end
ExLLM.set_config_provider(MyConfigProvider)

Error Handling

ExLLM uses consistent error patterns:

case ExLLM.chat(:anthropic, messages) do
{:ok, response} ->
# Success
IO.puts(response.content)
{:error, {:config_error, reason}} ->
# Configuration issue
IO.puts("Config error: #{reason}")
{:error, {:api_error, %{status: status, body: body}}} ->
# API error
IO.puts("API error #{status}: #{body}")
{:error, {:network_error, reason}} ->
# Network issue
IO.puts("Network error: #{reason}")
{:error, {:parse_error, reason}} ->
# Response parsing issue
IO.puts("Parse error: #{reason}")
end

Error Recovery and Retries

ExLLM includes automatic error recovery and retry mechanisms:

Automatic Retries

# Configure retry behavior
options = [
retry_count: 3, # Number of retry attempts
retry_delay: 1000, # Initial delay in milliseconds
retry_backoff: :exponential, # Backoff strategy
retry_jitter: true # Add jitter to prevent thundering herd
]
{:ok, response} = ExLLM.chat(:anthropic, messages, options)
# Provider-specific retry policies
ExLLM.Retry.with_retry(fn ->
ExLLM.chat(:anthropic, messages)
end,
max_attempts: 5,
initial_delay: 500,
max_delay: 30_000,
should_retry: fn error ->
# Custom retry logic
case error do
{:api_error, %{status: 429}} -> true # Rate limit
{:api_error, %{status: 503}} -> true # Service unavailable
{:network_error, _} -> true # Network issues
_ -> false
end
end
)

Stream Recovery

# Enable automatic stream recovery
{:ok, stream_id} = ExLLM.stream_chat(:anthropic, messages,
stream_recovery: true,
recovery_strategy: :paragraph, # :exact, :paragraph, or :summarize
fn chunk ->
IO.write(chunk.content)
end
)
# If stream is interrupted, resume from where it left off
case ExLLM.resume_stream(stream_id) do
{:ok, resumed_stream} ->
for chunk <- resumed_stream do
IO.write(chunk.content)
end
{:error, :not_found} ->
# Stream not recoverable
end
# List recoverable streams
recoverable = ExLLM.list_recoverable_streams()

Mock Adapter for Testing

The mock adapter allows you to test your LLM interactions without making real API calls:

Basic Mock Usage

# Configure static mock response
{:ok, response} = ExLLM.chat(:mock, messages,
mock_response: "This is a mock response"
)
# Configure mock with usage data
{:ok, response} = ExLLM.chat(:mock, messages,
mock_response: %{
content: "Mock response with usage",
usage: %{input_tokens: 10, output_tokens: 20},
model: "mock-model"
}
)
# Mock streaming responses
ExLLM.stream_chat(:mock, messages,
mock_chunks: ["Hello", " from", " mock", " adapter!"],
chunk_delay: 100, # Delay between chunks in ms
fn chunk ->
IO.write(chunk.content)
end
)

Advanced Mock Configuration

# Dynamic mock responses based on input
mock_handler = fn messages ->
last_message = List.last(messages)
cond do
String.contains?(last_message.content, "weather") ->
"It's sunny and 72°F"
String.contains?(last_message.content, "hello") ->
"Hello! How can I help you?"
true ->
"I don't understand"
end
end
{:ok, response} = ExLLM.chat(:mock, messages,
mock_handler: mock_handler
)
# Simulate errors
{:error, {:api_error, %{status: 429, body: "Rate limit exceeded"}}} =
ExLLM.chat(:mock, messages,
mock_error: {:api_error, %{status: 429, body: "Rate limit exceeded"}}
)
# Capture requests for assertions
{:ok, response} = ExLLM.chat(:mock, messages,
capture_requests: true,
mock_response: "Test response"
)
# Access captured requests
captured = ExLLM.Adapters.Mock.get_captured_requests()
assert length(captured) == 1
assert List.first(captured).messages == messages

Testing with Mock Adapter

defmodule MyApp.LLMClientTest do
use ExUnit.Case
setup do
# Clear any previous captures
ExLLM.Adapters.Mock.clear_captured_requests()
:ok
end
test "handles weather queries" do
messages = [%{role: "user", content: "What's the weather?"}]
{:ok, response} = ExLLM.chat(:mock, messages,
mock_response: "It's sunny today!",
capture_requests: true
)
assert response.content == "It's sunny today!"
# Verify the request
[request] = ExLLM.Adapters.Mock.get_captured_requests()
assert request.provider == :mock
assert request.messages == messages
end
test "simulates API errors" do
messages = [%{role: "user", content: "Hello"}]
{:error, error} = ExLLM.chat(:mock, messages,
mock_error: {:network_error, :timeout}
)
assert error == {:network_error, :timeout}
end
end

Local Model Support

ExLLM supports running models locally using Bumblebee and EXLA/EMLX backends. This enables on-device inference without API calls or costs.

Setup

  1. ExLLM includes Bumblebee and Nx dependencies. For hardware acceleration, add one of these optional backends to your mix.exs:
def deps do
[
{:ex_llm, "~> 0.4.1"},
# For CUDA/ROCm GPUs:
{:exla, "~> 0.7"}
# OR for Apple Silicon Metal acceleration:
# {:emlx, github: "elixir-nx/emlx", branch: "main"}
]
end
  1. Configure EXLA backend (optional - auto-detected by default):
# For CUDA GPUs
config :nx, :default_backend, {EXLA.Backend, client: :cuda}
# For Apple Silicon
config :nx, :default_backend, EMLX.Backend

Available Models

Usage

# Start the model loader (happens automatically on first use)
{:ok, _} = ExLLM.Local.ModelLoader.start_link()
# Use a local model
messages = [
%{role: "user", content: "Explain quantum computing in simple terms"}
]
{:ok, response} = ExLLM.chat(:bumblebee, messages, model: "microsoft/phi-4")
IO.puts(response.content)
# Stream responses
{:ok, stream} = ExLLM.stream_chat(:bumblebee, messages)
for chunk <- stream do
IO.write(chunk.content)
end
# List available models
{:ok, models} = ExLLM.list_models(:bumblebee)
Enum.each(models, fn model ->
IO.puts("#{model.name} - Context: #{model.context_window} tokens")
end)
# Check acceleration info
info = ExLLM.Local.EXLAConfig.acceleration_info()
IO.puts("Running on: #{info.name}")

Hardware Acceleration

ExLLM automatically detects and uses available hardware acceleration:

Performance Tips

  1. First Load: Models are downloaded from HuggingFace on first use and cached locally
  2. Memory: Ensure you have enough RAM/VRAM for your chosen model
  3. Batch Size: Automatically optimized based on available memory
  4. Mixed Precision: Enabled by default for better performance

Model Loading

# Pre-load a model
{:ok, _} = ExLLM.Local.ModelLoader.load_model("Qwen/Qwen3-0.6B")
# Load from local path
{:ok, _} = ExLLM.Local.ModelLoader.load_model("/path/to/model")
# Unload to free memory
:ok = ExLLM.Local.ModelLoader.unload_model("Qwen/Qwen3-0.6B")
# List loaded models
loaded = ExLLM.Local.ModelLoader.list_loaded_models()

Adding New Providers

To add a new LLM provider, implement the ExLLM.Adapter behaviour:

defmodule ExLLM.Adapters.MyProvider do
@behaviour ExLLM.Adapter
@impl true
def chat(messages, options) do
# Implement chat completion
end
@impl true
def stream_chat(messages, options, callback) do
# Implement streaming chat
end
@impl true
def configured?() do
# Check if provider is configured
end
@impl true
def list_models() do
# Return available models
end
end

Then register it in the main ExLLM module.

Requirements

Development

Setup

# Clone the repository
git clone https://github.com/azmaveth/ex_llm.git
cd ex_llm
# Install dependencies
mix deps.get
mix deps.compile
# Run tests
mix test
# Run quality checks
mix format --check-formatted
mix credo
mix dialyzer

Testing

# Run all tests
mix test
# Run specific test files
mix test test/ex_llm_test.exs
# Run only integration tests
mix test test/*_integration_test.exs
# Run tests with coverage
mix test --cover

Documentation

# Generate docs
mix docs
# Open in browser
open doc/index.html

User Guides

Roadmap

Visit the GitHub repository to see the detailed roadmap and progress tracking.

Recently Completed ✅

Near-term Goals

Long-term Vision

Contributing

We welcome contributions! Please see our contributing guidelines:

  1. Fork the repository
  2. Create a feature branch (git checkout -b feature/amazing-feature)
  3. Make your changes
  4. Add tests for new functionality
  5. Ensure all tests pass (mix test)
  6. Format your code (mix format)
  7. Run linter (mix credo)
  8. Commit your changes (git commit -m 'feat: add amazing feature')
  9. Push to the branch (git push origin feature/amazing-feature)
  10. Open a Pull Request

Commit Message Convention

We use Conventional Commits:

Future Provider Support

ExLLM includes pre-configured model data for 49 additional providers, ready for implementation:

Major Cloud Providers: Azure, Vertex AI, Databricks, Sagemaker, Watsonx, Snowflake

AI Companies: Mistral AI, Cohere, Together AI, Replicate, Perplexity, DeepSeek, XAI

Inference Platforms: Fireworks AI, DeepInfra, Anyscale, Cloudflare, NScale, SambaNova

Specialized: AI21, NLP Cloud, Aleph Alpha, Voyage (embeddings), Assembly AI (audio)

All model configurations including pricing, context windows, and capabilities are already available in config/models/.

Acknowledgments

License

MIT License - see LICENSE for details.