Spidra Elixir SDK

The official Elixir SDK for Spidra that allows you to scrape pages, run browser actions, batch-process URLs, and crawl entire sites. All results come back as structured data ready to feed into your LLM pipelines or store directly.

Installation

Add spidra to your list of dependencies in mix.exs:

def deps do
  [
    {:spidra, "~> 0.1.0"}
  ]
end

Then run mix deps.get in your terminal.

Get your API key at app.spidra.io under Settings > API Keys.

Quick start

# Initialize your configuration
config = Spidra.Config.new(api_key: "spd_YOUR_API_KEY")

# Run a scrape job
{:ok, job} = Spidra.Scrape.run(config, %{
  urls: [%{url: "https://news.ycombinator.com"}],
  prompt: "List the top 5 stories with title, points, and comment count",
  output: "json"
})

IO.inspect(job["result"]["content"])

Table of contents

Scraping

All scrape jobs run asynchronously on the Spidra platform. The Spidra.Scrape.run/3 function submits a job and polls until it finishes. If you need more control, use submit/2 and get/2 directly.

Up to 3 URLs can be passed per request and they are processed in parallel.

Basic scrape

{:ok, job} = Spidra.Scrape.run(config, %{
  urls: [%{url: "https://example.com/pricing"}],
  prompt: "Extract all pricing plans with name, price, and included features",
  output: "json"
})

IO.inspect(job["result"]["content"])
# "{ \"plans\": [{ \"name\": \"Starter\", \"price\": \"$9/mo\", \"features\": [...] }, ...] }"

Structured output with JSON schema

When you need a guaranteed shape, pass a schema. The API will enforce the structure and return null for any missing fields rather than hallucinating values.

{:ok, job} = Spidra.Scrape.run(config, %{
  urls: [%{url: "https://jobs.example.com/senior-engineer"}],
  prompt: "Extract the job listing details",
  output: "json",
  schema: %{
    "type" => "object",
    "required" => ["title", "company", "remote"],
    "properties" => %{
      "title" => %{"type" => "string"},
      "company" => %{"type" => "string"},
      "remote" => %{"type" => ["boolean", "null"]},
      "salary_min" => %{"type" => ["number", "null"]},
      "salary_max" => %{"type" => ["number", "null"]},
      "skills" => %{"type" => "array", "items" => %{"type" => "string"}}
    }
  }
})

Geo-targeted scraping

Pass use_proxy: true and a proxy_country code to route the request through a specific country. Useful for geo-restricted content or localized pricing.

{:ok, job} = Spidra.Scrape.run(config, %{
  urls: [%{url: "https://www.amazon.de/gp/bestsellers"}],
  prompt: "List the top 10 products with name and price",
  use_proxy: true,
  proxy_country: "de"
})

Supported country codes include: us, gb, de, fr, jp, au, ca, br, in, nl, sg, es, it, mx, and 40+ more. Use "global" or "eu" for regional routing.

Authenticated pages

Pass cookies as a string to scrape pages that require a login session.

{:ok, job} = Spidra.Scrape.run(config, %{
  urls: [%{url: "https://app.example.com/dashboard"}],
  prompt: "Extract the monthly revenue and active user count",
  cookies: "session=abc123; auth_token=xyz789"
})

Browser actions

Actions let you interact with the page before the scrape runs. They execute in order, and the scrape happens after all actions complete.

{:ok, job} = Spidra.Scrape.run(config, %{
  urls: [
    %{
      url: "https://example.com/products",
      actions: [
        %{type: "click", selector: "#accept-cookies"},
        %{type: "wait", duration: 1000},
        %{type: "scroll", to: "80%"}
      ]
    }
  ],
  prompt: "Extract all product names and prices"
})

Manual job control

Use submit/2 and get/2 when you want to manage polling yourself, or fire-and-forget and check back later.

# Submit a job and get the job_id immediately
{:ok, %{"jobId" => job_id}} = Spidra.Scrape.submit(config, %{
  urls: [%{url: "https://example.com"}],
  prompt: "Extract the main headline"
})

# Check status at any point
{:ok, status} = Spidra.Scrape.get(config, job_id)

case status["status"] do
  "completed" -> IO.inspect(status["result"]["content"])
  "failed" -> IO.inspect(status["error"])
  _ -> IO.puts("Job is still pending...")
end

Batch scraping

Submit up to 50 URLs in a single request. All URLs are processed in parallel. Each URL is a plain string.

{:ok, batch} = Spidra.Batch.run(config, %{
  urls: [
    "https://shop.example.com/product/1",
    "https://shop.example.com/product/2",
    "https://shop.example.com/product/3"
  ],
  prompt: "Extract product name, price, and availability",
  output: "json",
  use_proxy: true
})

for item <- batch["items"] do
  case item["status"] do
    "completed" -> IO.inspect({item["url"], item["result"]})
    "failed" -> IO.inspect({item["url"], item["error"]})
    _ -> :ok
  end
end

Retry failed items:

{:ok, %{"batchId" => batch_id}} = Spidra.Batch.submit(config, %{
  urls: ["https://example.com/1", "https://example.com/2"],
  prompt: "Extract the page title"
})

# Later, after checking status
{:ok, result} = Spidra.Batch.get(config, batch_id)
if result["failedCount"] > 0 do
  {:ok, retried} = Spidra.Batch.retry(config, batch_id)
  IO.puts("Retried #{retried["retriedCount"]} items")
end

Cancel a running batch:

{:ok, response} = Spidra.Batch.cancel(config, batch_id)
IO.puts("Cancelled #{response["cancelledItems"]} items, refunded #{response["creditsRefunded"]} credits")

List past batches:

{:ok, response} = Spidra.Batch.list(config, page: 1, limit: 20)

for job <- response["jobs"] do
  IO.puts("#{job["uuid"]} #{job["status"]} #{job["completedCount"]}/#{job["totalUrls"]}")
end

Crawling

Given a starting URL, Spidra discovers pages automatically according to your instruction and extracts structured data from each one.

{:ok, job} = Spidra.Crawl.run(config, %{
  base_url: "https://competitor.com/blog",
  crawl_instruction: "Find all blog posts published in 2024",
  transform_instruction: "Extract the title, author, publish date, and a one-sentence summary",
  max_pages: 30,
  use_proxy: true
})

for page <- job["result"] do
  IO.inspect({page["url"], page["data"]})
end

Get signed download URLs for all crawled pages:

Each page includes html_url and markdown_url pointing to S3-signed URLs that expire after 1 hour.

{:ok, response} = Spidra.Crawl.pages(config, job_id)

for page <- response["pages"] do
  IO.puts("#{page["url"]} - #{page["status"]}")
  # Download raw HTML: page["html_url"]
  # Download markdown: page["markdown_url"]
end

Re-extract with a new instruction:

Runs a new AI transformation over an existing completed crawl without re-crawling any pages. Charges credits for the transformation only.

{:ok, queued} = Spidra.Crawl.extract(config, source_job_id, "Extract only the product SKUs and prices as a CSV")

# Poll the new job manually
{:ok, result} = Spidra.Crawl.get(config, queued["jobId"])

Crawl history and stats:

{:ok, response} = Spidra.Crawl.history(config, page: 1, limit: 10)
{:ok, stats} = Spidra.Crawl.stats(config)

IO.puts("Total crawls: #{stats["total"]}")

Logs

Scrape logs are stored for every job that runs through the API.

# List logs with optional filters
{:ok, response} = Spidra.Logs.list(config, %{
  status: "failed",
  search_term: "amazon.com",
  channel: "api",
  date_start: "2024-01-01",
  date_end: "2024-12-31",
  page: 1,
  limit: 20
})

for log <- response["logs"] do
  IO.puts("#{hd(log["urls"])["url"]} #{log["status"]} #{log["credits_used"]}")
end

Get a single log with full extraction result:

{:ok, log} = Spidra.Logs.get(config, "log-uuid")
IO.inspect(log["result_data"]) # the full AI output for that job

Usage statistics

Returns credit and request usage broken down by day or week.

# Range options: "7d" | "30d" | "weekly"
{:ok, rows} = Spidra.Usage.get(config, "30d")

for row <- rows do
  IO.puts("#{row["date"]} Requests: #{row["requests"]} Credits: #{row["credits"]}")
end

Requirements

License

MIT