tok
Pure Erlang tokenizer for HuggingFacetokenizer.json files.
No NIFs, no Python, no native dependencies — drop a tokenizer.json next to your application and encode text directly from Erlang.
Supported formats
| Type | Models |
|---|---|
| WordPiece | BERT, DistilBERT, RoBERTa-base, multilingual BERT, ... |
| BPE — ByteLevel | GPT-2, Falcon, Llama 3, Mistral-Nemo, ... |
| BPE — Metaspace | Llama 2, Mistral 7B, Phi-3, ... |
Installation
%% rebar.config
{deps, [{tok, "0.2.0"}]}.Quick start
{ok, Tok} = tok:load("path/to/tokenizer.json"),
%% Encode — returns {InputIds, AttentionMask, TokenTypeIds} as flat binaries
%% of int32 little-endian values, padded to max_length from the tokenizer config.
{IdsBin, MaskBin, _TypeBin} = tok:encode(Tok, <<"Hello world">>),
%% Decode ids from the binary
Ids = [Id || <<Id:32/signed-little>> <= IdsBin],
%% Decode back to text (strips special tokens)
Text = tok:decode(Tok, Ids),
%% Count tokens without building the output binary
N = tok:count_tokens(Tok, <<"Hello world">>).Getting a tokenizer.json
Download directly from HuggingFace:
# Any model page → Files → tokenizer.json
curl -L https://huggingface.co/<org>/<model>/resolve/main/tokenizer.json \
-o tokenizer.json
Or save from the Python transformers library:
from transformers import AutoTokenizer
AutoTokenizer.from_pretrained("bert-base-uncased").save_pretrained(".")
# tokenizer.json is now in the current directoryAPI
%% Load a tokenizer from a tokenizer.json file.
-spec load(file:filename()) -> {ok, tokenizer()} | {error, term()}.
%% Encode text. Returns three binaries of int32 little-endian values,
%% each padded to max_length as configured in the tokenizer file.
-spec encode(tokenizer(), binary()) ->
{InputIds, AttentionMask, TokenTypeIds}.
%% Encode with options.
%% add_special_tokens => false skips CLS/SEP (WordPiece) or BOS/EOS (BPE).
-spec encode(tokenizer(), binary(), #{add_special_tokens => boolean()}) ->
{InputIds, AttentionMask, TokenTypeIds}.
%% Encode a list of texts.
-spec encode_batch(tokenizer(), [binary()]) ->
[{InputIds, AttentionMask, TokenTypeIds}].
-spec encode_batch(tokenizer(), [binary()], #{add_special_tokens => boolean()}) ->
[{InputIds, AttentionMask, TokenTypeIds}].
%% Decode a list of token IDs back to text. Special tokens are stripped.
-spec decode(tokenizer(), [integer()]) -> binary().
%% Count real tokens (after truncation, including special tokens).
%% Cheaper than encode/2 — does not allocate output binaries.
-spec count_tokens(tokenizer(), binary()) -> non_neg_integer().
%% Return vocabulary size.
-spec vocab_size(tokenizer()) -> integer().Reading the output binary
{IdsBin, MaskBin, TypeBin} = tok:encode(Tok, Text),
InputIds = [Id || <<Id:32/signed-little>> <= IdsBin],
AttentionMask = [M || <<M:32/signed-little>> <= MaskBin],
TokenTypeIds = [T || <<T:32/signed-little>> <= TypeBin].
The binary format matches what most ONNX runtimes and NIF-based inference libraries expect directly, so you can often pass IdsBin through without decoding.
Notes
- max_length is read from the
truncationsection oftokenizer.json. If absent, defaults to 512. - pad_id is read from the
paddingsection. If absent, defaults to the[PAD]token id or 0. -
BOS/EOS tokens are injected automatically when a
TemplateProcessingpost-processor is present in the tokenizer file. byte_fallback(Llama 2 / Mistral style) is supported: characters not in the vocabulary are split into<0xNN>byte tokens.
License
Apache 2.0 — see LICENSE.