Implement a Basic Tokenizer
Build a word tokenizer from scratch: whitespace splitting, vocabulary building, encoding and decoding, OOV handling, and comparison with HuggingFace tokenizer output.
What Interviewers Are Testing
Implementing a tokenizer from scratch is a common AI engineering interview problem because it tests whether you understand what actually happens before text enters a model. Many engineers use AutoTokenizer.from_pretrained() without knowing the underlying mechanics. This lesson fixes that.
Level 1: Whitespace Tokenizer
Start simple β split on whitespace.
def whitespace_tokenize(text: str) -> list[str]:
"""
Split text on whitespace. Handles multiple spaces.
Time: O(n) where n = len(text)
Space: O(t) where t = number of tokens
"""
return text.split() # split() without args handles multiple spaces + strip
# Test
assert whitespace_tokenize("Hello world") == ["Hello", "world"]
assert whitespace_tokenize(" Hello world ") == ["Hello", "world"]
assert whitespace_tokenize("") == []
assert whitespace_tokenize("one") == ["one"]Level 2: Punctuation-Aware Tokenizer
Real tokenizers split punctuation from words.
import re
def word_tokenize(text: str) -> list[str]:
"""
Tokenize text by splitting on whitespace and punctuation.
Strategy: use regex to match word characters or individual punctuation.
This is similar to NLTK's word_tokenize for simple cases.
Time: O(n)
Space: O(t)
"""
# Match sequences of word chars (letters, digits, underscore)
# OR single punctuation characters
pattern = re.compile(r"\w+|[^\w\s]")
return pattern.findall(text)
# Tests
assert word_tokenize("Hello, world!") == ["Hello", ",", "world", "!"]
assert word_tokenize("It's a test.") == ["It", "'", "s", "a", "test", "."]
assert word_tokenize("GPT-4 costs $0.01/token.") == ["GPT", "-", "4", "costs", "$", "0", ".", "01", "/", "token", "."]
assert word_tokenize("") == []
print("word_tokenize tests passed")Level 3: Building the Vocabulary
After tokenization, you map tokens to integer IDs.
from collections import Counter
from typing import Optional
class Vocabulary:
"""
Maps tokens to integer IDs and back.
Special tokens:
[PAD] = 0 β padding for batch alignment
[UNK] = 1 β out-of-vocabulary token
[BOS] = 2 β beginning of sequence
[EOS] = 3 β end of sequence
"""
PAD_TOKEN = "[PAD]"
UNK_TOKEN = "[UNK]"
BOS_TOKEN = "[BOS]"
EOS_TOKEN = "[EOS]"
SPECIAL_TOKENS = [PAD_TOKEN, UNK_TOKEN, BOS_TOKEN, EOS_TOKEN]
PAD_ID = 0
UNK_ID = 1
BOS_ID = 2
EOS_ID = 3
def __init__(self):
self.token_to_id: dict[str, int] = {}
self.id_to_token: dict[int, str] = {}
self._next_id: int = 0
# Register special tokens first so they get IDs 0-3
for token in self.SPECIAL_TOKENS:
self._add_token(token)
def _add_token(self, token: str) -> int:
"""Add a token and return its ID."""
if token not in self.token_to_id:
self.token_to_id[token] = self._next_id
self.id_to_token[self._next_id] = token
self._next_id += 1
return self.token_to_id[token]
def build_from_corpus(
self,
corpus: list[str],
tokenizer_fn=None,
max_vocab_size: Optional[int] = None,
min_frequency: int = 1,
) -> "Vocabulary":
"""
Build vocabulary from a corpus.
Steps:
1. Tokenize all documents
2. Count token frequencies
3. Filter by min_frequency
4. Sort by frequency (most common first)
5. Add up to max_vocab_size tokens
Time: O(n * avg_doc_len) for counting
Space: O(V) where V = vocabulary size
"""
if tokenizer_fn is None:
tokenizer_fn = str.split
# Count all token frequencies
counter: Counter[str] = Counter()
for doc in corpus:
tokens = tokenizer_fn(doc)
counter.update(tokens)
# Filter and sort
eligible = [
(token, count) for token, count in counter.items()
if count >= min_frequency
and token not in self.SPECIAL_TOKENS
]
eligible.sort(key=lambda x: (-x[1], x[0])) # sort by freq desc, then alpha
# Apply max_vocab_size limit
if max_vocab_size is not None:
# Reserve 4 slots for special tokens
eligible = eligible[:max_vocab_size - len(self.SPECIAL_TOKENS)]
for token, _ in eligible:
self._add_token(token)
return self
def __len__(self) -> int:
return self._next_id
def __contains__(self, token: str) -> bool:
return token in self.token_to_id
def get_id(self, token: str) -> int:
"""Return token ID, UNK_ID if not in vocabulary."""
return self.token_to_id.get(token, self.UNK_ID)
def get_token(self, token_id: int) -> str:
"""Return token string, UNK_TOKEN if ID not found."""
return self.id_to_token.get(token_id, self.UNK_TOKEN)
# Build vocabulary from a small corpus
corpus = [
"the quick brown fox jumps over the lazy dog",
"the dog sat on the mat with the cat",
"quick brown foxes leap over lazy dogs",
"a cat and a dog are the best of friends",
]
vocab = Vocabulary()
vocab.build_from_corpus(corpus, min_frequency=2)
print(f"Vocabulary size: {len(vocab)}")
print(f"PAD ID: {vocab.get_id('[PAD]')}")
print(f"UNK ID: {vocab.get_id('[UNK]')}")
print(f"'the' ID: {vocab.get_id('the')}")
print(f"'dog' ID: {vocab.get_id('dog')}")
print(f"'zephyr' ID: {vocab.get_id('zephyr')}") # OOV β UNKLevel 4: Encoding and Decoding
class SimpleTokenizer:
"""
A complete word-level tokenizer with encode/decode.
Not suitable for production (word-level has too large a vocabulary
for morphologically rich languages), but perfect for interviews.
"""
def __init__(
self,
max_vocab_size: Optional[int] = None,
min_frequency: int = 1,
add_special_tokens: bool = True,
):
self.vocab = Vocabulary()
self.max_vocab_size = max_vocab_size
self.min_frequency = min_frequency
self.add_special_tokens = add_special_tokens
self._tokenize_fn = re.compile(r"\w+|[^\w\s]").findall
def fit(self, corpus: list[str]) -> "SimpleTokenizer":
"""Build vocabulary from corpus."""
self.vocab.build_from_corpus(
corpus,
tokenizer_fn=self._tokenize_fn,
max_vocab_size=self.max_vocab_size,
min_frequency=self.min_frequency,
)
return self
def tokenize(self, text: str) -> list[str]:
"""Convert text to list of token strings."""
tokens = self._tokenize_fn(text.lower())
if self.add_special_tokens:
tokens = [Vocabulary.BOS_TOKEN] + tokens + [Vocabulary.EOS_TOKEN]
return tokens
def encode(
self,
text: str,
max_length: Optional[int] = None,
padding: bool = False,
) -> list[int]:
"""
Convert text to list of integer token IDs.
OOV tokens become UNK_ID.
If max_length is set, truncate or pad to that length.
"""
tokens = self.tokenize(text)
ids = [self.vocab.get_id(t) for t in tokens]
if max_length is not None:
if len(ids) > max_length:
# Truncate: keep [BOS] ... [EOS]
ids = ids[:max_length - 1] + [Vocabulary.EOS_ID]
elif padding and len(ids) < max_length:
ids = ids + [Vocabulary.PAD_ID] * (max_length - len(ids))
return ids
def decode(self, token_ids: list[int], skip_special: bool = True) -> str:
"""
Convert list of token IDs back to text.
Handles:
- Special token removal (PAD, BOS, EOS, UNK display)
- Simple space joining (not perfect for punctuation)
"""
special_ids = {
Vocabulary.PAD_ID,
Vocabulary.BOS_ID,
Vocabulary.EOS_ID,
}
tokens = []
for tid in token_ids:
if skip_special and tid in special_ids:
continue
tokens.append(self.vocab.get_token(tid))
return " ".join(tokens)
def encode_batch(
self,
texts: list[str],
max_length: int,
) -> list[list[int]]:
"""
Encode multiple texts, padding all to max_length.
Time: O(n * max_length) where n = number of texts
Space: O(n * max_length)
"""
return [
self.encode(text, max_length=max_length, padding=True)
for text in texts
]
@property
def vocab_size(self) -> int:
return len(self.vocab)
# Full test
tokenizer = SimpleTokenizer(max_vocab_size=100, min_frequency=1)
tokenizer.fit(corpus)
test_text = "the quick dog runs"
tokens = tokenizer.tokenize(test_text)
ids = tokenizer.encode(test_text)
decoded = tokenizer.decode(ids)
print(f"\nInput: '{test_text}'")
print(f"Tokens: {tokens}")
print(f"IDs: {ids}")
print(f"Decoded: '{decoded}'")
# Test OOV
oov_text = "the quantum zephyr oscillates"
oov_ids = tokenizer.encode(oov_text)
print(f"\nOOV text: '{oov_text}'")
print(f"OOV IDs: {oov_ids}")
# 'quantum' and 'zephyr' should both map to UNK_ID (1)
oov_tokens = tokenizer.tokenize(oov_text)
for token, tid in zip(oov_tokens, oov_ids):
is_unk = tid == Vocabulary.UNK_ID and token not in tokenizer.vocab
marker = " β OOV" if is_unk else ""
print(f" '{token}' β {tid}{marker}")
# Test batch encoding
batch = ["the cat sat", "a quick fox", "hello"]
batch_ids = tokenizer.encode_batch(batch, max_length=10)
print("\nBatch encoding (max_length=10):")
for text, ids in zip(batch, batch_ids):
print(f" '{text}' β {ids} (len={len(ids)})")Comparison with HuggingFace Tokenizer
# This shows what the same text looks like through a real BPE tokenizer
# Run this if transformers is installed; otherwise read the output comments
def compare_with_hf_tokenizer(texts: list[str]) -> None:
"""
Compare our simple tokenizer output vs HuggingFace GPT-2 tokenizer.
Key differences you'll observe:
1. HF tokenizer handles subwords β "unbelievable" β ["un", "believ", "able"]
2. HF tokenizer preserves case (GPT-2 is case-sensitive)
3. HF tokenizer handles punctuation differently β "it's" β ["it", "'s"]
4. Vocabulary sizes: ours is tiny (100), GPT-2 has 50,257 tokens
"""
try:
from transformers import GPT2Tokenizer
hf_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
print("\nComparison: Simple vs GPT-2 tokenizer")
print("=" * 60)
for text in texts:
our_tokens = tokenizer.tokenize(text)
hf_tokens = hf_tokenizer.tokenize(text)
print(f"\nText: '{text}'")
print(f" Ours: {our_tokens}")
print(f" GPT-2: {hf_tokens}")
print(f" Our vocab size: {tokenizer.vocab_size}")
print(f" GPT-2 vocab size: {hf_tokenizer.vocab_size}")
except ImportError:
print("\nHuggingFace transformers not installed.")
print("Expected differences (conceptual):")
print()
examples = {
"Hello, world!": {
"ours": ["[BOS]", "hello", ",", "world", "!", "[EOS]"],
"gpt2": ["Hello", ",", "Δ world", "!"], # Δ = space prefix
},
"unbelievable": {
"ours": ["[BOS]", "unbelievable", "[EOS]"], # one token or OOV
"gpt2": ["un", "belie", "vable"], # subword decomposition
},
"2024-01-15": {
"ours": ["[BOS]", "2024", "-", "01", "-", "15", "[EOS]"],
"gpt2": ["2024", "-", "01", "-", "15"],
},
}
for text, comparison in examples.items():
print(f"Text: '{text}'")
print(f" Ours (word-level): {comparison['ours']}")
print(f" GPT-2 (BPE): {comparison['gpt2']}")
print()
compare_with_hf_tokenizer(["Hello world", "the unbelievable fox"])Interview Talking Points
Why subword over word-level? Word-level vocabularies need millions of entries for morphologically rich languages. Subword methods (BPE, WordPiece) handle unknown words by decomposing them. "unbelievable" β "un" + "believable" means even if the full word is new, the subwords exist in the vocabulary.
Why not character-level? Character-level sequences are very long (5-10x longer than subword), which increases the cost of the O(nΒ²) attention mechanism. "hello" as characters needs 5 attention positions; as a subword it needs 1.
What is [PAD] for? GPU operations require fixed-size tensors. When batching variable-length sequences, you pad shorter ones to match the longest in the batch. The model learns to ignore PAD tokens via an attention mask.
What is [UNK]? Any token not in the vocabulary maps to the UNK (unknown) ID. Word-level tokenizers produce many UNKs for rare words, proper nouns, and typos β another reason subword tokenizers are preferred.
Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.