Learnixo

Python Essentials for AI Engineers · Lesson 28 of 36

NumPy Slicing and Indexing

Basic Indexing

Python
import numpy as np

arr = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90])

# Single element
print(arr[0])    # 10  first element
print(arr[-1])   # 90  last element
print(arr[-2])   # 80  second to last

# Slicing: arr[start:stop:step]
print(arr[2:5])    # [30 40 50]  indices 2, 3, 4 (stop is exclusive)
print(arr[:3])     # [10 20 30]  first 3
print(arr[6:])     # [70 80 90]  from index 6 to end
print(arr[::2])    # [10 30 50 70 90]  every other element
print(arr[::-1])   # [90 80 70 60 50 40 30 20 10]  reversed

2D Indexing

NumPy uses [row, col] syntax for 2D arrays:

Python
matrix = np.array([
    [1,  2,  3,  4],
    [5,  6,  7,  8],
    [9,  10, 11, 12],
])
# Shape: (3, 4)  3 rows, 4 columns

# Single element
print(matrix[0, 0])   # 1  row 0, col 0
print(matrix[1, 2])   # 7  row 1, col 2
print(matrix[-1, -1]) # 12  last row, last col

# Row slice
print(matrix[0, :])   # [1 2 3 4]  entire first row
print(matrix[1])      # [5 6 7 8]  shorthand for matrix[1, :]

# Column slice
print(matrix[:, 0])   # [1 5 9]  entire first column
print(matrix[:, 2])   # [3 7 11]  entire third column

# Sub-matrix
print(matrix[0:2, 1:3])   # Rows 0-1, cols 1-2
# [[2 3]
#  [6 7]]

# Every other row
print(matrix[::2, :])   # Rows 0 and 2
# [[ 1  2  3  4]
#  [ 9 10 11 12]]

Slicing in AI: Embedding Batches

Python
# Common AI pattern: batch of embeddings
embeddings = np.random.randn(1000, 1536)   # 1000 documents, 1536-dim embeddings

# Get first 32 embeddings (one batch)
batch = embeddings[:32]
print(batch.shape)   # (32, 1536)

# Get embeddings for specific documents by index
doc_indices = [0, 5, 42, 99]
selected = embeddings[doc_indices]
print(selected.shape)   # (4, 1536)

# Get the first 128 dimensions of all embeddings (dimension reduction)
reduced = embeddings[:, :128]
print(reduced.shape)   # (1000, 128)

# Normalize each embedding (divide by its L2 norm)
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)   # Shape: (1000, 1)
normalized = embeddings / norms   # Broadcasting: (1000, 1536) / (1000, 1)
print(np.linalg.norm(normalized[0]))   # ~1.0  each row has unit norm

Boolean (Mask) Indexing

Boolean arrays select elements where the condition is True:

Python
scores = np.array([0.92, 0.45, 0.78, 0.61, 0.89, 0.33, 0.75])

# Create a boolean mask
mask = scores >= 0.7
print(mask)   # [True False True False True False True]

# Apply mask  returns elements where True
passing = scores[mask]
print(passing)   # [0.92 0.78 0.89 0.75]

# Inline  same result
passing = scores[scores >= 0.7]

# Compound conditions
good_range = scores[(scores >= 0.7) & (scores < 0.95)]
# Use & (not 'and'), | (not 'or'), ~ (not 'not')
print(good_range)   # [0.92 0.78 0.89 0.75]

# Boolean indexing with 2D arrays
data = np.array([[1.0, 0.9, 0.8], [0.5, 0.4, 0.3], [0.95, 0.85, 0.75]])
# Get rows where the first column score is >= 0.7
high_first_col = data[data[:, 0] >= 0.7]
print(high_first_col)
# [[1.0  0.9  0.8]
#  [0.95 0.85 0.75]]

Fancy Indexing

Use an array of indices to select elements:

Python
arr = np.array([10, 20, 30, 40, 50, 60, 70, 80])

# Integer array indexing
indices = np.array([0, 2, 5, 7])
print(arr[indices])   # [10 30 60 80]  elements at those indices

# Use argsort to get top-k indices
scores = np.array([0.45, 0.92, 0.78, 0.61, 0.89])
top_k = np.argsort(scores)[::-1][:3]   # Top 3 indices by score (descending)
print(top_k)             # [1 4 2]  indices of scores 0.92, 0.89, 0.78
print(scores[top_k])     # [0.92 0.89 0.78]


# 2D fancy indexing  select specific rows
embeddings = np.random.randn(100, 1536)
top_doc_indices = np.array([3, 17, 42, 71])
top_embeddings = embeddings[top_doc_indices]
print(top_embeddings.shape)   # (4, 1536)


# Fancy indexing returns a COPY, not a view
copy = arr[np.array([0, 2])]
copy[0] = 999
print(arr)   # [10 20 30 ...]  original unchanged

np.where: Conditional Element Selection

Python
scores = np.array([0.92, 0.45, 0.78, 0.61, 0.89])

# np.where(condition, value_if_true, value_if_false)
labels = np.where(scores >= 0.7, "pass", "fail")
print(labels)   # ["pass" "fail" "pass" "fail" "pass"]

# Replace below-threshold values with 0
clamped = np.where(scores >= 0.7, scores, 0.0)
print(clamped)   # [0.92 0.   0.78 0.   0.89]

# Use indices of True elements
passing_indices = np.where(scores >= 0.7)[0]   # [0] to extract the tuple
print(passing_indices)   # [0 2 4]

Slicing for ML Data Splitting

Python
# Common ML pattern: split dataset into train/val/test
def split_dataset(X: np.ndarray, y: np.ndarray, train_pct: float = 0.7, val_pct: float = 0.15):
    n = len(X)
    train_end = int(n * train_pct)
    val_end = int(n * (train_pct + val_pct))

    X_train, y_train = X[:train_end], y[:train_end]
    X_val,   y_val   = X[train_end:val_end], y[train_end:val_end]
    X_test,  y_test  = X[val_end:], y[val_end:]

    return (X_train, y_train), (X_val, y_val), (X_test, y_test)


# Shuffle before splitting (important!)
np.random.seed(42)
indices = np.random.permutation(len(X))   # Random permutation of indices
X_shuffled = X[indices]   # Fancy indexing with permuted indices
y_shuffled = y[indices]


# Mini-batch generation for training
def get_batches(X: np.ndarray, y: np.ndarray, batch_size: int = 32):
    n = len(X)
    for start in range(0, n, batch_size):
        end = min(start + batch_size, n)
        yield X[start:end], y[start:end]

for X_batch, y_batch in get_batches(X_train, y_train):
    loss = train_step(X_batch, y_batch)

Indexing Quick Reference

| Operation | Syntax | Returns | |---|---|---| | Single element | arr[i] | scalar | | Slice (1D) | arr[start:stop:step] | view | | Row (2D) | mat[i, :] or mat[i] | view | | Column (2D) | mat[:, j] | view | | Sub-matrix | mat[r1:r2, c1:c2] | view | | Boolean mask | arr[arr > 0] | copy | | Fancy indexing | arr[np.array([0,2,5])] | copy | | Conditional select | np.where(cond, a, b) | copy | | Top-k indices | np.argsort(arr)[::-1][:k] | copy |