Python Essentials for AI Engineers · Lesson 32 of 36

String Manipulation Problems

Problem 1: Reverse Words in a String

Problem: Given a string of words separated by spaces, reverse the order of words. Handle multiple spaces and leading/trailing whitespace.

Python

def reverse_words(s: str) -> str:
    """Reverse word order in a string."""
    words = s.split()      # split() with no args: splits on any whitespace, removes empty strings
    return " ".join(reversed(words))

print(reverse_words("the quick brown fox"))      # "fox brown quick the"
print(reverse_words("  hello   world  "))        # "world hello"
print(reverse_words("single"))                   # "single"
print(reverse_words(""))                         # ""


# One-liner
reverse_words_v2 = lambda s: " ".join(s.split()[::-1])

# In AI: useful for checking if an LLM's word order is correct
def check_word_order(original: str, response: str) -> bool:
    """Check if the response uses the same words in the correct order."""
    return set(original.lower().split()) == set(response.lower().split())

Problem 2: Check if a String is a Palindrome

Problem: Return True if the string reads the same forward and backward, ignoring case, spaces, and non-alphanumeric characters.

Python

def is_palindrome(s: str) -> bool:
    """Check if string is a palindrome (ignoring case and non-alphanumeric)."""
    # Clean: lowercase, keep only alphanumeric
    cleaned = [c.lower() for c in s if c.isalnum()]
    return cleaned == cleaned[::-1]


print(is_palindrome("racecar"))              # True
print(is_palindrome("A man a plan a canal Panama"))  # True
print(is_palindrome("hello"))               # False
print(is_palindrome("Was it a car or a cat I saw"))  # True

# Two-pointer version (O(1) space)
def is_palindrome_two_pointers(s: str) -> bool:
    cleaned = [c.lower() for c in s if c.isalnum()]
    left, right = 0, len(cleaned) - 1
    while left < right:
        if cleaned[left] != cleaned[right]:
            return False
        left += 1
        right -= 1
    return True

Problem 3: Find All Anagrams

Problem: Given a string s and a pattern p, find all starting indices in s where the substring is an anagram of p.

Python

from collections import Counter

def find_anagrams(s: str, p: str) -> list[int]:
    """Find all start indices of anagrams of p in s."""
    if len(p) > len(s):
        return []

    p_count = Counter(p)
    window_count = Counter(s[:len(p)])
    result = []

    if window_count == p_count:
        result.append(0)

    for i in range(len(p), len(s)):
        # Add new character to window
        new_char = s[i]
        window_count[new_char] += 1

        # Remove oldest character from window
        old_char = s[i - len(p)]
        window_count[old_char] -= 1
        if window_count[old_char] == 0:
            del window_count[old_char]   # Keep Counter clean

        if window_count == p_count:
            result.append(i - len(p) + 1)

    return result


print(find_anagrams("cbaebabacd", "abc"))  # [0, 6]
print(find_anagrams("abab", "ab"))         # [0, 1, 2]

Complexity: O(n) time — sliding window avoids recomputing the entire Counter each step.

Problem 4: Parse Structured LLM Output

Problem: An LLM returns answers in a specific format. Parse the structured text into a dict.

Python

import re

def parse_drug_report(llm_output: str) -> dict:
    """
    Parse LLM output in this format:
    Drug: warfarin
    Category: anticoagulant
    Dose: 5mg daily
    Monitoring: INR weekly
    """
    result = {}
    lines = llm_output.strip().split("\n")

    for line in lines:
        line = line.strip()
        if not line or ":" not in line:
            continue
        key, _, value = line.partition(":")
        result[key.strip().lower().replace(" ", "_")] = value.strip()

    return result


output = """
Drug: warfarin
Category: anticoagulant
Dose: 5mg daily
Monitoring: INR weekly initially, then monthly when stable
"""

parsed = parse_drug_report(output)
print(parsed)
# {"drug": "warfarin", "category": "anticoagulant", "dose": "5mg daily", ...}


# More robust: regex-based parser
def parse_with_regex(text: str) -> dict:
    """Parse key: value pairs using regex — handles inconsistent spacing."""
    pattern = re.compile(r"^([A-Za-z ]+):\s*(.+)$", re.MULTILINE)
    matches = pattern.findall(text)
    return {
        key.strip().lower().replace(" ", "_"): value.strip()
        for key, value in matches
    }

Problem 5: Clean LLM Output

Problem: LLM output often has extra formatting, markdown, or repeated text. Clean it for downstream processing.

Python

import re

def clean_llm_output(text: str) -> str:
    """
    Remove common LLM output artifacts:
    - Markdown code fences (```...```)
    - Leading/trailing whitespace
    - Repeated consecutive newlines
    - "As an AI language model..." preamble
    """
    # Remove code fences
    text = re.sub(r"```[\w]*\n?(.*?)```", r"\1", text, flags=re.DOTALL)

    # Remove common AI preambles
    preambles = [
        r"As an AI(?: language model)?,?\s*",
        r"I(?:'m| am) sorry,?\s*(?:but\s*)?",
        r"Certainly[!,.]?\s*",
        r"Of course[!,.]?\s*",
    ]
    for pattern in preambles:
        text = re.sub(pattern, "", text, flags=re.IGNORECASE)

    # Collapse multiple blank lines into one
    text = re.sub(r"\n{3,}", "\n\n", text)

    # Strip leading/trailing whitespace
    return text.strip()


raw = """
Certainly! As an AI language model, I can help.

```python
def hello():
    return "world"

The answer is warfarin 5mg daily.

Please consult your physician. """

print(clean_llm_output(raw))

"I can help.\n\ndef hello():\n return 'world'\n\nThe answer is warfarin 5mg daily.\n\nPlease consult your physician."


---

## Problem 6: Extract Drug Names from Clinical Text

**Problem:** Given a list of known drug names, extract all mentions from a clinical note (case-insensitive, whole-word matching).

```python
import re
from collections import defaultdict

def extract_drug_mentions(
    text: str,
    drug_names: list[str],
) -> dict[str, list[int]]:
    """
    Extract all drug mentions from text.
    Returns: {drug_name: [list of character positions]}
    """
    mentions: dict = defaultdict(list)

    for drug in drug_names:
        # \b = word boundary — avoids matching "aspirin" inside "aspirinate"
        pattern = re.compile(r"\b" + re.escape(drug) + r"\b", re.IGNORECASE)
        for match in pattern.finditer(text):
            mentions[drug.lower()].append(match.start())

    return dict(mentions)


clinical_note = """
Patient is currently on Warfarin 5mg daily for atrial fibrillation.
She was started on Aspirin 81mg three weeks ago following her MI.
Metformin 500mg BID for Type 2 DM continues unchanged.
No known allergies to warfarin or aspirin.
"""

known_drugs = ["warfarin", "aspirin", "metformin", "lisinopril"]
mentions = extract_drug_mentions(clinical_note, known_drugs)
print(mentions)
# {"warfarin": [29, 163], "aspirin": [93, 172], "metformin": [126]}


# Count mentions
mention_counts = {drug: len(positions) for drug, positions in mentions.items()}
print(mention_counts)
# {"warfarin": 2, "aspirin": 2, "metformin": 1}

Common String Methods Reference

Python

s = "  Warfarin 5mg PO Daily  "

# Case
s.lower()         # "  warfarin 5mg po daily  "
s.upper()         # "  WARFARIN 5MG PO DAILY  "
s.title()         # "  Warfarin 5Mg Po Daily  "

# Whitespace
s.strip()         # "Warfarin 5mg PO Daily"
s.lstrip()        # "Warfarin 5mg PO Daily  "
s.rstrip()        # "  Warfarin 5mg PO Daily"

# Search
s.find("5mg")     # 10 — first index (or -1 if not found)
s.count("a")      # 2 — count occurrences
s.startswith("W") # False — whitespace at start
s.strip().startswith("W")  # True

# Replace
s.replace("PO", "by mouth")

# Split / join
parts = s.strip().split()    # ["Warfarin", "5mg", "PO", "Daily"]
" | ".join(parts)             # "Warfarin | 5mg | PO | Daily"

# Check character types
"abc123".isalnum()   # True
"abc".isalpha()      # True
"123".isdigit()      # True
"abc".isspace()      # False

Interview: NumPy Problem Walk-Through

Next Lesson

List and Array Problems