Python Essentials for AI Engineers · Lesson 32 of 36
String Manipulation Problems
Problem 1: Reverse Words in a String
Problem: Given a string of words separated by spaces, reverse the order of words. Handle multiple spaces and leading/trailing whitespace.
def reverse_words(s: str) -> str:
"""Reverse word order in a string."""
words = s.split() # split() with no args: splits on any whitespace, removes empty strings
return " ".join(reversed(words))
print(reverse_words("the quick brown fox")) # "fox brown quick the"
print(reverse_words(" hello world ")) # "world hello"
print(reverse_words("single")) # "single"
print(reverse_words("")) # ""
# One-liner
reverse_words_v2 = lambda s: " ".join(s.split()[::-1])
# In AI: useful for checking if an LLM's word order is correct
def check_word_order(original: str, response: str) -> bool:
"""Check if the response uses the same words in the correct order."""
return set(original.lower().split()) == set(response.lower().split())Problem 2: Check if a String is a Palindrome
Problem: Return True if the string reads the same forward and backward, ignoring case, spaces, and non-alphanumeric characters.
def is_palindrome(s: str) -> bool:
"""Check if string is a palindrome (ignoring case and non-alphanumeric)."""
# Clean: lowercase, keep only alphanumeric
cleaned = [c.lower() for c in s if c.isalnum()]
return cleaned == cleaned[::-1]
print(is_palindrome("racecar")) # True
print(is_palindrome("A man a plan a canal Panama")) # True
print(is_palindrome("hello")) # False
print(is_palindrome("Was it a car or a cat I saw")) # True
# Two-pointer version (O(1) space)
def is_palindrome_two_pointers(s: str) -> bool:
cleaned = [c.lower() for c in s if c.isalnum()]
left, right = 0, len(cleaned) - 1
while left < right:
if cleaned[left] != cleaned[right]:
return False
left += 1
right -= 1
return TrueProblem 3: Find All Anagrams
Problem: Given a string s and a pattern p, find all starting indices in s where the substring is an anagram of p.
from collections import Counter
def find_anagrams(s: str, p: str) -> list[int]:
"""Find all start indices of anagrams of p in s."""
if len(p) > len(s):
return []
p_count = Counter(p)
window_count = Counter(s[:len(p)])
result = []
if window_count == p_count:
result.append(0)
for i in range(len(p), len(s)):
# Add new character to window
new_char = s[i]
window_count[new_char] += 1
# Remove oldest character from window
old_char = s[i - len(p)]
window_count[old_char] -= 1
if window_count[old_char] == 0:
del window_count[old_char] # Keep Counter clean
if window_count == p_count:
result.append(i - len(p) + 1)
return result
print(find_anagrams("cbaebabacd", "abc")) # [0, 6]
print(find_anagrams("abab", "ab")) # [0, 1, 2]Complexity: O(n) time — sliding window avoids recomputing the entire Counter each step.
Problem 4: Parse Structured LLM Output
Problem: An LLM returns answers in a specific format. Parse the structured text into a dict.
import re
def parse_drug_report(llm_output: str) -> dict:
"""
Parse LLM output in this format:
Drug: warfarin
Category: anticoagulant
Dose: 5mg daily
Monitoring: INR weekly
"""
result = {}
lines = llm_output.strip().split("\n")
for line in lines:
line = line.strip()
if not line or ":" not in line:
continue
key, _, value = line.partition(":")
result[key.strip().lower().replace(" ", "_")] = value.strip()
return result
output = """
Drug: warfarin
Category: anticoagulant
Dose: 5mg daily
Monitoring: INR weekly initially, then monthly when stable
"""
parsed = parse_drug_report(output)
print(parsed)
# {"drug": "warfarin", "category": "anticoagulant", "dose": "5mg daily", ...}
# More robust: regex-based parser
def parse_with_regex(text: str) -> dict:
"""Parse key: value pairs using regex — handles inconsistent spacing."""
pattern = re.compile(r"^([A-Za-z ]+):\s*(.+)$", re.MULTILINE)
matches = pattern.findall(text)
return {
key.strip().lower().replace(" ", "_"): value.strip()
for key, value in matches
}Problem 5: Clean LLM Output
Problem: LLM output often has extra formatting, markdown, or repeated text. Clean it for downstream processing.
import re
def clean_llm_output(text: str) -> str:
"""
Remove common LLM output artifacts:
- Markdown code fences (```...```)
- Leading/trailing whitespace
- Repeated consecutive newlines
- "As an AI language model..." preamble
"""
# Remove code fences
text = re.sub(r"```[\w]*\n?(.*?)```", r"\1", text, flags=re.DOTALL)
# Remove common AI preambles
preambles = [
r"As an AI(?: language model)?,?\s*",
r"I(?:'m| am) sorry,?\s*(?:but\s*)?",
r"Certainly[!,.]?\s*",
r"Of course[!,.]?\s*",
]
for pattern in preambles:
text = re.sub(pattern, "", text, flags=re.IGNORECASE)
# Collapse multiple blank lines into one
text = re.sub(r"\n{3,}", "\n\n", text)
# Strip leading/trailing whitespace
return text.strip()
raw = """
Certainly! As an AI language model, I can help.
```python
def hello():
return "world"The answer is warfarin 5mg daily.
Please consult your physician. """
print(clean_llm_output(raw))
"I can help.\n\ndef hello():\n return 'world'\n\nThe answer is warfarin 5mg daily.\n\nPlease consult your physician."
---
## Problem 6: Extract Drug Names from Clinical Text
**Problem:** Given a list of known drug names, extract all mentions from a clinical note (case-insensitive, whole-word matching).
```python
import re
from collections import defaultdict
def extract_drug_mentions(
text: str,
drug_names: list[str],
) -> dict[str, list[int]]:
"""
Extract all drug mentions from text.
Returns: {drug_name: [list of character positions]}
"""
mentions: dict = defaultdict(list)
for drug in drug_names:
# \b = word boundary — avoids matching "aspirin" inside "aspirinate"
pattern = re.compile(r"\b" + re.escape(drug) + r"\b", re.IGNORECASE)
for match in pattern.finditer(text):
mentions[drug.lower()].append(match.start())
return dict(mentions)
clinical_note = """
Patient is currently on Warfarin 5mg daily for atrial fibrillation.
She was started on Aspirin 81mg three weeks ago following her MI.
Metformin 500mg BID for Type 2 DM continues unchanged.
No known allergies to warfarin or aspirin.
"""
known_drugs = ["warfarin", "aspirin", "metformin", "lisinopril"]
mentions = extract_drug_mentions(clinical_note, known_drugs)
print(mentions)
# {"warfarin": [29, 163], "aspirin": [93, 172], "metformin": [126]}
# Count mentions
mention_counts = {drug: len(positions) for drug, positions in mentions.items()}
print(mention_counts)
# {"warfarin": 2, "aspirin": 2, "metformin": 1}Common String Methods Reference
s = " Warfarin 5mg PO Daily "
# Case
s.lower() # " warfarin 5mg po daily "
s.upper() # " WARFARIN 5MG PO DAILY "
s.title() # " Warfarin 5Mg Po Daily "
# Whitespace
s.strip() # "Warfarin 5mg PO Daily"
s.lstrip() # "Warfarin 5mg PO Daily "
s.rstrip() # " Warfarin 5mg PO Daily"
# Search
s.find("5mg") # 10 — first index (or -1 if not found)
s.count("a") # 2 — count occurrences
s.startswith("W") # False — whitespace at start
s.strip().startswith("W") # True
# Replace
s.replace("PO", "by mouth")
# Split / join
parts = s.strip().split() # ["Warfarin", "5mg", "PO", "Daily"]
" | ".join(parts) # "Warfarin | 5mg | PO | Daily"
# Check character types
"abc123".isalnum() # True
"abc".isalpha() # True
"123".isdigit() # True
"abc".isspace() # False