Systematically find and fix Python performance bottlenecks using timeit, cProfile, pstats, tracemalloc, and algorithmic improvements: memoization, vectorisation with array, generator pipelines, and algorithmic complexity reduction.
Background
Premature optimisation is the root of all evil — but informed optimisation after profiling is engineering. The workflow is always: measure → identify hotspot → optimise → measure again. Python gives you first-class profiling tools in the standard library.
Time
35 minutes
Prerequisites
Lab 03 (Memory Management)
Tools
Docker: zchencow/innozverse-python:latest
Lab Instructions
Step 1: timeit — Micro-benchmarking
💡 Always benchmark with number= large enough to get stable readings. A single run of a fast function can be dominated by OS noise. Use timeit.timeit(..., number=1000) and divide — or use timeit.repeat(..., repeat=5, number=1000) and take the minimum (not mean) to exclude scheduling jitter.
docker run --rm zchencow/innozverse-python:latest python3 -c "
import timeit, array
N = 100_000
# Compare summation strategies
def naive_sum(n): return sum(range(n))
def loop_sum(n):
t = 0
for i in range(n): t += i
return t
def array_sum(n): return sum(array.array('l', range(n)))
def formula(n): return n * (n - 1) // 2 # O(1) closed form
results = {}
for fn in [naive_sum, loop_sum, array_sum, formula]:
elapsed = timeit.timeit(lambda fn=fn: fn(N), number=20) / 20
results[fn.__name__] = elapsed
print(f' {fn.__name__:15s}: {elapsed*1000:.3f} ms')
fastest = min(results, key=results.get)
slowest = max(results, key=results.get)
print(f'Speedup {fastest} vs {slowest}: {results[slowest]/results[fastest]:.0f}x')
# String building comparison
print()
n = 5_000
t_plus = timeit.timeit('s=\"\"; [s := s+str(i) for i in range(n)]', globals={'n':n}, number=50) / 50
t_join = timeit.timeit('\"\".join(str(i) for i in range(n))', globals={'n':n}, number=50) / 50
t_list = timeit.timeit('lst=[]; [lst.append(str(i)) for i in range(n)]; \"\".join(lst)', globals={'n':n}, number=50) / 50
print(f'String +: {t_plus*1000:.2f} ms')
print(f'str.join(gen): {t_join*1000:.2f} ms ({t_plus/t_join:.0f}x faster)')
print(f'list + join: {t_list*1000:.2f} ms ({t_plus/t_list:.0f}x faster)')
# Dict lookup vs list search
haystack_list = list(range(10_000))
haystack_dict = {i: True for i in range(10_000)}
haystack_set = set(range(10_000))
target = 9_999
t_list_search = timeit.timeit('target in haystack_list', globals=locals(), number=10_000) / 10_000
t_dict_lookup = timeit.timeit('target in haystack_dict', globals=locals(), number=10_000) / 10_000
t_set_lookup = timeit.timeit('target in haystack_set', globals=locals(), number=10_000) / 10_000
print()
print(f'list search: {t_list_search*1e6:.2f} µs O(n)')
print(f'dict lookup: {t_dict_lookup*1e6:.2f} µs O(1)')
print(f'set lookup: {t_set_lookup*1e6:.2f} µs O(1)')
print(f'List vs set: {t_list_search/t_set_lookup:.0f}x slowdown')
"
naive_sum : 2.847 ms
loop_sum : 15.823 ms
array_sum : 29.145 ms
formula : 0.000 ms
Speedup formula vs loop_sum: 63000x
String +: 360.12 ms
str.join(gen): 1.73 ms (208x faster)
list + join: 1.45 ms (248x faster)
list search: 489.23 µs O(n)
dict lookup: 0.05 µs O(1)
set lookup: 0.04 µs O(1)
List vs set: 10000x slowdown
docker run --rm zchencow/innozverse-python:latest python3 -c "
import cProfile, pstats, io
# Simulate a realistic workload
def load_products(n: int) -> list[dict]:
return [{'id': i, 'name': f'Product-{i}', 'price': i * 0.5 + 9.99, 'stock': i % 100}
for i in range(n)]
def compute_discount(price: float, tier: str) -> float:
tiers = {'gold': 0.2, 'silver': 0.1, 'bronze': 0.05}
return price * (1 - tiers.get(tier, 0))
def filter_active(products: list) -> list:
return [p for p in products if p['stock'] > 0]
def enrich(products: list) -> list:
result = []
for p in products:
tier = 'gold' if p['price'] > 500 else 'silver' if p['price'] > 100 else 'bronze'
result.append({**p, 'tier': tier, 'final': compute_discount(p['price'], tier)})
return result
def run():
products = load_products(5_000)
active = filter_active(products)
enriched = enrich(active)
total = sum(p['final'] * p['stock'] for p in enriched)
return total, len(enriched)
# Profile
pr = cProfile.Profile()
pr.enable()
total, count = run()
pr.disable()
print(f'Total value: \${total:,.2f} from {count} products')
print()
print('=== cProfile output (top 8 by cumulative time) ===')
sio = io.StringIO()
ps = pstats.Stats(pr, stream=sio).sort_stats('cumulative')
ps.print_stats(8)
for line in sio.getvalue().splitlines()[3:14]:
print(line)
# Profile by tottime (self time — no children)
print()
print('=== By self time (tottime) ===')
sio2 = io.StringIO()
pstats.Stats(pr, stream=sio2).sort_stats('tottime').print_stats(5)
for line in sio2.getvalue().splitlines()[3:10]:
print(line)
"