Copy docker run --rm zchencow/innozverse-python:latest python3 -c "
import pandas as pd
import numpy as np
# Step 3: GroupBy aggregation
products = pd.DataFrame({
'name': ['Surface Pro', 'Surface Pen', 'Office 365', 'USB-C Hub', 'Surface Book', 'Teams'],
'price': [864.0, 49.99, 99.99, 29.99, 1299.0, 6.0],
'stock': [15, 80, 999, 0, 5, 10000],
'category': ['Laptop', 'Accessory', 'Software', 'Accessory', 'Laptop', 'Software'],
'rating': [4.8, 4.6, 4.5, 4.2, 4.9, 4.3],
})
products['value'] = products['price'] * products['stock']
cat_stats = products.groupby('category').agg(
count=('name', 'count'),
avg_price=('price', 'mean'),
total_value=('value', 'sum'),
avg_rating=('rating', 'mean'),
).round(2)
print('=== By Category ===')
print(cat_stats)
# Step 4: Merge (join)
sales = pd.DataFrame({
'product_name': ['Surface Pro', 'Surface Pen', 'Surface Pro', 'Office 365', 'Surface Book'],
'date': pd.to_datetime(['2026-03-01', '2026-03-01', '2026-03-02', '2026-03-02', '2026-03-03']),
'qty': [2, 5, 1, 10, 1],
})
merged = sales.merge(products[['name', 'price', 'category']], left_on='product_name', right_on='name')
merged['revenue'] = merged['qty'] * merged['price']
print()
print('=== Sales with product info ===')
print(merged[['date', 'product_name', 'qty', 'revenue']].to_string(index=False))
# Step 5: Pivot table
pivot = merged.pivot_table(
values='revenue', index='date', columns='category', aggfunc='sum', fill_value=0
)
print()
print('=== Revenue pivot (date × category) ===')
print(pivot.round(2))
# Step 6: Time series
dates = pd.date_range('2026-01-01', periods=90, freq='D')
rng = np.random.default_rng(42)
daily_sales = pd.Series(rng.integers(10, 100, len(dates)) + np.sin(np.arange(len(dates))*0.2)*20,
index=dates, name='sales')
weekly = daily_sales.resample('W').sum()
monthly = daily_sales.resample('ME').sum()
rolling = daily_sales.rolling(7).mean()
print()
print('=== Time Series (90 days) ===')
print(f'Total sales: {daily_sales.sum():.0f}')
print(f'Best week: {weekly.max():.0f} ({weekly.idxmax().date()})')
print(f'Best month: {monthly.max():.0f}')
print(f'7-day MA (last 5):\n{rolling.tail().round(1)}')
# Step 7: Data cleaning
raw = pd.DataFrame({
'name': ['Surface Pro', None, 'Surface Pen', 'Surface Pro', '', 'Office 365'],
'price': [864.0, 49.99, None, 864.0, 29.99, -10.0],
'stock': ['15', '80', '999', '15', '0', 'bad'],
})
print()
print('=== Data Cleaning ===')
print(f'Before: {len(raw)} rows, {raw.isnull().sum().sum()} nulls')
cleaned = (raw
.dropna(subset=['name'])
.query('name != \"\"')
.assign(price=pd.to_numeric(raw['price'], errors='coerce'))
.assign(stock=pd.to_numeric(raw['stock'], errors='coerce').fillna(0).astype(int))
.query('price > 0')
.drop_duplicates(subset=['name'])
.reset_index(drop=True)
)
print(f'After: {len(cleaned)} rows')
print(cleaned.to_string(index=False))
# Step 8: Capstone — full data pipeline
print()
print('=== Capstone: Sales Analytics Pipeline ===')
# Generate synthetic data
np.random.seed(42)
n = 200
product_list = ['Surface Pro', 'Surface Pen', 'Office 365', 'USB-C Hub', 'Surface Book']
price_map = {'Surface Pro': 864, 'Surface Pen': 49.99, 'Office 365': 99.99, 'USB-C Hub': 29.99, 'Surface Book': 1299}
transactions = pd.DataFrame({
'date': pd.date_range('2026-01-01', periods=n, freq='6h')[:n],
'product': np.random.choice(product_list, n),
'qty': np.random.randint(1, 10, n),
'region': np.random.choice(['North', 'South', 'East', 'West'], n),
})
transactions['price'] = transactions['product'].map(price_map)
transactions['revenue'] = transactions['qty'] * transactions['price']
transactions['month'] = transactions['date'].dt.to_period('M')
# Analysis
summary = (transactions
.groupby(['month', 'product'])
.agg(orders=('qty', 'count'), units=('qty', 'sum'), revenue=('revenue', 'sum'))
.reset_index()
.sort_values('revenue', ascending=False)
)
top_products = (transactions.groupby('product')['revenue'].sum().sort_values(ascending=False))
by_region = (transactions.groupby('region')['revenue'].sum().sort_values(ascending=False))
print(f'Analyzed {len(transactions)} transactions')
print()
print('Top products by revenue:')
for prod, rev in top_products.items():
print(f' {prod:20s}: \${rev:,.2f}')
print()
print('Revenue by region:')
for region, rev in by_region.items():
print(f' {region:6s}: \${rev:,.2f}')
print()
print(f'Total revenue: \${transactions[\"revenue\"].sum():,.2f}')
print(f'Avg order: \${transactions[\"revenue\"].mean():.2f}')
print(f'Best product: {top_products.index[0]}')
"