Copy import numpy as np, time
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier,
VotingClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
import warnings; warnings.filterwarnings('ignore')
class AutoMLPipeline:
"""
Full AutoML: data → best model with best features, auto-configured.
Steps: feature search → algorithm selection → hyperparam tuning → ensemble
"""
ALGORITHMS = {
'random_forest': (RandomForestClassifier,
{'n_estimators': (100, 300), 'max_depth': (5, 20)}),
'gradient_boost': (GradientBoostingClassifier,
{'n_estimators': (100, 200), 'learning_rate': (0.05, 0.3)}),
'logistic': (LogisticRegression,
{'C': (0.01, 10.0)}),
}
def __init__(self, X, y, cv, time_budget_s: int = 30):
self.X = X; self.y = y; self.cv = cv
self.budget = time_budget_s
self.results = []
def _random_params(self, bounds: dict) -> dict:
params = {}
for k, (lo, hi) in bounds.items():
if isinstance(lo, int):
params[k] = np.random.randint(lo, hi+1)
else:
params[k] = np.random.uniform(lo, hi)
return params
def run(self) -> dict:
start = time.time()
# 1. Feature engineering
fe = AutoFeatureEngineer(self.X, self.y, self.cv)
fe_res = fe.search()
best_X = fe_res[0][1]
print(f" [1/4] Best features: {fe_res[0][0]} (AUC={fe_res[0][2]:.4f})")
# 2. Algorithm + hyperparam search
best_models = []
for algo_name, (AlgoClass, bounds) in self.ALGORITHMS.items():
best_auc, best_params = 0, None
n_trials = 5
for _ in range(n_trials):
if time.time() - start > self.budget * 0.7: break
params = self._random_params(bounds)
kw = {**params, 'random_state': 42} if 'random_state' in AlgoClass().get_params() else params
if algo_name == 'logistic':
kw['max_iter'] = 1000; kw['class_weight'] = 'balanced'
elif algo_name != 'logistic':
kw['class_weight'] = 'balanced' if hasattr(AlgoClass(), 'class_weight') else None
if kw.get('class_weight') is None: del kw['class_weight']
try:
model = AlgoClass(**kw)
auc = cross_val_score(model, best_X, self.y, cv=self.cv, scoring='roc_auc').mean()
if auc > best_auc: best_auc = auc; best_params = (AlgoClass, kw)
except: pass
if best_params:
best_models.append((algo_name, best_auc, best_params))
best_models.sort(key=lambda x: x[1], reverse=True)
print(f" [2/4] Algorithm ranking:")
for name, auc, _ in best_models:
print(f" {name:<20} AUC={auc:.4f}")
# 3. Ensemble top-3
estimators = []
for name, auc, (AlgoClass, kw) in best_models[:3]:
m = AlgoClass(**kw); m.fit(best_X, self.y)
estimators.append((name, m))
ensemble = VotingClassifier(estimators=estimators, voting='soft')
ens_auc = cross_val_score(ensemble, best_X, self.y, cv=self.cv, scoring='roc_auc').mean()
print(f" [3/4] Ensemble AUC: {ens_auc:.4f}")
print(f" [4/4] Time elapsed: {time.time()-start:.1f}s")
return {'best_auc': ens_auc, 'best_features': fe_res[0][0], 'ensemble_size': len(estimators)}
pipeline = AutoMLPipeline(X_s, y, cv, time_budget_s=60)
print("=== AutoML Pipeline ===\n")
result = pipeline.run()
print(f"\nFinal Result:")
print(f" Best AUC: {result['best_auc']:.4f}")
print(f" Best features:{result['best_features']}")
print(f" Ensemble: {result['ensemble_size']} models")