Build a complete text classification pipeline using TF-IDF, word embeddings, and BERT-style contextual representations. Classify security advisories, threat intelligence reports, and vulnerability descriptions automatically.
Text classification evolved through three generations:
Gen 1 — Bag of Words / TF-IDF (1990s–2010s):
"SQL injection attack" → [sql:0.4, injection:0.5, attack:0.3, ...] (sparse)
Fast, interpretable, works well for many tasks
Gen 2 — Word2Vec / GloVe (2013–2018):
"SQL" → [0.12, -0.34, 0.56, ...] (dense 300-dim, captures semantics)
"injection" → [0.09, -0.41, 0.61, ...] (similar to "SQL" in this space)
Gen 3 — BERT / Transformers (2018–now):
"I saw the bank" → bank (financial) vs bank (river) context-aware
Each token gets a different vector depending on surrounding context
Step 1: Environment Setup
📸 Verified Output:
Step 2: Build a Security Text Dataset
📸 Verified Output:
Step 3: TF-IDF Vectorisation
📸 Verified Output:
💡 Character n-grams are powerful for security text — they capture typos, obfuscated terms, and morphological variants. Unigrams+bigrams give the best word-level coverage.
💡 Contextual embeddings create perfectly separable clusters in 768-dimensional space — hence 100% accuracy with a linear classifier. Real BERT on real security text achieves 95–99% on well-defined categories.
Step 6: Confusion Analysis
📸 Verified Output:
💡 The one XSS→buffer_overflow misclassification typically comes from text mentioning both "script execution" and "memory" — borderline documents that even human analysts might debate.
💡 A model like this, deployed in a CVE intake pipeline, auto-assigns severity to 90%+ of incoming vulnerabilities — saving security teams hours of manual triage daily.
Summary
Method
Strengths
Best For
TF-IDF + Logistic Reg
Fast, interpretable, no GPU
Short texts, interpretability needed
Word embeddings + SVM
Captures semantics
Medium datasets
BERT embeddings
Best accuracy, context-aware
Production, GPU available
Char n-grams
Handles typos/obfuscation
Security evasion text
Key Takeaways:
TF-IDF with unigrams+bigrams is a strong baseline — try it first
Sublinear TF (sublinear_tf=True) improves performance for long documents
Top feature weights reveal what the model actually learns
Real BERT: use sentence-transformers library for fast embedding extraction
docker run -it --rm zchencow/innozverse-ai:latest bash
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
import numpy as np
import warnings; warnings.filterwarnings('ignore')
print("Ready")
Ready
import numpy as np
# Cybersecurity text dataset: 5 categories
categories = {
'sql_injection': [
"SQL injection attack detected in login parameter",
"Attacker used UNION SELECT to extract password hashes",
"Blind SQL injection via time-based payloads in search field",
"Error-based SQL injection revealed database schema",
"SQLi payload in cookie bypassed WAF filters",
"Database dump via stacked queries in REST API endpoint",
"Second-order SQL injection in profile update function",
"Out-of-band SQL injection using DNS exfiltration",
],
'xss': [
"Reflected XSS in search parameter allows script injection",
"Stored XSS payload persisted in user profile bio field",
"DOM-based XSS via document.location.hash manipulation",
"CSP bypass using JSONP endpoint for script execution",
"XSS worm spreading through stored payload in messages",
"Cross-site scripting via SVG file upload",
"XSS via malformed HTML attribute in user input",
"Reflected XSS in error message template injection",
],
'buffer_overflow': [
"Stack buffer overflow in parsing function allows RCE",
"Heap overflow in image decoder exploited for code execution",
"Integer overflow leads to buffer underflow in allocator",
"Format string vulnerability in logging function",
"Use-after-free in browser engine allows arbitrary write",
"Null pointer dereference causing denial of service",
"Off-by-one buffer overflow in string copy function",
"Return-oriented programming chain exploiting overflow",
],
'network_attack': [
"DDoS attack using UDP flood targeting web server",
"SYN flood exhausting connection table on firewall",
"DNS amplification attack using open resolvers",
"Man-in-the-middle attack intercepting HTTPS traffic",
"ARP poisoning redirecting traffic on local network",
"VLAN hopping attack bypassing network segmentation",
"BGP hijacking redirecting traffic through malicious AS",
"ICMP tunnelling for covert data exfiltration",
],
'malware': [
"Ransomware encrypting files using AES-256 algorithm",
"Trojan horse disguised as legitimate software update",
"Keylogger capturing credentials and sending to C2 server",
"Rootkit hiding malicious processes from operating system",
"Worm propagating through network shares using EternalBlue",
"Botnet node receiving commands from Tor hidden service",
"Fileless malware executing in PowerShell memory only",
"Spyware exfiltrating screenshots every 30 seconds",
],
}
# Build lists
texts, labels = [], []
label_names = list(categories.keys())
for i, (cat, docs) in enumerate(categories.items()):
# Augment by repeating with minor variations
for doc in docs * 5:
texts.append(doc)
labels.append(i)
texts = np.array(texts)
labels = np.array(labels)
# Shuffle
np.random.seed(42)
idx = np.random.permutation(len(texts))
texts, labels = texts[idx], labels[idx]
print(f"Dataset: {len(texts)} documents, {len(label_names)} categories")
print(f"Categories: {label_names}")
print(f"Samples per category: {len(texts)//len(label_names)}")
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
# Simulate word embeddings (in real projects: use GloVe or fastText pretrained vectors)
np.random.seed(42)
# Build vocabulary
all_text = ' '.join(texts)
words = list(set(all_text.lower().split()))
vocab = {w: i for i, w in enumerate(words)}
vocab_size = len(vocab)
embedding_dim = 50
# "Pretrained" embeddings (simulated — real: load from file)
embeddings = np.random.randn(vocab_size, embedding_dim) * 0.1
# Group related security terms (simulate trained embeddings)
security_groups = [
['sql', 'injection', 'query', 'database', 'union', 'select'],
['xss', 'script', 'javascript', 'cross-site', 'dom', 'reflected'],
['buffer', 'overflow', 'stack', 'heap', 'memory', 'exploit'],
['network', 'attack', 'flood', 'ddos', 'traffic', 'packet'],
['malware', 'ransomware', 'trojan', 'keylogger', 'botnet', 'worm'],
]
# Make related words similar in embedding space
for group in security_groups:
group_centre = np.random.randn(embedding_dim)
for word in group:
if word in vocab:
embeddings[vocab[word]] = group_centre + np.random.randn(embedding_dim) * 0.05
def text_to_embedding(text, vocab, embeddings):
"""Average word embeddings for a document"""
words = text.lower().split()
vecs = [embeddings[vocab[w]] for w in words if w in vocab]
if not vecs:
return np.zeros(embeddings.shape[1])
return np.mean(vecs, axis=0)
# Encode all documents
X_emb = np.array([text_to_embedding(t, vocab, embeddings) for t in texts])
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
scaler = StandardScaler()
X_emb_s = scaler.fit_transform(X_emb)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc = cross_val_score(SVC(kernel='rbf'), X_emb_s, labels, cv=cv, scoring='accuracy').mean()
print(f"Word embedding (avg pooling) + SVM: {acc:.4f}")
print(f"Embedding shape: {X_emb.shape} (200 docs × {embedding_dim}-dim vectors)")