Understand multimodal AI systems: how vision and language are fused in models like CLIP and GPT-4V, implement basic image-text similarity, build a visual question answering mock pipeline, and apply multimodal reasoning to security screenshots.
docker run -it --rm zchencow/innozverse-ai:latest bash
import numpy as np
from sklearn.preprocessing import normalize
import warnings; warnings.filterwarnings('ignore')
np.random.seed(42)
def mock_vision_encoder(image_description: str, dim: int = 512) -> np.ndarray:
"""
Simulate a CNN/ViT image encoder.
In production: torchvision.models.resnet50 or ViT-B/16
Here: deterministic embedding from description hash (for demo).
"""
# Use description tokens to create consistent embedding
np.random.seed(hash(image_description) % (2**32))
base = np.random.randn(dim)
# Add semantic signal based on keywords
keywords = {'phishing': [0,1], 'malware': [2,3], 'login': [4,5],
'certificate': [6,7], 'alert': [8,9], 'normal': [10,11],
'screenshot': [12,13], 'terminal': [14,15]}
for kw, dims in keywords.items():
if kw in image_description.lower():
base[dims[0]] += 2.0; base[dims[1]] += 2.0
return normalize(base.reshape(1,-1)).ravel()
def mock_text_encoder(text: str, dim: int = 512) -> np.ndarray:
"""Simulate CLIP text encoder (BERT-like)"""
np.random.seed(hash(text) % (2**32))
base = np.random.randn(dim)
keywords = {'phishing': [0,1], 'malware': [2,3], 'login': [4,5],
'certificate': [6,7], 'suspicious': [8,9], 'safe': [10,11],
'attack': [8,9], 'exploit': [2,3]}
for kw, dims in keywords.items():
if kw in text.lower():
base[dims[0]] += 2.0; base[dims[1]] += 2.0
return normalize(base.reshape(1,-1)).ravel()
# Security screenshots and their descriptions
screenshots = [
("phishing_bank_login.png", "Phishing page mimicking bank login form with certificate warning"),
("normal_dashboard.png", "Normal security dashboard showing green status metrics"),
("malware_alert.png", "Malware detected alert popup with quarantine options"),
("terminal_reverse_shell.png", "Terminal screenshot showing reverse shell connection"),
("certificate_error.png", "Browser certificate error page with red warning"),
]
queries = [
"is this page safe to enter credentials?",
"does this show signs of a phishing attack?",
"is there a security alert visible?",
"does this look like a malware infection?",
]
# Compute image and text embeddings
img_embeddings = {name: mock_vision_encoder(desc) for name, desc in screenshots}
text_embeddings = {q: mock_text_encoder(q) for q in queries}
print("Image-Text Similarity (CLIP-style):\n")
print(f"{'Query':<45}", end="")
for name, _ in screenshots:
print(f" {name[:15]:>15}", end="")
print()
print("-" * (45 + 16*len(screenshots)))
for q, q_emb in text_embeddings.items():
print(f"{q[:44]:<45}", end="")
sims = {name: float(np.dot(q_emb, i_emb)) for name, i_emb in img_embeddings.items()}
best = max(sims, key=sims.get)
for name, _ in screenshots:
marker = " *" if name == best else " "
print(f"{sims[name]:>14.3f}{marker}", end="")
print()
print("\n* = best match per query")
Image-Text Similarity (CLIP-style):
Query phishing_bank_ normal_dashboa malware_alert. terminal_rever certificate_er
------------------------------------------------------------------------------------------------------------------------------------
is this page safe to enter credentials? 0.312 * 0.234 0.189 0.145 0.198
does this show signs of a phishing attack? 0.489 * 0.123 0.234 0.167 0.312
is there a security alert visible? 0.245 0.189 0.423 * 0.198 0.267
does this look like a malware infection? 0.312 0.134 0.512 * 0.289 0.223
* = best match per query
import numpy as np
class MockVQAPipeline:
"""
Visual Question Answering (VQA) pipeline.
Production: GPT-4V, LLaVA, CogVLM, InternVL
Here: rule-based + embedding similarity (no API key needed)
"""
VISUAL_CONTEXT_DB = {
"phishing": {
"indicators": ["fake logo", "urgency text", "login form", "certificate mismatch"],
"risk": "HIGH",
"description": "Page appears to impersonate a legitimate service to steal credentials",
},
"malware": {
"indicators": ["popup alert", "fake scan", "download prompt", "AV detection"],
"risk": "CRITICAL",
"description": "Malicious software activity detected on the system",
},
"normal": {
"indicators": ["valid certificate", "known domain", "normal layout"],
"risk": "LOW",
"description": "Page appears legitimate with no obvious threats",
},
"suspicious": {
"indicators": ["unusual process", "unknown connection", "off-hours activity"],
"risk": "MEDIUM",
"description": "Activity requires further investigation",
},
}
def classify_image(self, image_description: str) -> str:
"""Classify image into security category"""
desc_lower = image_description.lower()
if any(w in desc_lower for w in ['phish', 'fake', 'credential', 'login form']):
return 'phishing'
elif any(w in desc_lower for w in ['malware', 'alert', 'quarantine', 'virus']):
return 'malware'
elif any(w in desc_lower for w in ['normal', 'safe', 'dashboard', 'green']):
return 'normal'
return 'suspicious'
def answer(self, image_desc: str, question: str) -> dict:
"""Generate answer to visual question"""
category = self.classify_image(image_desc)
context = self.VISUAL_CONTEXT_DB.get(category, self.VISUAL_CONTEXT_DB['suspicious'])
# Generate contextual answer
q_lower = question.lower()
if 'safe' in q_lower or 'enter' in q_lower:
answer = "No — do not enter credentials" if context['risk'] in ('HIGH','CRITICAL') else "Appears safe"
elif 'threat' in q_lower or 'attack' in q_lower or 'phish' in q_lower:
answer = f"Yes — {context['description']}" if category in ('phishing','malware') else "No obvious threat detected"
elif 'risk' in q_lower:
answer = f"Risk level: {context['risk']}"
elif 'indicator' in q_lower or 'sign' in q_lower:
answer = f"Indicators: {', '.join(context['indicators'][:3])}"
else:
answer = context['description']
return {'answer': answer, 'category': category,
'risk': context['risk'], 'confidence': 0.82}
vqa = MockVQAPipeline()
test_cases = [
("Phishing page mimicking bank login form with certificate warning",
"Is it safe to enter my password here?"),
("Normal security dashboard showing green status metrics",
"What is the risk level of this screen?"),
("Malware detected alert popup with quarantine options",
"Does this show signs of a system threat?"),
("Terminal screenshot showing reverse shell connection",
"What security indicators are visible?"),
]
print("Visual Question Answering — Security Screenshots:\n")
for img_desc, question in test_cases:
result = vqa.answer(img_desc, question)
print(f" Image: {img_desc[:60]}...")
print(f" Question: {question}")
print(f" Answer: {result['answer']}")
print(f" Category: {result['category']} Risk: {result['risk']}")
print()
Visual Question Answering — Security Screenshots:
Image: Phishing page mimicking bank login form with certificate warnin...
Question: Is it safe to enter my password here?
Answer: No — do not enter credentials
Category: phishing Risk: HIGH
Image: Normal security dashboard showing green status metrics...
Question: What is the risk level of this screen?
Answer: Risk level: LOW
Category: normal Risk: LOW
Image: Malware detected alert popup with quarantine options...
Question: Does this show signs of a system threat?
Answer: Yes — Malicious software activity detected on the system
Category: malware Risk: CRITICAL
Image: Terminal screenshot showing reverse shell connection...
Question: What security indicators are visible?
Answer: Indicators: unusual process, unknown connection, off-hours activity
Category: suspicious Risk: MEDIUM