Sitemap
Voxel51

News, tutorials, tips, and big ideas in computer vision and data-centric machine learning, from the company behind open source FiftyOne. Learn more at https://voxel51.com

This Visual Illusions Benchmark Makes Me Question the Power of VLMs

16 min readMar 3, 2025

--

The Illusory Datasets

Source: Illusory VQA paper

Testing Leading Multimodal Models

A Simple Yet Effective Solution

What we’re going to do in this tutorial

# installing bleeding edge version of transformers
!pip install git+https://github.com/huggingface/transformers.git#egg=transformers
!pip install fiftyone umap-learn
import fiftyone as fo
from fiftyone.utils.huggingface import load_from_hub

dataset = load_from_hub(
"Voxel51/IllusionAnimals",
overwrite=True,
persistent=True
)
fo.launch_app(dataset)
Initial exploration of the Illusion Animals dataset
main_images = dataset.select_group_slices("main").map_labels("label", {"illusionless": "no illusion"}).clone(name="illusion_animals") # get the main images
main_images.persistent = True # make the dataset persistent across sessions

filtered_images = dataset.select_group_slices("filtered").map_labels("label", {"illusionless": "no illusion"}).clone(name="illusion_animals_fitered")
filtered_images.persistent = True
class_names = main_images.distinct("label.label") # get the class names

Using Embeddings for Deeper Dataset Understanding

CLIP

import torch 

import fiftyone.zoo as foz

clip_model = foz.load_zoo_model(
"zero-shot-classification-transformer-torch",
name_or_path="openai/clip-vit-base-patch32",
classes=class_names,
device="cuda" if torch.cuda.is_available() else "cpu",
# install_requirements=True # uncomment this line if you are running this code for the first time
)

main_images.compute_embeddings(
model=clip_model,
embeddings_field="clip_embeddings"
)

filtered_images.compute_embeddings(
model=clip_model,
embeddings_field="clip_embeddings"
)

SigLIP 2

import torch 

import fiftyone.zoo as foz

siglip_model = foz.load_zoo_model(
"zero-shot-classification-transformer-torch",
name_or_path="google/siglip2-base-patch32-256",
classes=class_names,
device="cuda" if torch.cuda.is_available() else "cpu",
# install_requirements=True # uncomment this line if you are running this code for the first time
)
main_images.compute_embeddings(
model=siglip_model,
embeddings_field="siglip_embeddings"
)

filtered_images.compute_embeddings(
model=siglip_model,
embeddings_field="siglip_embeddings"
)

AIMv2

!fiftyone plugins download https://github.com/harpreetsahota204/aimv2_embeddings

!fiftyone plugins requirements @harpreetsahota/aimv2_embeddings --install
import os

os.environ['FIFTYONE_ALLOW_LEGACY_ORCHESTRATORS'] = 'true'
import fiftyone.operators as foo

aim_embeddings = foo.get_operator("@harpreetsahota/aimv2_embeddings/compute_aimv2_embeddings")
# Run the operator on your dataset
await aim_embeddings(
main_images,
model_name="apple/aimv2-large-patch14-224", # Choose any supported model
embedding_types="cls", #can be "cls", "mean"
emb_field="aimv2_embeddings",
delegate=True
)
# Run the operator on your dataset
await aim_embeddings(
filtered_images,
model_name="apple/aimv2-large-patch14-224", # Choose any supported model
embedding_types="cls",
emb_field="aimv2_embeddings",
delegate=True
)

Exploring embeddings

import fiftyone.brain as fob

# Define datasets and embedding fields as lists
datasets = [main_images, filtered_images]
embedding_fields = [
"aimv2_embeddings",
"clip_embeddings",
"siglip_embeddings"
]

# Compute UMAP for each dataset and embedding combination
for ds in datasets:
for field in embedding_fields:
_fname = field.split("_embeddings")[0]
brain_key = f"{_fname}_viz"

results = fob.compute_visualization(
ds,
embeddings=field,
method="umap",
brain_key=brain_key,
num_dims=2,
)
fo.launch_app(main_images)
Exploring embeddings from CLIP, AIMv2, and SigLIP 2

Computing uniqueness values

import fiftyone.brain as fob

fob.compute_uniqueness(
main_images,
embeddings="clip_embeddings",
uniqueness_field="clip_uniqueness",
)
Filtering by uniqueness value

Reproducing CLIP results from the paper


# run model on dataset
main_images.apply_model(
model=clip_model,
label_field="clip_predictions",
text_prompt = "illusion animal ",
)

# evaluate results
clip_res_illusions = main_images.evaluate_classifications(
pred_field="clip_predictions",
gt_field="label",
method="simple",
eval_key="clip_eval",
)

filtered_images.apply_model(
model=clip_model,
label_field="clip_predictions",
)

clip_res_filtered = filtered_images.evaluate_classifications(
pred_field="clip_predictions",
gt_field="label",
method="simple",
eval_key=f"clip_eval",
)
Model evaluation panel
clip_res_illusions.print_metrics(average='weighted', digits=4)

clip_res_filtered.print_metrics(average='weighted', digits=4)
def inference(img, labels, model, vis_processors, device):
image = vis_processors["eval"](img).unsqueeze(0).to(device)
sample = {"image": image, "text_input": labels}
clip_features = model.extract_features(sample)
image_features = clip_features.image_embeds_proj
text_features = clip_features.text_embeds_proj
sims = (image_features @ text_features.t())[0] / 0.01
probs = torch.nn.Softmax(dim=0)(sims).tolist()
max_index = probs.index(max(probs))
max_label = labels[max_index]
return max_label

Testing SigLIP 2 and AIMv2

aim_model = foz.load_zoo_model(
"zero-shot-classification-transformer-torch",
name_or_path="apple/aimv2-large-patch14-224-lit",
classes=class_names,
device="cuda" if torch.cuda.is_available() else "cpu",
# install_requirements=True #yes
)
main_images.apply_model(
model=aim_model,
label_field="aimv2_predictions",
text_prompt = "illusion animal ",
)

aim_res_illusions = main_images.evaluate_classifications(
pred_field="aimv2_predictions",
gt_field="label",
method="simple",
eval_key=f"aim_eval",
)
filtered_images.apply_model(
model=aim_model,
label_field="aimv2_predictions",
)

aim_res_filtered = filtered_images.evaluate_classifications(
pred_field="aimv2_predictions",
gt_field="label",
method="simple",
eval_key=f"aim_eval",
)
main_images.apply_model(
model=siglip_model,
label_field="siglip2_predictions",
text_prompt = "illusion animal ",
)

siglip_res_illusions = main_images.evaluate_classifications(
pred_field="siglip2_predictions",
gt_field="label",
method="simple",
eval_key=f"siglip2_eval",
)
filtered_images.apply_model(
model=siglip_model,
label_field="siglip2_predictions",
)

siglip_res_filtered = filtered_images.evaluate_classifications(
pred_field="siglip2_predictions",
gt_field="label",
method="simple",
eval_key=f"siglip2_eval",
)

Summary of findings

Can VLMs do better?

!fiftyone plugins download https://github.com/harpreetsahota204/janus-vqa-fiftyone

!fiftyone plugins requirements @harpreetsahota/janus_vqa --install

!fiftyone plugins download https://github.com/harpreetsahota204/moondream2-plugin

!fiftyone plugins requirements @harpreetsahota/moondream2 --install
import fiftyone.operators as foo

janus_vqa = foo.get_operator("@harpreetsahota/janus_vqa/janus_vqa")

moondream = foo.get_operator("@harpreetsahota/moondream2/moondream")
NO_HINT_PROMPT = f"""Which class is in the picture: {', '.join(class_names)}. 
Your answer must be one of these exact classes, no other answers allowed.
Respond in one word for your guess of the correct class without any extra explanation."""

HINT_PROMPT = f"""There might be an image illusion of something in this image.
These are the classes that the image illusion might belong to: {', '.join(class_names)}.
Your answer must be one of these exact classes, no other answers allowed.
Respond in one word for your guess of the correct class without any extra explanation.
"""

Running the VLMs using the “No Hint” prompt

await janus_vqa(
main_images,
model_path="deepseek-ai/Janus-Pro-1B",
question=NO_HINT_PROMPT,
question_field="no_hint_prompt",
answer_field="janus_no_hint_answer",
delegate=True
)
await moondream(
main_images,
revision="2025-01-09",
operation="query",
output_field="moondream_no_hint_answer",
query_text=NO_HINT_PROMPT,
delegate=True
)
await janus_vqa(
filtered_images,
model_path="deepseek-ai/Janus-Pro-1B",
question=NO_HINT_PROMPT,
question_field="no_hint_prompt",
answer_field="janus_no_hint_answer",
delegate=True
)
await moondream(
filtered_images,
revision="2025-01-09",
operation="query",
output_field="moondream_no_hint_answer",
query_text=NO_HINT_PROMPT,
delegate=True
)
main_images.reload()
main_images.save()
filtered_images.reload()
filtered_images.save()
def convert_to_classification(dataset, source_field, target_field):
"""
Converts values from a field into FiftyOne Classification objects and stores them in a new field.
Strips leading and trailing whitespace from labels.

Args:
dataset (fo.Dataset): The FiftyOne dataset to modify
source_field (str): The field containing the classification labels
target_field (str): The field where to store the Classification objects
"""
classifications = [fo.Classification(label=cls.strip()) for cls in dataset.values(source_field)]
dataset.set_values(target_field, classifications)
dataset.save()

# For main_images dataset
convert_to_classification(main_images, "janus_no_hint_answer", "janus_as_classification")
convert_to_classification(main_images, "moondream_no_hint_answer", "moondream_as_classification")

# For filtered_images dataset
convert_to_classification(filtered_images, "janus_no_hint_answer", "janus_as_classification")
convert_to_classification(filtered_images, "moondream_no_hint_answer", "moondream_as_classification")

main_images_janus_res = main_images.evaluate_classifications(
pred_field="janus_as_classification",
gt_field="label",
method="simple",
eval_key=f"janus_eval",
)

main_images_moondream_res = main_images.evaluate_classifications(
pred_field="moondream_as_classification",
gt_field="label",
method="simple",
eval_key=f"moondream_eval",
)

filtered_images_janus_res = filtered_images.evaluate_classifications(
pred_field="janus_as_classification",
gt_field="label",
method="simple",
eval_key=f"janus_eval",
)
filtered_images_moondream_res = filtered_images.evaluate_classifications(
pred_field="moondream_as_classification",
gt_field="label",
method="simple",
eval_key=f"moondream_eval",
)
Evaluating the performance of Janus Pro and Moondream2
main_images_janus_res.print_metrics(average='weighted', digits=4)
accuracy   0
precision 0
recall 0
fscore 0
support 2000
main_images_moondream_res.print_metrics(average='weighted', digits=4)
accuracy   0.4195
precision 0.4926
recall 0.4195
fscore 0.405
support 2000
filtered_images_janus_res.print_metrics(average='weighted', digits=4)
accuracy   0.0455
precision 0.1
recall 0.0455
fscore 0.0533
support 2000
filtered_images_moondream_res.print_metrics(average='weighted', digits=4)
accuracy   0.902
precision 0.9072
recall 0.902
fscore 0.8985
support 2000

What I Discovered

Why This Matters

--

--

Voxel51
Voxel51

Published in Voxel51

News, tutorials, tips, and big ideas in computer vision and data-centric machine learning, from the company behind open source FiftyOne. Learn more at https://voxel51.com

Harpreet Sahota
Harpreet Sahota

Written by Harpreet Sahota

🤖 Generative AI Hacker | 👨🏽‍💻 AI Engineer | Hacker-in- Residence at Voxel 51

Responses (1)