Voxel51

News, tutorials, tips, and big ideas in computer vision and data-centric machine learning, from the company behind open source FiftyOne. Learn more at https://voxel51.com

FACET: A Benchmark Dataset for Fairness in Computer Vision

Jacob Marks, Ph.D.
Voxel51
Published in
15 min readSep 12, 2023

--

FACET Dataset Quick Facts

Annotations

Dataset Statistics

Efforts Taken by FACET Team to Ensure Fairness

Value Counts by Attribute

well lit: 35533
underexposed: 1313
overexposed: 553
dimly lit: 10955
None/na: 1197
straight: 18382
curly: 719
bald: 1017
wavy: 6141
dreadlocks: 280
coily: 458
None/na: 22554
black: 14041
blonde: 2249
red: 333
colored: 248
brown: 10668
grey: 2107
None/na: 19905
True: 6121
False: 43430
young (25-40): 8860
middle (41-65): 27380
older (65+): 2659
None: 10652
fem: 10245
masc: 33240
non binary: 95
None/na: 5971
False: 48846
True: 705
'astronaut': 286,
'backpacker': 1612,
'ballplayer': 1309,
'bartender': 56,
'basketball_player': 1668,
'boatman': 2048,
'carpenter': 223,
'cheerleader': 399,
'climber': 455,
'computer_user': 1164,
'craftsman': 1034,
'dancer': 1397,
'disk_jockey': 310,
'doctor': 802,
'drummer': 977,
'electrician': 468,
'farmer': 1542,
'fireman': 913,
'flutist': 302,
'gardener': 457,
'guard': 1361,
'guitarist': 1180,
'gymnast': 615,
'hairdresser': 458,
'horseman': 735,
'judge': 96,
'laborer': 2540,
'lawman': 4455,
'lifeguard': 511,
'machinist': 354,
'motorcyclist': 1367,
'nurse': 1042,
'painter': 898,
'patient': 884,
'prayer': 798,
'referee': 755,
'repairman': 1295,
'reporter': 470,
'retailer': 546,
'runner': 638,
'sculptor': 213,
'seller': 1178,
'singer': 1286,
'skateboarder': 990,
'soccer_player': 1226,
'soldier': 1457,
'speaker': 1416,
'student': 682,
'teacher': 192,
'tennis_player': 1661,
'trumpeter': 498,
'waiter': 332

Loading the FACET Dataset

Prerequisites

pip install fiftyone
import json
import numpy as np
import os
import pandas as pd
from PIL import Image
from pycocotools import mask as maskUtils
from tqdm.notebook import tqdm
import fiftyone as fo
import fiftyone.brain as fob
import fiftyone.zoo as foz
from fiftyone import ViewField as F

Creating the Dataset

## use relative paths to your image dirs
IMG_DIRS = ["imgs_1", "imgs_2", "imgs_3"]
dataset = fo.Dataset(name = "FACET", persistent=True)for img_dir in IMG_DIRS:
dataset.add_images_dir(img_dir)
dataset.compute_metadata()
print(dataset)
Name:        FACET
Media type: image
Num samples: 31702
Persistent: True
Tags: []
Sample fields:
id: fiftyone.core.fields.ObjectIdField
filepath: fiftyone.core.fields.StringField
tags: fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
metadata: fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.ImageMetadata)
session = fo.launch_app(dataset)
FACET Images prior to loading annotations

Adding Person Detections

gt_df = pd.read_csv('annotations/annotations.csv')
BOOLEAN_PERSONAL_ATTRS = (
"has_facial_hair",
"has_tattoo",
"has_cap",
"has_mask",
"has_headscarf",
"has_eyeware",
)
def add_boolean_person_attributes(detection, row_index):
for attr in BOOLEAN_PERSONAL_ATTRS:
detection[attr] = gt_df.loc[row_index, attr].astype(bool)
def get_hairtype(row_index):
hair_info = gt_df.loc[row_index, gt_df.columns.str.startswith('hairtype')]
hairtype = hair_info[hair_info == 1]
if len(hairtype) == 0:
return None
return hairtype.index[0].split('_')[1]
def get_haircolor(row_index):
hair_info = gt_df.loc[row_index, gt_df.columns.str.startswith('hair_color')]
haircolor = hair_info[hair_info == 1]
if len(haircolor) == 0:
return None
return haircolor.index[0].split('_')[2]
def add_person_attributes(detection, row_index):
detection["hairtype"] = get_hairtype(row_index)
detection["haircolor"] = get_haircolor(row_index)
add_boolean_person_attributes(detection, row_index)
def get_perceived_gender_presentation(row_index):
gender_info = gt_df.loc[row_index, gt_df.columns.str.startswith('gender')]
pgp = gender_info[gender_info == 1]
if len(pgp) == 0:
return None
return pgp.index[0].replace("gender_presentation_", "").replace("_", " ")
def get_perceived_age_presentation(row_index):
age_info = gt_df.loc[row_index, gt_df.columns.str.startswith('age')]
pap = age_info[age_info == 1]
if len(pap) == 0:
return None
return pap.index[0].split('_')[2]
def get_skintone(row_index):
skin_info = gt_df.loc[row_index, gt_df.columns.str.startswith('skin_tone')]
return skin_info.to_dict()
def add_protected_attributes(detection, row_index):
detection["perceived_age_presentation"] = get_perceived_age_presentation(row_index)
detection["perceived_gender_presentation"] = get_perceived_gender_presentation(row_index)
detection["skin_tone"] = get_skintone(row_index)
VISIBILITY_ATTRS = ("visible_torso", "visible_face", "visible_minimal")
def get_lighting(row_index):
lighting_info = gt_df.loc[row_index, gt_df.columns.str.startswith('lighting')]
lighting = lighting_info[lighting_info == 1]
if len(lighting) == 0:
return None
lighting = lighting.index[0].replace("lighting_", "").replace("_", " ")
return lighting
def add_other_attributes(detection, row_index):
detection["lighting"] = get_lighting(row_index)
for attr in VISIBILITY_ATTRS:
detection[attr] = gt_df.loc[row_index, attr].astype(bool)
def create_detection(row_index, sample):
bbox_dict = json.loads(gt_df.loc[row_index, "bounding_box"])
x, y, w, h = bbox_dict["x"], bbox_dict["y"], bbox_dict["width"], bbox_dict["height"]
cat1, cat2 = bbox_dict["dict_attributes"]["cat1"], bbox_dict["dict_attributes"]["cat2"]
    person_id = gt_df.loc[row_index, "person_id"]    img_width, img_height = sample.metadata.width, sample.metadata.height    bounding_box = [x/img_width, y/img_height, w/img_width, h/img_height]
detection = fo.Detection(
label=cat1,
bounding_box=bounding_box,
person_id=person_id,
)
if cat2 != 'none':
detection["class2"] = cat2
add_person_attributes(detection, row_index)
add_protected_attributes(detection, row_index)
add_other_attributes(detection, row_index)
return detection
def add_ground_truth_labels(dataset):
for sample in dataset.iter_samples(autosave=True, progress=True):
sample_annos = gt_df[gt_df['filename'] == sample.filename]
detections = []
for row in sample_annos.iterrows():
row_index = row[0]
detection = create_detection(row_index, sample)
detections.append(detection)
sample["ground_truth"] = fo.Detections(detections=detections)
dataset.add_dynamic_sample_fields()
## add all of the ground truth labels
add_ground_truth_labels(dataset)
Example ground truth label for person in FACET dataset.

Adding Segmentation Masks

def add_coco_masks_to_dataset(dataset):
coco_masks = json.load(open("annotations/coco_masks.json", "r"))
cmas = coco_masks["annotations"]
    FILENAME_TO_ID = {
img["file_name"]: img["id"]
for img in coco_masks["images"]
}
CAT_TO_LABEL = {cat["id"]: cat["name"] for cat in coco_masks["categories"]} for sample in dataset.iter_samples(autosave=True, progress=True):
fn = sample.filename
if fn not in FILENAME_TO_ID:
continue
img_id = FILENAME_TO_ID[fn]
img_width, img_height = sample.metadata.width, sample.metadata.height
sample_annos = [a for a in cmas if a["image_id"] == img_id]
if len(sample_annos) == 0:
continue
coco_detections = []
for ann in sample_annos:
label = CAT_TO_LABEL[ann["category_id"]]
bbox = ann['bbox']
ann_id = ann['ann_id']
person_id = ann['facet_person_id']
mask = maskUtils.decode(ann["segmentation"])
mask = Image.fromarray(255*mask)
## Change bbox to be in the format [x, y, x, y]
bbox[2] = bbox[0] + bbox[2]
bbox[3] = bbox[1] + bbox[3]
## Get the cropped image
cropped_mask = np.array(mask.crop(bbox)).astype(bool)
## Convert to relative [x, y, w, h] coordinates
bbox[2] = bbox[2] - bbox[0]
bbox[3] = bbox[3] - bbox[1]
bbox[0] = bbox[0]/img_width
bbox[1] = bbox[1]/img_height
bbox[2] = bbox[2]/img_width
bbox[3] = bbox[3]/img_height
new_detection = fo.Detection(
label=label,
bounding_box=bbox,
person_id=person_id,
ann_id=ann_id,
mask=cropped_mask,
)
coco_detections.append(new_detection)
sample["coco_masks"] = fo.Detections(detections=coco_detections)
## add the masks
add_coco_masks_to_dataset(dataset)
Ground truth segmentation masks for hair, clothing, and people in FACET dataset

Evaluating Model Bias

FACET Evaluation Metrics

Disparity equation from FACET paper. l1 and l2 represent attributes, and C represents a common concept.
Definition of recall. TP stands for true positive, and FN stands for false negative.

Adding Predictions to the Dataset

CLIP zero-shot classification predictions on ground truth detection patches.
yolov5 = foz.load_zoo_model('yolov5m-coco-torch')
dataset.apply_model(yolov5, label_field="yolov5m")
### Just retain the "person" detections
people_view_values = dataset.filter_labels("yolov5m", F("label") == "person").values("yolov5m")
dataset.set_values("yolov5m", people_view_values)
dataset.save()
## get a list of all 52 classes
facet_classes = dataset.distinct("ground_truth.detections.label")
## instantiate a CLIP model with these classes
clip = foz.load_zoo_model(
"clip-vit-base32-torch",
text_prompt="A photo of a",
classes=facet_classes,
)
patch_view = dataset.to_patches("ground_truth")
patch_view.apply_model(clip, label_field="clip")
dataset.save_view("patch_view", patch_view)

Evaluating Detection Predictions

IOU_THRESHS = np.round(np.arange(0.5, 1.0, 0.05), 2)
def _evaluate_detection_model(dataset, label_field):
eval_key = "eval_" + label_field.replace("-", "_")
dataset.evaluate_detections(label_field, "ground_truth", eval_key=eval_key, classwise=False)

for sample in dataset.iter_samples(autosave=True, progress=True):
for pred in sample[label_field].detections:
iou_field = f"{eval_key}_iou"
if iou_field not in pred:
continue
            iou = pred[iou_field]
for it in IOU_THRESHS:
pred[f"{iou_field}_{str(it).replace('.', '')}"] = iou >= it
def _compute_detection_mAR(sample_collection, label_field):
"""Computes the mean average recall of the specified detection field.
-- computed as the average over iou thresholds of the recall at
each threshold.
"""
eval_key = "eval_" + label_field.replace("-", "_")
iou_recalls = []
for it in IOU_THRESHS:
field_str = f"{label_field}.detections.{eval_key}_iou_{str(it).replace('.', '')}"
counts = sample_collection.count_values(field_str)
tp, fn = counts.get(True, 0), counts.get(False, 0)
recall = tp/float(tp + fn) if tp + fn > 0 else 0.0
iou_recalls.append(recall)
    return np.mean(iou_recalls)
def get_concept_attr_detection_mAR(dataset, label_field, concept, attributes):
sub_view = dataset.filter_labels("ground_truth", F("label") == concept)
for attribute in attributes.items():
if "skin_tone" in attribute[0]:
sub_view = sub_view.filter_labels("ground_truth", F(f"skin_tone.{attribute[0]}") != 0)
else:
sub_view = sub_view.filter_labels(f"ground_truth", F(attribute[0]) == attribute[1])
return _compute_detection_mAR(sub_view, label_field)
concept =  'gymnast'
attributes = {"hairtype": "curly", "haircolor": "black"}
get_concept_attr_detection_mAR(dataset, "yolov5m", concept, attributes)
## 0.875

Evaluating Classification Predictions

def _evaluate_classification_model(dataset, prediction_field):
patch_view = dataset.load_saved_view("patch_view")
eval_key = "eval_" + prediction_field

for sample in patch_view.iter_samples(progress=True):
sample[eval_key] = (
sample.ground_truth.label == sample[prediction_field].label
)
sample.save()
dataset.save_view("patch_view", patch_view, overwrite=True)
def _compute_classification_recall(patch_collection, label_field):
eval_key = "eval_" + label_field.split("_")[0]
counts = patch_collection.count_values(eval_key)
tp, fn = counts.get(True, 0), counts.get(False, 0)
recall = tp/float(tp + fn) if tp + fn > 0 else 0.0
return recall
def get_concept_attr_classification_recall(dataset, label_field, concept, attributes):
patch_view = dataset.load_saved_view("patch_view")
sub_patch_view = patch_view.match(F("ground_truth.label") == concept)
for attribute in attributes.items():
if "skin_tone" in attribute[0]:
sub_patch_view = sub_patch_view.match(F(f"ground_truth.skin_tone.{attribute[0]}") != 0)
else:
sub_patch_view = sub_patch_view.match(F(f"ground_truth.{attribute[0]}") == attribute[1])
return _compute_classification_recall(sub_patch_view, label_field)
get_concept_attr_classification_recall(dataset, "clip", concept, attribute)
## 0.6193353474320241

Assessing Disparity

def get_concept_attr_recall(dataset, label_field, concept, attribute):
if label_field in dataset.get_field_schema().keys():
return get_concept_attr_detection_mAR(dataset, label_field, concept, attribute)
else:
return get_concept_attr_classification_recall(dataset, label_field, concept, attribute)
def compute_disparity(dataset, label_field, concept, attribute1, attribute2):
recall1 = get_concept_attr_recall(dataset, label_field, concept, attribute1)
recall2 = get_concept_attr_recall(dataset, label_field, concept, attribute2)
return recall1 - recall2
attrs1 = {"hairtype": "curly"}
attrs2 = {"hairtype": "straight"}
for concept in ["astronaut", "singer", "judge", "student"]:
disparity = compute_disparity(dataset, "clip", concept, attrs1, attrs2)
print(f"{concept}: {disparity}")
#### OUTPUT ####
## astronaut: -0.8269230769230769
## singer: -0.0008051529790660261
## judge: -0.06666666666666667
## student: 0.16279069767441856

Conclusion

--

--

Voxel51
Voxel51

Published in Voxel51

News, tutorials, tips, and big ideas in computer vision and data-centric machine learning, from the company behind open source FiftyOne. Learn more at https://voxel51.com

Jacob Marks, Ph.D.
Jacob Marks, Ph.D.

Written by Jacob Marks, Ph.D.

ML @ Liquid AI | Ex-Google X, Voxel51, Wolfram Research | Stanford Theoretical Physics PhD https://www.linkedin.com/in/jacob-marks

No responses yet