StrideSafe

403 points

Challenge

Singapore’s lamp posts are getting smarter. They don’t just light the way — they watch over the pavements.

Your next-gen chip has been selected for testing. Can your chip distinguish pedestrians from bicycles and PMDs (personal mobility devices)?

Pass the test, and your chip will earn deployment on Singapore’s smart lamp posts. Fail, and hazards roam free on pedestrian walkways.

Solution

given the challenge desc and the dist files, we figured the challenge would be a matter of classifying images and creating a QR code based off the images. Trying various imagenet/resnet models did not work, so we attempted CLIP (which was alluded to in a comment in the page’s HTML source code: <!-- TODO: Test on other vision-language models other than OpenAI CLIP -->) and colored based off whether it was a person, bike, or PMD. This gave us a QR code that scanned.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# pip install git+https://github.com/openai/CLIP.git ftfy regex tqdm pillow
import glob, json, math, os
from pathlib import Path

import numpy as np
import torch
import clip
from PIL import Image
import matplotlib.pyplot as plt

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
CLIP_MODEL = "ViT-B/32"

LABEL_GROUPS = {
    "pedestrian": [
        "a pedestrian",
        "a person",
        "a human",
        "a profile",
        "a nun",
        "a man",
        "a child",
    ],
    "bicycle": [
        "a bicycle",
        "a motorcycle",
        "a cyclist",
        "a skateboard",
    ],
    "pmd": [
        "a personal mobility device",
        "a wheelchair",
        "a scooter"
    ],
}

NONE_THRESHOLD = 0.175


BUCKETS = {
    "pedestrian": "white",
    "bicycle": "black",
    "pmd": "black",
}
TILE_SIZE = 8 

model, preprocess = clip.load(CLIP_MODEL, device=DEVICE)

def encode_text_groups(label_groups: dict[str, list[str]]):
    prompts = []
    class_slices = []  # (class_name, start, end)
    start = 0
    for cls, texts in label_groups.items():
        toks = clip.tokenize(texts).to(DEVICE)
        with torch.no_grad():
            feats = model.encode_text(toks)
        feats = feats / feats.norm(dim=-1, keepdim=True)
        prompts.append(feats)
        end = start + feats.shape[0]
        class_slices.append((cls, start, end))
        start = end
    all_feats = torch.cat(prompts, dim=0)  # (sum_prompts, d)

    class_feats = []
    names = []
    for cls, s, e in class_slices:
        cf = all_feats[s:e].mean(dim=0, keepdim=True)
        cf = cf / cf.norm(dim=-1, keepdim=True)
        class_feats.append(cf)
        names.append(cls)
    class_feats = torch.cat(class_feats, dim=0)  # (C, d)
    return names, class_feats

CLASS_NAMES, CLASS_TEXT_FEATS = encode_text_groups(LABEL_GROUPS)  # (C, d)

paths = sorted(glob.glob("*.jpg"))
if not paths:
    raise SystemExit("No .jpg images found in the current folder.")

def encode_images(image_paths, batch=32):
    ims = []
    for p in image_paths:
        im = Image.open(p).convert("RGB")
        ims.append(preprocess(im))
    feats = []
    with torch.no_grad():
        for i in range(0, len(ims), batch):
            batch_tensor = torch.stack(ims[i:i+batch]).to(DEVICE)
            f = model.encode_image(batch_tensor)
            f = f / f.norm(dim=-1, keepdim=True)
            feats.append(f)
    feats = torch.cat(feats, dim=0)  # (N, d)
    return feats

image_feats = encode_images(paths)  # (N, d)

sims = (image_feats @ CLASS_TEXT_FEATS.T).cpu().numpy()  # cosine similarity
pred_idxs = sims.argmax(axis=1)           # shape: (N,)
pred_scores = sims.max(axis=1)            # shape: (N,)
pred_names = [CLASS_NAMES[i] for i in pred_idxs]

def assign_bucket(name: str, score: float) -> str:
    if score < NONE_THRESHOLD:
        return "neither"
    return BUCKETS.get(name, "neither")

buckets = [assign_bucket(n, s) for n, s in zip(pred_names, pred_scores)]


n = len(buckets)
side = int(math.ceil(math.sqrt(n)))  # mosaic dimension in tiles
pad = side * side - n

white_tile = np.ones((TILE_SIZE, TILE_SIZE), dtype=np.float32)
black_tile = np.zeros((TILE_SIZE, TILE_SIZE), dtype=np.float32)

yy, xx = np.mgrid[0:TILE_SIZE, 0:TILE_SIZE]
checker_tile = ((xx + yy) % 2).astype(np.float32)  # 0/1 alternating

tile_map = {
    "white": white_tile,
    "black": black_tile,
    "neither": checker_tile,
}

tiles = [tile_map[b] for b in buckets] + [checker_tile] * pad

rows = []
for r in range(side):
    row_tiles = tiles[r*side:(r+1)*side]
    rows.append(np.concatenate(row_tiles, axis=1))
mosaic = np.concatenate(rows, axis=0)

plt.figure(figsize=(6, 6))
plt.imshow(mosaic, cmap="gray", vmin=0.0, vmax=1.0)
plt.title("white / black / checker = neither")
plt.axis("off")
plt.show()