from pathlib import Path import numpy as np from PIL import Image, ImageDraw from sentence_transformers import SentenceTransformer def make_sample(path, color, shape): image = Image.new("RGB", (224, 224), "white") draw = ImageDraw.Draw(image) if shape == "red square": draw.rectangle((46, 46, 178, 178), fill=color) elif shape == "blue circle": draw.ellipse((42, 42, 182, 182), fill=color) elif shape == "green triangle": draw.polygon([(112, 34), (38, 188), (186, 188)], fill=color) image.save(path) image_dir = Path("demo-images") image_dir.mkdir(exist_ok=True) samples = [ ("red-square.png", "red square", (220, 30, 30)), ("blue-circle.png", "blue circle", (30, 80, 220)), ("green-triangle.png", "green triangle", (40, 155, 75)), ] for filename, label, color in samples: make_sample(image_dir / filename, color, label) model = SentenceTransformer("sentence-transformers/clip-ViT-B-32") images = [Image.open(image_dir / filename) for filename, _, _ in samples] image_embeddings = model.encode( images, normalize_embeddings=True, convert_to_numpy=True, ) queries = [ ("a red square", "red square"), ("a blue circle", "blue circle"), ("a green triangle", "green triangle"), ] query_embeddings = model.encode( [query for query, _ in queries], normalize_embeddings=True, convert_to_numpy=True, ) scores = np.matmul(query_embeddings, image_embeddings.T) labels = [label for _, label, _ in samples] print("model: sentence-transformers/clip-ViT-B-32") print(f"indexed images: {len(samples)}") print(f"embedding dimension: {image_embeddings.shape[1]}") print("top matches:") all_expected = True for row, (query, expected) in enumerate(queries): best_index = int(np.argmax(scores[row])) matched = labels[best_index] score = scores[row][best_index] print(f"- query={query!r} match={matched} score={score:.4f}") all_expected = all_expected and matched == expected print(f"all expected matches: {all_expected}")