from PIL import Image, ImageDraw from sentence_transformers import SentenceTransformer model_id = "sentence-transformers/clip-ViT-B-32" image = Image.new("RGB", (224, 224), "white") draw = ImageDraw.Draw(image) draw.rectangle((48, 48, 176, 176), fill=(220, 30, 30)) model = SentenceTransformer(model_id) print(f"model_id={model_id}") if hasattr(model, "modalities"): print(f"modalities={model.modalities}") if hasattr(model, "supports"): print(f"supports_image={model.supports('image')}") image_embedding = model.encode( image, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=False, ) text_labels = ["a red square", "a blue circle"] text_embeddings = model.encode( text_labels, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=False, ) scores = model.similarity(image_embedding, text_embeddings)[0] best_index = int(scores.argmax()) print(f"image_embedding_shape={image_embedding.shape}") print(f"text_embedding_shape={text_embeddings.shape}") for label, score in zip(text_labels, scores): print(f"score[{label}]={float(score):.4f}") print(f"best_text_match={text_labels[best_index]}") if text_labels[best_index] != "a red square": raise SystemExit("unexpected text match")