from PIL import Image, ImageDraw
from sentence_transformers import SentenceTransformer


model_id = "sentence-transformers/clip-ViT-B-32"

image = Image.new("RGB", (224, 224), "white")
draw = ImageDraw.Draw(image)
draw.rectangle((48, 48, 176, 176), fill=(220, 30, 30))

model = SentenceTransformer(model_id)

print(f"model_id={model_id}")

if hasattr(model, "modalities"):
    print(f"modalities={model.modalities}")
if hasattr(model, "supports"):
    print(f"supports_image={model.supports('image')}")

image_embedding = model.encode(
    image,
    normalize_embeddings=True,
    convert_to_numpy=True,
    show_progress_bar=False,
)
text_labels = ["a red square", "a blue circle"]
text_embeddings = model.encode(
    text_labels,
    normalize_embeddings=True,
    convert_to_numpy=True,
    show_progress_bar=False,
)

scores = model.similarity(image_embedding, text_embeddings)[0]
best_index = int(scores.argmax())

print(f"image_embedding_shape={image_embedding.shape}")
print(f"text_embedding_shape={text_embeddings.shape}")
for label, score in zip(text_labels, scores):
    print(f"score[{label}]={float(score):.4f}")
print(f"best_text_match={text_labels[best_index]}")

if text_labels[best_index] != "a red square":
    raise SystemExit("unexpected text match")