from sentence_transformers import SentenceTransformer, util model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") topics = [ ( "password-reset", "Reset a forgotten password from account settings, open the email link, " "and choose a new password.", ), ( "invoice-export", "Export paid invoices from the billing dashboard as a CSV file for accounting.", ), ( "api-token-rotation", "Rotate API tokens before sharing a new integration with a teammate.", ), ( "notification-email", "Change the notification email address and confirm the new address for alerts.", ), ( "workspace-theme", "Change the dashboard color theme for a workspace user profile.", ), ( "vector-search", "Store dense embeddings in a vector index for semantic search retrieval.", ), ] corpus = [] for shard_id in range(1, 401): for topic, text in topics: corpus.append( { "id": f"{topic}-{shard_id:03d}", "topic": topic, "text": f"{text} Region {shard_id:03d}.", } ) documents = [item["text"] for item in corpus] query = "How does a user reset a forgotten password with an email link?" document_embeddings = model.encode_document( documents, batch_size=128, normalize_embeddings=True, convert_to_tensor=True, show_progress_bar=False, ) query_embedding = model.encode_query( query, normalize_embeddings=True, convert_to_tensor=True, show_progress_bar=False, ) query_chunk_size = 1 corpus_chunk_size = 256 top_k = 3 hits = util.semantic_search( query_embedding, document_embeddings, query_chunk_size=query_chunk_size, corpus_chunk_size=corpus_chunk_size, top_k=top_k, score_function=util.dot_score, )[0] print(f"corpus documents: {len(corpus)}") print(f"embedding dimension: {document_embeddings.shape[1]}") print(f"query chunk size: {query_chunk_size}") print(f"corpus chunk size: {corpus_chunk_size}") print(f"top k: {top_k}") print(f"query: {query}") print("top matches:") for rank, hit in enumerate(hits, start=1): record = corpus[hit["corpus_id"]] print( f"{rank}. {record['id']} topic={record['topic']} " f"score={hit['score']:.4f}" ) if corpus[hits[0]["corpus_id"]]["topic"] != "password-reset": raise SystemExit("unexpected top topic") print("verification: PASS password reset documents ranked first")