from sentence_transformers import SentenceTransformer from sentence_transformers import util source_sentences = [ "Reset a customer password.", "Renew the TLS certificate.", "Export customer invoices.", "Deploy the billing service.", "Enable admin multi-factor authentication.", ] target_sentences = [ "Exportar facturas de clientes.", "Implementar el servicio de facturacion.", "Restablecer la contrasena de un cliente.", "Renovar el certificado TLS.", "Activar la autenticacion multifactor del administrador.", ] expected_pairs = { 0: 2, 1: 3, 2: 0, 3: 1, 4: 4, } model = SentenceTransformer( "sentence-transformers/" "paraphrase-multilingual-MiniLM-L12-v2", device="cpu", ) source_embeddings = model.encode( source_sentences, convert_to_tensor=True, normalize_embeddings=True, show_progress_bar=False, ) target_embeddings = model.encode( target_sentences, convert_to_tensor=True, normalize_embeddings=True, show_progress_bar=False, ) k = min(4, len(target_sentences)) source_to_target = util.semantic_search( source_embeddings, target_embeddings, top_k=k, score_function=util.dot_score, ) target_to_source = util.semantic_search( target_embeddings, source_embeddings, top_k=k, score_function=util.dot_score, ) source_means = [ sum(hit["score"] for hit in hits) / len(hits) for hits in source_to_target ] target_means = [ sum(hit["score"] for hit in hits) / len(hits) for hits in target_to_source ] candidates = [] for source_id, hits in enumerate(source_to_target): for hit in hits: target_id = hit["corpus_id"] cosine = hit["score"] mean_score = ( source_means[source_id] + target_means[target_id] ) / 2 margin = cosine / mean_score if target_to_source[target_id][0]["corpus_id"] == source_id: candidates.append((margin, cosine, source_id, target_id)) candidates.sort(reverse=True) print("Mined translated sentence pairs:") for rank, result in enumerate(candidates, start=1): margin, cosine, source_id, target_id = result print(f"{rank}. margin={margin:.3f} cosine={cosine:.3f}") print(f" source[{source_id}]: {source_sentences[source_id]}") print(f" target[{target_id}]: {target_sentences[target_id]}") found_pairs = { source_id: target_id for _, _, source_id, target_id in candidates } missing = { source_id: target_id for source_id, target_id in expected_pairs.items() if found_pairs.get(source_id) != target_id } if missing: raise SystemExit(f"verification: FAIL missing pairs {missing}") print("verification: PASS all expected translation pairs recovered")