from pathlib import Path from datasets import Dataset from sentence_transformers import ( SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments, ) from sentence_transformers.sentence_transformer import losses from sentence_transformers.sentence_transformer.evaluation import ( EmbeddingSimilarityEvaluator, SimilarityFunction, ) output_dir = Path("models/support-matryoshka") model = SentenceTransformer( "sentence-transformers/paraphrase-albert-small-v2", model_kwargs={"torch_dtype": "float32"}, ) train_dataset = Dataset.from_dict( { "sentence1": [ "reset a forgotten admin password", "restore a deleted customer record", "create a read only database user", "rotate an expired api token", "download the latest invoice", "configure daily database backups", "invite a new support agent", "archive a completed support case", ], "sentence2": [ "help an administrator regain account access", "recover a customer profile that was removed", "add a user that can only read from the database", "replace a token that has expired", "retrieve the newest billing invoice", "schedule backups for the database every day", "add another agent to the support team", "close and archive a resolved case", ], "score": [0.95, 0.92, 0.9, 0.91, 0.88, 0.89, 0.86, 0.84], } ) base_loss = losses.CoSENTLoss(model) loss = losses.MatryoshkaLoss( model=model, loss=base_loss, matryoshka_dims=[768, 256, 128, 64], ) args = SentenceTransformerTrainingArguments( output_dir=str(output_dir), num_train_epochs=1, per_device_train_batch_size=4, learning_rate=2e-5, warmup_steps=0, save_strategy="no", report_to="none", disable_tqdm=True, logging_steps=1, ) evaluator = EmbeddingSimilarityEvaluator( sentences1=train_dataset["sentence1"], sentences2=train_dataset["sentence2"], scores=train_dataset["score"], main_similarity=SimilarityFunction.COSINE, name="support-pairs", write_csv=False, ) print(f"matryoshka dimensions: {loss.matryoshka_dims}") trainer = SentenceTransformerTrainer( model=model, args=args, train_dataset=train_dataset, loss=loss, evaluator=evaluator, ) trainer.train() model.save_pretrained(output_dir / "final") print(f"saved model: {output_dir / 'final'}") trained_model = SentenceTransformer(str(output_dir / "final")) for dim in [768, 256, 128, 64]: dim_evaluator = EmbeddingSimilarityEvaluator( sentences1=train_dataset["sentence1"], sentences2=train_dataset["sentence2"], scores=train_dataset["score"], main_similarity=SimilarityFunction.COSINE, name=f"support-pairs-{dim}", truncate_dim=dim, write_csv=False, ) results = dim_evaluator(trained_model) print(f"{dim} dimensions cosine_spearman: {results[dim_evaluator.primary_metric]:.4f}") query = ["reset admin login"] documents = [ "help an administrator regain account access", "retrieve the newest billing invoice", ] embeddings = trained_model.encode(query + documents, normalize_embeddings=True, truncate_dim=64) scores = trained_model.similarity(embeddings[:1], embeddings[1:])[0] print(f"64 dimensions retrieval scores: {scores[0]:.4f}, {scores[1]:.4f}")