import hashlib import json import sys from pathlib import Path import numpy as np from sentence_transformers import SentenceTransformer SOURCE = Path("chatbot_docs.jsonl") INDEX = Path("chatbot_embeddings.npz") MODEL_NAME = "sentence-transformers/paraphrase-albert-small-v2" def load_documents(path): return [json.loads(line) for line in path.read_text().splitlines() if line.strip()] def text_hash(text): return hashlib.sha256(text.encode("utf-8")).hexdigest() def load_existing_index(path): if not path.exists(): return {} data = np.load(path, allow_pickle=True) return { str(doc_id): {"hash": str(doc_hash), "embedding": embedding} for doc_id, doc_hash, embedding in zip(data["ids"], data["hashes"], data["embeddings"]) } def refresh_index(query): docs = load_documents(SOURCE) previous = load_existing_index(INDEX) model = SentenceTransformer(MODEL_NAME) embeddings = [] hashes = [] refreshed = [] for doc in docs: doc_id = doc["id"] doc_hash = text_hash(doc["text"]) old = previous.get(doc_id) if old and old["hash"] == doc_hash: embedding = old["embedding"] else: embedding = model.encode_document(doc["text"], normalize_embeddings=True, show_progress_bar=False) refreshed.append(doc_id) embeddings.append(embedding) hashes.append(doc_hash) matrix = np.vstack(embeddings) np.savez( INDEX, ids=np.array([doc["id"] for doc in docs]), hashes=np.array(hashes), texts=np.array([doc["text"] for doc in docs]), embeddings=matrix, ) query_embedding = model.encode_query(query, normalize_embeddings=True, show_progress_bar=False) scores = matrix @ query_embedding best = int(np.argmax(scores)) best_id = docs[best]["id"] print(f"documents indexed: {len(docs)}") print("documents refreshed: " + (", ".join(refreshed) if refreshed else "none")) print(f"top result: {best_id} score={float(scores[best]):.4f}") print(docs[best]["text"]) if __name__ == "__main__": refresh_index(sys.argv[1])