Skip to content

Instantly share code, notes, and snippets.

@spipm
Created January 27, 2025 20:39
Show Gist options
  • Save spipm/d02038e97b339019bfe01a68bd473f9c to your computer and use it in GitHub Desktop.
Save spipm/d02038e97b339019bfe01a68bd473f9c to your computer and use it in GitHub Desktop.
# Test for semantic between sentences using small language model
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
# Load a multilingual lightweight model and tokenizer
model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
def get_sentence_embedding(sentence):
inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)
with torch.no_grad():
outputs = model(**inputs)
# Mean pooling
embeddings = outputs.last_hidden_state.mean(dim=1)
return embeddings
def calculate_similarity(sentence1, sentence2):
emb1 = get_sentence_embedding(sentence1)
emb2 = get_sentence_embedding(sentence2)
similarity = cosine_similarity(emb1.numpy(), emb2.numpy())
return similarity[0][0]
# Example sentences in Dutch
pairs = [
("De kat zit op de mat.", "Een kat zit op het tapijt."),
("Hey, hoe gaat het?", "Hallo, alles goed daar?"),
("Het gaat goed met mij", "Het gaat zo z'n gangetje"),
("Het gaat goed met mij", "Met mij gaat het goed"),
("Zin om dit weekend deze kant op te komen?", "Heb je zin om hierheen te komen dit weekend?"),
("Het is bij mij druk op werk", "Bij mij op werk is het druk"),
("stuur je huidige locatie", "geef door waar je bent"),
("houd alles goed in de gaten", "houd je ogen open"),
("Dat wordt pompen met die subwoofer", "Drommels drommels drommels"),
("De bergbeklimmer was Henk niet", "Henk is de berg opgeklommen"),
("De bergbeklimmer was Henk niet", "Henk is de berg niet opgeklommen"),
("De bergbeklimmer was Henk niet", "De berg beklimmen was niets voor Henk"),
("De bergbeklimmer was Henk niet", "Henk vond de berg beklimmen maar niets"),
("dag", "dan"),
("flevoland", "walibi"),
("denken", "dingen"),
("vingers", "tengels")
]
for a,b in pairs:
sim = calculate_similarity(a,b)
print(a)
print(b)
print(sim)
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment