Created
January 27, 2025 20:39
-
-
Save spipm/d02038e97b339019bfe01a68bd473f9c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Test for semantic between sentences using small language model | |
from transformers import AutoTokenizer, AutoModel | |
import torch | |
from sklearn.metrics.pairwise import cosine_similarity | |
# Load a multilingual lightweight model and tokenizer | |
model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModel.from_pretrained(model_name) | |
def get_sentence_embedding(sentence): | |
inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True) | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
# Mean pooling | |
embeddings = outputs.last_hidden_state.mean(dim=1) | |
return embeddings | |
def calculate_similarity(sentence1, sentence2): | |
emb1 = get_sentence_embedding(sentence1) | |
emb2 = get_sentence_embedding(sentence2) | |
similarity = cosine_similarity(emb1.numpy(), emb2.numpy()) | |
return similarity[0][0] | |
# Example sentences in Dutch | |
pairs = [ | |
("De kat zit op de mat.", "Een kat zit op het tapijt."), | |
("Hey, hoe gaat het?", "Hallo, alles goed daar?"), | |
("Het gaat goed met mij", "Het gaat zo z'n gangetje"), | |
("Het gaat goed met mij", "Met mij gaat het goed"), | |
("Zin om dit weekend deze kant op te komen?", "Heb je zin om hierheen te komen dit weekend?"), | |
("Het is bij mij druk op werk", "Bij mij op werk is het druk"), | |
("stuur je huidige locatie", "geef door waar je bent"), | |
("houd alles goed in de gaten", "houd je ogen open"), | |
("Dat wordt pompen met die subwoofer", "Drommels drommels drommels"), | |
("De bergbeklimmer was Henk niet", "Henk is de berg opgeklommen"), | |
("De bergbeklimmer was Henk niet", "Henk is de berg niet opgeklommen"), | |
("De bergbeklimmer was Henk niet", "De berg beklimmen was niets voor Henk"), | |
("De bergbeklimmer was Henk niet", "Henk vond de berg beklimmen maar niets"), | |
("dag", "dan"), | |
("flevoland", "walibi"), | |
("denken", "dingen"), | |
("vingers", "tengels") | |
] | |
for a,b in pairs: | |
sim = calculate_similarity(a,b) | |
print(a) | |
print(b) | |
print(sim) | |
print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment