Created
January 11, 2019 17:32
-
-
Save Papaass/25cfd5c3bbfca0c5dd68137e60c1dfb2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from nltk import word_tokenize | |
corpus = ["La vie est courte mais la vie peut paraître longue","La nuit est proche"] | |
#definir deux phrases du corpus | |
phrase_1 = "La vie est courte mais la vie peut paraître longue" | |
phrase_2 = "La nuit est proche" | |
# fonction retournant un vocabulaire | |
def vocabulary(corpus): | |
voc = [] | |
for sentence in corpus: | |
words = word_tokenize(sentence.lower()) | |
voc.extend(words) | |
voc_clean= [] | |
for w in voc: | |
if w not in voc_clean: | |
voc_clean.append(w) | |
return voc_clean | |
# fonction retournant un sac de mots | |
def bagofwords(sentence,corpus): | |
vocab = vocabulary(corpus) | |
sentence_words = words = word_tokenize(sentence.lower()) | |
bag_of_words = np.zeros(len(vocab)) | |
for w_in_sentence in sentence_words : | |
for i,w in enumerate(vocab) : | |
if w == w_in_sentence : | |
bag_of_words[i] += 1 | |
return bag_of_words |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
print(bagofwords(phrase_1,corpus)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
print(bagofwords(phrase_2,corpus)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment