Created
March 22, 2022 00:48
-
-
Save Dref360/4df9723789d089b43851d8a58047364f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gensim | |
import nltk | |
from gensim import corpora, models | |
from nltk.stem import WordNetLemmatizer, SnowballStemmer | |
import pandas as pd | |
nltk.download('wordnet') | |
nltk.download('omw-1.4') | |
nltk.download('stopwords') | |
""" | |
Taken from https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24 | |
""" | |
FILTERED_TOKENS = {'okay', 'like', 'know', 'yeah', 'think', 'thing'} | |
def lemmatize_stemming(text): | |
stemmer = SnowballStemmer('english', ignore_stopwords=True) | |
return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v')) | |
def preprocess(text): | |
result = [] | |
for token in gensim.utils.simple_preprocess(text): | |
if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in FILTERED_TOKENS: | |
result.append(lemmatize_stemming(token)) | |
return result | |
def find_topics(df: pd.DataFrame, text_column: str): | |
"""Find topics for a dataframe. | |
:note Just print for now. | |
:param df: Dataframe with data | |
:param text_column: Column name to get segment. | |
:return: Nothing | |
""" | |
processed_docs = df[text_column].map(preprocess) | |
dictionary = gensim.corpora.Dictionary(processed_docs) | |
dictionary.filter_extremes(no_below=15, no_above=0.2, keep_n=100) | |
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs] | |
tfidf = models.TfidfModel(bow_corpus) | |
corpus_tfidf = tfidf[bow_corpus] | |
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4) | |
for idx, topic in lda_model_tfidf.print_topics(-1): | |
print('Topic: {} Word: {}'.format(idx, topic)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment