Last active
April 9, 2022 23:08
-
-
Save andrea-dagostino/4d876dc236865b71544aee8f6f716594 to your computer and use it in GitHub Desktop.
posts/raggruppamento-testuale-con-tf-idf
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import required sklearn libs | |
from sklearn.datasets import fetch_20newsgroups | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.cluster import KMeans | |
from sklearn.decomposition import PCA | |
# import other required libs | |
import pandas as pd | |
import numpy as np | |
# string manipulation libs | |
import re | |
import string | |
import nltk | |
nltk.download('punkt') | |
nltk.download('stopwords') | |
from nltk.corpus import stopwords | |
# viz libs | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
categories = [ | |
'comp.graphics', | |
'comp.os.ms-windows.misc', | |
'rec.sport.baseball', | |
'rec.sport.hockey', | |
'alt.atheism', | |
'soc.religion.christian', | |
] | |
dataset = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, remove=('headers', 'footers', 'quotes')) | |
df = pd.DataFrame(dataset.data, columns=["corpus"]) | |
df['cleaned'] = df['corpus'].apply(lambda x: preprocess_text(x, remove_stopwords=True)) | |
# initialize vectorizer | |
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, max_df=0.95) | |
# fit_transform applies TF-IDF to clean texts - we save the array of vectors in X | |
X = vectorizer.fit_transform(df['cleaned']) | |
# initialize KMeans with 3 clusters | |
kmeans = KMeans(n_clusters=3, random_state=42) | |
kmeans.fit(X) | |
clusters = kmeans.labels_ | |
# initialize PCA with 2 components | |
pca = PCA(n_components=2, random_state=42) | |
# pass X to the pca | |
pca_vecs = pca.fit_transform(X.toarray()) | |
# save the two dimensions in x0 and x1 | |
x0 = pca_vecs[:, 0] | |
x1 = pca_vecs[:, 1] | |
# assign clusters and PCA vectors to columns in the original dataframe | |
df['cluster'] = clusters | |
df['x0'] = x0 | |
df['x1'] = x1 | |
cluster_map = {0: "sport", 1: "technology", 2: "religion"} # mapping found through get_top_keywords | |
df['cluster'] = df['cluster'].map(cluster_map) | |
# set image size | |
plt.figure(figsize=(12, 7)) | |
# set title | |
plt.title("Raggruppamento TF-IDF + KMeans 20newsgroup", fontdict={"fontsize": 18}) | |
# set axes names | |
plt.xlabel("X0", fontdict={"fontsize": 16}) | |
plt.ylabel("X1", fontdict={"fontsize": 16}) | |
# create scatter plot with seaborn, where hue is the class used to group the data | |
sns.scatterplot(data=df, x='x0', y='x1', hue='cluster', palette="viridis") | |
plt.show() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def preprocess_text(text: str, remove_stopwords: bool) -> str: | |
"""This function cleans the input text by | |
- removing links | |
- removing special chars | |
- removing numbers | |
- removing stopwords | |
- transforming in lower case | |
- removing excessive whitespaces | |
Arguments: | |
text (str): text to clean | |
remove_stopwords (bool): remove stopwords or not | |
Returns: | |
str: cleaned text | |
""" | |
# remove links | |
text = re.sub(r"http\S+", "", text) | |
# remove numbers and special chars | |
text = re.sub("[^A-Za-z]+", " ", text) | |
# remove stopwords | |
if remove_stopwords: | |
# 1. creates tokens | |
tokens = nltk.word_tokenize(text) | |
# 2. checks if token is a stopword and removes it | |
tokens = [w for w in tokens if not w.lower() in stopwords.words("english")] | |
# 3. joins all tokens again | |
text = " ".join(tokens) | |
# returns cleaned text | |
text = text.lower().strip() | |
return text | |
def get_top_keywords(n_terms): | |
"""This function returns the keywords for each centroid of the KMeans""" | |
df = pd.DataFrame(X.todense()).groupby(clusters).mean() # groups tf idf vector per cluster | |
terms = vectorizer.get_feature_names_out() # access to tf idf terms | |
for i,r in df.iterrows(): | |
print('\nCluster {}'.format(i)) | |
print(','.join([terms[t] for t in np.argsort(r)[-n_terms:]])) # for each row of the dataframe, find the n terms that have the highest tf idf score | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment