andrea-dagostino · April 9, 2022 23:08
diff --git a/clustering_eng_full.py b/clustering_eng_full.py
 # import required sklearn libs
 from sklearn.datasets import fetch_20newsgroups
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.cluster import KMeans
 from sklearn.decomposition import PCA

 # import other required libs
 import pandas as pd
 import numpy as np

 # string manipulation libs
 import re
 import string
 import nltk
 nltk.download('punkt')
 nltk.download('stopwords')
 from nltk.corpus import stopwords

 # viz libs
 import matplotlib.pyplot as plt
 import seaborn as sns

 categories = [
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'alt.atheism',
 'soc.religion.christian',
 ]
 dataset = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, remove=('headers', 'footers', 'quotes'))

 df = pd.DataFrame(dataset.data, columns=["corpus"])
 df['cleaned'] = df['corpus'].apply(lambda x: preprocess_text(x, remove_stopwords=True))

 # initialize vectorizer
 vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, max_df=0.95)
 # fit_transform applies TF-IDF to clean texts - we save the array of vectors in X
 X = vectorizer.fit_transform(df['cleaned'])

 # initialize KMeans with 3 clusters
 kmeans = KMeans(n_clusters=3, random_state=42)
 kmeans.fit(X)
 clusters = kmeans.labels_

 # initialize PCA with 2 components
 pca = PCA(n_components=2, random_state=42)
 # pass X to the pca
 pca_vecs = pca.fit_transform(X.toarray())
 # save the two dimensions in x0 and x1
 x0 = pca_vecs[:, 0]
 x1 = pca_vecs[:, 1]

 # assign clusters and PCA vectors to columns in the original dataframe
 df['cluster'] = clusters
 df['x0'] = x0
 df['x1'] = x1

 cluster_map = {0: "sport", 1: "technology", 2: "religion"} # mapping found through get_top_keywords
 df['cluster'] = df['cluster'].map(cluster_map)

 # set image size
 plt.figure(figsize=(12, 7))
 # set title
 plt.title("Raggruppamento TF-IDF + KMeans 20newsgroup", fontdict={"fontsize": 18})
 # set axes names
 plt.xlabel("X0", fontdict={"fontsize": 16})
 plt.ylabel("X1", fontdict={"fontsize": 16})
 #  create scatter plot with seaborn, where hue is the class used to group the data
 sns.scatterplot(data=df, x='x0', y='x1', hue='cluster', palette="viridis")
 plt.show()
diff --git a/clustering_eng_functions.py b/clustering_eng_functions.py
 def preprocess_text(text: str, remove_stopwords: bool) -> str:
    """This function cleans the input text by
    - removing links
    - removing special chars
    - removing numbers
    - removing stopwords
    - transforming in lower case
    - removing excessive whitespaces
    Arguments:
        text (str): text to clean
        remove_stopwords (bool): remove stopwords or not
    Returns:
        str: cleaned text
    """
    # remove links
    text = re.sub(r"http\S+", "", text)
    # remove numbers and special chars
    text = re.sub("[^A-Za-z]+", " ", text)
    # remove stopwords
    if remove_stopwords:
        # 1. creates tokens
        tokens = nltk.word_tokenize(text)
        # 2. checks if token is a stopword and removes it
        tokens = [w for w in tokens if not w.lower() in stopwords.words("english")]
        # 3. joins all tokens again
        text = " ".join(tokens)
    # returns cleaned text
    text = text.lower().strip()
    return text
  
 def get_top_keywords(n_terms):
    """This function returns the keywords for each centroid of the KMeans"""
    df = pd.DataFrame(X.todense()).groupby(clusters).mean() # groups tf idf vector per cluster
    terms = vectorizer.get_feature_names_out() # access to tf idf terms
    for i,r in df.iterrows():
        print('\nCluster {}'.format(i))
        print(','.join([terms[t] for t in np.argsort(r)[-n_terms:]])) # for each row of the dataframe, find the n terms that have the highest tf idf score
	# import required sklearn libs
	from sklearn.datasets import fetch_20newsgroups
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.cluster import KMeans
	from sklearn.decomposition import PCA

	# import other required libs
	import pandas as pd
	import numpy as np

	# string manipulation libs
	import re
	import string
	import nltk
	nltk.download('punkt')
	nltk.download('stopwords')
	from nltk.corpus import stopwords

	# viz libs
	import matplotlib.pyplot as plt
	import seaborn as sns

	categories = [
	'comp.graphics',
	'comp.os.ms-windows.misc',
	'rec.sport.baseball',
	'rec.sport.hockey',
	'alt.atheism',
	'soc.religion.christian',
	]
	dataset = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, remove=('headers', 'footers', 'quotes'))

	df = pd.DataFrame(dataset.data, columns=["corpus"])
	df['cleaned'] = df['corpus'].apply(lambda x: preprocess_text(x, remove_stopwords=True))

	# initialize vectorizer
	vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, max_df=0.95)
	# fit_transform applies TF-IDF to clean texts - we save the array of vectors in X
	X = vectorizer.fit_transform(df['cleaned'])

	# initialize KMeans with 3 clusters
	kmeans = KMeans(n_clusters=3, random_state=42)
	kmeans.fit(X)
	clusters = kmeans.labels_

	# initialize PCA with 2 components
	pca = PCA(n_components=2, random_state=42)
	# pass X to the pca
	pca_vecs = pca.fit_transform(X.toarray())
	# save the two dimensions in x0 and x1
	x0 = pca_vecs[:, 0]
	x1 = pca_vecs[:, 1]

	# assign clusters and PCA vectors to columns in the original dataframe
	df['cluster'] = clusters
	df['x0'] = x0
	df['x1'] = x1

	cluster_map = {0: "sport", 1: "technology", 2: "religion"} # mapping found through get_top_keywords
	df['cluster'] = df['cluster'].map(cluster_map)

	# set image size
	plt.figure(figsize=(12, 7))
	# set title
	plt.title("Raggruppamento TF-IDF + KMeans 20newsgroup", fontdict={"fontsize": 18})
	# set axes names
	plt.xlabel("X0", fontdict={"fontsize": 16})
	plt.ylabel("X1", fontdict={"fontsize": 16})
	# create scatter plot with seaborn, where hue is the class used to group the data
	sns.scatterplot(data=df, x='x0', y='x1', hue='cluster', palette="viridis")
	plt.show()
	def preprocess_text(text: str, remove_stopwords: bool) -> str:
	"""This function cleans the input text by
	- removing links
	- removing special chars
	- removing numbers
	- removing stopwords
	- transforming in lower case
	- removing excessive whitespaces
	Arguments:
	text (str): text to clean
	remove_stopwords (bool): remove stopwords or not
	Returns:
	str: cleaned text
	"""
	# remove links
	text = re.sub(r"http\S+", "", text)
	# remove numbers and special chars
	text = re.sub("[^A-Za-z]+", " ", text)
	# remove stopwords
	if remove_stopwords:
	# 1. creates tokens
	tokens = nltk.word_tokenize(text)
	# 2. checks if token is a stopword and removes it
	tokens = [w for w in tokens if not w.lower() in stopwords.words("english")]
	# 3. joins all tokens again
	text = " ".join(tokens)
	# returns cleaned text
	text = text.lower().strip()
	return text

	def get_top_keywords(n_terms):
	"""This function returns the keywords for each centroid of the KMeans"""
	df = pd.DataFrame(X.todense()).groupby(clusters).mean() # groups tf idf vector per cluster
	terms = vectorizer.get_feature_names_out() # access to tf idf terms
	for i,r in df.iterrows():
	print('\nCluster {}'.format(i))
	print(','.join([terms[t] for t in np.argsort(r)[-n_terms:]])) # for each row of the dataframe, find the n terms that have the highest tf idf score