jovianlin · December 21, 2020 07:53 · anandnc2 · Oct 24, 2017 · dinani65 · Dec 21, 2020
diff --git a/clustering_cosine_similarity_matrix.py b/clustering_cosine_similarity_matrix.py
 """
 ### Problem Statement ###
 Let's say you have a square matrix which consists of cosine similarities (values between 0 and 1).
 This square matrix can be of any size. 
 You want to get clusters which maximize the values between elemnts in the cluster.
 For example, for the following matrix:

  |  A  |  B  |  C  |  D
 A | 1.0 | 0.1 | 0.6 |  0.4
 B | 0.1 | 1.0 | 0.1 |  0.2
 C | 0.6 | 0.1 | 1.0 |  0.7
 D | 0.4 | 0.2 | 0.7 |  1.0

 You should get 2 clusters:
 cluster #1: B
 cluster #2: A, C, D
 """

 import numpy as np
 from sklearn.cluster import SpectralClustering
 mat = np.matrix([[1.,.1,.6,.4],[.1,1.,.1,.2],[.6,.1,1.,.7],[.4,.2,.7,1.]])
 SpectralClustering(2).fit_predict(mat)
 # >>> array([0, 1, 0, 0], dtype=int32)

 # i.e., A, C, D belong to Cluster "0"
 # whereas B belongs to Cluster "1"
 # The algorithm takes the top k eigenvectors of the input matrix corresponding to the largest eigenvalues, 
 # then runs the k-mean algorithm on the new matrix. Here is a simple code that does this for your matrix:

 from sklearn.cluster import KMeans
 eigen_values, eigen_vectors = np.linalg.eigh(mat)
 KMeans(n_clusters=2, init='k-means++').fit_predict(eigen_vectors[:, 2:4])
 # >>> array([0, 1, 0, 0], dtype=int32)

 # For the cases you want the algorithm to figure out the number of clusters by itself, 
 # you can use Density Based Clustering Algorithms like DBSCAN:

 from sklearn.cluster import DBSCAN
 DBSCAN(min_samples=1).fit_predict(mat)
 # >>> array([0, 1, 2, 2])

 Source: http://stackoverflow.com/a/30093501
	"""
	### Problem Statement ###
	Let's say you have a square matrix which consists of cosine similarities (values between 0 and 1).
	This square matrix can be of any size.
	You want to get clusters which maximize the values between elemnts in the cluster.
	For example, for the following matrix:

	\| A \| B \| C \| D
	A \| 1.0 \| 0.1 \| 0.6 \| 0.4
	B \| 0.1 \| 1.0 \| 0.1 \| 0.2
	C \| 0.6 \| 0.1 \| 1.0 \| 0.7
	D \| 0.4 \| 0.2 \| 0.7 \| 1.0

	You should get 2 clusters:
	cluster #1: B
	cluster #2: A, C, D
	"""

	import numpy as np
	from sklearn.cluster import SpectralClustering
	mat = np.matrix([[1.,.1,.6,.4],[.1,1.,.1,.2],[.6,.1,1.,.7],[.4,.2,.7,1.]])
	SpectralClustering(2).fit_predict(mat)
	# >>> array([0, 1, 0, 0], dtype=int32)

	# i.e., A, C, D belong to Cluster "0"
	# whereas B belongs to Cluster "1"
	# The algorithm takes the top k eigenvectors of the input matrix corresponding to the largest eigenvalues,
	# then runs the k-mean algorithm on the new matrix. Here is a simple code that does this for your matrix:

	from sklearn.cluster import KMeans
	eigen_values, eigen_vectors = np.linalg.eigh(mat)
	KMeans(n_clusters=2, init='k-means++').fit_predict(eigen_vectors[:, 2:4])
	# >>> array([0, 1, 0, 0], dtype=int32)

	# For the cases you want the algorithm to figure out the number of clusters by itself,
	# you can use Density Based Clustering Algorithms like DBSCAN:

	from sklearn.cluster import DBSCAN
	DBSCAN(min_samples=1).fit_predict(mat)
	# >>> array([0, 1, 2, 2])

	Source: http://stackoverflow.com/a/30093501