Created
March 20, 2024 11:29
-
-
Save dineshdharme/abb0b726f7351e5228f4ef8806f8a62b to your computer and use it in GitHub Desktop.
Clustering similar text using minhashing and lsh.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
https://stackoverflow.com/questions/78186018/fuzzy-logic-to-match-the-records-in-a-dataframe/78192904#78192904 | |
Here's another implementation which does the same thing. This time using MinHash and LSH. | |
Here's an article which explains this. | |
https://spotintelligence.com/2023/01/02/minhash/ | |
First, install `datasketch` and `networkx` | |
`pip install networkx` | |
`pip install datasketch` | |
import networkx as nx | |
from datasketch import MinHash, MinHashLSH | |
import pprint | |
import random | |
names = [ | |
"Arvind Kathmandu", | |
"Arvind Kathmands", | |
"Arbind Kathmandu", | |
"Arvinds Kathmandu", | |
"Arveen Kathmandu", | |
"Arvins Kathmandu", | |
"Arvind Kathmandu Nepal", | |
"Abhishek Pokhara", | |
"Abhisheks Pokhara", | |
"Abhishek1 Pokhara", | |
"Abhishek2 Pokhara", | |
"Abhishek3 Pokhara" | |
] | |
# just to test stuff | |
random.shuffle(names) | |
print(f"shuffled names list = {names}") | |
def get_shingles(name_arg): | |
name_internal = name_arg.lower() | |
shingle_list = [2, 3, 4] | |
list_given = [] | |
for shingle_width in shingle_list: | |
list_internal = [name_internal[i:i + shingle_width] for i in range(max(len(name_internal) - shingle_width + 1, 1))] | |
list_given.extend(list_internal) | |
return set(list_given) | |
signatures = {} | |
for name in names: | |
m = MinHash(num_perm=128) | |
for shingle in get_shingles(name): | |
m.update(shingle.encode('utf8')) | |
signatures[name] = m | |
lsh = MinHashLSH(threshold=0.5, num_perm=128) | |
for name, minhash in signatures.items(): | |
lsh.insert(name, minhash) | |
G = nx.Graph() | |
for name in signatures.keys(): | |
similar_names = lsh.query(signatures[name]) | |
for sim_name in similar_names: | |
G.add_edge(name, sim_name) | |
clusters = list(nx.connected_components(G)) | |
pp = pprint.PrettyPrinter(indent=2) | |
print("Final results:") | |
pp.pprint(clusters) | |
Output : | |
shuffled names list = ['Arvind Kathmands', 'Arvins Kathmandu', 'Abhishek3 Pokhara', 'Abhishek1 Pokhara', 'Abhishek2 Pokhara', 'Arvind Kathmandu', 'Arvind Kathmandu Nepal', 'Arbind Kathmandu', 'Abhisheks Pokhara', 'Arvinds Kathmandu', 'Arveen Kathmandu', 'Abhishek Pokhara'] | |
Final results: | |
[ { 'Arbind Kathmandu', | |
'Arveen Kathmandu', | |
'Arvind Kathmands', | |
'Arvind Kathmandu', | |
'Arvind Kathmandu Nepal', | |
'Arvinds Kathmandu', | |
'Arvins Kathmandu'}, | |
{ 'Abhishek Pokhara', | |
'Abhishek1 Pokhara', | |
'Abhishek2 Pokhara', | |
'Abhishek3 Pokhara', | |
'Abhisheks Pokhara'}] | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment