dineshdharme · April 9, 2024 16:21
diff --git a/MaximalBipartiteMatchingGraphProblem.py b/MaximalBipartiteMatchingGraphProblem.py
 https://stackoverflow.com/questions/78294920/select-unique-pairs-from-pyspark-dataframe

 As @ Abdennacer Lachiheb  mentioned in the comment, this is indeed a bipartite matching algorithm. Unlikely to get solved correctly in pyspark or using graphframes. The best would to solve it using a graph algorithm library's `hopcroft_karp_matching` like `NetworkX`. Or use `scipy.optimize.linear_sum_assignment`

 `hopcroft_karp_matching` : pure python code, runs in O(E√V) time, where E is the number of edges and V is the number of vertices in the graph.

 `scipy.optimize.linear_sum_assignment` : O(n^3) complexity but written in c++.

 So only practical testing on the data can determine which works better on your data sizes.  

 Read more for discussion here :

 https://stackoverflow.com/a/49552057/3238085

 https://stackoverflow.com/a/4436206/3238085

 NetworkX solve this bipartite **maximal** matching problem using *Hopcroft-Karp algorithm.*

 Here's are two such solutions :


    import networkx as nx
    from networkx.algorithms import bipartite
    
    
    def first_method():
        data = [(1, 'a'), (1, 'b'), (1, 'c'), (2, 'b'), (2, 'a'), (2, 'c'),
                (3, 'a'), (3, 'c'), (3, 'b'), (4, 'p')]
    
        B = nx.Graph()
    
        B.add_nodes_from(set(x for x, y in data), bipartite=0)  # Set X
        B.add_nodes_from(set(y for x, y in data), bipartite=1)  # Set Y
    
        B.add_edges_from(data)
    
        top_nodes = set(x for x, y in data)
    
        matching = nx.algorithms.bipartite.matching.hopcroft_karp_matching(B, top_nodes=top_nodes)
    
        filtered_matching = {x: y for x, y in matching.items() if x in top_nodes}
    
        print("Matching pairs:")
        for x, y in filtered_matching.items():
            print(f"{x} - {y}")
    
    
    def second_method():
        import numpy as np
        from scipy.optimize import linear_sum_assignment
    
    
        data = [(1, 'a'), (1, 'b'), (1, 'c'), (2, 'b'), (2, 'a'), (2, 'c'),
                (3, 'a'), (3, 'c'), (3, 'b'), (4, 'p')]
    
        X, Y = zip(*data)
        X_unique = sorted(set(X))
        Y_unique = sorted(set(Y))
    
        cost_matrix = np.full((len(X_unique), len(Y_unique)), fill_value=np.inf)
    
        for x, y in data:
            i = X_unique.index(x)  # Row index
            j = Y_unique.index(y)  # Column index
            cost_matrix[i, j] = 1  # Set cost to 1 for existing connections
    
        row_ind, col_ind = linear_sum_assignment(cost_matrix)
    
        print("Matching pairs:")
        for i, j in zip(row_ind, col_ind):
            print(f"{X_unique[i]} - {Y_unique[j]}")
    
    
    first_method()
    second_method()

 Output : 

    Matching pairs:
    1 - a
    2 - b
    3 - c
    4 - p
    
    Matching pairs:
    1 - a
    2 - b
    3 - c
    4 - p
	https://stackoverflow.com/questions/78294920/select-unique-pairs-from-pyspark-dataframe

	As @ Abdennacer Lachiheb mentioned in the comment, this is indeed a bipartite matching algorithm. Unlikely to get solved correctly in pyspark or using graphframes. The best would to solve it using a graph algorithm library's `hopcroft_karp_matching` like `NetworkX`. Or use `scipy.optimize.linear_sum_assignment`

	`hopcroft_karp_matching` : pure python code, runs in O(E√V) time, where E is the number of edges and V is the number of vertices in the graph.

	`scipy.optimize.linear_sum_assignment` : O(n^3) complexity but written in c++.

	So only practical testing on the data can determine which works better on your data sizes.

	Read more for discussion here :

	https://stackoverflow.com/a/49552057/3238085

	https://stackoverflow.com/a/4436206/3238085

	NetworkX solve this bipartite maximal matching problem using Hopcroft-Karp algorithm.

	Here's are two such solutions :


	import networkx as nx
	from networkx.algorithms import bipartite


	def first_method():
	data = [(1, 'a'), (1, 'b'), (1, 'c'), (2, 'b'), (2, 'a'), (2, 'c'),
	(3, 'a'), (3, 'c'), (3, 'b'), (4, 'p')]

	B = nx.Graph()

	B.add_nodes_from(set(x for x, y in data), bipartite=0) # Set X
	B.add_nodes_from(set(y for x, y in data), bipartite=1) # Set Y

	B.add_edges_from(data)

	top_nodes = set(x for x, y in data)

	matching = nx.algorithms.bipartite.matching.hopcroft_karp_matching(B, top_nodes=top_nodes)

	filtered_matching = {x: y for x, y in matching.items() if x in top_nodes}

	print("Matching pairs:")
	for x, y in filtered_matching.items():
	print(f"{x} - {y}")


	def second_method():
	import numpy as np
	from scipy.optimize import linear_sum_assignment


	data = [(1, 'a'), (1, 'b'), (1, 'c'), (2, 'b'), (2, 'a'), (2, 'c'),
	(3, 'a'), (3, 'c'), (3, 'b'), (4, 'p')]

	X, Y = zip(*data)
	X_unique = sorted(set(X))
	Y_unique = sorted(set(Y))

	cost_matrix = np.full((len(X_unique), len(Y_unique)), fill_value=np.inf)

	for x, y in data:
	i = X_unique.index(x) # Row index
	j = Y_unique.index(y) # Column index
	cost_matrix[i, j] = 1 # Set cost to 1 for existing connections

	row_ind, col_ind = linear_sum_assignment(cost_matrix)

	print("Matching pairs:")
	for i, j in zip(row_ind, col_ind):
	print(f"{X_unique[i]} - {Y_unique[j]}")


	first_method()
	second_method()

	Output :

	Matching pairs:
	1 - a
	2 - b
	3 - c
	4 - p

	Matching pairs:
	1 - a
	2 - b
	3 - c
	4 - p