sulaya86 · August 30, 2021 20:18
diff --git a/deduplicate_products.py b/deduplicate_products.py
 # The Problem: Marketing Team requires to crosscheck the list of products available
 # in all the websites of the Company
 # Input: A Excel File that contains "Model" and "Item Path" columns
 # Requirements: Models are duplicated, Remove the duplicated models and keep the Item Path
 # The problem: The cells in the "Model" columns contains extra spaces which makes all the models be unique
 # even if they are not
 # This script will remove extra spaces, remove the cells duplicated by keeping the first item found
 # author: Soraya Ruiz
 # date of creation: 27-08-2021

 import pandas as pd


 def read_file(path):
    data = pd.read_excel(path)
    return data


 def remove_extra_spaces(data):
    models = data[['Model', 'Item Path']]
    df = pd.DataFrame(models)
    #remove the extra spaces from head and tail of string
    #https://www.geeksforgeeks.org/pandas-strip-whitespace-from-entire-dataframe/
    #https://www.datasciencemadesimple.com/strip-space-column-pandas-dataframe-leading-trailing-2/
    df['Model'] = df['Model'].str.strip()
    return df


 def remove_duplicate(data):
    # Apply the removing only to the column "Model", applying to all
    # the dataset will make all the rows unique since "Item Path" is different
    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html
    deduplicate = data.drop_duplicates(subset='Model', keep='first')
    return deduplicate


 if __name__ == '__main__':
    # Get the data in a dataframe
    dataset = read_file(
        'C:\\tofilter.xlsx'
    )
    # Remove the extra spaces
    product_list = remove_extra_spaces(dataset)
    # Remove rows duplicated
    new_product_list = remove_duplicate(product_list)
    # Save the result in a new file
    new_product_list.to_excel("filtered.xlsx")
	# The Problem: Marketing Team requires to crosscheck the list of products available
	# in all the websites of the Company
	# Input: A Excel File that contains "Model" and "Item Path" columns
	# Requirements: Models are duplicated, Remove the duplicated models and keep the Item Path
	# The problem: The cells in the "Model" columns contains extra spaces which makes all the models be unique
	# even if they are not
	# This script will remove extra spaces, remove the cells duplicated by keeping the first item found
	# author: Soraya Ruiz
	# date of creation: 27-08-2021

	import pandas as pd


	def read_file(path):
	data = pd.read_excel(path)
	return data


	def remove_extra_spaces(data):
	models = data[['Model', 'Item Path']]
	df = pd.DataFrame(models)
	#remove the extra spaces from head and tail of string
	#https://www.geeksforgeeks.org/pandas-strip-whitespace-from-entire-dataframe/
	#https://www.datasciencemadesimple.com/strip-space-column-pandas-dataframe-leading-trailing-2/
	df['Model'] = df['Model'].str.strip()
	return df


	def remove_duplicate(data):
	# Apply the removing only to the column "Model", applying to all
	# the dataset will make all the rows unique since "Item Path" is different
	# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html
	deduplicate = data.drop_duplicates(subset='Model', keep='first')
	return deduplicate


	if __name__ == '__main__':
	# Get the data in a dataframe
	dataset = read_file(
	'C:\\tofilter.xlsx'
	)
	# Remove the extra spaces
	product_list = remove_extra_spaces(dataset)
	# Remove rows duplicated
	new_product_list = remove_duplicate(product_list)
	# Save the result in a new file
	new_product_list.to_excel("filtered.xlsx")