Last active
August 30, 2021 20:18
-
-
Save sulaya86/729db71ec59b20201fc0b36b6a445f9b to your computer and use it in GitHub Desktop.
Remove extra blank spaces in a column and removes duplicated rows
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The Problem: Marketing Team requires to crosscheck the list of products available | |
# in all the websites of the Company | |
# Input: A Excel File that contains "Model" and "Item Path" columns | |
# Requirements: Models are duplicated, Remove the duplicated models and keep the Item Path | |
# The problem: The cells in the "Model" columns contains extra spaces which makes all the models be unique | |
# even if they are not | |
# This script will remove extra spaces, remove the cells duplicated by keeping the first item found | |
# author: Soraya Ruiz | |
# date of creation: 27-08-2021 | |
import pandas as pd | |
def read_file(path): | |
data = pd.read_excel(path) | |
return data | |
def remove_extra_spaces(data): | |
models = data[['Model', 'Item Path']] | |
df = pd.DataFrame(models) | |
#remove the extra spaces from head and tail of string | |
#https://www.geeksforgeeks.org/pandas-strip-whitespace-from-entire-dataframe/ | |
#https://www.datasciencemadesimple.com/strip-space-column-pandas-dataframe-leading-trailing-2/ | |
df['Model'] = df['Model'].str.strip() | |
return df | |
def remove_duplicate(data): | |
# Apply the removing only to the column "Model", applying to all | |
# the dataset will make all the rows unique since "Item Path" is different | |
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html | |
deduplicate = data.drop_duplicates(subset='Model', keep='first') | |
return deduplicate | |
if __name__ == '__main__': | |
# Get the data in a dataframe | |
dataset = read_file( | |
'C:\\tofilter.xlsx' | |
) | |
# Remove the extra spaces | |
product_list = remove_extra_spaces(dataset) | |
# Remove rows duplicated | |
new_product_list = remove_duplicate(product_list) | |
# Save the result in a new file | |
new_product_list.to_excel("filtered.xlsx") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment