Skip to content

Instantly share code, notes, and snippets.

@ehzawad
Created April 8, 2025 08:18
Show Gist options
  • Save ehzawad/d88658b2a41966ceff03a970693df101 to your computer and use it in GitHub Desktop.
Save ehzawad/d88658b2a41966ceff03a970693df101 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Bengali Emotion Dataset Creator
This script creates a Hugging Face dataset from Bengali WAV files and their transcriptions.
It processes audio files, maps them with transcripts from a CSV file, and uploads
the dataset to the Hugging Face Hub.
Requirements:
- pandas
- datasets
- huggingface_hub
"""
import os
import pandas as pd
from datasets import Dataset, DatasetDict, Features, Audio, Value
import argparse
from datetime import datetime
def create_bengali_emotion_dataset(base_path, csv_filename, wav_folder_name,
output_name="bengali-emotion-dataset",
upload=True, hf_username="ehzawad"):
"""
Create a Bengali emotion dataset and optionally upload it to Hugging Face Hub.
Args:
base_path (str): Base directory containing files
csv_filename (str): Name of the CSV file with transcripts
wav_folder_name (str): Name of the folder containing WAV files
output_name (str): Name for the output dataset
upload (bool): Whether to upload the dataset to HF Hub
hf_username (str): HF username for upload
Returns:
datasets.DatasetDict: The created dataset
"""
# Construct full paths
csv_path = os.path.join(base_path, csv_filename)
wav_folder = os.path.join(base_path, wav_folder_name)
# Validate paths
if not os.path.exists(csv_path):
raise FileNotFoundError(f"CSV file not found: {csv_path}")
if not os.path.exists(wav_folder):
raise FileNotFoundError(f"WAV folder not found: {wav_folder}")
print(f"Loading transcripts from: {csv_path}")
print(f"Loading audio files from: {wav_folder}")
# Read the CSV file
df = pd.read_csv(csv_path)
print(f"Found {len(df)} entries in the CSV file")
# Create lists to store the dataset entries
texts = []
audio_paths = []
# Process each row in the CSV
found_count = 0
missing_count = 0
# Create a log file for errors
log_file = os.path.join(base_path, f"dataset_creation_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt")
with open(log_file, 'w') as log:
log.write(f"Dataset creation started at {datetime.now()}\n")
log.write(f"CSV: {csv_path}\n")
log.write(f"WAV folder: {wav_folder}\n\n")
for index, row in df.iterrows():
try:
# Get file name and transcript
file_name = str(row['file_name']).strip()
transcript = str(row['file_transcript']).strip()
# Construct the path to the WAV file
wav_path = os.path.join(wav_folder, f"{file_name}.wav")
# Check if the file exists
if os.path.exists(wav_path):
# Add the data to our lists
texts.append(transcript)
audio_paths.append(wav_path)
found_count += 1
if index % 50 == 0:
print(f"Processed {index}/{len(df)} files...")
else:
error_msg = f"WARNING: WAV file not found for {file_name}"
print(error_msg)
log.write(f"{error_msg}\n")
missing_count += 1
except Exception as e:
error_msg = f"ERROR processing row {index}: {e}"
print(error_msg)
log.write(f"{error_msg}\n")
log.write(f"Row data: {row}\n")
missing_count += 1
summary = f"\nFound and processed {found_count} matching audio files\n"
summary += f"Missing or error in {missing_count} audio files\n"
print(summary)
log.write(summary)
if found_count == 0:
raise ValueError("No matching audio files found!")
# Create data dictionary with proper Audio feature
data = {
"text": texts,
"audio": audio_paths
}
# Define features with proper Audio feature
features = Features({
"text": Value("string"),
"audio": Audio()
})
# Create the dataset with correct features
print("Creating dataset with proper audio features...")
dataset = Dataset.from_dict(data, features=features)
# Create a DatasetDict with a train split
dataset_dict = DatasetDict({"train": dataset})
# Save locally
local_output_path = os.path.join(base_path, output_name)
print(f"Saving dataset to: {local_output_path}")
dataset_dict.save_to_disk(local_output_path)
print("\nDataset created successfully!")
print(f"Number of samples: {len(dataset_dict['train'])}")
print(f"Local dataset saved at: {local_output_path}")
# Upload if requested
if upload:
try:
print(f"Uploading to Hugging Face Hub as {hf_username}/{output_name}...")
dataset_dict.push_to_hub(
f"{hf_username}/{output_name}",
private=False
)
print("Dataset uploaded successfully!")
print(f"https://huggingface.co/datasets/{hf_username}/{output_name}")
except Exception as e:
print(f"Error during upload: {e}")
print("The dataset was created locally but could not be uploaded.")
print("You can try uploading it manually later.")
return dataset_dict
if __name__ == "__main__":
# Parse command-line arguments
parser = argparse.ArgumentParser(description="Create a Bengali emotion dataset and upload it to Hugging Face Hub")
parser.add_argument("--base-path", default="/home/ehz/hface", help="Base directory containing files")
parser.add_argument("--csv", default="mou_emotion_transcript.csv", help="Name of the CSV file")
parser.add_argument("--wav-folder", default="mou_emotion_wavs", help="Name of the folder containing WAV files")
parser.add_argument("--output", default="bengali-emotion-dataset", help="Name for the output dataset")
parser.add_argument("--username", default="ehzawad", help="Hugging Face username")
parser.add_argument("--no-upload", action="store_true", help="Don't upload to HF Hub, just create locally")
args = parser.parse_args()
try:
# Create and optionally upload the dataset
dataset = create_bengali_emotion_dataset(
base_path=args.base_path,
csv_filename=args.csv,
wav_folder_name=args.wav_folder,
output_name=args.output,
upload=not args.no_upload,
hf_username=args.username
)
# Print usage instructions
print("\nTo use this dataset:")
print("from datasets import load_dataset")
if not args.no_upload:
print(f"dataset = load_dataset(\"{args.username}/{args.output}\")")
else:
print(f"dataset = load_from_disk(\"{os.path.join(args.base_path, args.output)}\")")
print("train_data = dataset['train']")
except Exception as e:
print(f"Error: {e}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment