Created
April 8, 2025 08:18
-
-
Save ehzawad/d88658b2a41966ceff03a970693df101 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Bengali Emotion Dataset Creator | |
This script creates a Hugging Face dataset from Bengali WAV files and their transcriptions. | |
It processes audio files, maps them with transcripts from a CSV file, and uploads | |
the dataset to the Hugging Face Hub. | |
Requirements: | |
- pandas | |
- datasets | |
- huggingface_hub | |
""" | |
import os | |
import pandas as pd | |
from datasets import Dataset, DatasetDict, Features, Audio, Value | |
import argparse | |
from datetime import datetime | |
def create_bengali_emotion_dataset(base_path, csv_filename, wav_folder_name, | |
output_name="bengali-emotion-dataset", | |
upload=True, hf_username="ehzawad"): | |
""" | |
Create a Bengali emotion dataset and optionally upload it to Hugging Face Hub. | |
Args: | |
base_path (str): Base directory containing files | |
csv_filename (str): Name of the CSV file with transcripts | |
wav_folder_name (str): Name of the folder containing WAV files | |
output_name (str): Name for the output dataset | |
upload (bool): Whether to upload the dataset to HF Hub | |
hf_username (str): HF username for upload | |
Returns: | |
datasets.DatasetDict: The created dataset | |
""" | |
# Construct full paths | |
csv_path = os.path.join(base_path, csv_filename) | |
wav_folder = os.path.join(base_path, wav_folder_name) | |
# Validate paths | |
if not os.path.exists(csv_path): | |
raise FileNotFoundError(f"CSV file not found: {csv_path}") | |
if not os.path.exists(wav_folder): | |
raise FileNotFoundError(f"WAV folder not found: {wav_folder}") | |
print(f"Loading transcripts from: {csv_path}") | |
print(f"Loading audio files from: {wav_folder}") | |
# Read the CSV file | |
df = pd.read_csv(csv_path) | |
print(f"Found {len(df)} entries in the CSV file") | |
# Create lists to store the dataset entries | |
texts = [] | |
audio_paths = [] | |
# Process each row in the CSV | |
found_count = 0 | |
missing_count = 0 | |
# Create a log file for errors | |
log_file = os.path.join(base_path, f"dataset_creation_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt") | |
with open(log_file, 'w') as log: | |
log.write(f"Dataset creation started at {datetime.now()}\n") | |
log.write(f"CSV: {csv_path}\n") | |
log.write(f"WAV folder: {wav_folder}\n\n") | |
for index, row in df.iterrows(): | |
try: | |
# Get file name and transcript | |
file_name = str(row['file_name']).strip() | |
transcript = str(row['file_transcript']).strip() | |
# Construct the path to the WAV file | |
wav_path = os.path.join(wav_folder, f"{file_name}.wav") | |
# Check if the file exists | |
if os.path.exists(wav_path): | |
# Add the data to our lists | |
texts.append(transcript) | |
audio_paths.append(wav_path) | |
found_count += 1 | |
if index % 50 == 0: | |
print(f"Processed {index}/{len(df)} files...") | |
else: | |
error_msg = f"WARNING: WAV file not found for {file_name}" | |
print(error_msg) | |
log.write(f"{error_msg}\n") | |
missing_count += 1 | |
except Exception as e: | |
error_msg = f"ERROR processing row {index}: {e}" | |
print(error_msg) | |
log.write(f"{error_msg}\n") | |
log.write(f"Row data: {row}\n") | |
missing_count += 1 | |
summary = f"\nFound and processed {found_count} matching audio files\n" | |
summary += f"Missing or error in {missing_count} audio files\n" | |
print(summary) | |
log.write(summary) | |
if found_count == 0: | |
raise ValueError("No matching audio files found!") | |
# Create data dictionary with proper Audio feature | |
data = { | |
"text": texts, | |
"audio": audio_paths | |
} | |
# Define features with proper Audio feature | |
features = Features({ | |
"text": Value("string"), | |
"audio": Audio() | |
}) | |
# Create the dataset with correct features | |
print("Creating dataset with proper audio features...") | |
dataset = Dataset.from_dict(data, features=features) | |
# Create a DatasetDict with a train split | |
dataset_dict = DatasetDict({"train": dataset}) | |
# Save locally | |
local_output_path = os.path.join(base_path, output_name) | |
print(f"Saving dataset to: {local_output_path}") | |
dataset_dict.save_to_disk(local_output_path) | |
print("\nDataset created successfully!") | |
print(f"Number of samples: {len(dataset_dict['train'])}") | |
print(f"Local dataset saved at: {local_output_path}") | |
# Upload if requested | |
if upload: | |
try: | |
print(f"Uploading to Hugging Face Hub as {hf_username}/{output_name}...") | |
dataset_dict.push_to_hub( | |
f"{hf_username}/{output_name}", | |
private=False | |
) | |
print("Dataset uploaded successfully!") | |
print(f"https://huggingface.co/datasets/{hf_username}/{output_name}") | |
except Exception as e: | |
print(f"Error during upload: {e}") | |
print("The dataset was created locally but could not be uploaded.") | |
print("You can try uploading it manually later.") | |
return dataset_dict | |
if __name__ == "__main__": | |
# Parse command-line arguments | |
parser = argparse.ArgumentParser(description="Create a Bengali emotion dataset and upload it to Hugging Face Hub") | |
parser.add_argument("--base-path", default="/home/ehz/hface", help="Base directory containing files") | |
parser.add_argument("--csv", default="mou_emotion_transcript.csv", help="Name of the CSV file") | |
parser.add_argument("--wav-folder", default="mou_emotion_wavs", help="Name of the folder containing WAV files") | |
parser.add_argument("--output", default="bengali-emotion-dataset", help="Name for the output dataset") | |
parser.add_argument("--username", default="ehzawad", help="Hugging Face username") | |
parser.add_argument("--no-upload", action="store_true", help="Don't upload to HF Hub, just create locally") | |
args = parser.parse_args() | |
try: | |
# Create and optionally upload the dataset | |
dataset = create_bengali_emotion_dataset( | |
base_path=args.base_path, | |
csv_filename=args.csv, | |
wav_folder_name=args.wav_folder, | |
output_name=args.output, | |
upload=not args.no_upload, | |
hf_username=args.username | |
) | |
# Print usage instructions | |
print("\nTo use this dataset:") | |
print("from datasets import load_dataset") | |
if not args.no_upload: | |
print(f"dataset = load_dataset(\"{args.username}/{args.output}\")") | |
else: | |
print(f"dataset = load_from_disk(\"{os.path.join(args.base_path, args.output)}\")") | |
print("train_data = dataset['train']") | |
except Exception as e: | |
print(f"Error: {e}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment