ehzawad · April 8, 2025 08:18
diff --git a/audio_stuff_to_hf.py b/audio_stuff_to_hf.py
 #!/usr/bin/env python3
 """
 Bengali Emotion Dataset Creator

 This script creates a Hugging Face dataset from Bengali WAV files and their transcriptions.
 It processes audio files, maps them with transcripts from a CSV file, and uploads
 the dataset to the Hugging Face Hub.

 Requirements:
    - pandas
    - datasets
    - huggingface_hub
 """

 import os
 import pandas as pd
 from datasets import Dataset, DatasetDict, Features, Audio, Value
 import argparse
 from datetime import datetime


 def create_bengali_emotion_dataset(base_path, csv_filename, wav_folder_name, 
                                   output_name="bengali-emotion-dataset", 
                                   upload=True, hf_username="ehzawad"):
    """
    Create a Bengali emotion dataset and optionally upload it to Hugging Face Hub.
    
    Args:
        base_path (str): Base directory containing files
        csv_filename (str): Name of the CSV file with transcripts
        wav_folder_name (str): Name of the folder containing WAV files
        output_name (str): Name for the output dataset
        upload (bool): Whether to upload the dataset to HF Hub
        hf_username (str): HF username for upload
        
    Returns:
        datasets.DatasetDict: The created dataset
    """
    # Construct full paths
    csv_path = os.path.join(base_path, csv_filename)
    wav_folder = os.path.join(base_path, wav_folder_name)
    
    # Validate paths
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"CSV file not found: {csv_path}")
    if not os.path.exists(wav_folder):
        raise FileNotFoundError(f"WAV folder not found: {wav_folder}")
    
    print(f"Loading transcripts from: {csv_path}")
    print(f"Loading audio files from: {wav_folder}")
    
    # Read the CSV file
    df = pd.read_csv(csv_path)
    print(f"Found {len(df)} entries in the CSV file")
    
    # Create lists to store the dataset entries
    texts = []
    audio_paths = []
    
    # Process each row in the CSV
    found_count = 0
    missing_count = 0
    
    # Create a log file for errors
    log_file = os.path.join(base_path, f"dataset_creation_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt")
    with open(log_file, 'w') as log:
        log.write(f"Dataset creation started at {datetime.now()}\n")
        log.write(f"CSV: {csv_path}\n")
        log.write(f"WAV folder: {wav_folder}\n\n")
        
        for index, row in df.iterrows():
            try:
                # Get file name and transcript
                file_name = str(row['file_name']).strip()
                transcript = str(row['file_transcript']).strip()
                
                # Construct the path to the WAV file
                wav_path = os.path.join(wav_folder, f"{file_name}.wav")
                
                # Check if the file exists
                if os.path.exists(wav_path):
                    # Add the data to our lists
                    texts.append(transcript)
                    audio_paths.append(wav_path)
                    found_count += 1
                    
                    if index % 50 == 0:
                        print(f"Processed {index}/{len(df)} files...")
                else:
                    error_msg = f"WARNING: WAV file not found for {file_name}"
                    print(error_msg)
                    log.write(f"{error_msg}\n")
                    missing_count += 1
            except Exception as e:
                error_msg = f"ERROR processing row {index}: {e}"
                print(error_msg)
                log.write(f"{error_msg}\n")
                log.write(f"Row data: {row}\n")
                missing_count += 1
        
        summary = f"\nFound and processed {found_count} matching audio files\n"
        summary += f"Missing or error in {missing_count} audio files\n"
        print(summary)
        log.write(summary)
    
    if found_count == 0:
        raise ValueError("No matching audio files found!")
    
    # Create data dictionary with proper Audio feature
    data = {
        "text": texts,
        "audio": audio_paths
    }
    
    # Define features with proper Audio feature
    features = Features({
        "text": Value("string"),
        "audio": Audio()
    })
    
    # Create the dataset with correct features
    print("Creating dataset with proper audio features...")
    dataset = Dataset.from_dict(data, features=features)
    
    # Create a DatasetDict with a train split
    dataset_dict = DatasetDict({"train": dataset})
    
    # Save locally
    local_output_path = os.path.join(base_path, output_name)
    print(f"Saving dataset to: {local_output_path}")
    dataset_dict.save_to_disk(local_output_path)
    
    print("\nDataset created successfully!")
    print(f"Number of samples: {len(dataset_dict['train'])}")
    print(f"Local dataset saved at: {local_output_path}")
    
    # Upload if requested
    if upload:
        try:
            print(f"Uploading to Hugging Face Hub as {hf_username}/{output_name}...")
            dataset_dict.push_to_hub(
                f"{hf_username}/{output_name}",
                private=False
            )
            print("Dataset uploaded successfully!")
            print(f"https://huggingface.co/datasets/{hf_username}/{output_name}")
        except Exception as e:
            print(f"Error during upload: {e}")
            print("The dataset was created locally but could not be uploaded.")
            print("You can try uploading it manually later.")
    
    return dataset_dict


 if __name__ == "__main__":
    # Parse command-line arguments
    parser = argparse.ArgumentParser(description="Create a Bengali emotion dataset and upload it to Hugging Face Hub")
    parser.add_argument("--base-path", default="/home/ehz/hface", help="Base directory containing files")
    parser.add_argument("--csv", default="mou_emotion_transcript.csv", help="Name of the CSV file")
    parser.add_argument("--wav-folder", default="mou_emotion_wavs", help="Name of the folder containing WAV files")
    parser.add_argument("--output", default="bengali-emotion-dataset", help="Name for the output dataset")
    parser.add_argument("--username", default="ehzawad", help="Hugging Face username")
    parser.add_argument("--no-upload", action="store_true", help="Don't upload to HF Hub, just create locally")
    
    args = parser.parse_args()
    
    try:
        # Create and optionally upload the dataset
        dataset = create_bengali_emotion_dataset(
            base_path=args.base_path,
            csv_filename=args.csv,
            wav_folder_name=args.wav_folder,
            output_name=args.output,
            upload=not args.no_upload,
            hf_username=args.username
        )
        
        # Print usage instructions
        print("\nTo use this dataset:")
        print("from datasets import load_dataset")
        if not args.no_upload:
            print(f"dataset = load_dataset(\"{args.username}/{args.output}\")")
        else:
            print(f"dataset = load_from_disk(\"{os.path.join(args.base_path, args.output)}\")")
        print("train_data = dataset['train']")
        
    except Exception as e:
        print(f"Error: {e}")
	#!/usr/bin/env python3
	"""
	Bengali Emotion Dataset Creator

	This script creates a Hugging Face dataset from Bengali WAV files and their transcriptions.
	It processes audio files, maps them with transcripts from a CSV file, and uploads
	the dataset to the Hugging Face Hub.

	Requirements:
	- pandas
	- datasets
	- huggingface_hub
	"""

	import os
	import pandas as pd
	from datasets import Dataset, DatasetDict, Features, Audio, Value
	import argparse
	from datetime import datetime


	def create_bengali_emotion_dataset(base_path, csv_filename, wav_folder_name,
	output_name="bengali-emotion-dataset",
	upload=True, hf_username="ehzawad"):
	"""
	Create a Bengali emotion dataset and optionally upload it to Hugging Face Hub.

	Args:
	base_path (str): Base directory containing files
	csv_filename (str): Name of the CSV file with transcripts
	wav_folder_name (str): Name of the folder containing WAV files
	output_name (str): Name for the output dataset
	upload (bool): Whether to upload the dataset to HF Hub
	hf_username (str): HF username for upload

	Returns:
	datasets.DatasetDict: The created dataset
	"""
	# Construct full paths
	csv_path = os.path.join(base_path, csv_filename)
	wav_folder = os.path.join(base_path, wav_folder_name)

	# Validate paths
	if not os.path.exists(csv_path):
	raise FileNotFoundError(f"CSV file not found: {csv_path}")
	if not os.path.exists(wav_folder):
	raise FileNotFoundError(f"WAV folder not found: {wav_folder}")

	print(f"Loading transcripts from: {csv_path}")
	print(f"Loading audio files from: {wav_folder}")

	# Read the CSV file
	df = pd.read_csv(csv_path)
	print(f"Found {len(df)} entries in the CSV file")

	# Create lists to store the dataset entries
	texts = []
	audio_paths = []

	# Process each row in the CSV
	found_count = 0
	missing_count = 0

	# Create a log file for errors
	log_file = os.path.join(base_path, f"dataset_creation_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt")
	with open(log_file, 'w') as log:
	log.write(f"Dataset creation started at {datetime.now()}\n")
	log.write(f"CSV: {csv_path}\n")
	log.write(f"WAV folder: {wav_folder}\n\n")

	for index, row in df.iterrows():
	try:
	# Get file name and transcript
	file_name = str(row['file_name']).strip()
	transcript = str(row['file_transcript']).strip()

	# Construct the path to the WAV file
	wav_path = os.path.join(wav_folder, f"{file_name}.wav")

	# Check if the file exists
	if os.path.exists(wav_path):
	# Add the data to our lists
	texts.append(transcript)
	audio_paths.append(wav_path)
	found_count += 1

	if index % 50 == 0:
	print(f"Processed {index}/{len(df)} files...")
	else:
	error_msg = f"WARNING: WAV file not found for {file_name}"
	print(error_msg)
	log.write(f"{error_msg}\n")
	missing_count += 1
	except Exception as e:
	error_msg = f"ERROR processing row {index}: {e}"
	print(error_msg)
	log.write(f"{error_msg}\n")
	log.write(f"Row data: {row}\n")
	missing_count += 1

	summary = f"\nFound and processed {found_count} matching audio files\n"
	summary += f"Missing or error in {missing_count} audio files\n"
	print(summary)
	log.write(summary)

	if found_count == 0:
	raise ValueError("No matching audio files found!")

	# Create data dictionary with proper Audio feature
	data = {
	"text": texts,
	"audio": audio_paths
	}

	# Define features with proper Audio feature
	features = Features({
	"text": Value("string"),
	"audio": Audio()
	})

	# Create the dataset with correct features
	print("Creating dataset with proper audio features...")
	dataset = Dataset.from_dict(data, features=features)

	# Create a DatasetDict with a train split
	dataset_dict = DatasetDict({"train": dataset})

	# Save locally
	local_output_path = os.path.join(base_path, output_name)
	print(f"Saving dataset to: {local_output_path}")
	dataset_dict.save_to_disk(local_output_path)

	print("\nDataset created successfully!")
	print(f"Number of samples: {len(dataset_dict['train'])}")
	print(f"Local dataset saved at: {local_output_path}")

	# Upload if requested
	if upload:
	try:
	print(f"Uploading to Hugging Face Hub as {hf_username}/{output_name}...")
	dataset_dict.push_to_hub(
	f"{hf_username}/{output_name}",
	private=False
	)
	print("Dataset uploaded successfully!")
	print(f"https://huggingface.co/datasets/{hf_username}/{output_name}")
	except Exception as e:
	print(f"Error during upload: {e}")
	print("The dataset was created locally but could not be uploaded.")
	print("You can try uploading it manually later.")

	return dataset_dict


	if __name__ == "__main__":
	# Parse command-line arguments
	parser = argparse.ArgumentParser(description="Create a Bengali emotion dataset and upload it to Hugging Face Hub")
	parser.add_argument("--base-path", default="/home/ehz/hface", help="Base directory containing files")
	parser.add_argument("--csv", default="mou_emotion_transcript.csv", help="Name of the CSV file")
	parser.add_argument("--wav-folder", default="mou_emotion_wavs", help="Name of the folder containing WAV files")
	parser.add_argument("--output", default="bengali-emotion-dataset", help="Name for the output dataset")
	parser.add_argument("--username", default="ehzawad", help="Hugging Face username")
	parser.add_argument("--no-upload", action="store_true", help="Don't upload to HF Hub, just create locally")

	args = parser.parse_args()

	try:
	# Create and optionally upload the dataset
	dataset = create_bengali_emotion_dataset(
	base_path=args.base_path,
	csv_filename=args.csv,
	wav_folder_name=args.wav_folder,
	output_name=args.output,
	upload=not args.no_upload,
	hf_username=args.username
	)

	# Print usage instructions
	print("\nTo use this dataset:")
	print("from datasets import load_dataset")
	if not args.no_upload:
	print(f"dataset = load_dataset(\"{args.username}/{args.output}\")")
	else:
	print(f"dataset = load_from_disk(\"{os.path.join(args.base_path, args.output)}\")")
	print("train_data = dataset['train']")

	except Exception as e:
	print(f"Error: {e}")