ntfargo · February 16, 2025 16:31
diff --git a/codebase-dumper.py b/codebase-dumper.py
 #!/usr/bin/env python3
 """
 A folder scanning tool that writes a directory tree and the contents
 of each file (found recursively) into an output file. Only one command-line argument,
 the folder path, is required.

 The output file is automatically named after the folder with the suffix "-dump.txt".

 Options:
  --include-hidden   : Include hidden files/folders (those starting with a dot).
  --no-folder        : Do not dump the directory (folder) structure.
  --no-files         : Do not dump the file contents.
  --exclude PATTERNS : Comma-separated list of patterns to exclude (e.g., "license.md,build").
  --skip-large       : Skip dumping file content if the file has more than 1500 lines.

 Binary files (images, fonts, etc.) are automatically skipped.
 The first line of the dump will contain a summary line with an estimated token count,
 computed as approximately one token per 4 characters.
 """

 import sys
 from pathlib import Path
 import fnmatch


 def format_number(n: float) -> str:
    """
    Format a number into a string with K (thousands) or M (millions) if appropriate.
    """
    if n >= 1_000_000:
        return f"{n/1_000_000:.1f}M"
    elif n >= 1000:
        return f"{n/1000:.1f}K"
    else:
        return str(int(n))


 def is_excluded(path: str, exclusion_patterns: set) -> bool:
    """
    Check if a given path should be excluded based on a set of patterns.
    First uses fnmatch for shell-style matching, then checks if any path component
    exactly matches one of the exclusion patterns.
    """
    for pattern in exclusion_patterns:
        if fnmatch.fnmatch(path, pattern):
            return True
    for part in Path(path).parts:
        if part in exclusion_patterns:
            return True
    return False


 def generate_tree(current: Path, base: Path, include_hidden: bool, exclude_patterns: set, prefix: str = "") -> list:
    """
    Recursively generate lines for the directory tree.
    Hidden entries are skipped if include_hidden is False.
    Files or directories matching any exclusion pattern are omitted.
    """
    lines = []
    entries = []
    for entry in current.iterdir():
        if not include_hidden and entry.name.startswith('.'):
            continue
        rel_entry = str(entry.relative_to(base)).replace("\\", "/")
        if is_excluded(rel_entry, exclude_patterns):
            continue
        entries.append(entry)
    entries.sort(key=lambda p: (not p.is_dir(), p.name.lower()))
    
    for index, entry in enumerate(entries):
        connector = "└── " if index == len(entries) - 1 else "├── "
        if entry.is_dir():
            lines.append(f"{prefix}{connector}{entry.name}/")
            new_prefix = prefix + ("    " if index == len(entries) - 1 else "│   ")
            lines.extend(generate_tree(entry, base, include_hidden, exclude_patterns, new_prefix))
        else:
            lines.append(f"{prefix}{connector}{entry.name}")
    return lines


 def get_directory_tree(base: Path, include_hidden: bool, exclude_patterns: set) -> str:
    """
    Return a string representation of the directory tree for the base folder.
    """
    tree_lines = [f"/ {base.name}"]
    tree_lines.extend(generate_tree(base, base, include_hidden, exclude_patterns))
    return "\n".join(tree_lines)


 def scan_folder(base: Path, output_file: Path, include_hidden: bool, dump_folder: bool,
                dump_files: bool, exclude_patterns: set, skip_large: bool) -> None:
    """
    Generate the output content, including the directory structure and file contents,
    compute a summary with estimated tokens, and write everything to the output file.
    If 'skip_large' is True, files with more than 1500 lines will not have their content dumped.
    Binary files (images, fonts, etc.) are automatically skipped.
    """
    output_lines = []
    
    if dump_folder:
        output_lines.append("Directory Structure:")
        output_lines.append("-------------------")
        output_lines.append(get_directory_tree(base, include_hidden, exclude_patterns))
        output_lines.append("")
        
    if dump_files:
        output_lines.append("File Contents:")
        output_lines.append("--------------")
        skip_extensions = {".png", ".gif", ".jpg", ".jpeg", ".webp", ".svg",
                           ".ttf", ".otf", ".woff", ".woff2"}
        
        # Process files recursively.
        for file_path in sorted(base.rglob("*"), key=lambda p: str(p.relative_to(base)).lower()):
            # Skip files in hidden directories if include_hidden is False.
            rel_parts = file_path.relative_to(base).parts
            if not include_hidden and any(part.startswith('.') for part in rel_parts):
                continue
            rel_path = str(file_path.relative_to(base)).replace("\\", "/")
            if is_excluded(rel_path, exclude_patterns):
                continue
            if file_path.is_file():
                # Skip known binary file types.
                if file_path.suffix.lower() in skip_extensions:
                    continue

                try:
                    content = file_path.read_text(encoding="utf-8")
                except Exception:
                    # If reading fails (e.g. binary/non-UTF8), skip the file entirely.
                    continue

                output_lines.append(f"File: {rel_path}")
                output_lines.append("-" * 50)
                if skip_large and len(content.splitlines()) > 1500:
                    output_lines.append("Skipped file content because it exceeds 1500 lines.")
                else:
                    output_lines.append(content)
                output_lines.append("")
    
    final_output = "\n".join(output_lines)
    
    total_chars = len(final_output)
    estimated_tokens = total_chars / 4
    formatted_tokens = format_number(estimated_tokens)
    
    summary_line = f"Summary: Estimated tokens: {formatted_tokens}\n"
    final_output = summary_line + final_output
    
    output_file.write_text(final_output, encoding="utf-8")
    print(f"Scan complete. Results written to {output_file}")


 def main():
    if len(sys.argv) < 2:
        print("Usage: python script.py <folder_path> [--include-hidden] [--no-folder] [--no-files] [--exclude PATTERNS] [--skip-large]")
        sys.exit(1)
    
    include_hidden = False
    dump_folder = True
    dump_files = True
    skip_large = False
    exclude_patterns = set()
    folder_path = None

    i = 1
    while i < len(sys.argv):
        arg = sys.argv[i]
        if arg == "--include-hidden":
            include_hidden = True
        elif arg == "--no-folder":
            dump_folder = False
        elif arg == "--no-files":
            dump_files = False
        elif arg == "--skip-large":
            skip_large = True
        elif arg == "--exclude":
            if i + 1 >= len(sys.argv):
                print("Error: Missing argument for --exclude")
                sys.exit(1)
            patterns = sys.argv[i + 1].split(',')
            for pat in patterns:
                pat = pat.strip()
                if pat:
                    exclude_patterns.add(pat)
            i += 1 
        else:
            if folder_path is None:
                folder_path = arg
        i += 1

    if folder_path is None:
        print("Error: No folder path provided.\nUsage: python script.py <folder_path> [--include-hidden] [--no-folder] [--no-files] [--exclude PATTERNS] [--skip-large]")
        sys.exit(1)
    
    base = Path(folder_path)
    if not base.is_dir():
        print(f"Error: '{folder_path}' is not a valid directory.")
        sys.exit(1)
    
    output_file = Path(f"{base.name}-dump.txt")
    scan_folder(base, output_file, include_hidden, dump_folder, dump_files, exclude_patterns, skip_large)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	A folder scanning tool that writes a directory tree and the contents
	of each file (found recursively) into an output file. Only one command-line argument,
	the folder path, is required.

	The output file is automatically named after the folder with the suffix "-dump.txt".

	Options:
	--include-hidden : Include hidden files/folders (those starting with a dot).
	--no-folder : Do not dump the directory (folder) structure.
	--no-files : Do not dump the file contents.
	--exclude PATTERNS : Comma-separated list of patterns to exclude (e.g., "license.md,build").
	--skip-large : Skip dumping file content if the file has more than 1500 lines.

	Binary files (images, fonts, etc.) are automatically skipped.
	The first line of the dump will contain a summary line with an estimated token count,
	computed as approximately one token per 4 characters.
	"""

	import sys
	from pathlib import Path
	import fnmatch


	def format_number(n: float) -> str:
	"""
	Format a number into a string with K (thousands) or M (millions) if appropriate.
	"""
	if n >= 1_000_000:
	return f"{n/1_000_000:.1f}M"
	elif n >= 1000:
	return f"{n/1000:.1f}K"
	else:
	return str(int(n))


	def is_excluded(path: str, exclusion_patterns: set) -> bool:
	"""
	Check if a given path should be excluded based on a set of patterns.
	First uses fnmatch for shell-style matching, then checks if any path component
	exactly matches one of the exclusion patterns.
	"""
	for pattern in exclusion_patterns:
	if fnmatch.fnmatch(path, pattern):
	return True
	for part in Path(path).parts:
	if part in exclusion_patterns:
	return True
	return False


	def generate_tree(current: Path, base: Path, include_hidden: bool, exclude_patterns: set, prefix: str = "") -> list:
	"""
	Recursively generate lines for the directory tree.
	Hidden entries are skipped if include_hidden is False.
	Files or directories matching any exclusion pattern are omitted.
	"""
	lines = []
	entries = []
	for entry in current.iterdir():
	if not include_hidden and entry.name.startswith('.'):
	continue
	rel_entry = str(entry.relative_to(base)).replace("\\", "/")
	if is_excluded(rel_entry, exclude_patterns):
	continue
	entries.append(entry)
	entries.sort(key=lambda p: (not p.is_dir(), p.name.lower()))

	for index, entry in enumerate(entries):
	connector = "└── " if index == len(entries) - 1 else "├── "
	if entry.is_dir():
	lines.append(f"{prefix}{connector}{entry.name}/")
	new_prefix = prefix + (" " if index == len(entries) - 1 else "│ ")
	lines.extend(generate_tree(entry, base, include_hidden, exclude_patterns, new_prefix))
	else:
	lines.append(f"{prefix}{connector}{entry.name}")
	return lines


	def get_directory_tree(base: Path, include_hidden: bool, exclude_patterns: set) -> str:
	"""
	Return a string representation of the directory tree for the base folder.
	"""
	tree_lines = [f"/ {base.name}"]
	tree_lines.extend(generate_tree(base, base, include_hidden, exclude_patterns))
	return "\n".join(tree_lines)


	def scan_folder(base: Path, output_file: Path, include_hidden: bool, dump_folder: bool,
	dump_files: bool, exclude_patterns: set, skip_large: bool) -> None:
	"""
	Generate the output content, including the directory structure and file contents,
	compute a summary with estimated tokens, and write everything to the output file.
	If 'skip_large' is True, files with more than 1500 lines will not have their content dumped.
	Binary files (images, fonts, etc.) are automatically skipped.
	"""
	output_lines = []

	if dump_folder:
	output_lines.append("Directory Structure:")
	output_lines.append("-------------------")
	output_lines.append(get_directory_tree(base, include_hidden, exclude_patterns))
	output_lines.append("")

	if dump_files:
	output_lines.append("File Contents:")
	output_lines.append("--------------")
	skip_extensions = {".png", ".gif", ".jpg", ".jpeg", ".webp", ".svg",
	".ttf", ".otf", ".woff", ".woff2"}

	# Process files recursively.
	for file_path in sorted(base.rglob("*"), key=lambda p: str(p.relative_to(base)).lower()):
	# Skip files in hidden directories if include_hidden is False.
	rel_parts = file_path.relative_to(base).parts
	if not include_hidden and any(part.startswith('.') for part in rel_parts):
	continue
	rel_path = str(file_path.relative_to(base)).replace("\\", "/")
	if is_excluded(rel_path, exclude_patterns):
	continue
	if file_path.is_file():
	# Skip known binary file types.
	if file_path.suffix.lower() in skip_extensions:
	continue

	try:
	content = file_path.read_text(encoding="utf-8")
	except Exception:
	# If reading fails (e.g. binary/non-UTF8), skip the file entirely.
	continue

	output_lines.append(f"File: {rel_path}")
	output_lines.append("-" * 50)
	if skip_large and len(content.splitlines()) > 1500:
	output_lines.append("Skipped file content because it exceeds 1500 lines.")
	else:
	output_lines.append(content)
	output_lines.append("")

	final_output = "\n".join(output_lines)

	total_chars = len(final_output)
	estimated_tokens = total_chars / 4
	formatted_tokens = format_number(estimated_tokens)

	summary_line = f"Summary: Estimated tokens: {formatted_tokens}\n"
	final_output = summary_line + final_output

	output_file.write_text(final_output, encoding="utf-8")
	print(f"Scan complete. Results written to {output_file}")


	def main():
	if len(sys.argv) < 2:
	print("Usage: python script.py <folder_path> [--include-hidden] [--no-folder] [--no-files] [--exclude PATTERNS] [--skip-large]")
	sys.exit(1)

	include_hidden = False
	dump_folder = True
	dump_files = True
	skip_large = False
	exclude_patterns = set()
	folder_path = None

	i = 1
	while i < len(sys.argv):
	arg = sys.argv[i]
	if arg == "--include-hidden":
	include_hidden = True
	elif arg == "--no-folder":
	dump_folder = False
	elif arg == "--no-files":
	dump_files = False
	elif arg == "--skip-large":
	skip_large = True
	elif arg == "--exclude":
	if i + 1 >= len(sys.argv):
	print("Error: Missing argument for --exclude")
	sys.exit(1)
	patterns = sys.argv[i + 1].split(',')
	for pat in patterns:
	pat = pat.strip()
	if pat:
	exclude_patterns.add(pat)
	i += 1
	else:
	if folder_path is None:
	folder_path = arg
	i += 1

	if folder_path is None:
	print("Error: No folder path provided.\nUsage: python script.py <folder_path> [--include-hidden] [--no-folder] [--no-files] [--exclude PATTERNS] [--skip-large]")
	sys.exit(1)

	base = Path(folder_path)
	if not base.is_dir():
	print(f"Error: '{folder_path}' is not a valid directory.")
	sys.exit(1)

	output_file = Path(f"{base.name}-dump.txt")
	scan_folder(base, output_file, include_hidden, dump_folder, dump_files, exclude_patterns, skip_large)


	if __name__ == "__main__":
	main()