Last active
February 16, 2025 16:31
-
-
Save ntfargo/9db84526d89cb7221cb87abcc7735e27 to your computer and use it in GitHub Desktop.
A dumper script that recursively dumps a folder's structure and file contents into a single output file—with an estimated token count summary—perfect for preparing data for LLMs and other text-processing tasks.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
A folder scanning tool that writes a directory tree and the contents | |
of each file (found recursively) into an output file. Only one command-line argument, | |
the folder path, is required. | |
The output file is automatically named after the folder with the suffix "-dump.txt". | |
Options: | |
--include-hidden : Include hidden files/folders (those starting with a dot). | |
--no-folder : Do not dump the directory (folder) structure. | |
--no-files : Do not dump the file contents. | |
--exclude PATTERNS : Comma-separated list of patterns to exclude (e.g., "license.md,build"). | |
--skip-large : Skip dumping file content if the file has more than 1500 lines. | |
Binary files (images, fonts, etc.) are automatically skipped. | |
The first line of the dump will contain a summary line with an estimated token count, | |
computed as approximately one token per 4 characters. | |
""" | |
import sys | |
from pathlib import Path | |
import fnmatch | |
def format_number(n: float) -> str: | |
""" | |
Format a number into a string with K (thousands) or M (millions) if appropriate. | |
""" | |
if n >= 1_000_000: | |
return f"{n/1_000_000:.1f}M" | |
elif n >= 1000: | |
return f"{n/1000:.1f}K" | |
else: | |
return str(int(n)) | |
def is_excluded(path: str, exclusion_patterns: set) -> bool: | |
""" | |
Check if a given path should be excluded based on a set of patterns. | |
First uses fnmatch for shell-style matching, then checks if any path component | |
exactly matches one of the exclusion patterns. | |
""" | |
for pattern in exclusion_patterns: | |
if fnmatch.fnmatch(path, pattern): | |
return True | |
for part in Path(path).parts: | |
if part in exclusion_patterns: | |
return True | |
return False | |
def generate_tree(current: Path, base: Path, include_hidden: bool, exclude_patterns: set, prefix: str = "") -> list: | |
""" | |
Recursively generate lines for the directory tree. | |
Hidden entries are skipped if include_hidden is False. | |
Files or directories matching any exclusion pattern are omitted. | |
""" | |
lines = [] | |
entries = [] | |
for entry in current.iterdir(): | |
if not include_hidden and entry.name.startswith('.'): | |
continue | |
rel_entry = str(entry.relative_to(base)).replace("\\", "/") | |
if is_excluded(rel_entry, exclude_patterns): | |
continue | |
entries.append(entry) | |
entries.sort(key=lambda p: (not p.is_dir(), p.name.lower())) | |
for index, entry in enumerate(entries): | |
connector = "└── " if index == len(entries) - 1 else "├── " | |
if entry.is_dir(): | |
lines.append(f"{prefix}{connector}{entry.name}/") | |
new_prefix = prefix + (" " if index == len(entries) - 1 else "│ ") | |
lines.extend(generate_tree(entry, base, include_hidden, exclude_patterns, new_prefix)) | |
else: | |
lines.append(f"{prefix}{connector}{entry.name}") | |
return lines | |
def get_directory_tree(base: Path, include_hidden: bool, exclude_patterns: set) -> str: | |
""" | |
Return a string representation of the directory tree for the base folder. | |
""" | |
tree_lines = [f"/ {base.name}"] | |
tree_lines.extend(generate_tree(base, base, include_hidden, exclude_patterns)) | |
return "\n".join(tree_lines) | |
def scan_folder(base: Path, output_file: Path, include_hidden: bool, dump_folder: bool, | |
dump_files: bool, exclude_patterns: set, skip_large: bool) -> None: | |
""" | |
Generate the output content, including the directory structure and file contents, | |
compute a summary with estimated tokens, and write everything to the output file. | |
If 'skip_large' is True, files with more than 1500 lines will not have their content dumped. | |
Binary files (images, fonts, etc.) are automatically skipped. | |
""" | |
output_lines = [] | |
if dump_folder: | |
output_lines.append("Directory Structure:") | |
output_lines.append("-------------------") | |
output_lines.append(get_directory_tree(base, include_hidden, exclude_patterns)) | |
output_lines.append("") | |
if dump_files: | |
output_lines.append("File Contents:") | |
output_lines.append("--------------") | |
skip_extensions = {".png", ".gif", ".jpg", ".jpeg", ".webp", ".svg", | |
".ttf", ".otf", ".woff", ".woff2"} | |
# Process files recursively. | |
for file_path in sorted(base.rglob("*"), key=lambda p: str(p.relative_to(base)).lower()): | |
# Skip files in hidden directories if include_hidden is False. | |
rel_parts = file_path.relative_to(base).parts | |
if not include_hidden and any(part.startswith('.') for part in rel_parts): | |
continue | |
rel_path = str(file_path.relative_to(base)).replace("\\", "/") | |
if is_excluded(rel_path, exclude_patterns): | |
continue | |
if file_path.is_file(): | |
# Skip known binary file types. | |
if file_path.suffix.lower() in skip_extensions: | |
continue | |
try: | |
content = file_path.read_text(encoding="utf-8") | |
except Exception: | |
# If reading fails (e.g. binary/non-UTF8), skip the file entirely. | |
continue | |
output_lines.append(f"File: {rel_path}") | |
output_lines.append("-" * 50) | |
if skip_large and len(content.splitlines()) > 1500: | |
output_lines.append("Skipped file content because it exceeds 1500 lines.") | |
else: | |
output_lines.append(content) | |
output_lines.append("") | |
final_output = "\n".join(output_lines) | |
total_chars = len(final_output) | |
estimated_tokens = total_chars / 4 | |
formatted_tokens = format_number(estimated_tokens) | |
summary_line = f"Summary: Estimated tokens: {formatted_tokens}\n" | |
final_output = summary_line + final_output | |
output_file.write_text(final_output, encoding="utf-8") | |
print(f"Scan complete. Results written to {output_file}") | |
def main(): | |
if len(sys.argv) < 2: | |
print("Usage: python script.py <folder_path> [--include-hidden] [--no-folder] [--no-files] [--exclude PATTERNS] [--skip-large]") | |
sys.exit(1) | |
include_hidden = False | |
dump_folder = True | |
dump_files = True | |
skip_large = False | |
exclude_patterns = set() | |
folder_path = None | |
i = 1 | |
while i < len(sys.argv): | |
arg = sys.argv[i] | |
if arg == "--include-hidden": | |
include_hidden = True | |
elif arg == "--no-folder": | |
dump_folder = False | |
elif arg == "--no-files": | |
dump_files = False | |
elif arg == "--skip-large": | |
skip_large = True | |
elif arg == "--exclude": | |
if i + 1 >= len(sys.argv): | |
print("Error: Missing argument for --exclude") | |
sys.exit(1) | |
patterns = sys.argv[i + 1].split(',') | |
for pat in patterns: | |
pat = pat.strip() | |
if pat: | |
exclude_patterns.add(pat) | |
i += 1 | |
else: | |
if folder_path is None: | |
folder_path = arg | |
i += 1 | |
if folder_path is None: | |
print("Error: No folder path provided.\nUsage: python script.py <folder_path> [--include-hidden] [--no-folder] [--no-files] [--exclude PATTERNS] [--skip-large]") | |
sys.exit(1) | |
base = Path(folder_path) | |
if not base.is_dir(): | |
print(f"Error: '{folder_path}' is not a valid directory.") | |
sys.exit(1) | |
output_file = Path(f"{base.name}-dump.txt") | |
scan_folder(base, output_file, include_hidden, dump_folder, dump_files, exclude_patterns, skip_large) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment