Skip to content

Instantly share code, notes, and snippets.

@ntfargo
Last active February 16, 2025 16:31
Show Gist options
  • Save ntfargo/9db84526d89cb7221cb87abcc7735e27 to your computer and use it in GitHub Desktop.
Save ntfargo/9db84526d89cb7221cb87abcc7735e27 to your computer and use it in GitHub Desktop.
A dumper script that recursively dumps a folder's structure and file contents into a single output file—with an estimated token count summary—perfect for preparing data for LLMs and other text-processing tasks.
#!/usr/bin/env python3
"""
A folder scanning tool that writes a directory tree and the contents
of each file (found recursively) into an output file. Only one command-line argument,
the folder path, is required.
The output file is automatically named after the folder with the suffix "-dump.txt".
Options:
--include-hidden : Include hidden files/folders (those starting with a dot).
--no-folder : Do not dump the directory (folder) structure.
--no-files : Do not dump the file contents.
--exclude PATTERNS : Comma-separated list of patterns to exclude (e.g., "license.md,build").
--skip-large : Skip dumping file content if the file has more than 1500 lines.
Binary files (images, fonts, etc.) are automatically skipped.
The first line of the dump will contain a summary line with an estimated token count,
computed as approximately one token per 4 characters.
"""
import sys
from pathlib import Path
import fnmatch
def format_number(n: float) -> str:
"""
Format a number into a string with K (thousands) or M (millions) if appropriate.
"""
if n >= 1_000_000:
return f"{n/1_000_000:.1f}M"
elif n >= 1000:
return f"{n/1000:.1f}K"
else:
return str(int(n))
def is_excluded(path: str, exclusion_patterns: set) -> bool:
"""
Check if a given path should be excluded based on a set of patterns.
First uses fnmatch for shell-style matching, then checks if any path component
exactly matches one of the exclusion patterns.
"""
for pattern in exclusion_patterns:
if fnmatch.fnmatch(path, pattern):
return True
for part in Path(path).parts:
if part in exclusion_patterns:
return True
return False
def generate_tree(current: Path, base: Path, include_hidden: bool, exclude_patterns: set, prefix: str = "") -> list:
"""
Recursively generate lines for the directory tree.
Hidden entries are skipped if include_hidden is False.
Files or directories matching any exclusion pattern are omitted.
"""
lines = []
entries = []
for entry in current.iterdir():
if not include_hidden and entry.name.startswith('.'):
continue
rel_entry = str(entry.relative_to(base)).replace("\\", "/")
if is_excluded(rel_entry, exclude_patterns):
continue
entries.append(entry)
entries.sort(key=lambda p: (not p.is_dir(), p.name.lower()))
for index, entry in enumerate(entries):
connector = "└── " if index == len(entries) - 1 else "├── "
if entry.is_dir():
lines.append(f"{prefix}{connector}{entry.name}/")
new_prefix = prefix + (" " if index == len(entries) - 1 else "│ ")
lines.extend(generate_tree(entry, base, include_hidden, exclude_patterns, new_prefix))
else:
lines.append(f"{prefix}{connector}{entry.name}")
return lines
def get_directory_tree(base: Path, include_hidden: bool, exclude_patterns: set) -> str:
"""
Return a string representation of the directory tree for the base folder.
"""
tree_lines = [f"/ {base.name}"]
tree_lines.extend(generate_tree(base, base, include_hidden, exclude_patterns))
return "\n".join(tree_lines)
def scan_folder(base: Path, output_file: Path, include_hidden: bool, dump_folder: bool,
dump_files: bool, exclude_patterns: set, skip_large: bool) -> None:
"""
Generate the output content, including the directory structure and file contents,
compute a summary with estimated tokens, and write everything to the output file.
If 'skip_large' is True, files with more than 1500 lines will not have their content dumped.
Binary files (images, fonts, etc.) are automatically skipped.
"""
output_lines = []
if dump_folder:
output_lines.append("Directory Structure:")
output_lines.append("-------------------")
output_lines.append(get_directory_tree(base, include_hidden, exclude_patterns))
output_lines.append("")
if dump_files:
output_lines.append("File Contents:")
output_lines.append("--------------")
skip_extensions = {".png", ".gif", ".jpg", ".jpeg", ".webp", ".svg",
".ttf", ".otf", ".woff", ".woff2"}
# Process files recursively.
for file_path in sorted(base.rglob("*"), key=lambda p: str(p.relative_to(base)).lower()):
# Skip files in hidden directories if include_hidden is False.
rel_parts = file_path.relative_to(base).parts
if not include_hidden and any(part.startswith('.') for part in rel_parts):
continue
rel_path = str(file_path.relative_to(base)).replace("\\", "/")
if is_excluded(rel_path, exclude_patterns):
continue
if file_path.is_file():
# Skip known binary file types.
if file_path.suffix.lower() in skip_extensions:
continue
try:
content = file_path.read_text(encoding="utf-8")
except Exception:
# If reading fails (e.g. binary/non-UTF8), skip the file entirely.
continue
output_lines.append(f"File: {rel_path}")
output_lines.append("-" * 50)
if skip_large and len(content.splitlines()) > 1500:
output_lines.append("Skipped file content because it exceeds 1500 lines.")
else:
output_lines.append(content)
output_lines.append("")
final_output = "\n".join(output_lines)
total_chars = len(final_output)
estimated_tokens = total_chars / 4
formatted_tokens = format_number(estimated_tokens)
summary_line = f"Summary: Estimated tokens: {formatted_tokens}\n"
final_output = summary_line + final_output
output_file.write_text(final_output, encoding="utf-8")
print(f"Scan complete. Results written to {output_file}")
def main():
if len(sys.argv) < 2:
print("Usage: python script.py <folder_path> [--include-hidden] [--no-folder] [--no-files] [--exclude PATTERNS] [--skip-large]")
sys.exit(1)
include_hidden = False
dump_folder = True
dump_files = True
skip_large = False
exclude_patterns = set()
folder_path = None
i = 1
while i < len(sys.argv):
arg = sys.argv[i]
if arg == "--include-hidden":
include_hidden = True
elif arg == "--no-folder":
dump_folder = False
elif arg == "--no-files":
dump_files = False
elif arg == "--skip-large":
skip_large = True
elif arg == "--exclude":
if i + 1 >= len(sys.argv):
print("Error: Missing argument for --exclude")
sys.exit(1)
patterns = sys.argv[i + 1].split(',')
for pat in patterns:
pat = pat.strip()
if pat:
exclude_patterns.add(pat)
i += 1
else:
if folder_path is None:
folder_path = arg
i += 1
if folder_path is None:
print("Error: No folder path provided.\nUsage: python script.py <folder_path> [--include-hidden] [--no-folder] [--no-files] [--exclude PATTERNS] [--skip-large]")
sys.exit(1)
base = Path(folder_path)
if not base.is_dir():
print(f"Error: '{folder_path}' is not a valid directory.")
sys.exit(1)
output_file = Path(f"{base.name}-dump.txt")
scan_folder(base, output_file, include_hidden, dump_folder, dump_files, exclude_patterns, skip_large)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment