filesystem-auditor / scan_techstack.py
algorembrant's picture
Upload 6 files
2db20ba verified
"""
Repository Tech-Stack Scanner
===============================
Scans a repository and produces a Markdown table of file types with counts
and total sizes in KB. Supports every file extension in existence.
Usage:
python scan_techstack.py [path] [--output FILE] [--ignore PATTERN ...]
[--exclude-ext EXT ...] [--sort count|size|ext]
Output:
A Markdown file containing the tech-stack audit table.
"""
import os
import sys
import argparse
from pathlib import Path
from collections import defaultdict
# ---------------------------------------------------------------------------
# Comprehensive file-extension β†’ human-readable label mapping
# Covers programming, data, document, media, archive, config, and more.
# Any extension NOT in this dict will be auto-labelled from the extension.
# ---------------------------------------------------------------------------
EXT_LABELS: dict[str, str] = {
# ── Programming Languages ──────────────────────────────────────────
".py": "Python",
".pyw": "Python (windowed)",
".pyx": "Cython",
".pxd": "Cython Definition",
".pyi": "Python Stub",
".pyc": "Python Compiled",
".pyo": "Python Optimized",
".js": "JavaScript",
".mjs": "ES Module JavaScript",
".cjs": "CommonJS JavaScript",
".jsx": "JSX (React)",
".ts": "TypeScript",
".tsx": "TSX (React)",
".java": "Java",
".class": "Java Class",
".jar": "Java Archive",
".kt": "Kotlin",
".kts": "Kotlin Script",
".scala": "Scala",
".groovy": "Groovy",
".c": "C",
".h": "C Header",
".cpp": "C++",
".cxx": "C++",
".cc": "C++",
".c++": "C++",
".hpp": "C++ Header",
".hxx": "C++ Header",
".hh": "C++ Header",
".h++": "C++ Header",
".cs": "C#",
".csx": "C# Script",
".fs": "F#",
".fsx": "F# Script",
".fsi": "F# Signature",
".vb": "Visual Basic",
".vbs": "VBScript",
".go": "Go",
".rs": "Rust",
".rb": "Ruby",
".erb": "Embedded Ruby",
".php": "PHP",
".pl": "Perl",
".pm": "Perl Module",
".lua": "Lua",
".r": "R",
".rmd": "R Markdown",
".swift": "Swift",
".m": "Objective-C / MATLAB",
".mm": "Objective-C++",
".d": "D",
".dart": "Dart",
".ex": "Elixir",
".exs": "Elixir Script",
".erl": "Erlang",
".hrl": "Erlang Header",
".hs": "Haskell",
".lhs": "Literate Haskell",
".ml": "OCaml",
".mli": "OCaml Interface",
".clj": "Clojure",
".cljs": "ClojureScript",
".cljc": "Clojure Common",
".edn": "EDN (Clojure)",
".lisp": "Common Lisp",
".cl": "Common Lisp",
".el": "Emacs Lisp",
".scm": "Scheme",
".rkt": "Racket",
".nim": "Nim",
".zig": "Zig",
".v": "V / Verilog",
".vhd": "VHDL",
".vhdl": "VHDL",
".sv": "SystemVerilog",
".asm": "Assembly",
".s": "Assembly",
".f": "Fortran",
".f90": "Fortran 90",
".f95": "Fortran 95",
".f03": "Fortran 2003",
".f08": "Fortran 2008",
".for": "Fortran",
".cob": "COBOL",
".cbl": "COBOL",
".pas": "Pascal",
".pp": "Pascal",
".ada": "Ada",
".adb": "Ada Body",
".ads": "Ada Spec",
".tcl": "Tcl",
".tk": "Tcl/Tk",
".awk": "AWK",
".sed": "sed",
".jl": "Julia",
".cr": "Crystal",
".hack": "Hack",
".hx": "Haxe",
".hxml": "Haxe XML",
".purs": "PureScript",
".idr": "Idris",
".agda": "Agda",
".lean": "Lean",
".coq": "Coq",
".thy": "Isabelle",
".pro": "Prolog",
".sml": "Standard ML",
".fig": "MATLAB Figure",
".mat": "MATLAB Data",
# ── Shell / Scripting ──────────────────────────────────────────────
".sh": "Shell Script",
".bash": "Bash Script",
".zsh": "Zsh Script",
".fish": "Fish Script",
".bat": "Batch File",
".cmd": "Windows Command",
".ps1": "PowerShell",
".psm1": "PowerShell Module",
".psd1": "PowerShell Data",
# ── Web / Markup / Templating ──────────────────────────────────────
".html": "HTML",
".htm": "HTML",
".xhtml": "XHTML",
".css": "CSS",
".scss": "SCSS",
".sass": "Sass",
".less": "Less",
".styl": "Stylus",
".vue": "Vue",
".svelte": "Svelte",
".astro": "Astro",
".ejs": "EJS Template",
".pug": "Pug Template",
".jade": "Jade Template",
".hbs": "Handlebars",
".mustache": "Mustache",
".twig": "Twig Template",
".jinja": "Jinja Template",
".jinja2": "Jinja2 Template",
".j2": "Jinja2 Template",
".njk": "Nunjucks Template",
".liquid": "Liquid Template",
".haml": "HAML",
".slim": "Slim Template",
".blade.php":"Blade Template",
".jsp": "JavaServer Pages",
".asp": "ASP",
".aspx": "ASP.NET",
".cshtml": "Razor (C#)",
".razor": "Razor",
".wasm": "WebAssembly",
# ── Data / Config / Serialisation ──────────────────────────────────
".json": "JSON",
".jsonl": "JSON Lines",
".json5": "JSON5",
".geojson": "GeoJSON",
".xml": "XML",
".xsl": "XSLT",
".xslt": "XSLT",
".xsd": "XML Schema",
".dtd": "Document Type Definition",
".yaml": "YAML",
".yml": "YAML",
".toml": "TOML",
".ini": "INI Config",
".cfg": "Config",
".conf": "Config",
".env": "Environment File",
".properties": "Properties File",
".csv": "CSV",
".tsv": "TSV",
".parquet": "Apache Parquet",
".avro": "Apache Avro",
".orc": "Apache ORC",
".feather": "Feather (Arrow)",
".arrow": "Apache Arrow",
".msgpack": "MessagePack",
".bson": "BSON",
".cbor": "CBOR",
".protobuf": "Protocol Buffer",
".proto": "Protocol Buffer Schema",
".thrift": "Apache Thrift",
".hcl": "HCL (Terraform)",
".tf": "Terraform",
".tfvars": "Terraform Variables",
".pkl": "Pickle",
".pickle": "Pickle",
".npy": "NumPy Array",
".npz": "NumPy Archive",
".h5": "HDF5",
".hdf5": "HDF5",
".nc": "NetCDF",
".sqlite": "SQLite Database",
".sqlite3": "SQLite Database",
".db": "Database",
".dbf": "dBASE",
".mdb": "Access Database",
".accdb": "Access Database",
".lmdb": "LMDB",
".leveldb": "LevelDB",
# ── Documents & Office ─────────────────────────────────────────────
".md": "Markdown",
".mdx": "MDX",
".txt": "Plain Text",
".text": "Plain Text",
".log": "Log File",
".rst": "reStructuredText",
".adoc": "AsciiDoc",
".org": "Org Mode",
".tex": "LaTeX",
".latex": "LaTeX",
".bib": "BibTeX",
".rtf": "Rich Text Format",
".doc": "Microsoft Word",
".docx": "Microsoft Word (OOXML)",
".odt": "OpenDocument Text",
".pdf": "PDF",
".xls": "Microsoft Excel",
".xlsx": "Microsoft Excel (OOXML)",
".xlsm": "Excel Macro-Enabled",
".xlsb": "Excel Binary",
".ods": "OpenDocument Spreadsheet",
".ppt": "Microsoft PowerPoint",
".pptx": "Microsoft PowerPoint (OOXML)",
".odp": "OpenDocument Presentation",
".pages": "Apple Pages",
".numbers": "Apple Numbers",
".key": "Apple Keynote",
".epub": "EPUB",
".mobi": "Mobipocket",
".djvu": "DjVu",
".chm": "Compiled HTML Help",
".man": "Man Page",
".info": "GNU Info",
# ── Notebooks & Data Science ───────────────────────────────────────
".ipynb": "Jupyter Notebook",
".rmd": "R Markdown",
".qmd": "Quarto Document",
".dvc": "DVC File",
".onnx": "ONNX Model",
".pt": "PyTorch Model",
".pth": "PyTorch Checkpoint",
".safetensors": "SafeTensors Model",
".gguf": "GGUF Model",
".bin": "Binary File",
# ── Images ─────────────────────────────────────────────────────────
".png": "PNG Image",
".jpg": "JPEG Image",
".jpeg": "JPEG Image",
".gif": "GIF Image",
".bmp": "Bitmap Image",
".tiff": "TIFF Image",
".tif": "TIFF Image",
".webp": "WebP Image",
".avif": "AVIF Image",
".heic": "HEIC Image",
".heif": "HEIF Image",
".svg": "SVG Image",
".ico": "Icon",
".icns": "Apple Icon",
".cur": "Cursor",
".psd": "Photoshop",
".ai": "Adobe Illustrator",
".sketch": "Sketch",
".fig": "Figma",
".xd": "Adobe XD",
".eps": "EPS Vector",
".raw": "RAW Image",
".cr2": "Canon RAW",
".nef": "Nikon RAW",
".arw": "Sony RAW",
".dng": "DNG RAW",
".exr": "OpenEXR",
".hdr": "HDR Image",
".pbm": "PBM Image",
".pgm": "PGM Image",
".ppm": "PPM Image",
".pcx": "PCX Image",
".tga": "TGA Image",
# ── Audio ──────────────────────────────────────────────────────────
".mp3": "MP3 Audio",
".wav": "WAV Audio",
".flac": "FLAC Audio",
".aac": "AAC Audio",
".ogg": "Ogg Audio",
".wma": "WMA Audio",
".m4a": "M4A Audio",
".opus": "Opus Audio",
".aiff": "AIFF Audio",
".mid": "MIDI",
".midi": "MIDI",
".amr": "AMR Audio",
".ape": "APE Audio",
".alac": "ALAC Audio",
# ── Video ──────────────────────────────────────────────────────────
".mp4": "MP4 Video",
".mkv": "MKV Video",
".avi": "AVI Video",
".mov": "QuickTime Video",
".wmv": "WMV Video",
".flv": "Flash Video",
".webm": "WebM Video",
".m4v": "M4V Video",
".mpg": "MPEG Video",
".mpeg": "MPEG Video",
".3gp": "3GP Video",
".ogv": "Ogg Video",
".ts": "MPEG-TS",
".vob": "DVD Video",
".rm": "RealMedia",
# ── Fonts ──────────────────────────────────────────────────────────
".ttf": "TrueType Font",
".otf": "OpenType Font",
".woff": "WOFF Font",
".woff2": "WOFF2 Font",
".eot": "EOT Font",
".fon": "Bitmap Font",
# ── Archives / Compression ─────────────────────────────────────────
".zip": "ZIP Archive",
".gz": "Gzip Archive",
".gzip": "Gzip Archive",
".bz2": "Bzip2 Archive",
".xz": "XZ Archive",
".lz": "Lzip Archive",
".lzma": "LZMA Archive",
".zst": "Zstandard Archive",
".tar": "Tar Archive",
".tgz": "Tar+Gzip Archive",
".tbz2": "Tar+Bzip2 Archive",
".txz": "Tar+XZ Archive",
".rar": "RAR Archive",
".7z": "7-Zip Archive",
".cab": "Cabinet Archive",
".iso": "ISO Disk Image",
".dmg": "macOS Disk Image",
".img": "Disk Image",
".vhd": "Virtual Hard Disk",
".vhdx": "Virtual Hard Disk (VHDX)",
".vmdk": "VMware Disk",
".qcow2": "QEMU Disk",
".deb": "Debian Package",
".rpm": "RPM Package",
".apk": "Android Package",
".ipa": "iOS App",
".snap": "Snap Package",
".flatpak": "Flatpak Package",
".appimage": "AppImage",
".msi": "MSI Installer",
".exe": "Windows Executable",
".dll": "DLL Library",
".so": "Shared Object",
".dylib": "macOS Dynamic Library",
".a": "Static Library",
".lib": "Library",
".o": "Object File",
".obj": "Object File",
".sys": "System File",
".drv": "Driver",
# ── DevOps / CI / Containerisation ─────────────────────────────────
".dockerfile":"Dockerfile",
".vagrantfile":"Vagrantfile",
".jenkinsfile":"Jenkinsfile",
# ── Trading / Domain-specific ──────────────────────────────────────
".mq5": "MQL5 Source",
".mq4": "MQL4 Source",
".mqh": "MQL Header",
".ex5": "MQL5 Compiled",
".ex4": "MQL4 Compiled",
".set": "MT Parameter Set",
".pine": "Pine Script",
".tpl": "MT Template",
".chr": "MT Chart",
".mql": "MQL Source",
# ── Misc / Build / Project ─────────────────────────────────────────
".makefile": "Makefile",
".cmake": "CMake",
".gradle": "Gradle",
".sbt": "SBT Build",
".gemspec": "Ruby Gem Spec",
".podspec": "CocoaPods Spec",
".cabal": "Cabal (Haskell)",
".csproj": "C# Project",
".fsproj": "F# Project",
".vbproj": "VB.NET Project",
".sln": "Visual Studio Solution",
".xcodeproj":"Xcode Project",
".pbxproj": "Xcode Project (PBX)",
".xcworkspace":"Xcode Workspace",
".plist": "Property List",
".lnk": "Windows Shortcut",
".url": "URL Shortcut",
".desktop": "Desktop Entry",
".reg": "Windows Registry",
".service": "Systemd Service",
".timer": "Systemd Timer",
".socket": "Systemd Socket",
".lock": "Lock File",
".patch": "Patch File",
".diff": "Diff File",
".map": "Source Map",
".whl": "Python Wheel",
".egg": "Python Egg",
".gem": "Ruby Gem",
".nupkg": "NuGet Package",
".crate": "Rust Crate",
# ── SQL / Database ─────────────────────────────────────────────────
".sql": "SQL",
".mysql": "MySQL",
".pgsql": "PostgreSQL",
".plsql": "PL/SQL",
# ── GraphQL / API ──────────────────────────────────────────────────
".graphql": "GraphQL",
".gql": "GraphQL",
".swagger": "Swagger",
".openapi": "OpenAPI Spec",
# ── 3D / CAD / GIS ─────────────────────────────────────────────────
".stl": "STL 3D Model",
".obj": "Wavefront OBJ",
".fbx": "FBX 3D Model",
".gltf": "glTF 3D Model",
".glb": "glTF Binary",
".blend": "Blender File",
".3ds": "3DS Model",
".dae": "Collada",
".usd": "USD Scene",
".usda": "USD ASCII",
".usdc": "USD Binary",
".step": "STEP CAD",
".stp": "STEP CAD",
".iges": "IGES CAD",
".igs": "IGES CAD",
".dwg": "AutoCAD Drawing",
".dxf": "AutoCAD Exchange",
".shp": "Shapefile",
".kml": "KML",
".kmz": "KMZ",
".gpx": "GPX Track",
# ── Certificates / Security ────────────────────────────────────────
".pem": "PEM Certificate",
".crt": "Certificate",
".cer": "Certificate",
".csr": "Certificate Request",
".p12": "PKCS#12",
".pfx": "PKCS#12",
".asc": "PGP/GPG",
".gpg": "GPG Encrypted",
".sig": "Signature",
".pub": "Public Key",
# ── Git-related ────────────────────────────────────────────────────
".gitignore": "Git Ignore",
".gitattributes": "Git Attributes",
".gitmodules": "Git Submodules",
# ── Editor / IDE Config ────────────────────────────────────────────
".editorconfig": "EditorConfig",
".prettierrc": "Prettier Config",
".eslintrc": "ESLint Config",
".stylelintrc": "Stylelint Config",
".babelrc": "Babel Config",
".swcrc": "SWC Config",
".nvmrc": "NVM Config",
".npmrc": "NPM Config",
".yarnrc": "Yarn Config",
# ── Misc ───────────────────────────────────────────────────────────
".mw5": "MQL5 Variant",
".webmanifest": "Web App Manifest",
}
# ---------------------------------------------------------------------------
# Default ignore patterns (common non-source dirs / files)
# ---------------------------------------------------------------------------
DEFAULT_IGNORE = {
".git",
"__pycache__",
".venv",
"venv",
"env",
"node_modules",
".ipynb_checkpoints",
".mypy_cache",
".pytest_cache",
".tox",
".eggs",
"*.egg-info",
".DS_Store",
"Thumbs.db",
"desktop.ini",
}
def should_ignore(name: str, ignore_set: set) -> bool:
"""Return True if *name* matches any pattern in the ignore set."""
if name in ignore_set:
return True
for pattern in ignore_set:
if pattern.startswith("*") and name.endswith(pattern[1:]):
return True
return False
def get_label(ext: str) -> str:
"""Return the human-readable label for an extension."""
if ext in EXT_LABELS:
return EXT_LABELS[ext]
# Fallback: capitalise the extension name
return ext.upper().lstrip(".")
def scan_files(root_path: str, ignore_set: set, exclude_exts: set) -> dict:
"""
Walk *root_path* and collect per-extension statistics.
Returns a dict keyed by normalised extension string, e.g. ".py",
with values ``{"count": int, "size_bytes": int, "label": str}``.
Files without an extension are grouped under "(no extension)".
"""
stats: dict = defaultdict(lambda: {"count": 0, "size_bytes": 0, "label": ""})
for dirpath, dirnames, filenames in os.walk(root_path):
# Prune ignored directories in-place so os.walk skips them
dirnames[:] = [
d for d in dirnames if not should_ignore(d, ignore_set)
]
for fname in filenames:
if should_ignore(fname, ignore_set):
continue
filepath = os.path.join(dirpath, fname)
# Determine extension
_, ext = os.path.splitext(fname)
ext = ext.lower() # normalise
if ext and ext.lstrip(".") in exclude_exts:
continue
# Label
if ext:
label = f"{get_label(ext)} ({ext})"
else:
ext = "(none)"
label = "(no extension)"
try:
size = os.path.getsize(filepath)
except (OSError, PermissionError):
size = 0
stats[ext]["count"] += 1
stats[ext]["size_bytes"] += size
stats[ext]["label"] = label
return dict(stats)
def format_size_kb(size_bytes: int) -> str:
"""Return exact size in KB, rounded to 1 decimal place."""
kb = size_bytes / 1024
if kb == 0:
return "0"
# Show 1 decimal place for precision, strip trailing zero if whole number
formatted = f"{kb:,.1f}"
return formatted
def build_table(stats: dict, sort_by: str) -> list[str]:
"""
Build a Markdown table from *stats*.
Returns a list of lines.
"""
# Prepare rows
rows = []
total_count = 0
total_bytes = 0
for ext, data in stats.items():
rows.append(
{
"label": data["label"],
"count": data["count"],
"size_bytes": data["size_bytes"],
"ext_key": ext,
}
)
total_count += data["count"]
total_bytes += data["size_bytes"]
# Sort
if sort_by == "count":
rows.sort(key=lambda r: (-r["count"], r["label"].lower()))
elif sort_by == "size":
rows.sort(key=lambda r: (-r["size_bytes"], r["label"].lower()))
else: # "ext"
rows.sort(key=lambda r: r["label"].lower())
# Build Markdown lines
lines = [
"| File Type | Count | Size (KB) |",
"| :--- | :--- | :--- |",
]
for row in rows:
lines.append(
f"| {row['label']} | {row['count']} | {format_size_kb(row['size_bytes'])} |"
)
# Totals row
lines.append(f"| **Total** | **{total_count}** | **{format_size_kb(total_bytes)}** |")
return lines
def main() -> None:
parser = argparse.ArgumentParser(
description="Audit repository tech-stack (file types, counts, sizes)."
)
parser.add_argument(
"path",
nargs="?",
default=".",
help="Root directory to scan (default: current directory).",
)
parser.add_argument(
"--output",
"-o",
default=None,
help="Output Markdown file path (default: TECHSTACK.md in scanned dir).",
)
parser.add_argument(
"--ignore",
nargs="*",
default=None,
help="Extra directory/file names to ignore.",
)
parser.add_argument(
"--no-default-ignore",
action="store_true",
help="Disable the built-in ignore list.",
)
parser.add_argument(
"--exclude-ext",
nargs="*",
default=None,
help="File extensions to exclude from output (without dot, e.g. png mp4 git).",
)
parser.add_argument(
"--sort",
choices=["count", "size", "ext"],
default="count",
help="Sort order for the table (default: count descending).",
)
args = parser.parse_args()
root = os.path.abspath(args.path)
root_name = os.path.basename(root)
# Build ignore set
ignore_set: set = set() if args.no_default_ignore else set(DEFAULT_IGNORE)
if args.ignore:
ignore_set.update(args.ignore)
# Always ignore the output file itself
out_path = args.output or os.path.join(root, "TECHSTACK.md")
out_name = os.path.basename(out_path)
ignore_set.add(out_name)
# Excluded extensions
exclude_exts: set = set()
if args.exclude_ext:
exclude_exts = {e.lower().lstrip(".") for e in args.exclude_ext}
print(f"Scanning: {root}")
print(f"Ignoring: {', '.join(sorted(ignore_set))}")
if exclude_exts:
print(f"Excluding extensions: {', '.join(sorted(exclude_exts))}")
stats = scan_files(root, ignore_set, exclude_exts)
table_lines = build_table(stats, sort_by=args.sort)
# Compose Markdown
md_lines = [
f"## Techstack\n",
f"Audit of **{root_name}** project files (excluding environment and cache):\n",
]
md_lines.extend(table_lines)
md_lines.append("") # trailing newline
content = "\n".join(md_lines)
with open(out_path, "w", encoding="utf-8") as fh:
fh.write(content)
total = sum(d["count"] for d in stats.values())
print(f"Done – {total} files across {len(stats)} types written to {out_path}")
if __name__ == "__main__":
main()