""" Repository Tech-Stack Scanner =============================== Scans a repository and produces a Markdown table of file types with counts and total sizes in KB. Supports every file extension in existence. Usage: python scan_techstack.py [path] [--output FILE] [--ignore PATTERN ...] [--exclude-ext EXT ...] [--sort count|size|ext] Output: A Markdown file containing the tech-stack audit table. """ import os import sys import argparse from pathlib import Path from collections import defaultdict # --------------------------------------------------------------------------- # Comprehensive file-extension → human-readable label mapping # Covers programming, data, document, media, archive, config, and more. # Any extension NOT in this dict will be auto-labelled from the extension. # --------------------------------------------------------------------------- EXT_LABELS: dict[str, str] = { # ── Programming Languages ────────────────────────────────────────── ".py": "Python", ".pyw": "Python (windowed)", ".pyx": "Cython", ".pxd": "Cython Definition", ".pyi": "Python Stub", ".pyc": "Python Compiled", ".pyo": "Python Optimized", ".js": "JavaScript", ".mjs": "ES Module JavaScript", ".cjs": "CommonJS JavaScript", ".jsx": "JSX (React)", ".ts": "TypeScript", ".tsx": "TSX (React)", ".java": "Java", ".class": "Java Class", ".jar": "Java Archive", ".kt": "Kotlin", ".kts": "Kotlin Script", ".scala": "Scala", ".groovy": "Groovy", ".c": "C", ".h": "C Header", ".cpp": "C++", ".cxx": "C++", ".cc": "C++", ".c++": "C++", ".hpp": "C++ Header", ".hxx": "C++ Header", ".hh": "C++ Header", ".h++": "C++ Header", ".cs": "C#", ".csx": "C# Script", ".fs": "F#", ".fsx": "F# Script", ".fsi": "F# Signature", ".vb": "Visual Basic", ".vbs": "VBScript", ".go": "Go", ".rs": "Rust", ".rb": "Ruby", ".erb": "Embedded Ruby", ".php": "PHP", ".pl": "Perl", ".pm": "Perl Module", ".lua": "Lua", ".r": "R", ".rmd": "R Markdown", ".swift": "Swift", ".m": "Objective-C / MATLAB", ".mm": "Objective-C++", ".d": "D", ".dart": "Dart", ".ex": "Elixir", ".exs": "Elixir Script", ".erl": "Erlang", ".hrl": "Erlang Header", ".hs": "Haskell", ".lhs": "Literate Haskell", ".ml": "OCaml", ".mli": "OCaml Interface", ".clj": "Clojure", ".cljs": "ClojureScript", ".cljc": "Clojure Common", ".edn": "EDN (Clojure)", ".lisp": "Common Lisp", ".cl": "Common Lisp", ".el": "Emacs Lisp", ".scm": "Scheme", ".rkt": "Racket", ".nim": "Nim", ".zig": "Zig", ".v": "V / Verilog", ".vhd": "VHDL", ".vhdl": "VHDL", ".sv": "SystemVerilog", ".asm": "Assembly", ".s": "Assembly", ".f": "Fortran", ".f90": "Fortran 90", ".f95": "Fortran 95", ".f03": "Fortran 2003", ".f08": "Fortran 2008", ".for": "Fortran", ".cob": "COBOL", ".cbl": "COBOL", ".pas": "Pascal", ".pp": "Pascal", ".ada": "Ada", ".adb": "Ada Body", ".ads": "Ada Spec", ".tcl": "Tcl", ".tk": "Tcl/Tk", ".awk": "AWK", ".sed": "sed", ".jl": "Julia", ".cr": "Crystal", ".hack": "Hack", ".hx": "Haxe", ".hxml": "Haxe XML", ".purs": "PureScript", ".idr": "Idris", ".agda": "Agda", ".lean": "Lean", ".coq": "Coq", ".thy": "Isabelle", ".pro": "Prolog", ".sml": "Standard ML", ".fig": "MATLAB Figure", ".mat": "MATLAB Data", # ── Shell / Scripting ────────────────────────────────────────────── ".sh": "Shell Script", ".bash": "Bash Script", ".zsh": "Zsh Script", ".fish": "Fish Script", ".bat": "Batch File", ".cmd": "Windows Command", ".ps1": "PowerShell", ".psm1": "PowerShell Module", ".psd1": "PowerShell Data", # ── Web / Markup / Templating ────────────────────────────────────── ".html": "HTML", ".htm": "HTML", ".xhtml": "XHTML", ".css": "CSS", ".scss": "SCSS", ".sass": "Sass", ".less": "Less", ".styl": "Stylus", ".vue": "Vue", ".svelte": "Svelte", ".astro": "Astro", ".ejs": "EJS Template", ".pug": "Pug Template", ".jade": "Jade Template", ".hbs": "Handlebars", ".mustache": "Mustache", ".twig": "Twig Template", ".jinja": "Jinja Template", ".jinja2": "Jinja2 Template", ".j2": "Jinja2 Template", ".njk": "Nunjucks Template", ".liquid": "Liquid Template", ".haml": "HAML", ".slim": "Slim Template", ".blade.php":"Blade Template", ".jsp": "JavaServer Pages", ".asp": "ASP", ".aspx": "ASP.NET", ".cshtml": "Razor (C#)", ".razor": "Razor", ".wasm": "WebAssembly", # ── Data / Config / Serialisation ────────────────────────────────── ".json": "JSON", ".jsonl": "JSON Lines", ".json5": "JSON5", ".geojson": "GeoJSON", ".xml": "XML", ".xsl": "XSLT", ".xslt": "XSLT", ".xsd": "XML Schema", ".dtd": "Document Type Definition", ".yaml": "YAML", ".yml": "YAML", ".toml": "TOML", ".ini": "INI Config", ".cfg": "Config", ".conf": "Config", ".env": "Environment File", ".properties": "Properties File", ".csv": "CSV", ".tsv": "TSV", ".parquet": "Apache Parquet", ".avro": "Apache Avro", ".orc": "Apache ORC", ".feather": "Feather (Arrow)", ".arrow": "Apache Arrow", ".msgpack": "MessagePack", ".bson": "BSON", ".cbor": "CBOR", ".protobuf": "Protocol Buffer", ".proto": "Protocol Buffer Schema", ".thrift": "Apache Thrift", ".hcl": "HCL (Terraform)", ".tf": "Terraform", ".tfvars": "Terraform Variables", ".pkl": "Pickle", ".pickle": "Pickle", ".npy": "NumPy Array", ".npz": "NumPy Archive", ".h5": "HDF5", ".hdf5": "HDF5", ".nc": "NetCDF", ".sqlite": "SQLite Database", ".sqlite3": "SQLite Database", ".db": "Database", ".dbf": "dBASE", ".mdb": "Access Database", ".accdb": "Access Database", ".lmdb": "LMDB", ".leveldb": "LevelDB", # ── Documents & Office ───────────────────────────────────────────── ".md": "Markdown", ".mdx": "MDX", ".txt": "Plain Text", ".text": "Plain Text", ".log": "Log File", ".rst": "reStructuredText", ".adoc": "AsciiDoc", ".org": "Org Mode", ".tex": "LaTeX", ".latex": "LaTeX", ".bib": "BibTeX", ".rtf": "Rich Text Format", ".doc": "Microsoft Word", ".docx": "Microsoft Word (OOXML)", ".odt": "OpenDocument Text", ".pdf": "PDF", ".xls": "Microsoft Excel", ".xlsx": "Microsoft Excel (OOXML)", ".xlsm": "Excel Macro-Enabled", ".xlsb": "Excel Binary", ".ods": "OpenDocument Spreadsheet", ".ppt": "Microsoft PowerPoint", ".pptx": "Microsoft PowerPoint (OOXML)", ".odp": "OpenDocument Presentation", ".pages": "Apple Pages", ".numbers": "Apple Numbers", ".key": "Apple Keynote", ".epub": "EPUB", ".mobi": "Mobipocket", ".djvu": "DjVu", ".chm": "Compiled HTML Help", ".man": "Man Page", ".info": "GNU Info", # ── Notebooks & Data Science ─────────────────────────────────────── ".ipynb": "Jupyter Notebook", ".rmd": "R Markdown", ".qmd": "Quarto Document", ".dvc": "DVC File", ".onnx": "ONNX Model", ".pt": "PyTorch Model", ".pth": "PyTorch Checkpoint", ".safetensors": "SafeTensors Model", ".gguf": "GGUF Model", ".bin": "Binary File", # ── Images ───────────────────────────────────────────────────────── ".png": "PNG Image", ".jpg": "JPEG Image", ".jpeg": "JPEG Image", ".gif": "GIF Image", ".bmp": "Bitmap Image", ".tiff": "TIFF Image", ".tif": "TIFF Image", ".webp": "WebP Image", ".avif": "AVIF Image", ".heic": "HEIC Image", ".heif": "HEIF Image", ".svg": "SVG Image", ".ico": "Icon", ".icns": "Apple Icon", ".cur": "Cursor", ".psd": "Photoshop", ".ai": "Adobe Illustrator", ".sketch": "Sketch", ".fig": "Figma", ".xd": "Adobe XD", ".eps": "EPS Vector", ".raw": "RAW Image", ".cr2": "Canon RAW", ".nef": "Nikon RAW", ".arw": "Sony RAW", ".dng": "DNG RAW", ".exr": "OpenEXR", ".hdr": "HDR Image", ".pbm": "PBM Image", ".pgm": "PGM Image", ".ppm": "PPM Image", ".pcx": "PCX Image", ".tga": "TGA Image", # ── Audio ────────────────────────────────────────────────────────── ".mp3": "MP3 Audio", ".wav": "WAV Audio", ".flac": "FLAC Audio", ".aac": "AAC Audio", ".ogg": "Ogg Audio", ".wma": "WMA Audio", ".m4a": "M4A Audio", ".opus": "Opus Audio", ".aiff": "AIFF Audio", ".mid": "MIDI", ".midi": "MIDI", ".amr": "AMR Audio", ".ape": "APE Audio", ".alac": "ALAC Audio", # ── Video ────────────────────────────────────────────────────────── ".mp4": "MP4 Video", ".mkv": "MKV Video", ".avi": "AVI Video", ".mov": "QuickTime Video", ".wmv": "WMV Video", ".flv": "Flash Video", ".webm": "WebM Video", ".m4v": "M4V Video", ".mpg": "MPEG Video", ".mpeg": "MPEG Video", ".3gp": "3GP Video", ".ogv": "Ogg Video", ".ts": "MPEG-TS", ".vob": "DVD Video", ".rm": "RealMedia", # ── Fonts ────────────────────────────────────────────────────────── ".ttf": "TrueType Font", ".otf": "OpenType Font", ".woff": "WOFF Font", ".woff2": "WOFF2 Font", ".eot": "EOT Font", ".fon": "Bitmap Font", # ── Archives / Compression ───────────────────────────────────────── ".zip": "ZIP Archive", ".gz": "Gzip Archive", ".gzip": "Gzip Archive", ".bz2": "Bzip2 Archive", ".xz": "XZ Archive", ".lz": "Lzip Archive", ".lzma": "LZMA Archive", ".zst": "Zstandard Archive", ".tar": "Tar Archive", ".tgz": "Tar+Gzip Archive", ".tbz2": "Tar+Bzip2 Archive", ".txz": "Tar+XZ Archive", ".rar": "RAR Archive", ".7z": "7-Zip Archive", ".cab": "Cabinet Archive", ".iso": "ISO Disk Image", ".dmg": "macOS Disk Image", ".img": "Disk Image", ".vhd": "Virtual Hard Disk", ".vhdx": "Virtual Hard Disk (VHDX)", ".vmdk": "VMware Disk", ".qcow2": "QEMU Disk", ".deb": "Debian Package", ".rpm": "RPM Package", ".apk": "Android Package", ".ipa": "iOS App", ".snap": "Snap Package", ".flatpak": "Flatpak Package", ".appimage": "AppImage", ".msi": "MSI Installer", ".exe": "Windows Executable", ".dll": "DLL Library", ".so": "Shared Object", ".dylib": "macOS Dynamic Library", ".a": "Static Library", ".lib": "Library", ".o": "Object File", ".obj": "Object File", ".sys": "System File", ".drv": "Driver", # ── DevOps / CI / Containerisation ───────────────────────────────── ".dockerfile":"Dockerfile", ".vagrantfile":"Vagrantfile", ".jenkinsfile":"Jenkinsfile", # ── Trading / Domain-specific ────────────────────────────────────── ".mq5": "MQL5 Source", ".mq4": "MQL4 Source", ".mqh": "MQL Header", ".ex5": "MQL5 Compiled", ".ex4": "MQL4 Compiled", ".set": "MT Parameter Set", ".pine": "Pine Script", ".tpl": "MT Template", ".chr": "MT Chart", ".mql": "MQL Source", # ── Misc / Build / Project ───────────────────────────────────────── ".makefile": "Makefile", ".cmake": "CMake", ".gradle": "Gradle", ".sbt": "SBT Build", ".gemspec": "Ruby Gem Spec", ".podspec": "CocoaPods Spec", ".cabal": "Cabal (Haskell)", ".csproj": "C# Project", ".fsproj": "F# Project", ".vbproj": "VB.NET Project", ".sln": "Visual Studio Solution", ".xcodeproj":"Xcode Project", ".pbxproj": "Xcode Project (PBX)", ".xcworkspace":"Xcode Workspace", ".plist": "Property List", ".lnk": "Windows Shortcut", ".url": "URL Shortcut", ".desktop": "Desktop Entry", ".reg": "Windows Registry", ".service": "Systemd Service", ".timer": "Systemd Timer", ".socket": "Systemd Socket", ".lock": "Lock File", ".patch": "Patch File", ".diff": "Diff File", ".map": "Source Map", ".whl": "Python Wheel", ".egg": "Python Egg", ".gem": "Ruby Gem", ".nupkg": "NuGet Package", ".crate": "Rust Crate", # ── SQL / Database ───────────────────────────────────────────────── ".sql": "SQL", ".mysql": "MySQL", ".pgsql": "PostgreSQL", ".plsql": "PL/SQL", # ── GraphQL / API ────────────────────────────────────────────────── ".graphql": "GraphQL", ".gql": "GraphQL", ".swagger": "Swagger", ".openapi": "OpenAPI Spec", # ── 3D / CAD / GIS ───────────────────────────────────────────────── ".stl": "STL 3D Model", ".obj": "Wavefront OBJ", ".fbx": "FBX 3D Model", ".gltf": "glTF 3D Model", ".glb": "glTF Binary", ".blend": "Blender File", ".3ds": "3DS Model", ".dae": "Collada", ".usd": "USD Scene", ".usda": "USD ASCII", ".usdc": "USD Binary", ".step": "STEP CAD", ".stp": "STEP CAD", ".iges": "IGES CAD", ".igs": "IGES CAD", ".dwg": "AutoCAD Drawing", ".dxf": "AutoCAD Exchange", ".shp": "Shapefile", ".kml": "KML", ".kmz": "KMZ", ".gpx": "GPX Track", # ── Certificates / Security ──────────────────────────────────────── ".pem": "PEM Certificate", ".crt": "Certificate", ".cer": "Certificate", ".csr": "Certificate Request", ".p12": "PKCS#12", ".pfx": "PKCS#12", ".asc": "PGP/GPG", ".gpg": "GPG Encrypted", ".sig": "Signature", ".pub": "Public Key", # ── Git-related ──────────────────────────────────────────────────── ".gitignore": "Git Ignore", ".gitattributes": "Git Attributes", ".gitmodules": "Git Submodules", # ── Editor / IDE Config ──────────────────────────────────────────── ".editorconfig": "EditorConfig", ".prettierrc": "Prettier Config", ".eslintrc": "ESLint Config", ".stylelintrc": "Stylelint Config", ".babelrc": "Babel Config", ".swcrc": "SWC Config", ".nvmrc": "NVM Config", ".npmrc": "NPM Config", ".yarnrc": "Yarn Config", # ── Misc ─────────────────────────────────────────────────────────── ".mw5": "MQL5 Variant", ".webmanifest": "Web App Manifest", } # --------------------------------------------------------------------------- # Default ignore patterns (common non-source dirs / files) # --------------------------------------------------------------------------- DEFAULT_IGNORE = { ".git", "__pycache__", ".venv", "venv", "env", "node_modules", ".ipynb_checkpoints", ".mypy_cache", ".pytest_cache", ".tox", ".eggs", "*.egg-info", ".DS_Store", "Thumbs.db", "desktop.ini", } def should_ignore(name: str, ignore_set: set) -> bool: """Return True if *name* matches any pattern in the ignore set.""" if name in ignore_set: return True for pattern in ignore_set: if pattern.startswith("*") and name.endswith(pattern[1:]): return True return False def get_label(ext: str) -> str: """Return the human-readable label for an extension.""" if ext in EXT_LABELS: return EXT_LABELS[ext] # Fallback: capitalise the extension name return ext.upper().lstrip(".") def scan_files(root_path: str, ignore_set: set, exclude_exts: set) -> dict: """ Walk *root_path* and collect per-extension statistics. Returns a dict keyed by normalised extension string, e.g. ".py", with values ``{"count": int, "size_bytes": int, "label": str}``. Files without an extension are grouped under "(no extension)". """ stats: dict = defaultdict(lambda: {"count": 0, "size_bytes": 0, "label": ""}) for dirpath, dirnames, filenames in os.walk(root_path): # Prune ignored directories in-place so os.walk skips them dirnames[:] = [ d for d in dirnames if not should_ignore(d, ignore_set) ] for fname in filenames: if should_ignore(fname, ignore_set): continue filepath = os.path.join(dirpath, fname) # Determine extension _, ext = os.path.splitext(fname) ext = ext.lower() # normalise if ext and ext.lstrip(".") in exclude_exts: continue # Label if ext: label = f"{get_label(ext)} ({ext})" else: ext = "(none)" label = "(no extension)" try: size = os.path.getsize(filepath) except (OSError, PermissionError): size = 0 stats[ext]["count"] += 1 stats[ext]["size_bytes"] += size stats[ext]["label"] = label return dict(stats) def format_size_kb(size_bytes: int) -> str: """Return exact size in KB, rounded to 1 decimal place.""" kb = size_bytes / 1024 if kb == 0: return "0" # Show 1 decimal place for precision, strip trailing zero if whole number formatted = f"{kb:,.1f}" return formatted def build_table(stats: dict, sort_by: str) -> list[str]: """ Build a Markdown table from *stats*. Returns a list of lines. """ # Prepare rows rows = [] total_count = 0 total_bytes = 0 for ext, data in stats.items(): rows.append( { "label": data["label"], "count": data["count"], "size_bytes": data["size_bytes"], "ext_key": ext, } ) total_count += data["count"] total_bytes += data["size_bytes"] # Sort if sort_by == "count": rows.sort(key=lambda r: (-r["count"], r["label"].lower())) elif sort_by == "size": rows.sort(key=lambda r: (-r["size_bytes"], r["label"].lower())) else: # "ext" rows.sort(key=lambda r: r["label"].lower()) # Build Markdown lines lines = [ "| File Type | Count | Size (KB) |", "| :--- | :--- | :--- |", ] for row in rows: lines.append( f"| {row['label']} | {row['count']} | {format_size_kb(row['size_bytes'])} |" ) # Totals row lines.append(f"| **Total** | **{total_count}** | **{format_size_kb(total_bytes)}** |") return lines def main() -> None: parser = argparse.ArgumentParser( description="Audit repository tech-stack (file types, counts, sizes)." ) parser.add_argument( "path", nargs="?", default=".", help="Root directory to scan (default: current directory).", ) parser.add_argument( "--output", "-o", default=None, help="Output Markdown file path (default: TECHSTACK.md in scanned dir).", ) parser.add_argument( "--ignore", nargs="*", default=None, help="Extra directory/file names to ignore.", ) parser.add_argument( "--no-default-ignore", action="store_true", help="Disable the built-in ignore list.", ) parser.add_argument( "--exclude-ext", nargs="*", default=None, help="File extensions to exclude from output (without dot, e.g. png mp4 git).", ) parser.add_argument( "--sort", choices=["count", "size", "ext"], default="count", help="Sort order for the table (default: count descending).", ) args = parser.parse_args() root = os.path.abspath(args.path) root_name = os.path.basename(root) # Build ignore set ignore_set: set = set() if args.no_default_ignore else set(DEFAULT_IGNORE) if args.ignore: ignore_set.update(args.ignore) # Always ignore the output file itself out_path = args.output or os.path.join(root, "TECHSTACK.md") out_name = os.path.basename(out_path) ignore_set.add(out_name) # Excluded extensions exclude_exts: set = set() if args.exclude_ext: exclude_exts = {e.lower().lstrip(".") for e in args.exclude_ext} print(f"Scanning: {root}") print(f"Ignoring: {', '.join(sorted(ignore_set))}") if exclude_exts: print(f"Excluding extensions: {', '.join(sorted(exclude_exts))}") stats = scan_files(root, ignore_set, exclude_exts) table_lines = build_table(stats, sort_by=args.sort) # Compose Markdown md_lines = [ f"## Techstack\n", f"Audit of **{root_name}** project files (excluding environment and cache):\n", ] md_lines.extend(table_lines) md_lines.append("") # trailing newline content = "\n".join(md_lines) with open(out_path, "w", encoding="utf-8") as fh: fh.write(content) total = sum(d["count"] for d in stats.values()) print(f"Done – {total} files across {len(stats)} types written to {out_path}") if __name__ == "__main__": main()