""" GitHub repository analysis utilities """ import os import re import httpx from typing import Optional from datetime import datetime, timedelta def get_since_iso(time_range: str) -> str: """Compute ISO since date for a given window.""" now = datetime.now() delta_map = { "1week": timedelta(days=7), "1month": timedelta(days=30), "3months": timedelta(days=90), "6months": timedelta(days=180), "1year": timedelta(days=365), } delta = delta_map.get(time_range, timedelta(days=90)) return (now - delta).isoformat() async def fetch_github_repo(repo_url: str, github_token: Optional[str] = None) -> dict: """Fetch repository metadata from GitHub. Priority for GitHub token: 1. Environment variable GITHUB_TOKEN (for Hugging Face Spaces secrets) 2. User-provided github_token parameter (for manual override) """ match = re.match(r"https?://github\.com/([^/]+)/([^/]+)", repo_url) if not match: raise ValueError("Invalid GitHub URL") owner, repo = match.groups() repo_name = repo.rstrip(".git") # Environment variable takes priority (for HF Spaces secrets) token = os.environ.get("GITHUB_TOKEN") or github_token headers = {"Accept": "application/vnd.github.v3+json"} if token: headers["Authorization"] = f"token {token}" async with httpx.AsyncClient() as client: # Fetch repository metadata repo_response = await client.get( f"https://api.github.com/repos/{owner}/{repo_name}", headers=headers, timeout=30, ) if repo_response.status_code != 200: raise ValueError(f"GitHub API error: {repo_response.text}") repo_data = repo_response.json() # Fetch languages languages_response = await client.get( repo_data["languages_url"], headers=headers, timeout=30 ) languages = ( languages_response.json() if languages_response.status_code == 200 else {} ) # Fetch README readme = "" try: readme_response = await client.get( f"https://api.github.com/repos/{owner}/{repo_name}/readme", headers={**headers, "Accept": "application/vnd.github.raw"}, timeout=30, ) if readme_response.status_code == 200: readme = readme_response.text except Exception as e: print(f"README fetch failed: {e}") # Fetch repository tree tree_response = await client.get( f"https://api.github.com/repos/{owner}/{repo_name}/git/trees/{repo_data['default_branch']}?recursive=1", headers=headers, timeout=30, ) tree_data = ( tree_response.json() if tree_response.status_code == 200 else {"tree": []} ) files = tree_data.get("tree", []) structure = analyze_repo_structure(files) return { "full_name": repo_data["full_name"], "owner": owner, "name": repo_name, "description": repo_data.get("description", "") or "", "languages": languages, "primaryLanguage": repo_data.get("language", "Unknown") or "Unknown", "stars": repo_data.get("stargazers_count", 0), "forks": repo_data.get("forks_count", 0), "readme": readme, "structure": structure, "url": repo_url, } async def fetch_recent_commits( repo_url: str, limit: int = 10, since_iso: Optional[str] = None, until_iso: Optional[str] = None, github_token: Optional[str] = None, ) -> list[dict]: """Fetch recent commits from a GitHub repository. Priority for GitHub token: 1. Environment variable GITHUB_TOKEN (for Hugging Face Spaces secrets) 2. User-provided github_token parameter (for manual override) """ match = re.match(r"https?://github\.com/([^/]+)/([^/]+)", repo_url) if not match: raise ValueError("Invalid GitHub URL") owner, repo = match.groups() repo_name = repo.rstrip(".git") # Environment variable takes priority (for HF Spaces secrets) token = os.environ.get("GITHUB_TOKEN") or github_token headers = {"Accept": "application/vnd.github.v3+json"} if token: headers["Authorization"] = f"token {token}" try: params = {"per_page": str(limit)} if since_iso: params["since"] = since_iso if until_iso: params["until"] = until_iso async with httpx.AsyncClient() as client: response = await client.get( f"https://api.github.com/repos/{owner}/{repo_name}/commits", headers=headers, params=params, timeout=30, ) if response.status_code != 200: return [] commits_data = response.json() return [ { "sha": c["sha"], "message": c.get("commit", {}) .get("message", "") .split("\n")[0][:100], "author": c.get("commit", {}) .get("author", {}) .get("name", "unknown"), "date": c.get("commit", {}).get("author", {}).get("date", ""), "url": c.get("html_url", ""), } for c in commits_data ] except Exception as e: print(f"Recent commits unavailable: {e}") return [] def analyze_repo_structure(files: list) -> dict: """Analyze repository structure from file list.""" paths = [f.get("path", "").lower() for f in files] has_readme = any(p in ("readme.md", "readme.txt") for p in paths) has_docs = any(p.startswith("docs/") or p.startswith("documentation/") for p in paths) has_examples = any(p.startswith("examples/") or p.startswith("example/") for p in paths) # Extract key directories (top-level only) directories = set() for f in files: if f.get("type") == "tree": parts = f.get("path", "").split("/") if len(parts) == 1: directories.add(parts[0]) return { "hasReadme": has_readme, "hasDocs": has_docs, "hasExamples": has_examples, "keyDirectories": sorted(list(directories)), } async def fetch_commit_diff( repo_url: str, commit_sha: str, github_token: Optional[str] = None, ) -> dict: """Fetch the diff for a specific commit. Priority for GitHub token: 1. Environment variable GITHUB_TOKEN (for Hugging Face Spaces secrets) 2. User-provided github_token parameter (for manual override) """ match = re.match(r"https?://github\.com/([^/]+)/([^/]+)", repo_url) if not match: raise ValueError("Invalid GitHub URL") owner, repo = match.groups() repo_name = repo.rstrip(".git") # Environment variable takes priority (for HF Spaces secrets) token = os.environ.get("GITHUB_TOKEN") or github_token headers = {"Accept": "application/vnd.github.v3.diff"} if token: headers["Authorization"] = f"token {token}" try: async with httpx.AsyncClient() as client: response = await client.get( f"https://api.github.com/repos/{owner}/{repo_name}/commits/{commit_sha}", headers=headers, timeout=30, ) if response.status_code != 200: return {"sha": commit_sha, "diff": "", "error": "Failed to fetch diff"} return { "sha": commit_sha, "diff": response.text[:10000], # Limit diff size } except Exception as e: print(f"Commit diff fetch failed: {e}") return {"sha": commit_sha, "diff": "", "error": str(e)} async def fetch_commits_with_diffs( repo_url: str, commits: list[dict], max_commits: int = 5, github_token: Optional[str] = None, ) -> list[dict]: """Fetch diffs for multiple commits, focusing on important changes.""" diffs = [] for commit in commits[:max_commits]: diff_data = await fetch_commit_diff(repo_url, commit["sha"], github_token) diffs.append({ **commit, "diff": diff_data.get("diff", ""), }) return diffs async def fetch_readme_at_commit( repo_url: str, commit_sha: str, github_token: Optional[str] = None, ) -> str: """Fetch README content at a specific commit. Priority for GitHub token: 1. Environment variable GITHUB_TOKEN (for Hugging Face Spaces secrets) 2. User-provided github_token parameter (for manual override) """ match = re.match(r"https?://github\.com/([^/]+)/([^/]+)", repo_url) if not match: raise ValueError("Invalid GitHub URL") owner, repo = match.groups() repo_name = repo.rstrip(".git") # Environment variable takes priority (for HF Spaces secrets) token = os.environ.get("GITHUB_TOKEN") or github_token headers = {"Accept": "application/vnd.github.raw"} if token: headers["Authorization"] = f"token {token}" # Try common README filenames readme_files = ["README.md", "readme.md", "README.txt", "README"] async with httpx.AsyncClient() as client: for readme_file in readme_files: try: response = await client.get( f"https://api.github.com/repos/{owner}/{repo_name}/contents/{readme_file}?ref={commit_sha}", headers=headers, timeout=30, ) if response.status_code == 200: return response.text except Exception: continue return "" async def fetch_first_commit_sha( repo_url: str, github_token: Optional[str] = None, ) -> Optional[str]: """Fetch the SHA of the first (oldest) commit in the repository. Priority for GitHub token: 1. Environment variable GITHUB_TOKEN (for Hugging Face Spaces secrets) 2. User-provided github_token parameter (for manual override) """ match = re.match(r"https?://github\.com/([^/]+)/([^/]+)", repo_url) if not match: raise ValueError("Invalid GitHub URL") owner, repo = match.groups() repo_name = repo.rstrip(".git") # Environment variable takes priority (for HF Spaces secrets) token = os.environ.get("GITHUB_TOKEN") or github_token headers = {"Accept": "application/vnd.github.v3+json"} if token: headers["Authorization"] = f"token {token}" try: async with httpx.AsyncClient() as client: # Get the default branch repo_response = await client.get( f"https://api.github.com/repos/{owner}/{repo_name}", headers=headers, timeout=30, ) if repo_response.status_code != 200: return None default_branch = repo_response.json().get("default_branch", "main") # Get commits with pagination to find the oldest # We'll get the last page of commits response = await client.get( f"https://api.github.com/repos/{owner}/{repo_name}/commits", headers=headers, params={"sha": default_branch, "per_page": "1"}, timeout=30, ) if response.status_code != 200: return None # Check for Link header to get last page link_header = response.headers.get("Link", "") last_page_url = None for link in link_header.split(", "): if 'rel="last"' in link: last_page_url = link.split(";")[0].strip("<>") break if last_page_url: # Fetch the last page last_response = await client.get( last_page_url, headers=headers, timeout=30, ) if last_response.status_code == 200: commits = last_response.json() if commits: return commits[-1]["sha"] # If no pagination, get commits and return the oldest response = await client.get( f"https://api.github.com/repos/{owner}/{repo_name}/commits", headers=headers, params={"sha": default_branch, "per_page": "100"}, timeout=30, ) if response.status_code == 200: commits = response.json() if commits: return commits[-1]["sha"] return None except Exception as e: print(f"First commit fetch failed: {e}") return None async def fetch_readme_changes( repo_url: str, since_iso: str, github_token: Optional[str] = None, ) -> dict: """ Fetch README changes: initial README and diff within the time period. Returns both the original README (for project understanding) and any README changes during the analysis period (highest priority for DevRel). Priority for GitHub token: 1. Environment variable GITHUB_TOKEN (for Hugging Face Spaces secrets) 2. User-provided github_token parameter (for manual override) """ match = re.match(r"https?://github\.com/([^/]+)/([^/]+)", repo_url) if not match: raise ValueError("Invalid GitHub URL") owner, repo = match.groups() repo_name = repo.rstrip(".git") # Environment variable takes priority (for HF Spaces secrets) token = os.environ.get("GITHUB_TOKEN") or github_token headers = {"Accept": "application/vnd.github.v3+json"} if token: headers["Authorization"] = f"token {token}" result = { "initial_readme": "", "readme_diff": "", "has_readme_changes": False, "readme_commits": [], } try: async with httpx.AsyncClient() as client: # Get commits that touched README files in the time period readme_commits = [] for readme_file in ["README.md", "readme.md", "README.txt"]: response = await client.get( f"https://api.github.com/repos/{owner}/{repo_name}/commits", headers=headers, params={ "path": readme_file, "since": since_iso, "per_page": "10", }, timeout=30, ) if response.status_code == 200: commits = response.json() for c in commits: readme_commits.append({ "sha": c["sha"], "message": c.get("commit", {}).get("message", "").split("\n")[0][:100], "date": c.get("commit", {}).get("author", {}).get("date", ""), "file": readme_file, }) result["readme_commits"] = readme_commits result["has_readme_changes"] = len(readme_commits) > 0 # Get initial README (from first commit or oldest available) first_sha = await fetch_first_commit_sha(repo_url, github_token) if first_sha: result["initial_readme"] = await fetch_readme_at_commit( repo_url, first_sha, github_token ) # If there are README changes, get the diff if readme_commits: oldest_readme_commit = readme_commits[-1]["sha"] if readme_commits else None if oldest_readme_commit: # Get diff for the README changes diff_headers = {"Accept": "application/vnd.github.v3.diff"} if token: diff_headers["Authorization"] = f"token {token}" # Get combined diff of all README commits diffs = [] for commit in readme_commits[:3]: # Limit to 3 most recent diff_response = await client.get( f"https://api.github.com/repos/{owner}/{repo_name}/commits/{commit['sha']}", headers=diff_headers, timeout=30, ) if diff_response.status_code == 200: # Filter to only include README changes diff_text = diff_response.text readme_sections = [] in_readme = False current_section = [] for line in diff_text.split("\n"): if line.startswith("diff --git") and "readme" in line.lower(): in_readme = True current_section = [line] elif line.startswith("diff --git") and in_readme: readme_sections.append("\n".join(current_section)) in_readme = "readme" in line.lower() current_section = [line] if in_readme else [] elif in_readme: current_section.append(line) if current_section and in_readme: readme_sections.append("\n".join(current_section)) if readme_sections: diffs.append(f"# Commit: {commit['message']}\n" + "\n".join(readme_sections)) result["readme_diff"] = "\n\n---\n\n".join(diffs)[:8000] # Limit size except Exception as e: print(f"README changes fetch failed: {e}") return result async def analyze_breaking_changes( commits_with_diffs: list[dict], ) -> list[dict]: """ Analyze commit diffs to identify potential breaking changes. Returns a list of breaking changes with context. """ breaking_changes = [] # Patterns that might indicate breaking changes breaking_patterns = [ # Function/method signature changes (r"^-\s*(def|function|func|public|private|protected)\s+\w+\s*\([^)]*\)", "Function signature removed/changed"), # Class/interface changes (r"^-\s*(class|interface|struct|type)\s+\w+", "Class/interface removed/changed"), # Import/export changes (r"^-\s*(import|export|from|require)\s+", "Import/export removed/changed"), # Configuration changes (r"^-\s*['\"]?[a-zA-Z_]+['\"]?\s*[:=]", "Configuration option removed/changed"), # API endpoint changes (r"^-\s*@?(get|post|put|delete|patch|route|api)\s*\(", "API endpoint removed/changed"), # Deprecation notices (r"deprecated|breaking|removed|obsolete", "Deprecation or breaking change mentioned"), # Version bumps in configs (r"^-\s*['\"]?version['\"]?\s*[:=]\s*['\"]?\d+\.\d+", "Version number changed"), ] for commit in commits_with_diffs: diff = commit.get("diff", "") if not diff: continue commit_breaking_changes = [] for pattern, description in breaking_patterns: import re matches = re.findall(pattern, diff, re.MULTILINE | re.IGNORECASE) if matches: commit_breaking_changes.append({ "type": description, "count": len(matches), }) if commit_breaking_changes: breaking_changes.append({ "sha": commit.get("sha", "")[:7], "message": commit.get("message", ""), "date": commit.get("date", ""), "changes": commit_breaking_changes, }) return breaking_changes