Spaces:
Sleeping
Sleeping
| from typing import List, Dict, Any, Optional | |
| import re | |
| import httpx | |
| from datetime import datetime, timedelta | |
| from bs4 import BeautifulSoup | |
| from src.logger import logger | |
| class HuggingFaceDailyPapers: | |
| """Class for crawling and parsing Hugging Face daily papers""" | |
| def __init__(self): | |
| self.base_url = "https://huggingface.co/papers/date" | |
| self.timeout = 20 | |
| def extract_arxiv_id(self, url: str) -> Optional[str]: | |
| """Extract arXiv ID from a URL""" | |
| if not url: | |
| return None | |
| # matches https://huggingface.co/papers/2508.10711 | |
| m = re.search(r"huggingface\.co/papers/(\d{4,5}\.\d+)(v\d+)?", url) | |
| if m: | |
| return m.group(1) | |
| return None | |
| def extract_json_data(self, html: str) -> Dict[str, Any]: | |
| """Extract JSON data from the HTML page to get GitHub stars and other metadata.""" | |
| try: | |
| soup = BeautifulSoup(html, "lxml") | |
| # Look for GitHub stars in the HTML structure | |
| # Based on the user's description, GitHub stars are displayed with SVG icons | |
| # Look for SVG elements that might represent GitHub stars | |
| svg_elements = soup.find_all("svg") | |
| github_stars_map = {} | |
| for svg in svg_elements: | |
| # Look for GitHub-related SVG (usually has specific viewBox or path) | |
| svg_html = str(svg) | |
| if "github" in svg_html.lower() or "256 250" in svg_html: # GitHub icon viewBox | |
| # Look for the star count near this SVG | |
| parent = svg.parent | |
| if parent: | |
| # Look for numbers that might be star counts | |
| text_content = parent.get_text() | |
| numbers = re.findall(r'\b(\d+)\b', text_content) | |
| if numbers: | |
| # The number near a GitHub SVG is likely the star count | |
| star_count = int(numbers[0]) | |
| # Try to find the paper title or ID to associate with | |
| # Look for the closest article or card container | |
| article = svg.find_parent("article") | |
| if article: | |
| title_elem = article.find("h3") | |
| if title_elem: | |
| paper_title = title_elem.get_text(strip=True) | |
| github_stars_map[paper_title] = star_count | |
| # Also look for any elements with GitHub-related text | |
| github_text_elements = soup.find_all(string=lambda text: text and "github" in text.lower()) | |
| for text_elem in github_text_elements: | |
| parent = text_elem.parent | |
| if parent: | |
| text_content = parent.get_text() | |
| numbers = re.findall(r'\b(\d+)\b', text_content) | |
| if numbers: | |
| star_count = int(numbers[0]) | |
| # Try to find the paper title | |
| article = parent.find_parent("article") | |
| if article: | |
| title_elem = article.find("h3") | |
| if title_elem: | |
| paper_title = title_elem.get_text(strip=True) | |
| if paper_title not in github_stars_map: | |
| github_stars_map[paper_title] = star_count | |
| return {"github_stars_map": github_stars_map} | |
| except Exception as e: | |
| logger.error(f"Error extracting JSON data: {e}") | |
| return {"github_stars_map": {}} | |
| async def fetch_daily_html(self, target_date: str) -> tuple[str, str]: | |
| """Fetch daily papers HTML, with fallback to find the latest available date""" | |
| async with httpx.AsyncClient(timeout=self.timeout, follow_redirects=False) as client: | |
| # First try the requested date | |
| url = f"{self.base_url}/{target_date}" | |
| try: | |
| r = await client.get(url) | |
| # Check if we got redirected | |
| if r.status_code in [301, 302, 303, 307, 308]: | |
| # We got redirected, extract the actual date from the redirect location | |
| location = r.headers.get('location', '') | |
| logger.info(f"Got redirect to: {location}") | |
| # Extract date from redirect URL (e.g., /papers/date/2025-08-08) | |
| date_match = re.search(r'/papers/date/(\d{4}-\d{2}-\d{2})', location) | |
| if date_match: | |
| actual_date = date_match.group(1) | |
| logger.info(f"Redirected from {target_date} to {actual_date}") | |
| # Fetch the actual page | |
| actual_url = f"https://huggingface.co{location}" | |
| r = await client.get(actual_url) | |
| if r.status_code == 200: | |
| return actual_date, r.text | |
| else: | |
| raise Exception(f"Failed to fetch redirected page: {r.status_code}") | |
| else: | |
| # Couldn't extract date from redirect, use fallback | |
| raise Exception("Could not extract date from redirect") | |
| elif r.status_code == 200: | |
| # Direct success, check if the page actually contains the requested date | |
| if target_date in r.text or "Daily Papers" in r.text: | |
| return target_date, r.text | |
| else: | |
| # Page exists but doesn't contain expected content | |
| raise Exception("Page exists but doesn't contain expected content") | |
| else: | |
| # Other error status | |
| raise Exception(f"Status code {r.status_code}") | |
| except Exception as e: | |
| logger.error(f"Failed to fetch {target_date}: {e}") | |
| # If the requested date fails, try to find the latest available date | |
| actual_date, html = await self.find_latest_available_date(client) | |
| return actual_date, html | |
| async def find_latest_available_date(self, client: httpx.AsyncClient) -> tuple[str, str]: | |
| """Find the latest available date by checking recent dates""" | |
| # Start from today and go backwards up to 30 days | |
| today = datetime.now() | |
| for i in range(30): | |
| check_date = today - timedelta(days=i) | |
| date_str = check_date.strftime("%Y-%m-%d") | |
| url = f"{self.base_url}/{date_str}" | |
| try: | |
| r = await client.get(url) | |
| if r.status_code == 200: | |
| # Check if the page actually has content (not just a 404 or empty page) | |
| if "Daily Papers" in r.text and len(r.text) > 1000: | |
| logger.info(f"Found latest available date: {date_str}") | |
| return date_str, r.text | |
| except Exception: | |
| continue | |
| # If no date found, return a default page or raise an error | |
| raise Exception("No available daily papers found in the last 30 days") | |
| def parse_daily_cards(self, html: str) -> List[Dict[str, Any]]: | |
| """Parse daily papers HTML and extract paper cards""" | |
| soup = BeautifulSoup(html, "lxml") | |
| # First, extract JSON data from the page to get GitHub stars | |
| json_data = self.extract_json_data(html) | |
| # Find all article elements that contain paper cards | |
| cards: List[Dict[str, Any]] = [] | |
| # Look for article elements with the specific class structure from Hugging Face | |
| for article in soup.select("article.relative.flex.flex-col.overflow-hidden.rounded-xl.border"): | |
| try: | |
| card_data = {} | |
| # Extract title and link | |
| title_link = article.select_one("h3 a") | |
| if title_link: | |
| card_data["title"] = title_link.get_text(strip=True) | |
| href = title_link.get("href") | |
| if href: | |
| if href.startswith("http"): | |
| card_data["huggingface_url"] = href | |
| else: | |
| card_data["huggingface_url"] = f"https://huggingface.co{href}" | |
| # Extract upvote count | |
| upvote_div = article.select_one("div.shadow-alternate div.leading-none") | |
| if upvote_div: | |
| upvote_text = upvote_div.get_text(strip=True) | |
| try: | |
| card_data["upvotes"] = int(upvote_text) | |
| except ValueError: | |
| card_data["upvotes"] = 0 | |
| # Extract author count - look for the author count text | |
| author_count_div = article.select_one("div.flex.truncate.text-sm") | |
| if author_count_div: | |
| author_text = author_count_div.get_text(strip=True) | |
| # Extract number from "· 10 authors" | |
| author_match = re.search(r'(\d+)\s*authors?', author_text) | |
| if author_match: | |
| card_data["author_count"] = int(author_match.group(1)) | |
| else: | |
| card_data["author_count"] = 0 | |
| # Extract GitHub stars from JSON data in the page | |
| # This will be handled later when we parse the JSON data | |
| card_data["github_stars"] = 0 # Default value | |
| # Extract comments count - look for comment icon and number | |
| comment_links = article.select("a[href*='#community']") | |
| for comment_link in comment_links: | |
| comment_text = comment_link.get_text(strip=True) | |
| try: | |
| card_data["comments"] = int(comment_text) | |
| break | |
| except ValueError: | |
| continue | |
| # Extract submitter information | |
| submitted_div = article.select_one("div.shadow-xs") | |
| if submitted_div: | |
| submitter_text = submitted_div.get_text(strip=True) | |
| # Extract submitter name from "Submitted byLiang0223" (no space) | |
| submitter_match = re.search(r'Submitted by(\S+)', submitter_text) | |
| if submitter_match: | |
| card_data["submitter"] = submitter_match.group(1) | |
| # Extract arXiv ID from the URL | |
| if card_data.get("huggingface_url"): | |
| arxiv_id = self.extract_arxiv_id(card_data["huggingface_url"]) | |
| if arxiv_id: | |
| card_data["arxiv_id"] = arxiv_id | |
| # Try to get GitHub stars from the extracted data | |
| # Look for GitHub stars by matching paper title | |
| paper_title = card_data.get("title", "") | |
| if paper_title in json_data.get("github_stars_map", {}): | |
| card_data["github_stars"] = json_data["github_stars_map"][paper_title] | |
| # Only add cards that have at least a title | |
| if card_data.get("title"): | |
| cards.append(card_data) | |
| except Exception as e: | |
| logger.error(f"Error parsing card: {e}") | |
| continue | |
| # If the above method didn't work, fall back to the old method | |
| if not cards: | |
| logger.info("Falling back to old parsing method") | |
| for h3 in soup.select("h3"): | |
| # Title and Hugging Face paper link (if present) | |
| a = h3.find("a") | |
| title = h3.get_text(strip=True) | |
| hf_link = None | |
| if a and a.get("href"): | |
| href = a.get("href") | |
| # Absolute URL to huggingface | |
| if href.startswith("http"): | |
| hf_link = href | |
| else: | |
| hf_link = f"https://huggingface.co{href}" | |
| # Try to capture sibling info (authors, votes, etc.) as a small snippet | |
| meta_text = None | |
| parent = h3.parent | |
| if parent: | |
| # Join immediate text content following h3 | |
| collected: List[str] = [] | |
| for sib in parent.find_all(text=True, recursive=False): | |
| t = (sib or "").strip() | |
| if t: | |
| collected.append(t) | |
| if collected: | |
| meta_text = " ".join(collected) | |
| # Try to discover any arXiv link inside nearby anchors | |
| arxiv_id: Optional[str] = None | |
| container = parent if parent else h3 | |
| for link in container.find_all("a", href=True): | |
| possible = self.extract_arxiv_id(link["href"]) | |
| if possible: | |
| arxiv_id = possible | |
| break | |
| cards.append( | |
| { | |
| "title": title, | |
| "huggingface_url": hf_link, | |
| "meta": meta_text, | |
| "arxiv_id": arxiv_id, | |
| } | |
| ) | |
| # Deduplicate by title | |
| seen = set() | |
| unique_cards: List[Dict[str, Any]] = [] | |
| for c in cards: | |
| key = c.get("title") or "" | |
| if key and key not in seen: | |
| seen.add(key) | |
| unique_cards.append(c) | |
| logger.info(f"Parsed {len(unique_cards)} cards") | |
| return unique_cards | |
| async def get_daily_papers(self, target_date: str) -> tuple[str, List[Dict[str, Any]]]: | |
| """Get daily papers for a specific date""" | |
| date_str, html = await self.fetch_daily_html(target_date) | |
| cards = self.parse_daily_cards(html) | |
| return date_str, cards | |