Spaces:

zwt963
/

paperindex

Sleeping

paperindex / src /crawl /huggingface_daily.py

DVampire

update website

a878541 4 months ago

14.2 kB

	from typing import List, Dict, Any, Optional
	import re
	import httpx
	from datetime import datetime, timedelta
	from bs4 import BeautifulSoup

	from src.logger import logger


	class HuggingFaceDailyPapers:
	"""Class for crawling and parsing Hugging Face daily papers"""

	def __init__(self):
	self.base_url = "https://huggingface.co/papers/date"
	self.timeout = 20

	def extract_arxiv_id(self, url: str) -> Optional[str]:
	"""Extract arXiv ID from a URL"""
	if not url:
	return None
	# matches https://huggingface.co/papers/2508.10711
	m = re.search(r"huggingface\.co/papers/(\d{4,5}\.\d+)(v\d+)?", url)
	if m:
	return m.group(1)
	return None

	def extract_json_data(self, html: str) -> Dict[str, Any]:
	"""Extract JSON data from the HTML page to get GitHub stars and other metadata."""
	try:
	soup = BeautifulSoup(html, "lxml")

	# Look for GitHub stars in the HTML structure
	# Based on the user's description, GitHub stars are displayed with SVG icons
	# Look for SVG elements that might represent GitHub stars
	svg_elements = soup.find_all("svg")

	github_stars_map = {}

	for svg in svg_elements:
	# Look for GitHub-related SVG (usually has specific viewBox or path)
	svg_html = str(svg)
	if "github" in svg_html.lower() or "256 250" in svg_html: # GitHub icon viewBox
	# Look for the star count near this SVG
	parent = svg.parent
	if parent:
	# Look for numbers that might be star counts
	text_content = parent.get_text()
	numbers = re.findall(r'\b(\d+)\b', text_content)
	if numbers:
	# The number near a GitHub SVG is likely the star count
	star_count = int(numbers[0])
	# Try to find the paper title or ID to associate with
	# Look for the closest article or card container
	article = svg.find_parent("article")
	if article:
	title_elem = article.find("h3")
	if title_elem:
	paper_title = title_elem.get_text(strip=True)
	github_stars_map[paper_title] = star_count

	# Also look for any elements with GitHub-related text
	github_text_elements = soup.find_all(string=lambda text: text and "github" in text.lower())
	for text_elem in github_text_elements:
	parent = text_elem.parent
	if parent:
	text_content = parent.get_text()
	numbers = re.findall(r'\b(\d+)\b', text_content)
	if numbers:
	star_count = int(numbers[0])
	# Try to find the paper title
	article = parent.find_parent("article")
	if article:
	title_elem = article.find("h3")
	if title_elem:
	paper_title = title_elem.get_text(strip=True)
	if paper_title not in github_stars_map:
	github_stars_map[paper_title] = star_count

	return {"github_stars_map": github_stars_map}

	except Exception as e:
	logger.error(f"Error extracting JSON data: {e}")

	return {"github_stars_map": {}}

	async def fetch_daily_html(self, target_date: str) -> tuple[str, str]:
	"""Fetch daily papers HTML, with fallback to find the latest available date"""
	async with httpx.AsyncClient(timeout=self.timeout, follow_redirects=False) as client:
	# First try the requested date
	url = f"{self.base_url}/{target_date}"
	try:
	r = await client.get(url)

	# Check if we got redirected
	if r.status_code in [301, 302, 303, 307, 308]:
	# We got redirected, extract the actual date from the redirect location
	location = r.headers.get('location', '')
	logger.info(f"Got redirect to: {location}")

	# Extract date from redirect URL (e.g., /papers/date/2025-08-08)
	date_match = re.search(r'/papers/date/(\d{4}-\d{2}-\d{2})', location)
	if date_match:
	actual_date = date_match.group(1)
	logger.info(f"Redirected from {target_date} to {actual_date}")

	# Fetch the actual page
	actual_url = f"https://huggingface.co{location}"
	r = await client.get(actual_url)
	if r.status_code == 200:
	return actual_date, r.text
	else:
	raise Exception(f"Failed to fetch redirected page: {r.status_code}")
	else:
	# Couldn't extract date from redirect, use fallback
	raise Exception("Could not extract date from redirect")

	elif r.status_code == 200:
	# Direct success, check if the page actually contains the requested date
	if target_date in r.text or "Daily Papers" in r.text:
	return target_date, r.text
	else:
	# Page exists but doesn't contain expected content
	raise Exception("Page exists but doesn't contain expected content")
	else:
	# Other error status
	raise Exception(f"Status code {r.status_code}")

	except Exception as e:
	logger.error(f"Failed to fetch {target_date}: {e}")
	# If the requested date fails, try to find the latest available date
	actual_date, html = await self.find_latest_available_date(client)
	return actual_date, html

	async def find_latest_available_date(self, client: httpx.AsyncClient) -> tuple[str, str]:
	"""Find the latest available date by checking recent dates"""

	# Start from today and go backwards up to 30 days
	today = datetime.now()
	for i in range(30):
	check_date = today - timedelta(days=i)
	date_str = check_date.strftime("%Y-%m-%d")
	url = f"{self.base_url}/{date_str}"

	try:
	r = await client.get(url)
	if r.status_code == 200:
	# Check if the page actually has content (not just a 404 or empty page)
	if "Daily Papers" in r.text and len(r.text) > 1000:
	logger.info(f"Found latest available date: {date_str}")
	return date_str, r.text
	except Exception:
	continue

	# If no date found, return a default page or raise an error
	raise Exception("No available daily papers found in the last 30 days")

	def parse_daily_cards(self, html: str) -> List[Dict[str, Any]]:
	"""Parse daily papers HTML and extract paper cards"""
	soup = BeautifulSoup(html, "lxml")

	# First, extract JSON data from the page to get GitHub stars
	json_data = self.extract_json_data(html)

	# Find all article elements that contain paper cards
	cards: List[Dict[str, Any]] = []

	# Look for article elements with the specific class structure from Hugging Face
	for article in soup.select("article.relative.flex.flex-col.overflow-hidden.rounded-xl.border"):
	try:
	card_data = {}

	# Extract title and link
	title_link = article.select_one("h3 a")
	if title_link:
	card_data["title"] = title_link.get_text(strip=True)
	href = title_link.get("href")
	if href:
	if href.startswith("http"):
	card_data["huggingface_url"] = href
	else:
	card_data["huggingface_url"] = f"https://huggingface.co{href}"

	# Extract upvote count
	upvote_div = article.select_one("div.shadow-alternate div.leading-none")
	if upvote_div:
	upvote_text = upvote_div.get_text(strip=True)
	try:
	card_data["upvotes"] = int(upvote_text)
	except ValueError:
	card_data["upvotes"] = 0

	# Extract author count - look for the author count text
	author_count_div = article.select_one("div.flex.truncate.text-sm")
	if author_count_div:
	author_text = author_count_div.get_text(strip=True)
	# Extract number from "· 10 authors"
	author_match = re.search(r'(\d+)\s*authors?', author_text)
	if author_match:
	card_data["author_count"] = int(author_match.group(1))
	else:
	card_data["author_count"] = 0

	# Extract GitHub stars from JSON data in the page
	# This will be handled later when we parse the JSON data
	card_data["github_stars"] = 0 # Default value

	# Extract comments count - look for comment icon and number
	comment_links = article.select("a[href*='#community']")
	for comment_link in comment_links:
	comment_text = comment_link.get_text(strip=True)
	try:
	card_data["comments"] = int(comment_text)
	break
	except ValueError:
	continue

	# Extract submitter information
	submitted_div = article.select_one("div.shadow-xs")
	if submitted_div:
	submitter_text = submitted_div.get_text(strip=True)
	# Extract submitter name from "Submitted byLiang0223" (no space)
	submitter_match = re.search(r'Submitted by(\S+)', submitter_text)
	if submitter_match:
	card_data["submitter"] = submitter_match.group(1)

	# Extract arXiv ID from the URL
	if card_data.get("huggingface_url"):
	arxiv_id = self.extract_arxiv_id(card_data["huggingface_url"])
	if arxiv_id:
	card_data["arxiv_id"] = arxiv_id

	# Try to get GitHub stars from the extracted data
	# Look for GitHub stars by matching paper title
	paper_title = card_data.get("title", "")
	if paper_title in json_data.get("github_stars_map", {}):
	card_data["github_stars"] = json_data["github_stars_map"][paper_title]

	# Only add cards that have at least a title
	if card_data.get("title"):
	cards.append(card_data)

	except Exception as e:
	logger.error(f"Error parsing card: {e}")
	continue

	# If the above method didn't work, fall back to the old method
	if not cards:
	logger.info("Falling back to old parsing method")
	for h3 in soup.select("h3"):
	# Title and Hugging Face paper link (if present)
	a = h3.find("a")
	title = h3.get_text(strip=True)
	hf_link = None
	if a and a.get("href"):
	href = a.get("href")
	# Absolute URL to huggingface
	if href.startswith("http"):
	hf_link = href
	else:
	hf_link = f"https://huggingface.co{href}"

	# Try to capture sibling info (authors, votes, etc.) as a small snippet
	meta_text = None
	parent = h3.parent
	if parent:
	# Join immediate text content following h3
	collected: List[str] = []
	for sib in parent.find_all(text=True, recursive=False):
	t = (sib or "").strip()
	if t:
	collected.append(t)
	if collected:
	meta_text = " ".join(collected)

	# Try to discover any arXiv link inside nearby anchors
	arxiv_id: Optional[str] = None
	container = parent if parent else h3
	for link in container.find_all("a", href=True):
	possible = self.extract_arxiv_id(link["href"])
	if possible:
	arxiv_id = possible
	break

	cards.append(
	{
	"title": title,
	"huggingface_url": hf_link,
	"meta": meta_text,
	"arxiv_id": arxiv_id,
	}
	)

	# Deduplicate by title
	seen = set()
	unique_cards: List[Dict[str, Any]] = []
	for c in cards:
	key = c.get("title") or ""
	if key and key not in seen:
	seen.add(key)
	unique_cards.append(c)

	logger.info(f"Parsed {len(unique_cards)} cards")
	return unique_cards

	async def get_daily_papers(self, target_date: str) -> tuple[str, List[Dict[str, Any]]]:
	"""Get daily papers for a specific date"""
	date_str, html = await self.fetch_daily_html(target_date)
	cards = self.parse_daily_cards(html)
	return date_str, cards