Spaces:

SushantGautam
/

BridgeMentor

Sleeping

BridgeMentor / scrap_openalex.py

Add social authentication and enhance database configuration

bd98c1d 8 months ago

1.38 kB

	import os
	import requests
	import json
	from time import sleep

	# Base API URL
	BASE_URL = "https://api.openalex.org/authors"
	FILTER = "last_known_institutions.country_code:NO,x_concepts.id:C41008148"
	PER_PAGE = 200

	OUTPUT_DIR = "C41008148_authors"
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	# Initialize cursor
	cursor = "*"
	page_count = 1 # Track page numbers for saving files

	while cursor:
	url = f"{BASE_URL}?filter={FILTER}&per-page={PER_PAGE}&cursor={cursor}"
	try:
	print(f"Fetching page {page_count} with cursor...")
	response = requests.get(url)
	response.raise_for_status()
	data = response.json()

	filename = os.path.join(OUTPUT_DIR, f"{page_count:010}.json")
	if os.path.exists(filename):
	print(f"File {filename} already exists, skipping...")
	cursor = data.get("meta", {}).get("next_cursor")
	page_count += 1
	continue

	with open(filename, 'w', encoding='utf-8') as f:
	json.dump(data, f, ensure_ascii=False, indent=2)

	cursor = data.get("meta", {}).get("next_cursor")
	if not cursor:
	print("No more results.")
	break

	page_count += 1
	sleep(1) # Rate-limiting
	except Exception as e:
	print(f"Error on page {page_count}: {e}")
	break

	print("Download complete using cursor pagination.")