Spaces:
Running
Running
| import yt_dlp | |
| import requests | |
| import json | |
| video_id = "2dGB9Fo4hnU" | |
| url = f"https://www.youtube.com/watch?v={video_id}" | |
| print("\n--- Method 2: yt-dlp ---") | |
| try: | |
| ydl_opts = { | |
| 'skip_download': True, | |
| 'writesubtitles': True, | |
| 'writeautomaticsub': True, | |
| 'subtitleslangs': ['en'], | |
| 'quiet': True | |
| } | |
| with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
| info = ydl.extract_info(url, download=False) | |
| subtitles = info.get('subtitles', {}) | |
| auto_captions = info.get('automatic_captions', {}) | |
| sub_url = None | |
| if 'en' in subtitles: | |
| print("Found manual English subtitles") | |
| # Prefer json3 | |
| for fmt in subtitles['en']: | |
| if fmt['ext'] == 'json3': | |
| sub_url = fmt['url'] | |
| break | |
| if not sub_url: | |
| sub_url = subtitles['en'][0]['url'] | |
| elif 'en' in auto_captions: | |
| print("Found auto English captions") | |
| for fmt in auto_captions['en']: | |
| if fmt['ext'] == 'json3': | |
| sub_url = fmt['url'] | |
| break | |
| if not sub_url: | |
| sub_url = auto_captions['en'][0]['url'] | |
| if sub_url: | |
| print(f"Fetching: {sub_url}") | |
| r = requests.get(sub_url) | |
| data = r.json() | |
| # print(json.dumps(data, indent=2)[:500]) | |
| # Parse json3 | |
| events = data.get('events', []) | |
| text = "" | |
| for event in events: | |
| if 'segs' in event: | |
| for seg in event['segs']: | |
| if 'utf8' in seg: | |
| text += seg['utf8'] | |
| text += " " | |
| print(f"Extracted text length: {len(text)}") | |
| print(f"Preview: {text[:100]}") | |
| else: | |
| print("No English subtitles found") | |
| except Exception as e: | |
| print(f"Method 2 failed: {e}") | |