Spaces:
Sleeping
Sleeping
File size: 2,003 Bytes
5263a14 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import yt_dlp
import requests
import json
video_id = "2dGB9Fo4hnU"
url = f"https://www.youtube.com/watch?v={video_id}"
print("\n--- Method 2: yt-dlp ---")
try:
ydl_opts = {
'skip_download': True,
'writesubtitles': True,
'writeautomaticsub': True,
'subtitleslangs': ['en'],
'quiet': True
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=False)
subtitles = info.get('subtitles', {})
auto_captions = info.get('automatic_captions', {})
sub_url = None
if 'en' in subtitles:
print("Found manual English subtitles")
# Prefer json3
for fmt in subtitles['en']:
if fmt['ext'] == 'json3':
sub_url = fmt['url']
break
if not sub_url:
sub_url = subtitles['en'][0]['url']
elif 'en' in auto_captions:
print("Found auto English captions")
for fmt in auto_captions['en']:
if fmt['ext'] == 'json3':
sub_url = fmt['url']
break
if not sub_url:
sub_url = auto_captions['en'][0]['url']
if sub_url:
print(f"Fetching: {sub_url}")
r = requests.get(sub_url)
data = r.json()
# print(json.dumps(data, indent=2)[:500])
# Parse json3
events = data.get('events', [])
text = ""
for event in events:
if 'segs' in event:
for seg in event['segs']:
if 'utf8' in seg:
text += seg['utf8']
text += " "
print(f"Extracted text length: {len(text)}")
print(f"Preview: {text[:100]}")
else:
print("No English subtitles found")
except Exception as e:
print(f"Method 2 failed: {e}")
|