deep-research-ai / check_yt.py
jeevzz's picture
Upload 9 files
5263a14 verified
import yt_dlp
import requests
import json
video_id = "2dGB9Fo4hnU"
url = f"https://www.youtube.com/watch?v={video_id}"
print("\n--- Method 2: yt-dlp ---")
try:
ydl_opts = {
'skip_download': True,
'writesubtitles': True,
'writeautomaticsub': True,
'subtitleslangs': ['en'],
'quiet': True
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=False)
subtitles = info.get('subtitles', {})
auto_captions = info.get('automatic_captions', {})
sub_url = None
if 'en' in subtitles:
print("Found manual English subtitles")
# Prefer json3
for fmt in subtitles['en']:
if fmt['ext'] == 'json3':
sub_url = fmt['url']
break
if not sub_url:
sub_url = subtitles['en'][0]['url']
elif 'en' in auto_captions:
print("Found auto English captions")
for fmt in auto_captions['en']:
if fmt['ext'] == 'json3':
sub_url = fmt['url']
break
if not sub_url:
sub_url = auto_captions['en'][0]['url']
if sub_url:
print(f"Fetching: {sub_url}")
r = requests.get(sub_url)
data = r.json()
# print(json.dumps(data, indent=2)[:500])
# Parse json3
events = data.get('events', [])
text = ""
for event in events:
if 'segs' in event:
for seg in event['segs']:
if 'utf8' in seg:
text += seg['utf8']
text += " "
print(f"Extracted text length: {len(text)}")
print(f"Preview: {text[:100]}")
else:
print("No English subtitles found")
except Exception as e:
print(f"Method 2 failed: {e}")