File size: 2,003 Bytes
5263a14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66

import yt_dlp
import requests
import json

video_id = "2dGB9Fo4hnU"
url = f"https://www.youtube.com/watch?v={video_id}"

print("\n--- Method 2: yt-dlp ---")
try:
    ydl_opts = {
        'skip_download': True,
        'writesubtitles': True,
        'writeautomaticsub': True,
        'subtitleslangs': ['en'],
        'quiet': True
    }
    
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=False)
        
        subtitles = info.get('subtitles', {})
        auto_captions = info.get('automatic_captions', {})
        
        sub_url = None
        if 'en' in subtitles:
            print("Found manual English subtitles")
            # Prefer json3
            for fmt in subtitles['en']:
                if fmt['ext'] == 'json3':
                    sub_url = fmt['url']
                    break
            if not sub_url:
                sub_url = subtitles['en'][0]['url']
        elif 'en' in auto_captions:
            print("Found auto English captions")
            for fmt in auto_captions['en']:
                if fmt['ext'] == 'json3':
                    sub_url = fmt['url']
                    break
            if not sub_url:
                sub_url = auto_captions['en'][0]['url']
        
        if sub_url:
            print(f"Fetching: {sub_url}")
            r = requests.get(sub_url)
            data = r.json()
            # print(json.dumps(data, indent=2)[:500])
            
            # Parse json3
            events = data.get('events', [])
            text = ""
            for event in events:
                if 'segs' in event:
                    for seg in event['segs']:
                        if 'utf8' in seg:
                            text += seg['utf8']
                    text += " "
            print(f"Extracted text length: {len(text)}")
            print(f"Preview: {text[:100]}")
        else:
            print("No English subtitles found")

except Exception as e:
    print(f"Method 2 failed: {e}")