File size: 8,580 Bytes
eee0fe0
 
 
 
1fe0b70
 
 
eee0fe0
 
 
 
 
 
 
 
 
 
1fe0b70
eee0fe0
 
 
 
1fe0b70
 
 
 
 
 
 
 
 
 
 
 
 
 
eee0fe0
 
 
 
 
 
 
 
cc8e202
 
 
 
 
 
 
 
1fe0b70
cc8e202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eee0fe0
113cd14
eee0fe0
113cd14
eee0fe0
 
 
113cd14
eee0fe0
 
 
 
 
 
 
1fe0b70
eee0fe0
 
 
 
 
 
 
1fe0b70
eee0fe0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1fe0b70
eee0fe0
 
 
 
 
 
1fe0b70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eee0fe0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#!/usr/bin/env python3
"""
Coptic Dependency Parser - Core Module (Web-Compatible)

Neural-Symbolic Hybrid Parser combining Stanza (neural) with Prolog (symbolic)
for enhanced grammatical validation and error detection.

Extracted from coptic-parser.py for integration with web interfaces.
Author: André Linden (2025)
License: CC BY-NC-SA 4.0
"""

import stanza
import warnings
warnings.filterwarnings('ignore')

class CopticParserCore:
    """Lightweight neural-symbolic Coptic parser for web applications"""

    def __init__(self):
        self.nlp = None
        self.diaparser = None
        self.prolog = None  # Prolog engine for grammatical validation
        self._init_prolog()

    def _init_prolog(self):
        """Initialize Prolog engine for grammatical validation (optional)"""
        try:
            from coptic_prolog_rules import create_prolog_engine
            self.prolog = create_prolog_engine()
            if self.prolog and self.prolog.prolog_initialized:
                print("✓ Prolog engine initialized successfully")
        except Exception as e:
            print(f"ℹ  Prolog validation not available: {e}")
            print("   Parser will continue with neural-only mode")
            self.prolog = None

    def load_parser(self):
        """Initialize Stanza parser with Coptic models"""
        if self.nlp is not None:
            return  # Already loaded

        print("Loading Coptic NLP models...")

        try:
            # Try to load Stanza with all processors
            self.nlp = stanza.Pipeline(
                lang='cop',
                processors='tokenize,pos,lemma,depparse',
                download_method=None,
                verbose=False
            )
            print("✓ Coptic neural parser loaded successfully")

        except Exception as e:
            # If models not found, download them
            if "Resources file not found" in str(e) or "not found" in str(e).lower():
                print("📥 Coptic models not found. Downloading (this may take 2-3 minutes)...")
                try:
                    # Download Coptic models
                    stanza.download('cop', verbose=False)

                    # Try loading again
                    self.nlp = stanza.Pipeline(
                        lang='cop',
                        processors='tokenize,pos,lemma,depparse',
                        download_method=None,
                        verbose=False
                    )
                    print("✓ Coptic models downloaded and loaded successfully")
                except Exception as download_error:
                    print(f"❌ Failed to download Coptic models: {download_error}")
                    raise
            else:
                print(f"❌ Failed to load parser: {e}")
                raise

    def parse_text(self, text, include_prolog_validation=True):
        """
        Parse Coptic text and return structured results with Prolog validation

        Args:
            text: Coptic text to parse
            include_prolog_validation: Whether to run Prolog grammatical validation (default: True)

        Returns:
            dict with:
                - sentences: list of parsed sentence data
                - total_sentences: int
                - total_tokens: int
                - text: original text
                - prolog_validation: dict with validation results (if enabled and available)
        """
        if not text or not text.strip():
            return None

        # Ensure parser is loaded
        self.load_parser()

        # Parse with Stanza (neural)
        doc = self.nlp(text)

        if not doc.sentences:
            return None

        # Extract structured data
        sentences = []
        total_tokens = 0

        for sent_idx, sentence in enumerate(doc.sentences, 1):
            words_data = []

            for word in sentence.words:
                word_data = {
                    'id': word.id,
                    'form': word.text,
                    'lemma': word.lemma or '_',
                    'upos': word.upos,
                    'xpos': word.xpos or '_',
                    'feats': word.feats or '_',
                    'head': word.head,
                    'deprel': word.deprel,
                    'head_text': 'ROOT' if word.head == 0 else sentence.words[word.head-1].text
                }
                words_data.append(word_data)
                total_tokens += 1

            sentences.append({
                'id': sent_idx,
                'text': sentence.text,
                'words': words_data
            })

        result = {
            'sentences': sentences,
            'total_sentences': len(sentences),
            'total_tokens': total_tokens,
            'text': text
        }

        # Add Prolog validation (symbolic) if available and requested
        if include_prolog_validation and self.prolog and hasattr(self.prolog, 'prolog_initialized') and self.prolog.prolog_initialized:
            try:
                validation = self._validate_with_prolog(sentences)
                result['prolog_validation'] = validation
            except Exception as e:
                print(f"ℹ  Prolog validation skipped: {e}")
                result['prolog_validation'] = None

        return result

    def _validate_with_prolog(self, sentences):
        """
        Validate parsed sentences using Prolog grammatical rules

        Args:
            sentences: List of parsed sentence data

        Returns:
            dict with validation results including patterns detected and warnings
        """
        if not self.prolog:
            return None

        validation_results = {
            'patterns_detected': [],
            'warnings': [],
            'has_errors': False
        }

        for sentence in sentences:
            # Extract tokens, POS tags, heads, and dependency relations
            tokens = [word['form'] for word in sentence['words']]
            pos_tags = [word['upos'] for word in sentence['words']]
            heads = [word['head'] for word in sentence['words']]
            deprels = [word['deprel'] for word in sentence['words']]

            # Validate with Prolog
            try:
                sent_validation = self.prolog.validate_parse_tree(tokens, pos_tags, heads, deprels)

                if sent_validation:
                    # Merge results
                    if sent_validation.get('patterns'):
                        validation_results['patterns_detected'].extend(sent_validation['patterns'])

                    if sent_validation.get('warnings'):
                        validation_results['warnings'].extend(sent_validation['warnings'])
                        validation_results['has_errors'] = True

            except Exception as e:
                print(f"ℹ  Prolog validation error for sentence: {e}")

        return validation_results

    def format_conllu(self, parse_result):
        """Format parse result as CoNLL-U"""
        if not parse_result:
            return ""

        lines = []
        for sentence in parse_result['sentences']:
            lines.append(f"# sent_id = {sentence['id']}")
            lines.append(f"# text = {sentence['text']}")

            for word in sentence['words']:
                line = "\t".join([
                    str(word['id']),
                    word['form'],
                    word['lemma'],
                    word['upos'],
                    word['xpos'],
                    word['feats'],
                    str(word['head']),
                    word['deprel'],
                    '_',  # deps
                    '_'   # misc
                ])
                lines.append(line)

            lines.append("")  # Blank line between sentences

        return "\n".join(lines)

    def format_table(self, parse_result):
        """Format parse result as markdown table"""
        if not parse_result:
            return ""

        output = []

        for sentence in parse_result['sentences']:
            output.append(f"\n### Sentence {sentence['id']}: {sentence['text']}\n")
            output.append("| ID | Form | Lemma | UPOS | Head | DepRel |")
            output.append("|:---|:-----|:------|:-----|:-----|:-------|")

            for word in sentence['words']:
                output.append(
                    f"| {word['id']} | **{word['form']}** | {word['lemma']} | "
                    f"`{word['upos']}` | {word['head_text']} | `{word['deprel']}` |"
                )

        return "\n".join(output)