|
|
|
|
|
""" |
|
|
Coptic Dependency Parser - Core Module (Web-Compatible) |
|
|
|
|
|
Neural-Symbolic Hybrid Parser combining Stanza (neural) with Prolog (symbolic) |
|
|
for enhanced grammatical validation and error detection. |
|
|
|
|
|
Extracted from coptic-parser.py for integration with web interfaces. |
|
|
Author: André Linden (2025) |
|
|
License: CC BY-NC-SA 4.0 |
|
|
""" |
|
|
|
|
|
import stanza |
|
|
import warnings |
|
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
class CopticParserCore: |
|
|
"""Lightweight neural-symbolic Coptic parser for web applications""" |
|
|
|
|
|
def __init__(self): |
|
|
self.nlp = None |
|
|
self.diaparser = None |
|
|
self.prolog = None |
|
|
self._init_prolog() |
|
|
|
|
|
def _init_prolog(self): |
|
|
"""Initialize Prolog engine for grammatical validation (optional)""" |
|
|
try: |
|
|
from coptic_prolog_rules import create_prolog_engine |
|
|
self.prolog = create_prolog_engine() |
|
|
if self.prolog and self.prolog.prolog_initialized: |
|
|
print("✓ Prolog engine initialized successfully") |
|
|
except Exception as e: |
|
|
print(f"ℹ Prolog validation not available: {e}") |
|
|
print(" Parser will continue with neural-only mode") |
|
|
self.prolog = None |
|
|
|
|
|
def load_parser(self): |
|
|
"""Initialize Stanza parser with Coptic models""" |
|
|
if self.nlp is not None: |
|
|
return |
|
|
|
|
|
print("Loading Coptic NLP models...") |
|
|
|
|
|
try: |
|
|
|
|
|
self.nlp = stanza.Pipeline( |
|
|
lang='cop', |
|
|
processors='tokenize,pos,lemma,depparse', |
|
|
download_method=None, |
|
|
verbose=False |
|
|
) |
|
|
print("✓ Coptic neural parser loaded successfully") |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
if "Resources file not found" in str(e) or "not found" in str(e).lower(): |
|
|
print("📥 Coptic models not found. Downloading (this may take 2-3 minutes)...") |
|
|
try: |
|
|
|
|
|
stanza.download('cop', verbose=False) |
|
|
|
|
|
|
|
|
self.nlp = stanza.Pipeline( |
|
|
lang='cop', |
|
|
processors='tokenize,pos,lemma,depparse', |
|
|
download_method=None, |
|
|
verbose=False |
|
|
) |
|
|
print("✓ Coptic models downloaded and loaded successfully") |
|
|
except Exception as download_error: |
|
|
print(f"❌ Failed to download Coptic models: {download_error}") |
|
|
raise |
|
|
else: |
|
|
print(f"❌ Failed to load parser: {e}") |
|
|
raise |
|
|
|
|
|
def parse_text(self, text, include_prolog_validation=True): |
|
|
""" |
|
|
Parse Coptic text and return structured results with Prolog validation |
|
|
|
|
|
Args: |
|
|
text: Coptic text to parse |
|
|
include_prolog_validation: Whether to run Prolog grammatical validation (default: True) |
|
|
|
|
|
Returns: |
|
|
dict with: |
|
|
- sentences: list of parsed sentence data |
|
|
- total_sentences: int |
|
|
- total_tokens: int |
|
|
- text: original text |
|
|
- prolog_validation: dict with validation results (if enabled and available) |
|
|
""" |
|
|
if not text or not text.strip(): |
|
|
return None |
|
|
|
|
|
|
|
|
self.load_parser() |
|
|
|
|
|
|
|
|
doc = self.nlp(text) |
|
|
|
|
|
if not doc.sentences: |
|
|
return None |
|
|
|
|
|
|
|
|
sentences = [] |
|
|
total_tokens = 0 |
|
|
|
|
|
for sent_idx, sentence in enumerate(doc.sentences, 1): |
|
|
words_data = [] |
|
|
|
|
|
for word in sentence.words: |
|
|
word_data = { |
|
|
'id': word.id, |
|
|
'form': word.text, |
|
|
'lemma': word.lemma or '_', |
|
|
'upos': word.upos, |
|
|
'xpos': word.xpos or '_', |
|
|
'feats': word.feats or '_', |
|
|
'head': word.head, |
|
|
'deprel': word.deprel, |
|
|
'head_text': 'ROOT' if word.head == 0 else sentence.words[word.head-1].text |
|
|
} |
|
|
words_data.append(word_data) |
|
|
total_tokens += 1 |
|
|
|
|
|
sentences.append({ |
|
|
'id': sent_idx, |
|
|
'text': sentence.text, |
|
|
'words': words_data |
|
|
}) |
|
|
|
|
|
result = { |
|
|
'sentences': sentences, |
|
|
'total_sentences': len(sentences), |
|
|
'total_tokens': total_tokens, |
|
|
'text': text |
|
|
} |
|
|
|
|
|
|
|
|
if include_prolog_validation and self.prolog and hasattr(self.prolog, 'prolog_initialized') and self.prolog.prolog_initialized: |
|
|
try: |
|
|
validation = self._validate_with_prolog(sentences) |
|
|
result['prolog_validation'] = validation |
|
|
except Exception as e: |
|
|
print(f"ℹ Prolog validation skipped: {e}") |
|
|
result['prolog_validation'] = None |
|
|
|
|
|
return result |
|
|
|
|
|
def _validate_with_prolog(self, sentences): |
|
|
""" |
|
|
Validate parsed sentences using Prolog grammatical rules |
|
|
|
|
|
Args: |
|
|
sentences: List of parsed sentence data |
|
|
|
|
|
Returns: |
|
|
dict with validation results including patterns detected and warnings |
|
|
""" |
|
|
if not self.prolog: |
|
|
return None |
|
|
|
|
|
validation_results = { |
|
|
'patterns_detected': [], |
|
|
'warnings': [], |
|
|
'has_errors': False |
|
|
} |
|
|
|
|
|
for sentence in sentences: |
|
|
|
|
|
tokens = [word['form'] for word in sentence['words']] |
|
|
pos_tags = [word['upos'] for word in sentence['words']] |
|
|
heads = [word['head'] for word in sentence['words']] |
|
|
deprels = [word['deprel'] for word in sentence['words']] |
|
|
|
|
|
|
|
|
try: |
|
|
sent_validation = self.prolog.validate_parse_tree(tokens, pos_tags, heads, deprels) |
|
|
|
|
|
if sent_validation: |
|
|
|
|
|
if sent_validation.get('patterns'): |
|
|
validation_results['patterns_detected'].extend(sent_validation['patterns']) |
|
|
|
|
|
if sent_validation.get('warnings'): |
|
|
validation_results['warnings'].extend(sent_validation['warnings']) |
|
|
validation_results['has_errors'] = True |
|
|
|
|
|
except Exception as e: |
|
|
print(f"ℹ Prolog validation error for sentence: {e}") |
|
|
|
|
|
return validation_results |
|
|
|
|
|
def format_conllu(self, parse_result): |
|
|
"""Format parse result as CoNLL-U""" |
|
|
if not parse_result: |
|
|
return "" |
|
|
|
|
|
lines = [] |
|
|
for sentence in parse_result['sentences']: |
|
|
lines.append(f"# sent_id = {sentence['id']}") |
|
|
lines.append(f"# text = {sentence['text']}") |
|
|
|
|
|
for word in sentence['words']: |
|
|
line = "\t".join([ |
|
|
str(word['id']), |
|
|
word['form'], |
|
|
word['lemma'], |
|
|
word['upos'], |
|
|
word['xpos'], |
|
|
word['feats'], |
|
|
str(word['head']), |
|
|
word['deprel'], |
|
|
'_', |
|
|
'_' |
|
|
]) |
|
|
lines.append(line) |
|
|
|
|
|
lines.append("") |
|
|
|
|
|
return "\n".join(lines) |
|
|
|
|
|
def format_table(self, parse_result): |
|
|
"""Format parse result as markdown table""" |
|
|
if not parse_result: |
|
|
return "" |
|
|
|
|
|
output = [] |
|
|
|
|
|
for sentence in parse_result['sentences']: |
|
|
output.append(f"\n### Sentence {sentence['id']}: {sentence['text']}\n") |
|
|
output.append("| ID | Form | Lemma | UPOS | Head | DepRel |") |
|
|
output.append("|:---|:-----|:------|:-----|:-----|:-------|") |
|
|
|
|
|
for word in sentence['words']: |
|
|
output.append( |
|
|
f"| {word['id']} | **{word['form']}** | {word['lemma']} | " |
|
|
f"`{word['upos']}` | {word['head_text']} | `{word['deprel']}` |" |
|
|
) |
|
|
|
|
|
return "\n".join(output) |
|
|
|