coptic-translation-interface / coptic_parser_core.py
Rogaton
Re-enable Prolog validation for neural-symbolic architecture
113cd14
#!/usr/bin/env python3
"""
Coptic Dependency Parser - Core Module (Web-Compatible)
Neural-Symbolic Hybrid Parser combining Stanza (neural) with Prolog (symbolic)
for enhanced grammatical validation and error detection.
Extracted from coptic-parser.py for integration with web interfaces.
Author: André Linden (2025)
License: CC BY-NC-SA 4.0
"""
import stanza
import warnings
warnings.filterwarnings('ignore')
class CopticParserCore:
"""Lightweight neural-symbolic Coptic parser for web applications"""
def __init__(self):
self.nlp = None
self.diaparser = None
self.prolog = None # Prolog engine for grammatical validation
self._init_prolog()
def _init_prolog(self):
"""Initialize Prolog engine for grammatical validation (optional)"""
try:
from coptic_prolog_rules import create_prolog_engine
self.prolog = create_prolog_engine()
if self.prolog and self.prolog.prolog_initialized:
print("✓ Prolog engine initialized successfully")
except Exception as e:
print(f"ℹ Prolog validation not available: {e}")
print(" Parser will continue with neural-only mode")
self.prolog = None
def load_parser(self):
"""Initialize Stanza parser with Coptic models"""
if self.nlp is not None:
return # Already loaded
print("Loading Coptic NLP models...")
try:
# Try to load Stanza with all processors
self.nlp = stanza.Pipeline(
lang='cop',
processors='tokenize,pos,lemma,depparse',
download_method=None,
verbose=False
)
print("✓ Coptic neural parser loaded successfully")
except Exception as e:
# If models not found, download them
if "Resources file not found" in str(e) or "not found" in str(e).lower():
print("📥 Coptic models not found. Downloading (this may take 2-3 minutes)...")
try:
# Download Coptic models
stanza.download('cop', verbose=False)
# Try loading again
self.nlp = stanza.Pipeline(
lang='cop',
processors='tokenize,pos,lemma,depparse',
download_method=None,
verbose=False
)
print("✓ Coptic models downloaded and loaded successfully")
except Exception as download_error:
print(f"❌ Failed to download Coptic models: {download_error}")
raise
else:
print(f"❌ Failed to load parser: {e}")
raise
def parse_text(self, text, include_prolog_validation=True):
"""
Parse Coptic text and return structured results with Prolog validation
Args:
text: Coptic text to parse
include_prolog_validation: Whether to run Prolog grammatical validation (default: True)
Returns:
dict with:
- sentences: list of parsed sentence data
- total_sentences: int
- total_tokens: int
- text: original text
- prolog_validation: dict with validation results (if enabled and available)
"""
if not text or not text.strip():
return None
# Ensure parser is loaded
self.load_parser()
# Parse with Stanza (neural)
doc = self.nlp(text)
if not doc.sentences:
return None
# Extract structured data
sentences = []
total_tokens = 0
for sent_idx, sentence in enumerate(doc.sentences, 1):
words_data = []
for word in sentence.words:
word_data = {
'id': word.id,
'form': word.text,
'lemma': word.lemma or '_',
'upos': word.upos,
'xpos': word.xpos or '_',
'feats': word.feats or '_',
'head': word.head,
'deprel': word.deprel,
'head_text': 'ROOT' if word.head == 0 else sentence.words[word.head-1].text
}
words_data.append(word_data)
total_tokens += 1
sentences.append({
'id': sent_idx,
'text': sentence.text,
'words': words_data
})
result = {
'sentences': sentences,
'total_sentences': len(sentences),
'total_tokens': total_tokens,
'text': text
}
# Add Prolog validation (symbolic) if available and requested
if include_prolog_validation and self.prolog and hasattr(self.prolog, 'prolog_initialized') and self.prolog.prolog_initialized:
try:
validation = self._validate_with_prolog(sentences)
result['prolog_validation'] = validation
except Exception as e:
print(f"ℹ Prolog validation skipped: {e}")
result['prolog_validation'] = None
return result
def _validate_with_prolog(self, sentences):
"""
Validate parsed sentences using Prolog grammatical rules
Args:
sentences: List of parsed sentence data
Returns:
dict with validation results including patterns detected and warnings
"""
if not self.prolog:
return None
validation_results = {
'patterns_detected': [],
'warnings': [],
'has_errors': False
}
for sentence in sentences:
# Extract tokens, POS tags, heads, and dependency relations
tokens = [word['form'] for word in sentence['words']]
pos_tags = [word['upos'] for word in sentence['words']]
heads = [word['head'] for word in sentence['words']]
deprels = [word['deprel'] for word in sentence['words']]
# Validate with Prolog
try:
sent_validation = self.prolog.validate_parse_tree(tokens, pos_tags, heads, deprels)
if sent_validation:
# Merge results
if sent_validation.get('patterns'):
validation_results['patterns_detected'].extend(sent_validation['patterns'])
if sent_validation.get('warnings'):
validation_results['warnings'].extend(sent_validation['warnings'])
validation_results['has_errors'] = True
except Exception as e:
print(f"ℹ Prolog validation error for sentence: {e}")
return validation_results
def format_conllu(self, parse_result):
"""Format parse result as CoNLL-U"""
if not parse_result:
return ""
lines = []
for sentence in parse_result['sentences']:
lines.append(f"# sent_id = {sentence['id']}")
lines.append(f"# text = {sentence['text']}")
for word in sentence['words']:
line = "\t".join([
str(word['id']),
word['form'],
word['lemma'],
word['upos'],
word['xpos'],
word['feats'],
str(word['head']),
word['deprel'],
'_', # deps
'_' # misc
])
lines.append(line)
lines.append("") # Blank line between sentences
return "\n".join(lines)
def format_table(self, parse_result):
"""Format parse result as markdown table"""
if not parse_result:
return ""
output = []
for sentence in parse_result['sentences']:
output.append(f"\n### Sentence {sentence['id']}: {sentence['text']}\n")
output.append("| ID | Form | Lemma | UPOS | Head | DepRel |")
output.append("|:---|:-----|:------|:-----|:-----|:-------|")
for word in sentence['words']:
output.append(
f"| {word['id']} | **{word['form']}** | {word['lemma']} | "
f"`{word['upos']}` | {word['head_text']} | `{word['deprel']}` |"
)
return "\n".join(output)