Spaces:

Norelad
/

coptic-translation-interface

Sleeping

coptic-translation-interface / coptic_parser_core.py

Rogaton

Re-enable Prolog validation for neural-symbolic architecture

113cd14 24 days ago

8.58 kB

	#!/usr/bin/env python3
	"""
	Coptic Dependency Parser - Core Module (Web-Compatible)

	Neural-Symbolic Hybrid Parser combining Stanza (neural) with Prolog (symbolic)
	for enhanced grammatical validation and error detection.

	Extracted from coptic-parser.py for integration with web interfaces.
	Author: André Linden (2025)
	License: CC BY-NC-SA 4.0
	"""

	import stanza
	import warnings
	warnings.filterwarnings('ignore')

	class CopticParserCore:
	"""Lightweight neural-symbolic Coptic parser for web applications"""

	def __init__(self):
	self.nlp = None
	self.diaparser = None
	self.prolog = None # Prolog engine for grammatical validation
	self._init_prolog()

	def _init_prolog(self):
	"""Initialize Prolog engine for grammatical validation (optional)"""
	try:
	from coptic_prolog_rules import create_prolog_engine
	self.prolog = create_prolog_engine()
	if self.prolog and self.prolog.prolog_initialized:
	print("✓ Prolog engine initialized successfully")
	except Exception as e:
	print(f"ℹ Prolog validation not available: {e}")
	print(" Parser will continue with neural-only mode")
	self.prolog = None

	def load_parser(self):
	"""Initialize Stanza parser with Coptic models"""
	if self.nlp is not None:
	return # Already loaded

	print("Loading Coptic NLP models...")

	try:
	# Try to load Stanza with all processors
	self.nlp = stanza.Pipeline(
	lang='cop',
	processors='tokenize,pos,lemma,depparse',
	download_method=None,
	verbose=False
	)
	print("✓ Coptic neural parser loaded successfully")

	except Exception as e:
	# If models not found, download them
	if "Resources file not found" in str(e) or "not found" in str(e).lower():
	print("📥 Coptic models not found. Downloading (this may take 2-3 minutes)...")
	try:
	# Download Coptic models
	stanza.download('cop', verbose=False)

	# Try loading again
	self.nlp = stanza.Pipeline(
	lang='cop',
	processors='tokenize,pos,lemma,depparse',
	download_method=None,
	verbose=False
	)
	print("✓ Coptic models downloaded and loaded successfully")
	except Exception as download_error:
	print(f"❌ Failed to download Coptic models: {download_error}")
	raise
	else:
	print(f"❌ Failed to load parser: {e}")
	raise

	def parse_text(self, text, include_prolog_validation=True):
	"""
	Parse Coptic text and return structured results with Prolog validation

	Args:
	text: Coptic text to parse
	include_prolog_validation: Whether to run Prolog grammatical validation (default: True)

	Returns:
	dict with:
	- sentences: list of parsed sentence data
	- total_sentences: int
	- total_tokens: int
	- text: original text
	- prolog_validation: dict with validation results (if enabled and available)
	"""
	if not text or not text.strip():
	return None

	# Ensure parser is loaded
	self.load_parser()

	# Parse with Stanza (neural)
	doc = self.nlp(text)

	if not doc.sentences:
	return None

	# Extract structured data
	sentences = []
	total_tokens = 0

	for sent_idx, sentence in enumerate(doc.sentences, 1):
	words_data = []

	for word in sentence.words:
	word_data = {
	'id': word.id,
	'form': word.text,
	'lemma': word.lemma or '_',
	'upos': word.upos,
	'xpos': word.xpos or '_',
	'feats': word.feats or '_',
	'head': word.head,
	'deprel': word.deprel,
	'head_text': 'ROOT' if word.head == 0 else sentence.words[word.head-1].text
	}
	words_data.append(word_data)
	total_tokens += 1

	sentences.append({
	'id': sent_idx,
	'text': sentence.text,
	'words': words_data
	})

	result = {
	'sentences': sentences,
	'total_sentences': len(sentences),
	'total_tokens': total_tokens,
	'text': text
	}

	# Add Prolog validation (symbolic) if available and requested
	if include_prolog_validation and self.prolog and hasattr(self.prolog, 'prolog_initialized') and self.prolog.prolog_initialized:
	try:
	validation = self._validate_with_prolog(sentences)
	result['prolog_validation'] = validation
	except Exception as e:
	print(f"ℹ Prolog validation skipped: {e}")
	result['prolog_validation'] = None

	return result

	def _validate_with_prolog(self, sentences):
	"""
	Validate parsed sentences using Prolog grammatical rules

	Args:
	sentences: List of parsed sentence data

	Returns:
	dict with validation results including patterns detected and warnings
	"""
	if not self.prolog:
	return None

	validation_results = {
	'patterns_detected': [],
	'warnings': [],
	'has_errors': False
	}

	for sentence in sentences:
	# Extract tokens, POS tags, heads, and dependency relations
	tokens = [word['form'] for word in sentence['words']]
	pos_tags = [word['upos'] for word in sentence['words']]
	heads = [word['head'] for word in sentence['words']]
	deprels = [word['deprel'] for word in sentence['words']]

	# Validate with Prolog
	try:
	sent_validation = self.prolog.validate_parse_tree(tokens, pos_tags, heads, deprels)

	if sent_validation:
	# Merge results
	if sent_validation.get('patterns'):
	validation_results['patterns_detected'].extend(sent_validation['patterns'])

	if sent_validation.get('warnings'):
	validation_results['warnings'].extend(sent_validation['warnings'])
	validation_results['has_errors'] = True

	except Exception as e:
	print(f"ℹ Prolog validation error for sentence: {e}")

	return validation_results

	def format_conllu(self, parse_result):
	"""Format parse result as CoNLL-U"""
	if not parse_result:
	return ""

	lines = []
	for sentence in parse_result['sentences']:
	lines.append(f"# sent_id = {sentence['id']}")
	lines.append(f"# text = {sentence['text']}")

	for word in sentence['words']:
	line = "\t".join([
	str(word['id']),
	word['form'],
	word['lemma'],
	word['upos'],
	word['xpos'],
	word['feats'],
	str(word['head']),
	word['deprel'],
	'_', # deps
	'_' # misc
	])
	lines.append(line)

	lines.append("") # Blank line between sentences

	return "\n".join(lines)

	def format_table(self, parse_result):
	"""Format parse result as markdown table"""
	if not parse_result:
	return ""

	output = []

	for sentence in parse_result['sentences']:
	output.append(f"\n### Sentence {sentence['id']}: {sentence['text']}\n")
	output.append("\| ID \| Form \| Lemma \| UPOS \| Head \| DepRel \|")
	output.append("\|:---\|:-----\|:------\|:-----\|:-----\|:-------\|")

	for word in sentence['words']:
	output.append(
	f"\| {word['id']} \| {word['form']} \| {word['lemma']} \| "
	f"`{word['upos']}` \| {word['head_text']} \| `{word['deprel']}` \|"
	)

	return "\n".join(output)