import re import numpy as np class LiveFeatureExtractor: def __init__(self): self.patterns = { 'fillers': re.compile(r'&-([a-z]+)', re.IGNORECASE), 'repetition': re.compile(r'\[/+\]'), 'retracing': re.compile(r'\[//\]'), 'incomplete': re.compile(r'\+[\./]+'), 'errors': re.compile(r'\[\*.*?\]'), 'pauses': re.compile(r'\(\.+\)') } def clean_for_bert(self, raw_text): text = re.sub(r'^\*PAR:\s+', '', raw_text) text = re.sub(r'\x15\d+_\d+\x15', '', text) text = re.sub(r'<|>', '', text) text = re.sub(r'\[.*?\]', '', text) text = re.sub(r'\(\.+\)', '[PAUSE]', text) text = text.replace('_', ' ') text = re.sub(r'\s+', ' ', text).strip() return text def get_features(self, raw_text): stats = {k: len(p.findall(raw_text)) for k, p in self.patterns.items()} clean_for_stats = re.sub(r'\[.*?\]', '', raw_text) clean_for_stats = re.sub(r'&-([a-z]+)', '', clean_for_stats) clean_for_stats = re.sub(r'[^\w\s]', '', clean_for_stats) words = clean_for_stats.lower().split() stats['word_count'] = len(words) return stats def get_vector(self, raw_text, global_ttr_override=None): stats = self.get_features(raw_text) n = stats['word_count'] if stats['word_count'] > 0 else 1 if global_ttr_override is not None: ttr = global_ttr_override else: clean_for_stats = re.sub(r'\[.*?\]', '', raw_text) clean_for_stats = re.sub(r'&-([a-z]+)', '', clean_for_stats) clean_for_stats = re.sub(r'[^\w\s]', '', clean_for_stats) words = clean_for_stats.lower().split() ttr = (len(set(words)) / n) if n > 0 else 0.0 return [ ttr, stats['fillers']/n, stats['repetition']/n, stats['retracing']/n, stats['errors']/n, stats['pauses']/n ] class ChaParser: def __init__(self): self.extractor = LiveFeatureExtractor() def parse(self, file_content_lines): sentences = [] features = [] raw_lines = [] all_words_in_session = [] decoded_lines = [line.decode('utf-8') if isinstance(line, bytes) else line for line in file_content_lines] for line in decoded_lines: if line.startswith('*PAR:'): clean_line = re.sub(r'[^\w\s]', '', line.replace('*PAR:', '')) words = clean_line.lower().split() all_words_in_session.extend(words) unique_words = len(set(all_words_in_session)) total_words = len(all_words_in_session) global_ttr = unique_words / total_words if total_words > 0 else 0.0 for line in decoded_lines: if line.startswith('*PAR:'): display_text = self.extractor.clean_for_bert(line) feat_vec = self.extractor.get_vector(line, global_ttr_override=global_ttr) sentences.append(display_text) features.append(feat_vec) raw_lines.append(line.strip()) return sentences, features, raw_lines