import Levenshtein ARPABET_TO_IPA = { "AA": "ɑ", # father "AE": "æ", # cat "AH": "ʌ", # strut "AO": "ɔ", # thought "AW": "aʊ", # now "AY": "aɪ", # my "B": "b", "CH": "tʃ", "D": "d", "DH": "ð", "EH": "ɛ", # bed "ER": "ɝ", # bird (rhotic); could also map to ɜː in non-rhotic "EY": "eɪ", # face "F": "f", "G": "ɡ", "HH": "h", "IH": "ɪ", # sit "IY": "iː", # seat <-- changed "JH": "dʒ", "K": "k", "L": "l", "M": "m", "N": "n", "NG": "ŋ", "OW": "oʊ", # goat "OY": "ɔɪ", # boy "P": "p", "R": "ɹ", "S": "s", "SH": "ʃ", "T": "t", "TH": "θ", "UH": "ʊ", # foot "UW": "uː", # goose <-- changed "V": "v", "W": "w", "Y": "j", "Z": "z", "ZH": "ʒ" } def arpabet_to_ipa_seq(arpabet_seq): # remove stress numbers like UW1 → UW return [ARPABET_TO_IPA.get(sym.rstrip("012").upper(), sym) for sym in arpabet_seq] def levenshtein_similarity_score(seq1, seq2): """ Calculate the Levenshtein distance between two sequences. """ str1 = "".join(seq1) str2 = "".join(seq2) distance = Levenshtein.distance(str1, str2) max_len = max(len(str1), len(str2)) if max_len == 0: # Handle empty strings to prevent division by zero normalized_distance = 0.0 else: normalized_distance = distance / max_len similarity_score = 1 - normalized_distance return int(similarity_score * 100) if __name__ == "__main__": # print(arpabet_to_ipa_seq(['ah', 'l', 'ow', 'ay', 'd', 'ow', 'n', 't', 'r', 'ih'])) corrected_ipa = ["ð", "ɛ", "ɹ"] user_ipa = ["ʌ", "ð", "ɛ", "ɹ"] print(levenshtein_similarity_score(corrected_ipa, user_ipa))