| | |
| | |
| | |
| | |
| |
|
| | use lazy_static::lazy_static; |
| | use std::collections::HashMap; |
| |
|
| | lazy_static! { |
| | |
| | static ref G2P_DICT: HashMap<&'static str, Vec<&'static str>> = { |
| | let mut m = HashMap::new(); |
| | |
| | m.insert("hello", vec!["HH", "AH0", "L", "OW1"]); |
| | m.insert("world", vec!["W", "ER1", "L", "D"]); |
| | m.insert("the", vec!["DH", "AH0"]); |
| | m.insert("a", vec!["AH0"]); |
| | m.insert("is", vec!["IH1", "Z"]); |
| | m.insert("to", vec!["T", "UW1"]); |
| | m.insert("and", vec!["AH0", "N", "D"]); |
| | m.insert("in", vec!["IH0", "N"]); |
| | m.insert("that", vec!["DH", "AE1", "T"]); |
| | m.insert("have", vec!["HH", "AE1", "V"]); |
| | m.insert("for", vec!["F", "AO1", "R"]); |
| | m.insert("not", vec!["N", "AA1", "T"]); |
| | m.insert("with", vec!["W", "IH1", "DH"]); |
| | m.insert("you", vec!["Y", "UW1"]); |
| | m.insert("this", vec!["DH", "IH1", "S"]); |
| | m.insert("but", vec!["B", "AH1", "T"]); |
| | m.insert("from", vec!["F", "R", "AH1", "M"]); |
| | m.insert("they", vec!["DH", "EY1"]); |
| | m.insert("we", vec!["W", "IY1"]); |
| | m.insert("say", vec!["S", "EY1"]); |
| | m.insert("she", vec!["SH", "IY1"]); |
| | m.insert("or", vec!["AO1", "R"]); |
| | m.insert("an", vec!["AE1", "N"]); |
| | m.insert("will", vec!["W", "IH1", "L"]); |
| | m.insert("my", vec!["M", "AY1"]); |
| | m.insert("one", vec!["W", "AH1", "N"]); |
| | m.insert("all", vec!["AO1", "L"]); |
| | m.insert("would", vec!["W", "UH1", "D"]); |
| | m.insert("there", vec!["DH", "EH1", "R"]); |
| | m.insert("their", vec!["DH", "EH1", "R"]); |
| | m |
| | }; |
| |
|
| | |
| | static ref PINYIN_MAP: HashMap<&'static str, (&'static str, &'static str)> = { |
| | let mut m = HashMap::new(); |
| | |
| | m.insert("ba", ("b", "a")); |
| | m.insert("pa", ("p", "a")); |
| | m.insert("ma", ("m", "a")); |
| | m.insert("fa", ("f", "a")); |
| | m.insert("da", ("d", "a")); |
| | m.insert("ta", ("t", "a")); |
| | m.insert("na", ("n", "a")); |
| | m.insert("la", ("l", "a")); |
| | m.insert("ga", ("g", "a")); |
| | m.insert("ka", ("k", "a")); |
| | m.insert("ha", ("h", "a")); |
| | m.insert("zha", ("zh", "a")); |
| | m.insert("cha", ("ch", "a")); |
| | m.insert("sha", ("sh", "a")); |
| | m.insert("za", ("z", "a")); |
| | m.insert("ca", ("c", "a")); |
| | m.insert("sa", ("s", "a")); |
| | m.insert("ni", ("n", "i")); |
| | m.insert("hao", ("h", "ao")); |
| | m.insert("shi", ("sh", "i")); |
| | m.insert("jie", ("j", "ie")); |
| | m.insert("zhong", ("zh", "ong")); |
| | m.insert("guo", ("g", "uo")); |
| | m.insert("ren", ("r", "en")); |
| | m.insert("ming", ("m", "ing")); |
| | m.insert("de", ("d", "e")); |
| | m.insert("yi", ("", "i")); |
| | m.insert("er", ("", "er")); |
| | m.insert("san", ("s", "an")); |
| | m.insert("si", ("s", "i")); |
| | m.insert("wu", ("", "u")); |
| | m.insert("liu", ("l", "iu")); |
| | m.insert("qi", ("q", "i")); |
| | m.insert("jiu", ("j", "iu")); |
| | m |
| | }; |
| | } |
| |
|
| | |
| | pub fn g2p_english(word: &str) -> Vec<String> { |
| | let lower = word.to_lowercase(); |
| |
|
| | if let Some(phones) = G2P_DICT.get(lower.as_str()) { |
| | phones.iter().map(|s| s.to_string()).collect() |
| | } else { |
| | |
| | word.chars() |
| | .map(|c| c.to_uppercase().to_string()) |
| | .collect() |
| | } |
| | } |
| |
|
| | |
| | pub fn text_to_phonemes(text: &str) -> Vec<String> { |
| | let mut phonemes = Vec::new(); |
| |
|
| | let words: Vec<&str> = text.split_whitespace().collect(); |
| |
|
| | for (i, word) in words.iter().enumerate() { |
| | let clean_word: String = word |
| | .chars() |
| | .filter(|c| c.is_alphabetic()) |
| | .collect(); |
| |
|
| | if !clean_word.is_empty() { |
| | phonemes.extend(g2p_english(&clean_word)); |
| | } |
| |
|
| | |
| | if i < words.len() - 1 { |
| | phonemes.push(" ".to_string()); |
| | } |
| | } |
| |
|
| | phonemes |
| | } |
| |
|
| | |
| | #[derive(Debug, Clone, Copy, PartialEq, Eq)] |
| | pub enum Tone { |
| | First, |
| | Second, |
| | Third, |
| | Fourth, |
| | Neutral, |
| | } |
| |
|
| | |
| | pub fn extract_tone(pinyin: &str) -> (String, Tone) { |
| | let tone_marks = [ |
| | ('ā', 'a', Tone::First), |
| | ('á', 'a', Tone::Second), |
| | ('ǎ', 'a', Tone::Third), |
| | ('à', 'a', Tone::Fourth), |
| | ('ē', 'e', Tone::First), |
| | ('é', 'e', Tone::Second), |
| | ('ě', 'e', Tone::Third), |
| | ('è', 'e', Tone::Fourth), |
| | ('ī', 'i', Tone::First), |
| | ('í', 'i', Tone::Second), |
| | ('ǐ', 'i', Tone::Third), |
| | ('ì', 'i', Tone::Fourth), |
| | ('ō', 'o', Tone::First), |
| | ('ó', 'o', Tone::Second), |
| | ('ǒ', 'o', Tone::Third), |
| | ('ò', 'o', Tone::Fourth), |
| | ('ū', 'u', Tone::First), |
| | ('ú', 'u', Tone::Second), |
| | ('ǔ', 'u', Tone::Third), |
| | ('ù', 'u', Tone::Fourth), |
| | ('ǖ', 'ü', Tone::First), |
| | ('ǘ', 'ü', Tone::Second), |
| | ('ǚ', 'ü', Tone::Third), |
| | ('ǜ', 'ü', Tone::Fourth), |
| | ]; |
| |
|
| | let mut result = pinyin.to_string(); |
| | let mut tone = Tone::Neutral; |
| |
|
| | for (marked, plain, t) in tone_marks.iter() { |
| | if result.contains(*marked) { |
| | result = result.replace(*marked, &plain.to_string()); |
| | tone = *t; |
| | break; |
| | } |
| | } |
| |
|
| | |
| | if let Some(last_char) = result.chars().last() { |
| | if last_char.is_ascii_digit() { |
| | let tone_num = last_char.to_digit(10).unwrap_or(5); |
| | tone = match tone_num { |
| | 1 => Tone::First, |
| | 2 => Tone::Second, |
| | 3 => Tone::Third, |
| | 4 => Tone::Fourth, |
| | _ => Tone::Neutral, |
| | }; |
| | result.pop(); |
| | } |
| | } |
| |
|
| | (result, tone) |
| | } |
| |
|
| | |
| | pub fn pinyin_to_phones(pinyin: &str) -> Vec<String> { |
| | let (base, tone) = extract_tone(pinyin); |
| | let lower = base.to_lowercase(); |
| |
|
| | let mut phones = Vec::new(); |
| |
|
| | if let Some(&(initial, final_part)) = PINYIN_MAP.get(lower.as_str()) { |
| | if !initial.is_empty() { |
| | phones.push(initial.to_string()); |
| | } |
| | phones.push(final_part.to_string()); |
| | } else { |
| | |
| | phones.push(lower); |
| | } |
| |
|
| | |
| | let tone_str = match tone { |
| | Tone::First => "1", |
| | Tone::Second => "2", |
| | Tone::Third => "3", |
| | Tone::Fourth => "4", |
| | Tone::Neutral => "5", |
| | }; |
| | phones.push(tone_str.to_string()); |
| |
|
| | phones |
| | } |
| |
|
| | |
| | pub fn char_to_pinyin(ch: char) -> Option<String> { |
| | |
| | |
| | let pinyin_map: HashMap<char, &str> = [ |
| | ('你', "ni3"), |
| | ('好', "hao3"), |
| | ('世', "shi4"), |
| | ('界', "jie4"), |
| | ('中', "zhong1"), |
| | ('国', "guo2"), |
| | ('人', "ren2"), |
| | ('我', "wo3"), |
| | ('是', "shi4"), |
| | ('的', "de5"), |
| | ('了', "le5"), |
| | ('在', "zai4"), |
| | ('有', "you3"), |
| | ('个', "ge4"), |
| | ('这', "zhe4"), |
| | ('他', "ta1"), |
| | ('说', "shuo1"), |
| | ('来', "lai2"), |
| | ('要', "yao4"), |
| | ('就', "jiu4"), |
| | ('出', "chu1"), |
| | ('会', "hui4"), |
| | ('可', "ke3"), |
| | ('以', "yi3"), |
| | ('时', "shi2"), |
| | ('大', "da4"), |
| | ('看', "kan4"), |
| | ('地', "di4"), |
| | ('不', "bu4"), |
| | ('对', "dui4"), |
| | ] |
| | .iter() |
| | .cloned() |
| | .collect(); |
| |
|
| | pinyin_map.get(&ch).map(|s| s.to_string()) |
| | } |
| |
|
| | |
| | pub fn segment_chinese(text: &str) -> Vec<String> { |
| | use jieba_rs::Jieba; |
| |
|
| | let jieba = Jieba::new(); |
| | let words = jieba.cut(text, false); |
| | words.into_iter().map(|s| s.to_string()).collect() |
| | } |
| |
|
| | |
| | pub fn chinese_to_pinyin(text: &str) -> Vec<String> { |
| | let mut pinyin_seq = Vec::new(); |
| |
|
| | for ch in text.chars() { |
| | if super::is_chinese_char(ch) { |
| | if let Some(py) = char_to_pinyin(ch) { |
| | pinyin_seq.push(py); |
| | } else { |
| | |
| | pinyin_seq.push(format!("_{}_", ch)); |
| | } |
| | } else if !ch.is_whitespace() { |
| | pinyin_seq.push(ch.to_string()); |
| | } |
| | } |
| |
|
| | pinyin_seq |
| | } |
| |
|
| | #[cfg(test)] |
| | mod tests { |
| | use super::*; |
| |
|
| | #[test] |
| | fn test_g2p_english() { |
| | let phones = g2p_english("hello"); |
| | assert_eq!(phones, vec!["HH", "AH0", "L", "OW1"]); |
| | } |
| |
|
| | #[test] |
| | fn test_g2p_unknown() { |
| | let phones = g2p_english("xyz"); |
| | |
| | assert_eq!(phones, vec!["X", "Y", "Z"]); |
| | } |
| |
|
| | #[test] |
| | fn test_extract_tone() { |
| | let (base, tone) = extract_tone("nǐ"); |
| | assert_eq!(base, "ni"); |
| | assert_eq!(tone, Tone::Third); |
| |
|
| | let (base, tone) = extract_tone("hao3"); |
| | assert_eq!(base, "hao"); |
| | assert_eq!(tone, Tone::Third); |
| | } |
| |
|
| | #[test] |
| | fn test_pinyin_to_phones() { |
| | let phones = pinyin_to_phones("hao3"); |
| | assert!(phones.contains(&"h".to_string())); |
| | assert!(phones.contains(&"ao".to_string())); |
| | assert!(phones.contains(&"3".to_string())); |
| | } |
| |
|
| | #[test] |
| | fn test_char_to_pinyin() { |
| | assert_eq!(char_to_pinyin('你'), Some("ni3".to_string())); |
| | assert_eq!(char_to_pinyin('好'), Some("hao3".to_string())); |
| | } |
| |
|
| | #[test] |
| | fn test_segment_chinese() { |
| | let segments = segment_chinese("你好世界"); |
| | assert!(segments.len() >= 2); |
| | } |
| | } |
| |
|