vosk-model-ru-0.54

Browse files

Files changed (9) hide show

am-onnx/decoder.int8.onnx +2 -2
am-onnx/decoder.onnx +2 -2
am-onnx/encoder.int8.onnx +2 -2
am-onnx/encoder.onnx +2 -2
am-onnx/joiner.int8.onnx +2 -2
am-onnx/joiner.onnx +2 -2
am/jit_script.pt +2 -2
decode-onnx.py +46 -0
decode.py +92 -109

am-onnx/decoder.int8.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5b83b80d557538517f010f64e1961ba57fd9de2b93bf73b77b8d27f0ac28160f
-size 1307950

 version https://git-lfs.github.com/spec/v1
+oid sha256:6380fc4c6dd867b3d263aef71abe8de5a5785b2fcf0e5d619b4ccc2df2119d4f
+size 540689

am-onnx/decoder.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3977101b53640e50dff4274988ef754cd7ea539ae480baed0d18fc4966ad6917
-size 2093079

 version https://git-lfs.github.com/spec/v1
+oid sha256:dcbe1ffa0211e77ca6d3a80164df13fbda3ec00e47d12b9f449f89572df12136
+size 2093080

am-onnx/encoder.int8.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:69a347ddf7e5d39452d17f059dfedcd8c9471ca1953c320cb04ca19626149fdb
-size 68187180

 version https://git-lfs.github.com/spec/v1
+oid sha256:eb6c12fbad810d5bc3e427802e604604c69b5943a91feebc43424dd09d9ec407
+size 70876638

am-onnx/encoder.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3a19ffacdc7f67e803ece039c4dc31c15060961823bd16401e2adba4b147b7d1
-size 259207382

 version https://git-lfs.github.com/spec/v1
+oid sha256:8bca034acab837e4b30625f4101b27385c8553ea44abfa5bd89c4581667f250c
+size 261058126

am-onnx/joiner.int8.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a6f550b56375326ee4a67dcac2cac027035e2dc137c08fff3557f4ffdc5cbc3e
-size 259572

 version https://git-lfs.github.com/spec/v1
+oid sha256:93f2e1d12b78d53e7802f1606488c14bb3d764b15fadf5ef6c022f6ba1fa40f7
+size 259417

am-onnx/joiner.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7134eb50ea3d57bdabc22f2c3b9c7fd761d51b51a4534dc5694297e02ed735ed
-size 1026461

 version https://git-lfs.github.com/spec/v1
+oid sha256:6d94a1c4273ad750d98cbe89320a5b1860143059162fb8407cc22706bcfe5835
+size 1026462

am/jit_script.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d5b64964bad8c24fe48f5d9c0ffe98c4787495ea991a186f00b059fc3fa549c9
-size 264940286

 version https://git-lfs.github.com/spec/v1
+oid sha256:91323267f4a017096429d16783ccfd9366bc005b3447b0a78d4865ded08652fc
+size 265975361

decode-onnx.py ADDED Viewed

	@@ -0,0 +1,46 @@

+#!/usr/bin/env python3
+import wave
+from pathlib import Path
+from typing import Tuple
+import sys
+import numpy as np
+import sherpa_onnx
+import sys
+def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
+    with wave.open(wave_filename) as f:
+        assert f.getnchannels() == 1, f.getnchannels()
+        assert f.getsampwidth() == 2, f.getsampwidth()  # it is in bytes
+        num_samples = f.getnframes()
+        samples = f.readframes(num_samples)
+        samples_int16 = np.frombuffer(samples, dtype=np.int16)
+        samples_float32 = samples_int16.astype(np.float32)
+        samples_float32 = samples_float32 / 32768
+        return samples_float32, f.getframerate()
+def main():
+    recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
+            encoder="am-onnx/encoder.onnx",
+            decoder="am-onnx/decoder.onnx",
+            joiner="am-onnx/joiner.onnx",
+            tokens="lang/tokens.txt",
+            num_threads=0,
+            provider='cpu',
+            sample_rate=16000,
+            dither=3e-5,
+            max_active_paths=10,
+            decoding_method="modified_beam_search")
+    samples, sample_rate = read_wave(sys.argv[1])
+    s = recognizer.create_stream()
+    s.accept_waveform(sample_rate, samples)
+    recognizer.decode_stream(s)
+    print ("Text:", s.result.text)
+    print ("Tokens:", s.result.tokens)
+    print ("Timestamps:", s.result.timestamps)
+if __name__ == "__main__":
+    main()

decode.py CHANGED Viewed

@@ -1,6 +1,8 @@
 #!/usr/bin/env python3
 # Copyright 2021-2023 Xiaomi Corporation (Author: Fangjun Kuang, Zengwei Yao)
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,11 +14,33 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import sys
 import argparse
 import logging
 import math
 import warnings
 from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Tuple, Union
@@ -25,8 +49,8 @@ import kaldifeat
 import sentencepiece as spm
 import torch
 import torchaudio
 from torch.nn.utils.rnn import pad_sequence
 from icefall import NgramLm, NgramLmStateCost
 from icefall.decode import Nbest, one_best_decoding
@@ -38,6 +62,37 @@ from icefall.lexicon import Lexicon
 import k2
 def read_sound_files(
     filenames: List[str], expected_sample_rate: float = 16000
@@ -59,6 +114,8 @@ def read_sound_files(
         ans.append(wav)
     return ans
 @dataclass
 class Hypothesis:
     # The predicted tokens so far.
@@ -299,7 +356,7 @@ def modified_beam_search_LODR(
     for i in range(N):
         B[i].add(
             Hypothesis(
-                ys=[blank_id] * context_size,
                 log_prob=torch.zeros(1, dtype=torch.float32, device=device),
                 state=init_states,  # state of the NN LM
                 lm_score=init_score.reshape(-1),
@@ -501,132 +558,41 @@ def modified_beam_search_LODR(
     return ans
-def greedy_search(
-    model: torch.jit.ScriptModule,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-) -> List[List[int]]:
-    """Greedy search in batch mode. It hardcodes --max-sym-per-frame=1.
-    Args:
-      model:
-        The transducer model.
-      encoder_out:
-        A 3-D tensor of shape (N, T, C)
-      encoder_out_lens:
-        A 1-D tensor of shape (N,).
-    Returns:
-      Return the decoded results for each utterance.
-    """
-    assert encoder_out.ndim == 3
-    assert encoder_out.size(0) >= 1, encoder_out.size(0)
-    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
-        input=encoder_out,
-        lengths=encoder_out_lens.cpu(),
-        batch_first=True,
-        enforce_sorted=False,
-    )
-    device = encoder_out.device
-    blank_id = 0  # hard-code to 0
-    batch_size_list = packed_encoder_out.batch_sizes.tolist()
-    N = encoder_out.size(0)
-    assert torch.all(encoder_out_lens > 0), encoder_out_lens
-    assert N == batch_size_list[0], (N, batch_size_list)
-    context_size = model.decoder.context_size
-    hyps = [[blank_id] * context_size for _ in range(N)]
-    decoder_input = torch.tensor(
-        hyps,
-        device=device,
-        dtype=torch.int64,
-    )  # (N, context_size)
-    decoder_out = model.decoder(
-        decoder_input,
-        need_pad=torch.tensor([False]),
-    ).squeeze(1)
-    offset = 0
-    for batch_size in batch_size_list:
-        start = offset
-        end = offset + batch_size
-        current_encoder_out = packed_encoder_out.data[start:end]
-        current_encoder_out = current_encoder_out
-        # current_encoder_out's shape: (batch_size, encoder_out_dim)
-        offset = end
-        decoder_out = decoder_out[:batch_size]
-        logits = model.joiner(
-            current_encoder_out,
-            decoder_out,
-        )
-        # logits'shape (batch_size, vocab_size)
-        assert logits.ndim == 2, logits.shape
-        y = logits.argmax(dim=1).tolist()
-        emitted = False
-        for i, v in enumerate(y):
-            if v != blank_id:
-                hyps[i].append(v)
-                emitted = True
-        if emitted:
-            # update decoder output
-            decoder_input = [h[-context_size:] for h in hyps[:batch_size]]
-            decoder_input = torch.tensor(
-                decoder_input,
-                device=device,
-                dtype=torch.int64,
-            )
-            decoder_out = model.decoder(
-                decoder_input,
-                need_pad=torch.tensor([False]),
-            )
-            decoder_out = decoder_out.squeeze(1)
-    sorted_ans = [h[context_size:] for h in hyps]
-    ans = []
-    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
-    for i in range(N):
-        ans.append(sorted_ans[unsorted_indices[i]])
-    return ans
 @torch.no_grad()
 def main():
-    torch.set_num_threads(8)
     device = torch.device("cpu")
     if torch.cuda.is_available():
         device = torch.device("cuda", 0)
-    model = torch.jit.load("am/jit_script.pt")
     model.eval()
     model.to(device)
     sp = spm.SentencePieceProcessor()
-    sp.load("lang/bpe.model")
     opts = kaldifeat.FbankOptions()
     opts.device = device
-    opts.frame_opts.dither = 0
     opts.frame_opts.snip_edges = False
     opts.frame_opts.samp_freq = 16000
     opts.mel_opts.num_bins = 80
     fbank = kaldifeat.Fbank(opts)
     all_filenames = sys.argv[1:]
     params = AttributeDict()
-    params.vocab_size = 500
     params.rnn_lm_embedding_dim = 2048
     params.rnn_lm_hidden_dim = 2048
     params.rnn_lm_num_layers = 3
@@ -651,8 +617,15 @@ def main():
     )
     ngram_lm_scale = -0.1
-    for i in range(0, len(all_filenames), 16):
-        filenames = all_filenames[i:i+16]
         waves = read_sound_files(
             filenames=filenames,
         )
@@ -684,9 +657,19 @@ def main():
             LM=LM,
         )
         for f, hyp in zip(filenames, hyps):
             words = sp.decode(hyp)
-            print(f"{f.split('/')[-1][0:-4]} {words}")
 if __name__ == "__main__":
     formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"

 #!/usr/bin/env python3
 # Copyright 2021-2023 Xiaomi Corporation (Author: Fangjun Kuang, Zengwei Yao)
 #
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+This script loads torchscript models, exported by `torch.jit.script()`
+and uses them to decode waves.
+You can use the following command to get the exported models:
+./zipformer/export.py \
+  --exp-dir ./zipformer/exp \
+  --bpe-model data/lang_bpe_500/bpe.model \
+  --epoch 30 \
+  --avg 9 \
+  --jit 1
+Usage of this script:
+./zipformer/jit_pretrained.py \
+  --nn-model-filename ./zipformer/exp/cpu_jit.pt \
+  --bpe-model ./data/lang_bpe_500/bpe.model \
+  /path/to/foo.wav \
+  /path/to/bar.wav
+"""
 import argparse
 import logging
 import math
+import random
+import os
+import sys
 import warnings
 from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Tuple, Union
 import sentencepiece as spm
 import torch
 import torchaudio
 from torch.nn.utils.rnn import pad_sequence
+from timeit import default_timer as timer
 from icefall import NgramLm, NgramLmStateCost
 from icefall.decode import Nbest, one_best_decoding
 import k2
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--nn-model-filename",
+        default='am/jit_script.pt',
+        type=str,
+        help="Path to the torchscript model cpu_jit.pt",
+    )
+    parser.add_argument(
+        "--bpe-model",
+        default='lang/bpe.model',
+        type=str,
+        help="""Path to bpe.model.""",
+    )
+    parser.add_argument(
+        "sound_files",
+        type=str,
+        nargs="+",
+        help="The input sound file(s) to transcribe. "
+        "Supported formats are those supported by torchaudio.load(). "
+        "For example, wav and flac are supported. "
+        "The sample rate has to be 16kHz.",
+    )
+    return parser
 def read_sound_files(
     filenames: List[str], expected_sample_rate: float = 16000
         ans.append(wav)
     return ans
 @dataclass
 class Hypothesis:
     # The predicted tokens so far.
     for i in range(N):
         B[i].add(
             Hypothesis(
+                ys=([-1] * (context_size - 1) + [blank_id]),
                 log_prob=torch.zeros(1, dtype=torch.float32, device=device),
                 state=init_states,  # state of the NN LM
                 lm_score=init_score.reshape(-1),
     return ans
 @torch.no_grad()
 def main():
+    torch.set_num_threads(4)
+    parser = get_parser()
+    args = parser.parse_args()
     device = torch.device("cpu")
     if torch.cuda.is_available():
         device = torch.device("cuda", 0)
+    model = torch.jit.load(args.nn_model_filename)
     model.eval()
     model.to(device)
     sp = spm.SentencePieceProcessor()
+    sp.load(args.bpe_model)
+    random.seed(17)
     opts = kaldifeat.FbankOptions()
     opts.device = device
+    opts.frame_opts.dither = 3e-5
     opts.frame_opts.snip_edges = False
     opts.frame_opts.samp_freq = 16000
     opts.mel_opts.num_bins = 80
+    opts.mel_opts.high_freq = -400
     fbank = kaldifeat.Fbank(opts)
     all_filenames = sys.argv[1:]
     params = AttributeDict()
+    params.lm_vocab_size = 500
     params.rnn_lm_embedding_dim = 2048
     params.rnn_lm_hidden_dim = 2048
     params.rnn_lm_num_layers = 3
     )
     ngram_lm_scale = -0.1
+    start_time = timer()
+    samples = 0
+    for f in all_filenames:
+        samples = samples + os.path.getsize(f) / 2
+    batch_size = 8
+    for i in range(0, len(all_filenames), batch_size):
+        filenames = all_filenames[i:i+batch_size]
         waves = read_sound_files(
             filenames=filenames,
         )
             LM=LM,
         )
         for f, hyp in zip(filenames, hyps):
             words = sp.decode(hyp)
+            print(f"{f.split('/')[-1][0:-4]} {words}", flush=True)
+    end_time = timer()
+    print("Processed %.3f seconds of audio in %.3f seconds (%.3f xRT)"
+        % (samples / 16000.0,
+        end_time - start_time,
+        (end_time - start_time) / (samples / 16000.0)),
+        file=sys.stderr)
 if __name__ == "__main__":
     formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"