Brooklyn/arch/arm/ARMnn/samples/SpeechRecognition/src/Main.cpp

//
// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
#include <iostream>
#include <map>
#include <vector>
#include <algorithm>
#include <cmath>

#include "CmdArgsParser.hpp"
#include "ArmnnNetworkExecutor.hpp"
#include "AudioCapture.hpp"
#include "SpeechRecognitionPipeline.hpp"
#include "Wav2LetterMFCC.hpp"

using InferenceResult = std::vector<int8_t>;
using InferenceResults = std::vector<InferenceResult>;

const std::string AUDIO_FILE_PATH = "--audio-file-path";
const std::string MODEL_FILE_PATH = "--model-file-path";
const std::string LABEL_PATH = "--label-path";
const std::string PREFERRED_BACKENDS = "--preferred-backends";
const std::string HELP = "--help";

std::map<int, std::string> labels =
{
        {0,  "a"},
        {1,  "b"},
        {2,  "c"},
        {3,  "d"},
        {4,  "e"},
        {5,  "f"},
        {6,  "g"},
        {7,  "h"},
        {8,  "i"},
        {9,  "j"},
        {10, "k"},
        {11, "l"},
        {12, "m"},
        {13, "n"},
        {14, "o"},
        {15, "p"},
        {16, "q"},
        {17, "r"},
        {18, "s"},
        {19, "t"},
        {20, "u"},
        {21, "v"},
        {22, "w"},
        {23, "x"},
        {24, "y"},
        {25, "z"},
        {26, "\'"},
        {27, " "},
        {28, "$"}
};

/*
 * The accepted options for this Speech Recognition executable
 */
static std::map<std::string, std::string> CMD_OPTIONS =
{
    {AUDIO_FILE_PATH,    "[REQUIRED] Path to the Audio file to run speech recognition on"},
    {MODEL_FILE_PATH,    "[REQUIRED] Path to the Speech Recognition model to use"},
    {PREFERRED_BACKENDS, "[OPTIONAL] Takes the preferred backends in preference order, separated by comma."
                         " For example: CpuAcc,GpuAcc,CpuRef. Accepted options: [CpuAcc, CpuRef, GpuAcc]."
                         " Defaults to CpuAcc,CpuRef"}
};

/*
 * Reads the user supplied backend preference, splits it by comma, and returns an ordered vector
 */
std::vector<armnn::BackendId> GetPreferredBackendList(const std::string& preferredBackends)
{
    std::vector<armnn::BackendId> backends;
    std::stringstream ss(preferredBackends);

    while (ss.good())
    {
        std::string backend;
        std::getline(ss, backend, ',');
        backends.emplace_back(backend);
    }
    return backends;
}

int main(int argc, char* argv[])
{
    bool isFirstWindow = true;
    std::string currentRContext = "";

    std::map<std::string, std::string> options;

    int result = ParseOptions(options, CMD_OPTIONS, argv, argc);
    if (result != 0)
    {
        return result;
    }

    // Create the network options
    common::PipelineOptions pipelineOptions;
    pipelineOptions.m_ModelFilePath = GetSpecifiedOption(options, MODEL_FILE_PATH);
    pipelineOptions.m_ModelName = "Wav2Letter";
    if (CheckOptionSpecified(options, PREFERRED_BACKENDS))
    {
        pipelineOptions.m_backends = GetPreferredBackendList((GetSpecifiedOption(options, PREFERRED_BACKENDS)));
    }
    else
    {
        pipelineOptions.m_backends = {"CpuAcc", "CpuRef"};
    }

    asr::IPipelinePtr asrPipeline = asr::CreatePipeline(pipelineOptions, labels);

    audio::AudioCapture capture;
    std::vector<float> audioData = audio::AudioCapture::LoadAudioFile(GetSpecifiedOption(options, AUDIO_FILE_PATH));
    capture.InitSlidingWindow(audioData.data(), audioData.size(), asrPipeline->getInputSamplesSize(),
                              asrPipeline->getSlidingWindowOffset());

    while (capture.HasNext())
    {
        std::vector<float> audioBlock = capture.Next();
        InferenceResults results;

        std::vector<int8_t> preprocessedData = asrPipeline->PreProcessing(audioBlock);
        asrPipeline->Inference<int8_t>(preprocessedData, results);
        asrPipeline->PostProcessing<int8_t>(results, isFirstWindow, !capture.HasNext(), currentRContext);
    }

    return 0;
}