Introduction

Voice interfaces have moved from novelty to expectation. Users now assume they can talk to apps—dictating messages, searching by voice, and having conversations translated in real-time. The underlying technology has matured to the point where adding these features is straightforward rather than a research project.

This guide covers practical implementations of voice recognition and translation features in mobile apps, with code examples you can adapt for your projects.

Voice Recognition Options

Voice Recognition Options Infographic

On-Device vs Cloud

The first decision is where speech processing happens.

On-device processing:

  • Fast response (no network latency)
  • Works offline
  • Complete privacy (audio never leaves device)
  • Limited by device capabilities
  • Models can be large (40MB-300MB)

Cloud processing:

  • Better accuracy, especially for edge cases
  • Supports more languages
  • Smaller app size
  • Requires network connectivity
  • Audio is sent to third-party servers

Most production apps use a hybrid: on-device for common cases with cloud fallback for accuracy-critical scenarios.

Technology Options

OptionTypeProsCons
Apple SpeechOn-deviceFree, native iOSiOS only, limited customisation
Google ML KitOn-deviceFree, cross-platformFewer languages than cloud
Whisper (local)On-deviceExcellent accuracyLarge model size
Google Cloud STTCloudMost languages, best accuracyPer-minute cost
OpenAI Whisper APICloudGreat accuracy, simple APIPer-minute cost
AWS TranscribeCloudGood AWS integrationMore complex pricing

Implementing Voice Recognition

i

Implementing Voice Recognition Infographic OS with Apple Speech Framework

Apple’s built-in speech recognition is free and works on-device for supported languages.

import Speech
import AVFoundation

class VoiceRecognitionService: ObservableObject {
    @Published var transcript = ""
    @Published var isListening = false
    @Published var error: String?

    private let speechRecognizer: SFSpeechRecognizer?
    private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
    private var recognitionTask: SFSpeechRecognitionTask?
    private let audioEngine = AVAudioEngine()

    init(locale: Locale = .current) {
        speechRecognizer = SFSpeechRecognizer(locale: locale)
    }

    func requestAuthorization() async -> Bool {
        await withCheckedContinuation { continuation in
            SFSpeechRecognizer.requestAuthorization { status in
                continuation.resume(returning: status == .authorized)
            }
        }
    }

    func startListening() async throws {
        guard let speechRecognizer = speechRecognizer,
              speechRecognizer.isAvailable else {
            throw VoiceError.recognizerUnavailable
        }

        // Configure audio session
        let audioSession = AVAudioSession.sharedInstance()
        try audioSession.setCategory(.record, mode: .measurement, options: .duckOthers)
        try audioSession.setActive(true, options: .notifyOthersOnDeactivation)

        // Create recognition request
        recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
        guard let recognitionRequest = recognitionRequest else {
            throw VoiceError.requestCreationFailed
        }

        // Enable on-device recognition when available
        if speechRecognizer.supportsOnDeviceRecognition {
            recognitionRequest.requiresOnDeviceRecognition = true
        }

        recognitionRequest.shouldReportPartialResults = true
        recognitionRequest.taskHint = .dictation

        // Start recognition task
        recognitionTask = speechRecognizer.recognitionTask(with: recognitionRequest) { [weak self] result, error in
            guard let self = self else { return }

            if let result = result {
                DispatchQueue.main.async {
                    self.transcript = result.bestTranscription.formattedString
                }
            }

            if error != nil || result?.isFinal == true {
                self.stopListening()
            }
        }

        // Configure audio input
        let inputNode = audioEngine.inputNode
        let recordingFormat = inputNode.outputFormat(forBus: 0)

        inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { buffer, _ in
            self.recognitionRequest?.append(buffer)
        }

        audioEngine.prepare()
        try audioEngine.start()

        DispatchQueue.main.async {
            self.isListening = true
        }
    }

    func stopListening() {
        audioEngine.stop()
        audioEngine.inputNode.removeTap(onBus: 0)
        recognitionRequest?.endAudio()
        recognitionTask?.cancel()
        recognitionRequest = nil
        recognitionTask = nil

        DispatchQueue.main.async {
            self.isListening = false
        }
    }
}

enum VoiceError: Error {
    case recognizerUnavailable
    case requestCreationFailed
    case notAuthorized
}

Android with Google Speech Recognition

import android.content.Context
import android.content.Intent
import android.os.Bundle
import android.speech.RecognitionListener
import android.speech.RecognizerIntent
import android.speech.SpeechRecognizer
import kotlinx.coroutines.flow.MutableStateFlow
import kotlinx.coroutines.flow.StateFlow

class VoiceRecognitionService(private val context: Context) {

    private var speechRecognizer: SpeechRecognizer? = null

    private val _transcript = MutableStateFlow("")
    val transcript: StateFlow<String> = _transcript

    private val _isListening = MutableStateFlow(false)
    val isListening: StateFlow<Boolean> = _isListening

    private val _error = MutableStateFlow<String?>(null)
    val error: StateFlow<String?> = _error

    fun initialize() {
        if (!SpeechRecognizer.isRecognitionAvailable(context)) {
            _error.value = "Speech recognition not available"
            return
        }

        speechRecognizer = SpeechRecognizer.createSpeechRecognizer(context)
        speechRecognizer?.setRecognitionListener(createListener())
    }

    fun startListening(languageCode: String = "en-AU") {
        val intent = Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH).apply {
            putExtra(RecognizerIntent.EXTRA_LANGUAGE_MODEL,
                    RecognizerIntent.LANGUAGE_MODEL_FREE_FORM)
            putExtra(RecognizerIntent.EXTRA_LANGUAGE, languageCode)
            putExtra(RecognizerIntent.EXTRA_PARTIAL_RESULTS, true)
            putExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 1)
        }

        speechRecognizer?.startListening(intent)
        _isListening.value = true
    }

    fun stopListening() {
        speechRecognizer?.stopListening()
        _isListening.value = false
    }

    fun destroy() {
        speechRecognizer?.destroy()
        speechRecognizer = null
    }

    private fun createListener() = object : RecognitionListener {
        override fun onReadyForSpeech(params: Bundle?) {
            _error.value = null
        }

        override fun onBeginningOfSpeech() {}

        override fun onRmsChanged(rmsdB: Float) {}

        override fun onBufferReceived(buffer: ByteArray?) {}

        override fun onEndOfSpeech() {
            _isListening.value = false
        }

        override fun onError(error: Int) {
            _isListening.value = false
            _error.value = when (error) {
                SpeechRecognizer.ERROR_AUDIO -> "Audio recording error"
                SpeechRecognizer.ERROR_CLIENT -> "Client error"
                SpeechRecognizer.ERROR_NETWORK -> "Network error"
                SpeechRecognizer.ERROR_NO_MATCH -> "No speech detected"
                SpeechRecognizer.ERROR_SPEECH_TIMEOUT -> "Speech timeout"
                else -> "Recognition error: $error"
            }
        }

        override fun onResults(results: Bundle?) {
            val matches = results?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION)
            _transcript.value = matches?.firstOrNull() ?: ""
        }

        override fun onPartialResults(partialResults: Bundle?) {
            val matches = partialResults?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION)
            matches?.firstOrNull()?.let { _transcript.value = it }
        }

        override fun onEvent(eventType: Int, params: Bundle?) {}
    }
}

Using Whisper for Higher Accuracy

OpenAI’s Whisper model offers excellent accuracy. You can run it on-device or use the API.

On-device with whisper.cpp (React Native):

// Using react-native-whisper library
import { initWhisper, transcribe } from 'react-native-whisper';

class WhisperService {
  constructor() {
    this.whisperContext = null;
  }

  async initialize(modelPath) {
    // Download model from your server or bundle it
    // Models: tiny (39MB), base (74MB), small (244MB)
    this.whisperContext = await initWhisper({
      filePath: modelPath,
    });
  }

  async transcribe(audioPath, language = 'en') {
    if (!this.whisperContext) {
      throw new Error('Whisper not initialized');
    }

    const result = await transcribe(this.whisperContext, audioPath, {
      language,
      translate: false,
      maxLen: 0, // No max length
      tokenTimestamps: false,
    });

    return result.text;
  }

  destroy() {
    if (this.whisperContext) {
      this.whisperContext.release();
      this.whisperContext = null;
    }
  }
}

Cloud API approach:

// Backend service
import OpenAI from 'openai';
import fs from 'fs';

const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });

async function transcribeAudio(audioFilePath: string, language?: string) {
  const audioFile = fs.createReadStream(audioFilePath);

  const transcription = await openai.audio.transcriptions.create({
    file: audioFile,
    model: 'whisper-1',
    language, // Optional: ISO-639-1 code
    response_format: 'json',
  });

  return transcription.text;
}

// Express endpoint
app.post('/api/transcribe', upload.single('audio'), async (req, res) => {
  try {
    const text = await transcribeAudio(req.file.path, req.body.language);
    res.json({ text });
  } catch (error) {
    res.status(500).json({ error: error.message });
  } finally {
    // Clean up uploaded file
    fs.unlinkSync(req.file.path);
  }
});

Adding Real-Time Translatio

n

Once you have text, translation is the next step.

Google Cloud Translation

// Backend translation service
import { TranslationServiceClient } from '@google-cloud/translate';

const translationClient = new TranslationServiceClient();
const projectId = process.env.GOOGLE_CLOUD_PROJECT;

async function translateText(
  text: string,
  targetLanguage: string,
  sourceLanguage?: string
): Promise<TranslationResult> {
  const request = {
    parent: `projects/${projectId}/locations/global`,
    contents: [text],
    mimeType: 'text/plain',
    targetLanguageCode: targetLanguage,
    sourceLanguageCode: sourceLanguage, // Optional: auto-detect if not provided
  };

  const [response] = await translationClient.translateText(request);
  const translation = response.translations?.[0];

  return {
    translatedText: translation?.translatedText || '',
    detectedSourceLanguage: translation?.detectedLanguageCode,
  };
}

interface TranslationResult {
  translatedText: string;
  detectedSourceLanguage?: string;
}

// API endpoint
app.post('/api/translate', async (req, res) => {
  const { text, targetLanguage, sourceLanguage } = req.body;

  try {
    const result = await translateText(text, targetLanguage, sourceLanguage);
    res.json(result);
  } catch (error) {
    res.status(500).json({ error: 'Translation failed' });
  }
});

On-Device Translation with ML Kit

Google’s ML Kit offers on-device translation for common language pairs.

import com.google.mlkit.common.model.DownloadConditions
import com.google.mlkit.nl.translate.TranslateLanguage
import com.google.mlkit.nl.translate.Translation
import com.google.mlkit.nl.translate.TranslatorOptions
import kotlinx.coroutines.tasks.await

class TranslationService {

    private val translators = mutableMapOf<String, com.google.mlkit.nl.translate.Translator>()

    suspend fun downloadModel(languageCode: String): Boolean {
        val translator = getOrCreateTranslator(TranslateLanguage.ENGLISH, languageCode)

        val conditions = DownloadConditions.Builder()
            .requireWifi()
            .build()

        return try {
            translator.downloadModelIfNeeded(conditions).await()
            true
        } catch (e: Exception) {
            false
        }
    }

    suspend fun translate(
        text: String,
        sourceLanguage: String,
        targetLanguage: String
    ): String {
        val translator = getOrCreateTranslator(sourceLanguage, targetLanguage)
        return translator.translate(text).await()
    }

    private fun getOrCreateTranslator(
        sourceLanguage: String,
        targetLanguage: String
    ): com.google.mlkit.nl.translate.Translator {
        val key = "$sourceLanguage-$targetLanguage"

        return translators.getOrPut(key) {
            val options = TranslatorOptions.Builder()
                .setSourceLanguage(sourceLanguage)
                .setTargetLanguage(targetLanguage)
                .build()
            Translation.getClient(options)
        }
    }

    fun close() {
        translators.values.forEach { it.close() }
        translators.clear()
    }
}

Building a Voice-to-Voice Translation Feature

Combining speech recognition, translation, and text-to-speech creates a voice translation feature.

// iOS implementation
import AVFoundation

class VoiceTranslationService: ObservableObject {
    private let speechRecognition = VoiceRecognitionService()
    private let translator = TranslationService()
    private let synthesizer = AVSpeechSynthesizer()

    @Published var sourceText = ""
    @Published var translatedText = ""
    @Published var state: TranslationState = .idle

    enum TranslationState {
        case idle
        case listening
        case translating
        case speaking
    }

    func translate(from sourceLanguage: String, to targetLanguage: String) async {
        state = .listening

        do {
            // Step 1: Listen for speech
            try await speechRecognition.startListening()

            // Wait for speech to complete (simplified - real app would use voice activity detection)
            try await Task.sleep(nanoseconds: 5_000_000_000)
            speechRecognition.stopListening()

            sourceText = speechRecognition.transcript

            if sourceText.isEmpty {
                state = .idle
                return
            }

            // Step 2: Translate
            state = .translating
            translatedText = try await translator.translate(
                text: sourceText,
                from: sourceLanguage,
                to: targetLanguage
            )

            // Step 3: Speak translation
            state = .speaking
            await speakText(translatedText, language: targetLanguage)

            state = .idle

        } catch {
            print("Translation error: \(error)")
            state = .idle
        }
    }

    private func speakText(_ text: String, language: String) async {
        let utterance = AVSpeechUtterance(string: text)
        utterance.voice = AVSpeechSynthesisVoice(language: language)
        utterance.rate = AVSpeechUtteranceDefaultSpeechRate

        await withCheckedContinuation { continuation in
            let delegate = SpeechDelegate {
                continuation.resume()
            }
            synthesizer.delegate = delegate
            synthesizer.speak(utterance)
        }
    }
}

class SpeechDelegate: NSObject, AVSpeechSynthesizerDelegate {
    let completion: () -> Void

    init(completion: @escaping () -> Void) {
        self.completion = completion
    }

    func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) {
        completion()
    }
}

Handling Edge Cases

Background Audio

iOS requires specific audio session configuration to work in the background:

func configureAudioSession() throws {
    let session = AVAudioSession.sharedInstance()

    try session.setCategory(
        .playAndRecord,
        mode: .default,
        options: [
            .defaultToSpeaker,
            .allowBluetooth,
            .mixWithOthers
        ]
    )

    try session.setActive(true)
}

Handling Noise

Provide feedback when audio quality is poor:

func onRmsChanged(rmsdB: Float) {
    // rmsdB typically ranges from -160 to 0
    // Values below -50 indicate very quiet audio
    if rmsdB < -50 {
        showFeedback("Speak louder or move closer to the microphone")
    }
}

Language Detection

When the source language is unknown:

import { LanguageServiceClient } from '@google-cloud/language';

const languageClient = new LanguageServiceClient();

async function detectLanguage(text: string): Promise<string> {
  const [result] = await languageClient.analyzeEntities({
    document: {
      content: text,
      type: 'PLAIN_TEXT',
    },
  });

  return result.language || 'en';
}

Cost Optimisation

Voice and translation APIs charge per usage. Here’s how to manage costs:

Cache Translations

import { createClient } from 'redis';

const redis = createClient();

async function translateWithCache(
  text: string,
  from: string,
  to: string
): Promise<string> {
  const cacheKey = `translation:${from}:${to}:${hashText(text)}`;

  // Check cache first
  const cached = await redis.get(cacheKey);
  if (cached) {
    return cached;
  }

  // Translate and cache
  const translated = await translateText(text, to, from);
  await redis.setEx(cacheKey, 86400, translated); // Cache for 24 hours

  return translated;
}

Batch Requests

// Instead of translating one phrase at a time
const phrases = ['Hello', 'Goodbye', 'Thank you'];

// Batch them
const translations = await translateBatch(phrases, 'en', 'ja');

Use On-Device When Possible

func translate(text: String, from: String, to: String) async throws -> String {
    // Try on-device first
    if let onDeviceResult = try? await onDeviceTranslator.translate(text, from: from, to: to) {
        return onDeviceResult
    }

    // Fall back to cloud
    return try await cloudTranslator.translate(text, from: from, to: to)
}

Conclusion

Adding voice recognition and translation to your mobile app is more accessible than ever. The key decisions are:

  1. On-device vs cloud — balance privacy, speed, and accuracy for your use case
  2. Which APIs — Apple/Google native for free basics, Whisper for accuracy, cloud for edge cases
  3. User experience — provide feedback during listening, handle errors gracefully, show translation progress

Start with the simplest implementation that meets your needs. On-device speech recognition with cloud translation covers most use cases at reasonable cost. Add complexity like on-device translation models only when you have specific requirements for offline use or cost reduction.

Voice interfaces are no longer a premium feature—users expect them. The technology is mature, the APIs are straightforward, and the cost is manageable. Time to add voice to your app.