/** * capture-processor.js — AudioWorkletProcessor for microphone capture. * * Runs on the dedicated audio worklet thread (off the main JS thread). * * Responsibilities: * - Accumulate incoming 128-sample Web Audio blocks into a staging buffer. * - Drain the staging buffer in 960-sample chunks (20ms Opus frames @ 48kHz). * - Compute adaptive RMS VAD per frame. * - Post each complete frame + VAD flag to either the main thread or, when * installed, a dedicated encode worker MessagePort. * - Accept { type: 'mute', muted: boolean } messages to silence output. */ const OPUS_OUTPUT_SAMPLE_RATE = 48000; const OPUS_FRAME_DURATION_MS = 20; const OPUS_FRAME_SAMPLES = 960; // 20ms @ 48 kHz function resampleLinear(input, outputLength) { if (input.length === outputLength) { const out = new Float32Array(outputLength); out.set(input); return out; } const out = new Float32Array(outputLength); if (outputLength <= 1 || input.length <= 1) { out[0] = input[0] || 0; return out; } if (input.length > outputLength) { const scale = input.length / outputLength; for (let i = 0; i < outputLength; i++) { const start = i * scale; const end = (i + 1) * scale; const first = Math.floor(start); const last = Math.min(input.length - 1, Math.ceil(end) - 1); let sum = 0; let weight = 0; for (let j = first; j <= last; j++) { const segmentStart = Math.max(start, j); const segmentEnd = Math.min(end, j + 1); const w = Math.max(0, segmentEnd - segmentStart); sum += input[j] * w; weight += w; } out[i] = weight > 0 ? sum / weight : input[first] || 0; } return out; } const scale = (input.length - 1) / (outputLength - 1); for (let i = 0; i < outputLength; i++) { const pos = i * scale; const left = Math.floor(pos); const right = Math.min(input.length - 1, left + 1); const frac = pos - left; out[i] = input[left] * (1 - frac) + input[right] * frac; } return out; } // Adaptive RMS VAD constants (noise-tracking worklet only). const VAD_ALPHA = 0.99; // Slightly sensitive so quiet mics still set vad=true (forwarder gates on vad). const VAD_MULTIPLIER = 2.3; const VAD_MIN_THRESHOLD = 0.01; const VAD_INITIAL_FLOOR = 0.005; class CaptureProcessor extends AudioWorkletProcessor { constructor() { super(); this._inputFrameSamples = Math.max( 1, Math.round((sampleRate * OPUS_FRAME_DURATION_MS) / 1000) ); // Staging buffer: grows on demand, pre-sized for two input-rate frames. this._buf = new Float32Array(this._inputFrameSamples * 2); this._offset = 0; // Adaptive VAD state this._noiseFloor = VAD_INITIAL_FLOOR; // Mute gate this._muted = false; this._framePort = null; this._framePortGeneration = 0; this._sharedSamples = null; this._sharedState = null; this._sharedSlotCount = 0; this._sharedFrameSamples = OPUS_FRAME_SAMPLES; this.port.onmessage = (e) => { if (e.data?.type === 'mute') { this._muted = !!e.data.muted; } else if (e.data?.type === 'set-frame-port') { this._framePort = e.ports?.[0] || null; this._framePortGeneration = (e.data.generation >>> 0) || 0; this._sharedSamples = e.data.sharedSamples instanceof SharedArrayBuffer ? new Float32Array(e.data.sharedSamples) : null; this._sharedState = e.data.sharedState instanceof SharedArrayBuffer ? new Int32Array(e.data.sharedState) : null; this._sharedSlotCount = Math.max(0, e.data.sharedSlotCount >>> 0); this._sharedFrameSamples = Math.max(1, e.data.sharedFrameSamples >>> 0) || OPUS_FRAME_SAMPLES; } else if (e.data?.type === 'clear-frame-port') { this._framePort = null; this._framePortGeneration = 0; this._sharedSamples = null; this._sharedState = null; this._sharedSlotCount = 0; } }; } _postFrameToWorker(frame, vad, workletPostAudioClockMs) { if ( this._sharedSamples && this._sharedState && this._sharedSlotCount > 0 && this._sharedFrameSamples === frame.length ) { for (let slot = 0; slot < this._sharedSlotCount; slot++) { if (Atomics.compareExchange(this._sharedState, slot, 0, 1) !== 0) { continue; } this._sharedSamples.set(frame, slot * this._sharedFrameSamples); Atomics.store(this._sharedState, slot, 2); this._framePort.postMessage({ type: 'encodeFrame', generation: this._framePortGeneration, sharedSlot: slot, vad, workletPostAudioClockMs, inputSampleRate: sampleRate, outputSampleRate: OPUS_OUTPUT_SAMPLE_RATE, inputFrameSamples: this._inputFrameSamples, }); return; } } this._framePort.postMessage( { type: 'encodeFrame', generation: this._framePortGeneration, frame: frame.buffer, byteOffset: frame.byteOffset, byteLength: frame.byteLength, vad, workletPostAudioClockMs, inputSampleRate: sampleRate, outputSampleRate: OPUS_OUTPUT_SAMPLE_RATE, inputFrameSamples: this._inputFrameSamples, }, [frame.buffer] ); } process(inputs, outputs) { const input = inputs[0]?.[0]; // mono channel const output = outputs[0]?.[0]; // Always pass audio through to the output channel. // This keeps the node "active" in Chrome/Electron's Web Audio rendering graph: // Chrome only delivers real microphone samples to inputs[0][0] when it can // observe non-zero audio flowing through the node's output. Without this // passthrough, Chrome's silence optimiser detects that captureNode always // outputs zeros and stops populating inputs[0][0] — the same bug that caused // the ScriptProcessorNode to work (it passes through) while AudioWorkletNode // produced silent frames. // The downstream keepAliveGain(0.0001) attenuates the passthrough to -100 dB // at the destination so no echo is audible. if (input && output) output.set(input); if (!input || input.length === 0) return true; // When muted, still pass audio through above (keeps the graph active), // but do not accumulate or post frames to the main thread. if (this._muted) return true; // Grow staging buffer if needed const needed = this._offset + input.length; if (needed > this._buf.length) { const bigger = new Float32Array(needed * 2); bigger.set(this._buf.subarray(0, this._offset)); this._buf = bigger; } this._buf.set(input, this._offset); this._offset += input.length; // Drain complete 20 ms frames at the actual AudioContext sample rate, then // resample to the canonical 48 kHz/960-sample Opus input expected by sender. while (this._offset >= this._inputFrameSamples) { const inputFrame = new Float32Array(this._inputFrameSamples); inputFrame.set(this._buf.subarray(0, this._inputFrameSamples)); const frame = resampleLinear(inputFrame, OPUS_FRAME_SAMPLES); // Slide remaining samples to front this._buf.copyWithin(0, this._inputFrameSamples, this._offset); this._offset -= this._inputFrameSamples; // Adaptive RMS VAD let sum = 0; for (let i = 0; i < OPUS_FRAME_SAMPLES; i++) { sum += frame[i] * frame[i]; } const rms = Math.sqrt(sum / OPUS_FRAME_SAMPLES); // Threshold from the *previous* noise estimate only. We must not blend every // frame's RMS into the noise floor: sustained vowels have stable RMS, and // updating the floor toward speech makes threshold drift up until vad flips // false (chopped "aaaaaa…"). Track ambient noise only on sub-threshold frames. const threshold = Math.max( VAD_MIN_THRESHOLD, this._noiseFloor * VAD_MULTIPLIER ); if (rms < threshold) { this._noiseFloor = VAD_ALPHA * this._noiseFloor + (1 - VAD_ALPHA) * rms; } const vad = rms > threshold; // Use the audio render clock here; comparing this against the window's // `performance.now()` produced bogus multi-minute deltas in exports. const workletPostAudioClockMs = currentTime * 1000; if (this._framePort) { this._postFrameToWorker(frame, vad, workletPostAudioClockMs); } else { // Transfer the frame buffer to avoid copying this.port.postMessage( { frame, vad, workletPostAudioClockMs, inputSampleRate: sampleRate, outputSampleRate: OPUS_OUTPUT_SAMPLE_RATE, inputFrameSamples: this._inputFrameSamples, }, [frame.buffer] ); } } return true; // keep processor alive } } registerProcessor('capture-processor', CaptureProcessor);