254 lines
8.8 KiB
JavaScript
254 lines
8.8 KiB
JavaScript
/**
|
|
* capture-processor.js — AudioWorkletProcessor for microphone capture.
|
|
*
|
|
* Runs on the dedicated audio worklet thread (off the main JS thread).
|
|
*
|
|
* Responsibilities:
|
|
* - Accumulate incoming 128-sample Web Audio blocks into a staging buffer.
|
|
* - Drain the staging buffer in 960-sample chunks (20ms Opus frames @ 48kHz).
|
|
* - Compute adaptive RMS VAD per frame.
|
|
* - Post each complete frame + VAD flag to either the main thread or, when
|
|
* installed, a dedicated encode worker MessagePort.
|
|
* - Accept { type: 'mute', muted: boolean } messages to silence output.
|
|
*/
|
|
|
|
const OPUS_OUTPUT_SAMPLE_RATE = 48000;
|
|
const OPUS_FRAME_DURATION_MS = 20;
|
|
const OPUS_FRAME_SAMPLES = 960; // 20ms @ 48 kHz
|
|
|
|
function resampleLinear(input, outputLength) {
|
|
if (input.length === outputLength) {
|
|
const out = new Float32Array(outputLength);
|
|
out.set(input);
|
|
return out;
|
|
}
|
|
const out = new Float32Array(outputLength);
|
|
if (outputLength <= 1 || input.length <= 1) {
|
|
out[0] = input[0] || 0;
|
|
return out;
|
|
}
|
|
if (input.length > outputLength) {
|
|
const scale = input.length / outputLength;
|
|
for (let i = 0; i < outputLength; i++) {
|
|
const start = i * scale;
|
|
const end = (i + 1) * scale;
|
|
const first = Math.floor(start);
|
|
const last = Math.min(input.length - 1, Math.ceil(end) - 1);
|
|
let sum = 0;
|
|
let weight = 0;
|
|
for (let j = first; j <= last; j++) {
|
|
const segmentStart = Math.max(start, j);
|
|
const segmentEnd = Math.min(end, j + 1);
|
|
const w = Math.max(0, segmentEnd - segmentStart);
|
|
sum += input[j] * w;
|
|
weight += w;
|
|
}
|
|
out[i] = weight > 0 ? sum / weight : input[first] || 0;
|
|
}
|
|
return out;
|
|
}
|
|
const scale = (input.length - 1) / (outputLength - 1);
|
|
for (let i = 0; i < outputLength; i++) {
|
|
const pos = i * scale;
|
|
const left = Math.floor(pos);
|
|
const right = Math.min(input.length - 1, left + 1);
|
|
const frac = pos - left;
|
|
out[i] = input[left] * (1 - frac) + input[right] * frac;
|
|
}
|
|
return out;
|
|
}
|
|
|
|
// Adaptive RMS VAD constants (noise-tracking worklet only).
|
|
const VAD_ALPHA = 0.99;
|
|
// Slightly sensitive so quiet mics still set vad=true (forwarder gates on vad).
|
|
const VAD_MULTIPLIER = 2.3;
|
|
const VAD_MIN_THRESHOLD = 0.01;
|
|
const VAD_INITIAL_FLOOR = 0.005;
|
|
|
|
class CaptureProcessor extends AudioWorkletProcessor {
|
|
constructor() {
|
|
super();
|
|
|
|
this._inputFrameSamples = Math.max(
|
|
1,
|
|
Math.round((sampleRate * OPUS_FRAME_DURATION_MS) / 1000)
|
|
);
|
|
|
|
// Staging buffer: grows on demand, pre-sized for two input-rate frames.
|
|
this._buf = new Float32Array(this._inputFrameSamples * 2);
|
|
this._offset = 0;
|
|
|
|
// Adaptive VAD state
|
|
this._noiseFloor = VAD_INITIAL_FLOOR;
|
|
|
|
// Mute gate
|
|
this._muted = false;
|
|
this._framePort = null;
|
|
this._framePortGeneration = 0;
|
|
this._sharedSamples = null;
|
|
this._sharedState = null;
|
|
this._sharedSlotCount = 0;
|
|
this._sharedFrameSamples = OPUS_FRAME_SAMPLES;
|
|
|
|
this.port.onmessage = (e) => {
|
|
if (e.data?.type === 'mute') {
|
|
this._muted = !!e.data.muted;
|
|
} else if (e.data?.type === 'set-frame-port') {
|
|
this._framePort = e.ports?.[0] || null;
|
|
this._framePortGeneration = (e.data.generation >>> 0) || 0;
|
|
this._sharedSamples =
|
|
e.data.sharedSamples instanceof SharedArrayBuffer
|
|
? new Float32Array(e.data.sharedSamples)
|
|
: null;
|
|
this._sharedState =
|
|
e.data.sharedState instanceof SharedArrayBuffer
|
|
? new Int32Array(e.data.sharedState)
|
|
: null;
|
|
this._sharedSlotCount = Math.max(0, e.data.sharedSlotCount >>> 0);
|
|
this._sharedFrameSamples =
|
|
Math.max(1, e.data.sharedFrameSamples >>> 0) || OPUS_FRAME_SAMPLES;
|
|
} else if (e.data?.type === 'clear-frame-port') {
|
|
this._framePort = null;
|
|
this._framePortGeneration = 0;
|
|
this._sharedSamples = null;
|
|
this._sharedState = null;
|
|
this._sharedSlotCount = 0;
|
|
}
|
|
};
|
|
}
|
|
|
|
_postFrameToWorker(frame, vad, workletPostAudioClockMs) {
|
|
if (
|
|
this._sharedSamples &&
|
|
this._sharedState &&
|
|
this._sharedSlotCount > 0 &&
|
|
this._sharedFrameSamples === frame.length
|
|
) {
|
|
for (let slot = 0; slot < this._sharedSlotCount; slot++) {
|
|
if (Atomics.compareExchange(this._sharedState, slot, 0, 1) !== 0) {
|
|
continue;
|
|
}
|
|
this._sharedSamples.set(frame, slot * this._sharedFrameSamples);
|
|
Atomics.store(this._sharedState, slot, 2);
|
|
this._framePort.postMessage({
|
|
type: 'encodeFrame',
|
|
generation: this._framePortGeneration,
|
|
sharedSlot: slot,
|
|
vad,
|
|
workletPostAudioClockMs,
|
|
inputSampleRate: sampleRate,
|
|
outputSampleRate: OPUS_OUTPUT_SAMPLE_RATE,
|
|
inputFrameSamples: this._inputFrameSamples,
|
|
});
|
|
return;
|
|
}
|
|
}
|
|
|
|
this._framePort.postMessage(
|
|
{
|
|
type: 'encodeFrame',
|
|
generation: this._framePortGeneration,
|
|
frame: frame.buffer,
|
|
byteOffset: frame.byteOffset,
|
|
byteLength: frame.byteLength,
|
|
vad,
|
|
workletPostAudioClockMs,
|
|
inputSampleRate: sampleRate,
|
|
outputSampleRate: OPUS_OUTPUT_SAMPLE_RATE,
|
|
inputFrameSamples: this._inputFrameSamples,
|
|
},
|
|
[frame.buffer]
|
|
);
|
|
}
|
|
|
|
process(inputs, outputs) {
|
|
const input = inputs[0]?.[0]; // mono channel
|
|
const output = outputs[0]?.[0];
|
|
|
|
// Always pass audio through to the output channel.
|
|
// This keeps the node "active" in Chrome/Electron's Web Audio rendering graph:
|
|
// Chrome only delivers real microphone samples to inputs[0][0] when it can
|
|
// observe non-zero audio flowing through the node's output. Without this
|
|
// passthrough, Chrome's silence optimiser detects that captureNode always
|
|
// outputs zeros and stops populating inputs[0][0] — the same bug that caused
|
|
// the ScriptProcessorNode to work (it passes through) while AudioWorkletNode
|
|
// produced silent frames.
|
|
// The downstream keepAliveGain(0.0001) attenuates the passthrough to -100 dB
|
|
// at the destination so no echo is audible.
|
|
if (input && output) output.set(input);
|
|
|
|
if (!input || input.length === 0) return true;
|
|
|
|
// When muted, still pass audio through above (keeps the graph active),
|
|
// but do not accumulate or post frames to the main thread.
|
|
if (this._muted) return true;
|
|
|
|
// Grow staging buffer if needed
|
|
const needed = this._offset + input.length;
|
|
if (needed > this._buf.length) {
|
|
const bigger = new Float32Array(needed * 2);
|
|
bigger.set(this._buf.subarray(0, this._offset));
|
|
this._buf = bigger;
|
|
}
|
|
|
|
this._buf.set(input, this._offset);
|
|
this._offset += input.length;
|
|
|
|
// Drain complete 20 ms frames at the actual AudioContext sample rate, then
|
|
// resample to the canonical 48 kHz/960-sample Opus input expected by sender.
|
|
while (this._offset >= this._inputFrameSamples) {
|
|
const inputFrame = new Float32Array(this._inputFrameSamples);
|
|
inputFrame.set(this._buf.subarray(0, this._inputFrameSamples));
|
|
const frame = resampleLinear(inputFrame, OPUS_FRAME_SAMPLES);
|
|
|
|
// Slide remaining samples to front
|
|
this._buf.copyWithin(0, this._inputFrameSamples, this._offset);
|
|
this._offset -= this._inputFrameSamples;
|
|
|
|
// Adaptive RMS VAD
|
|
let sum = 0;
|
|
for (let i = 0; i < OPUS_FRAME_SAMPLES; i++) {
|
|
sum += frame[i] * frame[i];
|
|
}
|
|
const rms = Math.sqrt(sum / OPUS_FRAME_SAMPLES);
|
|
// Threshold from the *previous* noise estimate only. We must not blend every
|
|
// frame's RMS into the noise floor: sustained vowels have stable RMS, and
|
|
// updating the floor toward speech makes threshold drift up until vad flips
|
|
// false (chopped "aaaaaa…"). Track ambient noise only on sub-threshold frames.
|
|
const threshold = Math.max(
|
|
VAD_MIN_THRESHOLD,
|
|
this._noiseFloor * VAD_MULTIPLIER
|
|
);
|
|
if (rms < threshold) {
|
|
this._noiseFloor = VAD_ALPHA * this._noiseFloor + (1 - VAD_ALPHA) * rms;
|
|
}
|
|
const vad = rms > threshold;
|
|
|
|
// Use the audio render clock here; comparing this against the window's
|
|
// `performance.now()` produced bogus multi-minute deltas in exports.
|
|
const workletPostAudioClockMs = currentTime * 1000;
|
|
|
|
if (this._framePort) {
|
|
this._postFrameToWorker(frame, vad, workletPostAudioClockMs);
|
|
} else {
|
|
// Transfer the frame buffer to avoid copying
|
|
this.port.postMessage(
|
|
{
|
|
frame,
|
|
vad,
|
|
workletPostAudioClockMs,
|
|
inputSampleRate: sampleRate,
|
|
outputSampleRate: OPUS_OUTPUT_SAMPLE_RATE,
|
|
inputFrameSamples: this._inputFrameSamples,
|
|
},
|
|
[frame.buffer]
|
|
);
|
|
}
|
|
}
|
|
|
|
return true; // keep processor alive
|
|
}
|
|
}
|
|
|
|
registerProcessor('capture-processor', CaptureProcessor);
|