Files
Qortal-Hub/public/worklets/capture-processor.js
2026-05-14 18:19:37 +03:00

254 lines
8.8 KiB
JavaScript

/**
* capture-processor.js — AudioWorkletProcessor for microphone capture.
*
* Runs on the dedicated audio worklet thread (off the main JS thread).
*
* Responsibilities:
* - Accumulate incoming 128-sample Web Audio blocks into a staging buffer.
* - Drain the staging buffer in 960-sample chunks (20ms Opus frames @ 48kHz).
* - Compute adaptive RMS VAD per frame.
* - Post each complete frame + VAD flag to either the main thread or, when
* installed, a dedicated encode worker MessagePort.
* - Accept { type: 'mute', muted: boolean } messages to silence output.
*/
const OPUS_OUTPUT_SAMPLE_RATE = 48000;
const OPUS_FRAME_DURATION_MS = 20;
const OPUS_FRAME_SAMPLES = 960; // 20ms @ 48 kHz
function resampleLinear(input, outputLength) {
if (input.length === outputLength) {
const out = new Float32Array(outputLength);
out.set(input);
return out;
}
const out = new Float32Array(outputLength);
if (outputLength <= 1 || input.length <= 1) {
out[0] = input[0] || 0;
return out;
}
if (input.length > outputLength) {
const scale = input.length / outputLength;
for (let i = 0; i < outputLength; i++) {
const start = i * scale;
const end = (i + 1) * scale;
const first = Math.floor(start);
const last = Math.min(input.length - 1, Math.ceil(end) - 1);
let sum = 0;
let weight = 0;
for (let j = first; j <= last; j++) {
const segmentStart = Math.max(start, j);
const segmentEnd = Math.min(end, j + 1);
const w = Math.max(0, segmentEnd - segmentStart);
sum += input[j] * w;
weight += w;
}
out[i] = weight > 0 ? sum / weight : input[first] || 0;
}
return out;
}
const scale = (input.length - 1) / (outputLength - 1);
for (let i = 0; i < outputLength; i++) {
const pos = i * scale;
const left = Math.floor(pos);
const right = Math.min(input.length - 1, left + 1);
const frac = pos - left;
out[i] = input[left] * (1 - frac) + input[right] * frac;
}
return out;
}
// Adaptive RMS VAD constants (noise-tracking worklet only).
const VAD_ALPHA = 0.99;
// Slightly sensitive so quiet mics still set vad=true (forwarder gates on vad).
const VAD_MULTIPLIER = 2.3;
const VAD_MIN_THRESHOLD = 0.01;
const VAD_INITIAL_FLOOR = 0.005;
class CaptureProcessor extends AudioWorkletProcessor {
constructor() {
super();
this._inputFrameSamples = Math.max(
1,
Math.round((sampleRate * OPUS_FRAME_DURATION_MS) / 1000)
);
// Staging buffer: grows on demand, pre-sized for two input-rate frames.
this._buf = new Float32Array(this._inputFrameSamples * 2);
this._offset = 0;
// Adaptive VAD state
this._noiseFloor = VAD_INITIAL_FLOOR;
// Mute gate
this._muted = false;
this._framePort = null;
this._framePortGeneration = 0;
this._sharedSamples = null;
this._sharedState = null;
this._sharedSlotCount = 0;
this._sharedFrameSamples = OPUS_FRAME_SAMPLES;
this.port.onmessage = (e) => {
if (e.data?.type === 'mute') {
this._muted = !!e.data.muted;
} else if (e.data?.type === 'set-frame-port') {
this._framePort = e.ports?.[0] || null;
this._framePortGeneration = (e.data.generation >>> 0) || 0;
this._sharedSamples =
e.data.sharedSamples instanceof SharedArrayBuffer
? new Float32Array(e.data.sharedSamples)
: null;
this._sharedState =
e.data.sharedState instanceof SharedArrayBuffer
? new Int32Array(e.data.sharedState)
: null;
this._sharedSlotCount = Math.max(0, e.data.sharedSlotCount >>> 0);
this._sharedFrameSamples =
Math.max(1, e.data.sharedFrameSamples >>> 0) || OPUS_FRAME_SAMPLES;
} else if (e.data?.type === 'clear-frame-port') {
this._framePort = null;
this._framePortGeneration = 0;
this._sharedSamples = null;
this._sharedState = null;
this._sharedSlotCount = 0;
}
};
}
_postFrameToWorker(frame, vad, workletPostAudioClockMs) {
if (
this._sharedSamples &&
this._sharedState &&
this._sharedSlotCount > 0 &&
this._sharedFrameSamples === frame.length
) {
for (let slot = 0; slot < this._sharedSlotCount; slot++) {
if (Atomics.compareExchange(this._sharedState, slot, 0, 1) !== 0) {
continue;
}
this._sharedSamples.set(frame, slot * this._sharedFrameSamples);
Atomics.store(this._sharedState, slot, 2);
this._framePort.postMessage({
type: 'encodeFrame',
generation: this._framePortGeneration,
sharedSlot: slot,
vad,
workletPostAudioClockMs,
inputSampleRate: sampleRate,
outputSampleRate: OPUS_OUTPUT_SAMPLE_RATE,
inputFrameSamples: this._inputFrameSamples,
});
return;
}
}
this._framePort.postMessage(
{
type: 'encodeFrame',
generation: this._framePortGeneration,
frame: frame.buffer,
byteOffset: frame.byteOffset,
byteLength: frame.byteLength,
vad,
workletPostAudioClockMs,
inputSampleRate: sampleRate,
outputSampleRate: OPUS_OUTPUT_SAMPLE_RATE,
inputFrameSamples: this._inputFrameSamples,
},
[frame.buffer]
);
}
process(inputs, outputs) {
const input = inputs[0]?.[0]; // mono channel
const output = outputs[0]?.[0];
// Always pass audio through to the output channel.
// This keeps the node "active" in Chrome/Electron's Web Audio rendering graph:
// Chrome only delivers real microphone samples to inputs[0][0] when it can
// observe non-zero audio flowing through the node's output. Without this
// passthrough, Chrome's silence optimiser detects that captureNode always
// outputs zeros and stops populating inputs[0][0] — the same bug that caused
// the ScriptProcessorNode to work (it passes through) while AudioWorkletNode
// produced silent frames.
// The downstream keepAliveGain(0.0001) attenuates the passthrough to -100 dB
// at the destination so no echo is audible.
if (input && output) output.set(input);
if (!input || input.length === 0) return true;
// When muted, still pass audio through above (keeps the graph active),
// but do not accumulate or post frames to the main thread.
if (this._muted) return true;
// Grow staging buffer if needed
const needed = this._offset + input.length;
if (needed > this._buf.length) {
const bigger = new Float32Array(needed * 2);
bigger.set(this._buf.subarray(0, this._offset));
this._buf = bigger;
}
this._buf.set(input, this._offset);
this._offset += input.length;
// Drain complete 20 ms frames at the actual AudioContext sample rate, then
// resample to the canonical 48 kHz/960-sample Opus input expected by sender.
while (this._offset >= this._inputFrameSamples) {
const inputFrame = new Float32Array(this._inputFrameSamples);
inputFrame.set(this._buf.subarray(0, this._inputFrameSamples));
const frame = resampleLinear(inputFrame, OPUS_FRAME_SAMPLES);
// Slide remaining samples to front
this._buf.copyWithin(0, this._inputFrameSamples, this._offset);
this._offset -= this._inputFrameSamples;
// Adaptive RMS VAD
let sum = 0;
for (let i = 0; i < OPUS_FRAME_SAMPLES; i++) {
sum += frame[i] * frame[i];
}
const rms = Math.sqrt(sum / OPUS_FRAME_SAMPLES);
// Threshold from the *previous* noise estimate only. We must not blend every
// frame's RMS into the noise floor: sustained vowels have stable RMS, and
// updating the floor toward speech makes threshold drift up until vad flips
// false (chopped "aaaaaa…"). Track ambient noise only on sub-threshold frames.
const threshold = Math.max(
VAD_MIN_THRESHOLD,
this._noiseFloor * VAD_MULTIPLIER
);
if (rms < threshold) {
this._noiseFloor = VAD_ALPHA * this._noiseFloor + (1 - VAD_ALPHA) * rms;
}
const vad = rms > threshold;
// Use the audio render clock here; comparing this against the window's
// `performance.now()` produced bogus multi-minute deltas in exports.
const workletPostAudioClockMs = currentTime * 1000;
if (this._framePort) {
this._postFrameToWorker(frame, vad, workletPostAudioClockMs);
} else {
// Transfer the frame buffer to avoid copying
this.port.postMessage(
{
frame,
vad,
workletPostAudioClockMs,
inputSampleRate: sampleRate,
outputSampleRate: OPUS_OUTPUT_SAMPLE_RATE,
inputFrameSamples: this._inputFrameSamples,
},
[frame.buffer]
);
}
}
return true; // keep processor alive
}
}
registerProcessor('capture-processor', CaptureProcessor);