Files
hearbit-ai-app/src-tauri/src/audio_processor.rs

313 lines
12 KiB
Rust

use std::sync::{Arc, Mutex};
use tauri::{AppHandle, Emitter};
use crate::emit_log;
use cpal::Sample;
use hound::WavWriter;
use rubato::{Resampler, FastFixedIn, PolynomialDegree};
use voice_activity_detector::VoiceActivityDetector;
pub struct AudioProcessor {
// VAD
vad: VoiceActivityDetector,
vad_chunk_size: usize,
vad_buffer: Vec<f32>,
// Audio Config
channel_count: u16,
// Resampler
resampler: FastFixedIn<f32>,
resample_input_buffer: Vec<f32>,
resample_output_buffer: Vec<f32>,
// State
is_speech_active: bool,
last_speech_time: u64, // In samples or frames
hangover_samples: u64,
// Waiting Mode
waiting_for_speech: bool,
// Ring Buffer (for pre-roll)
ring_buffer: Vec<f32>,
ring_pos: usize,
ring_size: usize,
// Output
writer: Arc<Mutex<WavWriter<std::io::BufWriter<std::fs::File>>>>,
sample_rate: u32,
total_processed_samples: u64,
// Event Emission
app_handle: Option<AppHandle>,
last_event_time: std::time::Instant,
// System Audio Queue for Mixing
pub system_queue: Arc<Mutex<std::collections::VecDeque<f32>>>,
// Recording Mode (voice or meeting)
recording_mode: String,
}
impl AudioProcessor {
pub fn new(
sample_rate: u32,
channel_count: u16,
writer: Arc<Mutex<WavWriter<std::io::BufWriter<std::fs::File>>>>,
app_handle: AppHandle,
wait_for_speech: bool,
recording_mode: String,
) -> Result<Self, String> {
let vad_sample_rate = 16000;
let vad_chunk_size = 512;
// Initialize VAD
let vad = VoiceActivityDetector::builder()
.sample_rate(vad_sample_rate as u32)
.chunk_size(vad_chunk_size)
.build()
.map_err(|e| format!("Failed to init VAD: {:?}", e))?;
// Initialize Resampler (Input Rate -> 16000)
let resampler = FastFixedIn::<f32>::new(
16000.0 / sample_rate as f64,
1.0,
PolynomialDegree::Septic,
1024,
1
).map_err(|e| format!("Failed to init Resampler: {:?}", e))?;
// Pre-roll buffer (3.0 seconds) * Channels (interleaved store)
let ring_curr_seconds = 3.0;
// WavWriter writes interleaved, so we store interleaved.
let ring_size = (sample_rate as f32 * ring_curr_seconds) as usize * channel_count as usize;
Ok(Self {
vad,
vad_chunk_size,
vad_buffer: Vec::new(),
channel_count,
resampler,
resample_input_buffer: Vec::new(),
resample_output_buffer: Vec::new(),
is_speech_active: false,
last_speech_time: 0,
// Hangover counts "processed samples" which are actually frames * channels in current logic?
// Actually total_processed_samples usually counts FRAMES in audio terminology, but here we count elements.
// Let's stick to elements to match existing logic logic.
hangover_samples: (sample_rate as f32 * 1.5 * channel_count as f32) as u64,
waiting_for_speech: wait_for_speech,
ring_buffer: vec![0.0; ring_size],
ring_pos: 0,
ring_size,
writer,
sample_rate,
total_processed_samples: 0,
app_handle: Some(app_handle),
last_event_time: std::time::Instant::now(),
system_queue: Arc::new(Mutex::new(std::collections::VecDeque::new())),
recording_mode,
})
}
pub fn process(&mut self, input_data: &[f32]) {
// MIXING LOGIC:
// We have `input_data` (Microphone). We check `system_queue` for System Audio.
// System Audio is hardcoded to 2 channels (Stereo) in sc_audio.rs.
// Microphone `self.channel_count` can be 1 (Mono) or 2 (Stereo).
let mic_channels = self.channel_count as usize;
let mut mixed_data = input_data.to_vec();
let mut max_system_energy = 0.0;
let gain_mic = 1.0;
let gain_sys = 0.8; // Slightly lower system audio to prioritize speaker
if let Ok(mut queue) = self.system_queue.lock() {
let frames = mixed_data.len() / mic_channels;
for f in 0..frames {
// system_queue is always stereo (L, R, L, R...)
if let (Some(l), Some(r)) = (queue.pop_front(), queue.pop_front()) {
let abs_l = l.abs();
let abs_r = r.abs();
let current_sys_max = if abs_l > abs_r { abs_l } else { abs_r };
if current_sys_max > max_system_energy {
max_system_energy = current_sys_max;
}
if mic_channels == 1 {
// Mic is Mono: Mix System L+R down to Mono
let sys_mono = (l + r) / 2.0;
let mixed = (mixed_data[f] * gain_mic) + (sys_mono * gain_sys);
mixed_data[f] = mixed.max(-1.0).min(1.0);
} else {
// Mic is Stereo: Mix L-to-L and R-to-R
let f_start = f * 2;
let mixed_l = (mixed_data[f_start] * gain_mic) + (l * gain_sys);
let mixed_r = (mixed_data[f_start + 1] * gain_mic) + (r * gain_sys);
mixed_data[f_start] = mixed_l.max(-1.0).min(1.0);
mixed_data[f_start + 1] = mixed_r.max(-1.0).min(1.0);
}
}
}
}
let data = &mixed_data;
// 1. Add to Ring Buffer (Interleaved data - Record EVERYTHING)
for &sample in data {
self.ring_buffer[self.ring_pos] = sample;
self.ring_pos = (self.ring_pos + 1) % self.ring_size;
}
// 2. Prepare VAD Signal (Mono Mixdown)
// FRESH START LOGIC (v0.2.0):
// We expect standard Stereo Input.
let channels = self.channel_count as usize;
let frame_count = data.len() / channels;
let mut vad_input_chunk = Vec::with_capacity(frame_count);
for i in 0..frame_count {
let frame_start = i * channels;
let mix_sample = if channels >= 2 {
// Stereo -> Average L + R
(data[frame_start] + data[frame_start + 1]) / 2.0
} else {
// Mono -> Take as is
data[frame_start]
};
vad_input_chunk.push(mix_sample);
}
// 3. Resample for VAD
self.resample_input_buffer.extend_from_slice(&vad_input_chunk);
let needed = self.resampler.input_frames_next();
while self.resample_input_buffer.len() >= needed {
let chunk: Vec<f32> = self.resample_input_buffer.drain(0..needed).collect();
// Resample (mono)
let waves_in = vec![chunk];
// Allocate output (approx)
let mut waves_out = vec![vec![0.0; (needed as f64 * (16000.0 / self.sample_rate as f64)).ceil() as usize + 10]; 1]; // +10 padding
if let Ok((_in_len, out_len)) = self.resampler.process_into_buffer(&waves_in, &mut waves_out, None) {
if out_len > 0 {
self.vad_buffer.extend_from_slice(&waves_out[0][0..out_len]);
}
}
}
// 4. Process VAD
while self.vad_buffer.len() >= self.vad_chunk_size {
let vad_chunk: Vec<f32> = self.vad_buffer.drain(0..self.vad_chunk_size).collect();
// Run Detection
let probability = self.vad.predict(vad_chunk.clone());
let system_is_active = max_system_energy > 0.005; // Lowered to match trigger
let is_speech = probability > 0.9;
if is_speech || system_is_active {
self.is_speech_active = true;
self.last_speech_time = self.total_processed_samples;
}
// Emit VAD event periodically
if self.last_event_time.elapsed().as_millis() > 500 {
if let Some(app) = &self.app_handle {
#[derive(Clone, serde::Serialize)]
struct VadEvent {
is_speech: bool,
probability: f32,
}
let _ = app.emit("vad-event", VadEvent {
probability,
is_speech: self.is_speech_active,
});
}
self.last_event_time = std::time::Instant::now();
self.is_speech_active = false;
}
}
// 4. Update Hangover and Check Write condition
if self.waiting_for_speech {
// TRIGGER CONDITION:
// 1. VAD says speech (Someone is talking)
// 2. AND System Audio has energy (Meaning audio is coming from the PC, i.e., Call started)
// Threshold 0.01 is roughly -40dB, should cover ringtones/speech easily but ignore silence/hiss.
let system_active = max_system_energy > 0.005;
// Periodically log energy to help debug why meeting mode might not start
if self.last_event_time.elapsed().as_millis() > 2000 && self.recording_mode == "meeting" {
if let Some(app) = &self.app_handle {
emit_log(app, "DEBUG", &format!("Waiting for Meeting... Current System Energy: {:.4} (Threshold: 0.005)", max_system_energy));
}
}
// MODE-SPECIFIC TRIGGER LOGIC:
// "voice" -> Trigger if user speaks (is_speech_active)
// "meeting" -> Trigger ONLY if system audio energy detected (Call starting)
let trigger = if self.recording_mode == "voice" {
self.is_speech_active
} else {
system_active
};
if trigger {
// Trigger Detected!
println!("Auto-Start: Call detected (SysEnergy: {}). Flushing pre-roll...", max_system_energy);
self.waiting_for_speech = false;
// Flush Ring Buffer (Orderly: from ring_pos to end, then 0 to ring_pos)
let mut guard = self.writer.lock().unwrap();
let amplitude = i16::MAX as f32;
// Part 1: ring_pos to end
for i in self.ring_pos..self.ring_size {
let sample = self.ring_buffer[i];
guard.write_sample((sample * amplitude) as i16).ok();
}
// Part 2: 0 to ring_pos
for i in 0..self.ring_pos {
let sample = self.ring_buffer[i];
guard.write_sample((sample * amplitude) as i16).ok();
}
// Emit event to notify frontend that "real" recording started
if let Some(app) = &self.app_handle {
let _ = app.emit("auto-recording-triggered", ());
}
} else {
// Still waiting, do not write to file.
return;
}
}
// Standard Recording Logic (Active or Hangover)
let time_since_speech = self.total_processed_samples.saturating_sub(self.last_speech_time);
// We write to file if:
// 1. VAD thinks someone is speaking (Mic or System)
// 2. OR System audio energy is currently above threshold (Ensures calls are captured)
// 3. OR we are within the hangover period
let system_is_active = max_system_energy > 0.005;
if self.is_speech_active || system_is_active || time_since_speech < self.hangover_samples {
let mut guard = self.writer.lock().unwrap();
for &sample in data {
let amplitude = i16::MAX as f32;
guard.write_sample((sample * amplitude) as i16).ok();
}
}
self.total_processed_samples += data.len() as u64;
}
}