use std::sync::{Arc, Mutex}; use tauri::{AppHandle, Emitter}; use crate::emit_log; use cpal::Sample; use hound::WavWriter; use rubato::{Resampler, FastFixedIn, PolynomialDegree}; use voice_activity_detector::VoiceActivityDetector; pub struct AudioProcessor { // VAD vad: VoiceActivityDetector, vad_chunk_size: usize, vad_buffer: Vec, // Audio Config channel_count: u16, // Resampler resampler: FastFixedIn, resample_input_buffer: Vec, resample_output_buffer: Vec, // State is_speech_active: bool, last_speech_time: u64, // In samples or frames hangover_samples: u64, // Waiting Mode waiting_for_speech: bool, // Ring Buffer (for pre-roll) ring_buffer: Vec, ring_pos: usize, ring_size: usize, // Output writer: Arc>>>, sample_rate: u32, total_processed_samples: u64, // Event Emission app_handle: Option, last_event_time: std::time::Instant, // System Audio Queue for Mixing pub system_queue: Arc>>, // Recording Mode (voice or meeting) recording_mode: String, } impl AudioProcessor { pub fn new( sample_rate: u32, channel_count: u16, writer: Arc>>>, app_handle: AppHandle, wait_for_speech: bool, recording_mode: String, ) -> Result { let vad_sample_rate = 16000; let vad_chunk_size = 512; // Initialize VAD let vad = VoiceActivityDetector::builder() .sample_rate(vad_sample_rate as u32) .chunk_size(vad_chunk_size) .build() .map_err(|e| format!("Failed to init VAD: {:?}", e))?; // Initialize Resampler (Input Rate -> 16000) let resampler = FastFixedIn::::new( 16000.0 / sample_rate as f64, 1.0, PolynomialDegree::Septic, 1024, 1 ).map_err(|e| format!("Failed to init Resampler: {:?}", e))?; // Pre-roll buffer (3.0 seconds) * Channels (interleaved store) let ring_curr_seconds = 3.0; // WavWriter writes interleaved, so we store interleaved. let ring_size = (sample_rate as f32 * ring_curr_seconds) as usize * channel_count as usize; Ok(Self { vad, vad_chunk_size, vad_buffer: Vec::new(), channel_count, resampler, resample_input_buffer: Vec::new(), resample_output_buffer: Vec::new(), is_speech_active: false, last_speech_time: 0, // Hangover counts "processed samples" which are actually frames * channels in current logic? // Actually total_processed_samples usually counts FRAMES in audio terminology, but here we count elements. // Let's stick to elements to match existing logic logic. hangover_samples: (sample_rate as f32 * 1.5 * channel_count as f32) as u64, waiting_for_speech: wait_for_speech, ring_buffer: vec![0.0; ring_size], ring_pos: 0, ring_size, writer, sample_rate, total_processed_samples: 0, app_handle: Some(app_handle), last_event_time: std::time::Instant::now(), system_queue: Arc::new(Mutex::new(std::collections::VecDeque::new())), recording_mode, }) } pub fn process(&mut self, input_data: &[f32]) { // MIXING LOGIC: // We have `input_data` (Microphone). We check `system_queue` for System Audio. // System Audio is hardcoded to 2 channels (Stereo) in sc_audio.rs. // Microphone `self.channel_count` can be 1 (Mono) or 2 (Stereo). let mic_channels = self.channel_count as usize; let mut mixed_data = input_data.to_vec(); let mut max_system_energy = 0.0; let gain_mic = 1.0; let gain_sys = 0.8; // Slightly lower system audio to prioritize speaker if let Ok(mut queue) = self.system_queue.lock() { let frames = mixed_data.len() / mic_channels; for f in 0..frames { // system_queue is always stereo (L, R, L, R...) if let (Some(l), Some(r)) = (queue.pop_front(), queue.pop_front()) { let abs_l = l.abs(); let abs_r = r.abs(); let current_sys_max = if abs_l > abs_r { abs_l } else { abs_r }; if current_sys_max > max_system_energy { max_system_energy = current_sys_max; } if mic_channels == 1 { // Mic is Mono: Mix System L+R down to Mono let sys_mono = (l + r) / 2.0; let mixed = (mixed_data[f] * gain_mic) + (sys_mono * gain_sys); mixed_data[f] = mixed.max(-1.0).min(1.0); } else { // Mic is Stereo: Mix L-to-L and R-to-R let f_start = f * 2; let mixed_l = (mixed_data[f_start] * gain_mic) + (l * gain_sys); let mixed_r = (mixed_data[f_start + 1] * gain_mic) + (r * gain_sys); mixed_data[f_start] = mixed_l.max(-1.0).min(1.0); mixed_data[f_start + 1] = mixed_r.max(-1.0).min(1.0); } } } } let data = &mixed_data; // 1. Add to Ring Buffer (Interleaved data - Record EVERYTHING) for &sample in data { self.ring_buffer[self.ring_pos] = sample; self.ring_pos = (self.ring_pos + 1) % self.ring_size; } // 2. Prepare VAD Signal (Mono Mixdown) // FRESH START LOGIC (v0.2.0): // We expect standard Stereo Input. let channels = self.channel_count as usize; let frame_count = data.len() / channels; let mut vad_input_chunk = Vec::with_capacity(frame_count); for i in 0..frame_count { let frame_start = i * channels; let mix_sample = if channels >= 2 { // Stereo -> Average L + R (data[frame_start] + data[frame_start + 1]) / 2.0 } else { // Mono -> Take as is data[frame_start] }; vad_input_chunk.push(mix_sample); } // 3. Resample for VAD self.resample_input_buffer.extend_from_slice(&vad_input_chunk); let needed = self.resampler.input_frames_next(); while self.resample_input_buffer.len() >= needed { let chunk: Vec = self.resample_input_buffer.drain(0..needed).collect(); // Resample (mono) let waves_in = vec![chunk]; // Allocate output (approx) let mut waves_out = vec![vec![0.0; (needed as f64 * (16000.0 / self.sample_rate as f64)).ceil() as usize + 10]; 1]; // +10 padding if let Ok((_in_len, out_len)) = self.resampler.process_into_buffer(&waves_in, &mut waves_out, None) { if out_len > 0 { self.vad_buffer.extend_from_slice(&waves_out[0][0..out_len]); } } } // 4. Process VAD while self.vad_buffer.len() >= self.vad_chunk_size { let vad_chunk: Vec = self.vad_buffer.drain(0..self.vad_chunk_size).collect(); // Run Detection let probability = self.vad.predict(vad_chunk.clone()); let system_is_active = max_system_energy > 0.005; // Lowered to match trigger let is_speech = probability > 0.9; if is_speech || system_is_active { self.is_speech_active = true; self.last_speech_time = self.total_processed_samples; } // Emit VAD event periodically if self.last_event_time.elapsed().as_millis() > 500 { if let Some(app) = &self.app_handle { #[derive(Clone, serde::Serialize)] struct VadEvent { is_speech: bool, probability: f32, } let _ = app.emit("vad-event", VadEvent { probability, is_speech: self.is_speech_active, }); } self.last_event_time = std::time::Instant::now(); self.is_speech_active = false; } } // 4. Update Hangover and Check Write condition if self.waiting_for_speech { // TRIGGER CONDITION: // 1. VAD says speech (Someone is talking) // 2. AND System Audio has energy (Meaning audio is coming from the PC, i.e., Call started) // Threshold 0.01 is roughly -40dB, should cover ringtones/speech easily but ignore silence/hiss. let system_active = max_system_energy > 0.005; // Periodically log energy to help debug why meeting mode might not start if self.last_event_time.elapsed().as_millis() > 2000 && self.recording_mode == "meeting" { if let Some(app) = &self.app_handle { emit_log(app, "DEBUG", &format!("Waiting for Meeting... Current System Energy: {:.4} (Threshold: 0.005)", max_system_energy)); } } // MODE-SPECIFIC TRIGGER LOGIC: // "voice" -> Trigger if user speaks (is_speech_active) // "meeting" -> Trigger ONLY if system audio energy detected (Call starting) let trigger = if self.recording_mode == "voice" { self.is_speech_active } else { system_active }; if trigger { // Trigger Detected! println!("Auto-Start: Call detected (SysEnergy: {}). Flushing pre-roll...", max_system_energy); self.waiting_for_speech = false; // Flush Ring Buffer (Orderly: from ring_pos to end, then 0 to ring_pos) let mut guard = self.writer.lock().unwrap(); let amplitude = i16::MAX as f32; // Part 1: ring_pos to end for i in self.ring_pos..self.ring_size { let sample = self.ring_buffer[i]; guard.write_sample((sample * amplitude) as i16).ok(); } // Part 2: 0 to ring_pos for i in 0..self.ring_pos { let sample = self.ring_buffer[i]; guard.write_sample((sample * amplitude) as i16).ok(); } // Emit event to notify frontend that "real" recording started if let Some(app) = &self.app_handle { let _ = app.emit("auto-recording-triggered", ()); } } else { // Still waiting, do not write to file. return; } } // Standard Recording Logic (Active or Hangover) let time_since_speech = self.total_processed_samples.saturating_sub(self.last_speech_time); // We write to file if: // 1. VAD thinks someone is speaking (Mic or System) // 2. OR System audio energy is currently above threshold (Ensures calls are captured) // 3. OR we are within the hangover period let system_is_active = max_system_energy > 0.005; if self.is_speech_active || system_is_active || time_since_speech < self.hangover_samples { let mut guard = self.writer.lock().unwrap(); for &sample in data { let amplitude = i16::MAX as f32; guard.write_sample((sample * amplitude) as i16).ok(); } } self.total_processed_samples += data.len() as u64; } }