313 lines
12 KiB
Rust
313 lines
12 KiB
Rust
use std::sync::{Arc, Mutex};
|
|
use tauri::{AppHandle, Emitter};
|
|
use crate::emit_log;
|
|
use cpal::Sample;
|
|
use hound::WavWriter;
|
|
use rubato::{Resampler, FastFixedIn, PolynomialDegree};
|
|
use voice_activity_detector::VoiceActivityDetector;
|
|
|
|
pub struct AudioProcessor {
|
|
// VAD
|
|
vad: VoiceActivityDetector,
|
|
vad_chunk_size: usize,
|
|
vad_buffer: Vec<f32>,
|
|
|
|
// Audio Config
|
|
channel_count: u16,
|
|
|
|
// Resampler
|
|
resampler: FastFixedIn<f32>,
|
|
resample_input_buffer: Vec<f32>,
|
|
resample_output_buffer: Vec<f32>,
|
|
|
|
// State
|
|
is_speech_active: bool,
|
|
last_speech_time: u64, // In samples or frames
|
|
hangover_samples: u64,
|
|
|
|
// Waiting Mode
|
|
waiting_for_speech: bool,
|
|
|
|
// Ring Buffer (for pre-roll)
|
|
ring_buffer: Vec<f32>,
|
|
ring_pos: usize,
|
|
ring_size: usize,
|
|
|
|
// Output
|
|
writer: Arc<Mutex<WavWriter<std::io::BufWriter<std::fs::File>>>>,
|
|
sample_rate: u32,
|
|
total_processed_samples: u64,
|
|
// Event Emission
|
|
app_handle: Option<AppHandle>,
|
|
last_event_time: std::time::Instant,
|
|
|
|
// System Audio Queue for Mixing
|
|
pub system_queue: Arc<Mutex<std::collections::VecDeque<f32>>>,
|
|
|
|
// Recording Mode (voice or meeting)
|
|
recording_mode: String,
|
|
}
|
|
|
|
impl AudioProcessor {
|
|
pub fn new(
|
|
sample_rate: u32,
|
|
channel_count: u16,
|
|
writer: Arc<Mutex<WavWriter<std::io::BufWriter<std::fs::File>>>>,
|
|
app_handle: AppHandle,
|
|
wait_for_speech: bool,
|
|
recording_mode: String,
|
|
) -> Result<Self, String> {
|
|
let vad_sample_rate = 16000;
|
|
let vad_chunk_size = 512;
|
|
|
|
// Initialize VAD
|
|
let vad = VoiceActivityDetector::builder()
|
|
.sample_rate(vad_sample_rate as u32)
|
|
.chunk_size(vad_chunk_size)
|
|
.build()
|
|
.map_err(|e| format!("Failed to init VAD: {:?}", e))?;
|
|
|
|
// Initialize Resampler (Input Rate -> 16000)
|
|
let resampler = FastFixedIn::<f32>::new(
|
|
16000.0 / sample_rate as f64,
|
|
1.0,
|
|
PolynomialDegree::Septic,
|
|
1024,
|
|
1
|
|
).map_err(|e| format!("Failed to init Resampler: {:?}", e))?;
|
|
|
|
// Pre-roll buffer (3.0 seconds) * Channels (interleaved store)
|
|
let ring_curr_seconds = 3.0;
|
|
// WavWriter writes interleaved, so we store interleaved.
|
|
let ring_size = (sample_rate as f32 * ring_curr_seconds) as usize * channel_count as usize;
|
|
|
|
Ok(Self {
|
|
vad,
|
|
vad_chunk_size,
|
|
vad_buffer: Vec::new(),
|
|
channel_count,
|
|
resampler,
|
|
resample_input_buffer: Vec::new(),
|
|
resample_output_buffer: Vec::new(),
|
|
is_speech_active: false,
|
|
last_speech_time: 0,
|
|
// Hangover counts "processed samples" which are actually frames * channels in current logic?
|
|
// Actually total_processed_samples usually counts FRAMES in audio terminology, but here we count elements.
|
|
// Let's stick to elements to match existing logic logic.
|
|
hangover_samples: (sample_rate as f32 * 1.5 * channel_count as f32) as u64,
|
|
waiting_for_speech: wait_for_speech,
|
|
ring_buffer: vec![0.0; ring_size],
|
|
ring_pos: 0,
|
|
ring_size,
|
|
writer,
|
|
sample_rate,
|
|
total_processed_samples: 0,
|
|
app_handle: Some(app_handle),
|
|
last_event_time: std::time::Instant::now(),
|
|
system_queue: Arc::new(Mutex::new(std::collections::VecDeque::new())),
|
|
recording_mode,
|
|
})
|
|
}
|
|
|
|
pub fn process(&mut self, input_data: &[f32]) {
|
|
// MIXING LOGIC:
|
|
// We have `input_data` (Microphone). We check `system_queue` for System Audio.
|
|
// System Audio is hardcoded to 2 channels (Stereo) in sc_audio.rs.
|
|
// Microphone `self.channel_count` can be 1 (Mono) or 2 (Stereo).
|
|
|
|
let mic_channels = self.channel_count as usize;
|
|
let mut mixed_data = input_data.to_vec();
|
|
let mut max_system_energy = 0.0;
|
|
|
|
let gain_mic = 1.0;
|
|
let gain_sys = 0.8; // Slightly lower system audio to prioritize speaker
|
|
|
|
if let Ok(mut queue) = self.system_queue.lock() {
|
|
let frames = mixed_data.len() / mic_channels;
|
|
|
|
for f in 0..frames {
|
|
// system_queue is always stereo (L, R, L, R...)
|
|
if let (Some(l), Some(r)) = (queue.pop_front(), queue.pop_front()) {
|
|
let abs_l = l.abs();
|
|
let abs_r = r.abs();
|
|
let current_sys_max = if abs_l > abs_r { abs_l } else { abs_r };
|
|
if current_sys_max > max_system_energy {
|
|
max_system_energy = current_sys_max;
|
|
}
|
|
|
|
if mic_channels == 1 {
|
|
// Mic is Mono: Mix System L+R down to Mono
|
|
let sys_mono = (l + r) / 2.0;
|
|
let mixed = (mixed_data[f] * gain_mic) + (sys_mono * gain_sys);
|
|
mixed_data[f] = mixed.max(-1.0).min(1.0);
|
|
} else {
|
|
// Mic is Stereo: Mix L-to-L and R-to-R
|
|
let f_start = f * 2;
|
|
let mixed_l = (mixed_data[f_start] * gain_mic) + (l * gain_sys);
|
|
let mixed_r = (mixed_data[f_start + 1] * gain_mic) + (r * gain_sys);
|
|
mixed_data[f_start] = mixed_l.max(-1.0).min(1.0);
|
|
mixed_data[f_start + 1] = mixed_r.max(-1.0).min(1.0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
let data = &mixed_data;
|
|
|
|
// 1. Add to Ring Buffer (Interleaved data - Record EVERYTHING)
|
|
for &sample in data {
|
|
self.ring_buffer[self.ring_pos] = sample;
|
|
self.ring_pos = (self.ring_pos + 1) % self.ring_size;
|
|
}
|
|
|
|
// 2. Prepare VAD Signal (Mono Mixdown)
|
|
// FRESH START LOGIC (v0.2.0):
|
|
// We expect standard Stereo Input.
|
|
|
|
let channels = self.channel_count as usize;
|
|
let frame_count = data.len() / channels;
|
|
let mut vad_input_chunk = Vec::with_capacity(frame_count);
|
|
|
|
for i in 0..frame_count {
|
|
let frame_start = i * channels;
|
|
|
|
let mix_sample = if channels >= 2 {
|
|
// Stereo -> Average L + R
|
|
(data[frame_start] + data[frame_start + 1]) / 2.0
|
|
} else {
|
|
// Mono -> Take as is
|
|
data[frame_start]
|
|
};
|
|
|
|
vad_input_chunk.push(mix_sample);
|
|
}
|
|
|
|
|
|
// 3. Resample for VAD
|
|
self.resample_input_buffer.extend_from_slice(&vad_input_chunk);
|
|
|
|
let needed = self.resampler.input_frames_next();
|
|
while self.resample_input_buffer.len() >= needed {
|
|
let chunk: Vec<f32> = self.resample_input_buffer.drain(0..needed).collect();
|
|
// Resample (mono)
|
|
let waves_in = vec![chunk];
|
|
// Allocate output (approx)
|
|
let mut waves_out = vec![vec![0.0; (needed as f64 * (16000.0 / self.sample_rate as f64)).ceil() as usize + 10]; 1]; // +10 padding
|
|
|
|
if let Ok((_in_len, out_len)) = self.resampler.process_into_buffer(&waves_in, &mut waves_out, None) {
|
|
if out_len > 0 {
|
|
self.vad_buffer.extend_from_slice(&waves_out[0][0..out_len]);
|
|
}
|
|
}
|
|
}
|
|
|
|
// 4. Process VAD
|
|
while self.vad_buffer.len() >= self.vad_chunk_size {
|
|
let vad_chunk: Vec<f32> = self.vad_buffer.drain(0..self.vad_chunk_size).collect();
|
|
// Run Detection
|
|
let probability = self.vad.predict(vad_chunk.clone());
|
|
|
|
let system_is_active = max_system_energy > 0.005; // Lowered to match trigger
|
|
let is_speech = probability > 0.9;
|
|
|
|
if is_speech || system_is_active {
|
|
self.is_speech_active = true;
|
|
self.last_speech_time = self.total_processed_samples;
|
|
}
|
|
|
|
// Emit VAD event periodically
|
|
if self.last_event_time.elapsed().as_millis() > 500 {
|
|
if let Some(app) = &self.app_handle {
|
|
#[derive(Clone, serde::Serialize)]
|
|
struct VadEvent {
|
|
is_speech: bool,
|
|
probability: f32,
|
|
}
|
|
let _ = app.emit("vad-event", VadEvent {
|
|
probability,
|
|
is_speech: self.is_speech_active,
|
|
});
|
|
}
|
|
self.last_event_time = std::time::Instant::now();
|
|
self.is_speech_active = false;
|
|
}
|
|
}
|
|
|
|
|
|
// 4. Update Hangover and Check Write condition
|
|
if self.waiting_for_speech {
|
|
// TRIGGER CONDITION:
|
|
// 1. VAD says speech (Someone is talking)
|
|
// 2. AND System Audio has energy (Meaning audio is coming from the PC, i.e., Call started)
|
|
// Threshold 0.01 is roughly -40dB, should cover ringtones/speech easily but ignore silence/hiss.
|
|
|
|
let system_active = max_system_energy > 0.005;
|
|
|
|
// Periodically log energy to help debug why meeting mode might not start
|
|
if self.last_event_time.elapsed().as_millis() > 2000 && self.recording_mode == "meeting" {
|
|
if let Some(app) = &self.app_handle {
|
|
emit_log(app, "DEBUG", &format!("Waiting for Meeting... Current System Energy: {:.4} (Threshold: 0.005)", max_system_energy));
|
|
}
|
|
}
|
|
|
|
// MODE-SPECIFIC TRIGGER LOGIC:
|
|
// "voice" -> Trigger if user speaks (is_speech_active)
|
|
// "meeting" -> Trigger ONLY if system audio energy detected (Call starting)
|
|
let trigger = if self.recording_mode == "voice" {
|
|
self.is_speech_active
|
|
} else {
|
|
system_active
|
|
};
|
|
|
|
if trigger {
|
|
// Trigger Detected!
|
|
println!("Auto-Start: Call detected (SysEnergy: {}). Flushing pre-roll...", max_system_energy);
|
|
self.waiting_for_speech = false;
|
|
|
|
// Flush Ring Buffer (Orderly: from ring_pos to end, then 0 to ring_pos)
|
|
let mut guard = self.writer.lock().unwrap();
|
|
let amplitude = i16::MAX as f32;
|
|
|
|
// Part 1: ring_pos to end
|
|
for i in self.ring_pos..self.ring_size {
|
|
let sample = self.ring_buffer[i];
|
|
guard.write_sample((sample * amplitude) as i16).ok();
|
|
}
|
|
// Part 2: 0 to ring_pos
|
|
for i in 0..self.ring_pos {
|
|
let sample = self.ring_buffer[i];
|
|
guard.write_sample((sample * amplitude) as i16).ok();
|
|
}
|
|
|
|
// Emit event to notify frontend that "real" recording started
|
|
if let Some(app) = &self.app_handle {
|
|
let _ = app.emit("auto-recording-triggered", ());
|
|
}
|
|
|
|
} else {
|
|
// Still waiting, do not write to file.
|
|
return;
|
|
}
|
|
}
|
|
|
|
// Standard Recording Logic (Active or Hangover)
|
|
let time_since_speech = self.total_processed_samples.saturating_sub(self.last_speech_time);
|
|
|
|
// We write to file if:
|
|
// 1. VAD thinks someone is speaking (Mic or System)
|
|
// 2. OR System audio energy is currently above threshold (Ensures calls are captured)
|
|
// 3. OR we are within the hangover period
|
|
let system_is_active = max_system_energy > 0.005;
|
|
|
|
if self.is_speech_active || system_is_active || time_since_speech < self.hangover_samples {
|
|
let mut guard = self.writer.lock().unwrap();
|
|
for &sample in data {
|
|
let amplitude = i16::MAX as f32;
|
|
guard.write_sample((sample * amplitude) as i16).ok();
|
|
}
|
|
}
|
|
|
|
self.total_processed_samples += data.len() as u64;
|
|
}
|
|
}
|