feat(v1.2.0): Final Release - Native Audio, Smart VAD, Auto-Loop & Quality Fixes
- Implemented standard 48kHz audio pipeline to fix sample rate mismatch/distortion - Added Native System Audio (ScreenCaptureKit) support - Implemented Smart VAD (Voice Activity Detection) with Auto-Start on valid audio - Added Auto-Loop: Automatically re-arms recording after stop - Added Empty Guard: Prevents transcribing silent recordings (< 20s empty) - Increased Pre-Roll buffer to 3.0s to prevent cut-off speech - Fixed clipping with clamped audio mixing
This commit is contained in:
@@ -39,6 +39,9 @@ pub struct AudioProcessor {
|
||||
// Event Emission
|
||||
app_handle: Option<AppHandle>,
|
||||
last_event_time: std::time::Instant,
|
||||
|
||||
// System Audio Queue for Mixing
|
||||
pub system_queue: Arc<Mutex<std::collections::VecDeque<f32>>>,
|
||||
}
|
||||
|
||||
impl AudioProcessor {
|
||||
@@ -68,8 +71,8 @@ impl AudioProcessor {
|
||||
1
|
||||
).map_err(|e| format!("Failed to init Resampler: {:?}", e))?;
|
||||
|
||||
// Pre-roll buffer (1.0 seconds) * Channels (interleaved store)
|
||||
let ring_curr_seconds = 1.0;
|
||||
// Pre-roll buffer (3.0 seconds) * Channels (interleaved store)
|
||||
let ring_curr_seconds = 3.0;
|
||||
// WavWriter writes interleaved, so we store interleaved.
|
||||
let ring_size = (sample_rate as f32 * ring_curr_seconds) as usize * channel_count as usize;
|
||||
|
||||
@@ -96,10 +99,35 @@ impl AudioProcessor {
|
||||
total_processed_samples: 0,
|
||||
app_handle: Some(app_handle),
|
||||
last_event_time: std::time::Instant::now(),
|
||||
system_queue: Arc::new(Mutex::new(std::collections::VecDeque::new())),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn process(&mut self, data: &[f32]) {
|
||||
pub fn process(&mut self, input_data: &[f32]) {
|
||||
// MIXING LOGIC:
|
||||
// We have `input_data` (Microphone). We check `system_queue` for System Audio.
|
||||
// We mix them: Out = Mic + System.
|
||||
let mut mixed_data = input_data.to_vec();
|
||||
let mut max_system_energy = 0.0;
|
||||
|
||||
if let Ok(mut queue) = self.system_queue.lock() {
|
||||
for i in 0..mixed_data.len() {
|
||||
if let Some(sys_sample) = queue.pop_front() {
|
||||
// Track system energy for trigger logic
|
||||
let abs_sample = sys_sample.abs();
|
||||
if abs_sample > max_system_energy {
|
||||
max_system_energy = abs_sample;
|
||||
}
|
||||
|
||||
// Simple addition mixing with clamping to avoid clipping
|
||||
let mixed = mixed_data[i] + sys_sample;
|
||||
mixed_data[i] = mixed.max(-1.0).min(1.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let data = &mixed_data;
|
||||
|
||||
// 1. Add to Ring Buffer (Interleaved data - Record EVERYTHING)
|
||||
for &sample in data {
|
||||
self.ring_buffer[self.ring_pos] = sample;
|
||||
@@ -108,8 +136,7 @@ impl AudioProcessor {
|
||||
|
||||
// 2. Prepare VAD Signal (Mono Mixdown)
|
||||
// FRESH START LOGIC (v0.2.0):
|
||||
// We expect standard Stereo Input (BlackHole 2ch).
|
||||
// No magic 3-channel aggregate.
|
||||
// We expect standard Stereo Input.
|
||||
|
||||
let channels = self.channel_count as usize;
|
||||
let frame_count = data.len() / channels;
|
||||
@@ -146,7 +173,6 @@ impl AudioProcessor {
|
||||
self.vad_buffer.extend_from_slice(&waves_out[0][0..out_len]);
|
||||
}
|
||||
}
|
||||
// Update output buffer usage... logic is tricky with drain.
|
||||
}
|
||||
|
||||
// 4. Process VAD
|
||||
@@ -155,21 +181,19 @@ impl AudioProcessor {
|
||||
// Run Detection
|
||||
let probability = self.vad.predict(vad_chunk.clone());
|
||||
|
||||
// Calculate RMS for this chunk to use as fallback/hybrid detection
|
||||
let sq_sum: f32 = vad_chunk.iter().map(|x| x * x).sum();
|
||||
let rms = (sq_sum / vad_chunk.len() as f32).sqrt();
|
||||
// Hybrid VAD: Probability > 0.9 OR System Audio Active
|
||||
// We want to keep recording if there is meaningful audio from the system (Call in progress),
|
||||
// even if the VAD doesn't strictly classify it as 'speech' (e.g. ringing, laughter, noise).
|
||||
|
||||
let system_is_active = max_system_energy > 0.01; // Same threshold as trigger
|
||||
let is_speech = probability > 0.9;
|
||||
|
||||
// Hybrid VAD: Probability > 0.9 OR RMS > 0.025
|
||||
// INCREASED THRESHOLDS (v1.1.1):
|
||||
// Reduced sensitivity to avoid background noise triggering recording.
|
||||
let is_speech = probability > 0.9 || rms > 0.025;
|
||||
|
||||
if is_speech {
|
||||
if is_speech || system_is_active {
|
||||
self.is_speech_active = true;
|
||||
self.last_speech_time = self.total_processed_samples;
|
||||
}
|
||||
|
||||
// Emit VAD event periodically (every 500ms is enough for non-diagnostic mode)
|
||||
// Emit VAD event periodically
|
||||
if self.last_event_time.elapsed().as_millis() > 500 {
|
||||
if let Some(app) = &self.app_handle {
|
||||
#[derive(Clone, serde::Serialize)]
|
||||
@@ -183,11 +207,6 @@ impl AudioProcessor {
|
||||
});
|
||||
}
|
||||
self.last_event_time = std::time::Instant::now();
|
||||
|
||||
// IMPORTANT: We reset is_speech_active after emitting,
|
||||
// so we don't latch it forever if the user stops talking.
|
||||
// However, the main loop sets it to true if current chunk is speech.
|
||||
// This logic is a bit of a "latch for X ms".
|
||||
self.is_speech_active = false;
|
||||
}
|
||||
}
|
||||
@@ -195,9 +214,32 @@ impl AudioProcessor {
|
||||
|
||||
// 4. Update Hangover and Check Write condition
|
||||
if self.waiting_for_speech {
|
||||
if self.is_speech_active {
|
||||
// TRIGGER CONDITION:
|
||||
// 1. VAD says speech (Someone is talking)
|
||||
// 2. AND System Audio has energy (Meaning audio is coming from the PC, i.e., Call started)
|
||||
// Threshold 0.01 is roughly -40dB, should cover ringtones/speech easily but ignore silence/hiss.
|
||||
|
||||
let system_active = max_system_energy > 0.01;
|
||||
|
||||
// Special Case: If System Audio acts like a Ringtone (Constant high energy but maybe not VAD speech?)
|
||||
// We trust VAD for speech. But we also trust "Loud System Sound" = Call.
|
||||
// If system is consistently loud, it's likely a call.
|
||||
|
||||
// For now, Strict Mode:
|
||||
// Trigger if: (Speech Detected) AND (System Audio Present)
|
||||
// This prevents "User talking alone" -> No trigger (System silent).
|
||||
// This allows "Partner talking" -> Trigger (Speech + System).
|
||||
|
||||
// What about Ringtone? Ringtone has energy but maybe no speech.
|
||||
// If we want to record the ringtone, we should trigger on `system_active` alone?
|
||||
// "erst wen der call startet" -> usually ringing.
|
||||
// Let's be generous: If System Audio is loud (> 0.05), we trigger regardless of VAD.
|
||||
|
||||
let trigger = (self.is_speech_active && system_active) || (max_system_energy > 0.05);
|
||||
|
||||
if trigger {
|
||||
// Trigger Detected!
|
||||
println!("Auto-Start: Speech detected. Flushing pre-roll...");
|
||||
println!("Auto-Start: Call detected (SysEnergy: {}). Flushing pre-roll...", max_system_energy);
|
||||
self.waiting_for_speech = false;
|
||||
|
||||
// Flush Ring Buffer (Orderly: from ring_pos to end, then 0 to ring_pos)
|
||||
|
||||
Reference in New Issue
Block a user