feat: refine meeting auto-start, silence timeout (25s) and improve transcription logging

This commit is contained in:
michael.borak
2026-01-24 14:16:55 +01:00
parent a3e4fa4ec7
commit 9a65f42f51
5 changed files with 135 additions and 112 deletions

View File

@@ -1,5 +1,6 @@
use std::sync::{Arc, Mutex};
use tauri::{AppHandle, Emitter};
use crate::emit_log;
use cpal::Sample;
use hound::WavWriter;
use rubato::{Resampler, FastFixedIn, PolynomialDegree};
@@ -42,6 +43,9 @@ pub struct AudioProcessor {
// System Audio Queue for Mixing
pub system_queue: Arc<Mutex<std::collections::VecDeque<f32>>>,
// Recording Mode (voice or meeting)
recording_mode: String,
}
impl AudioProcessor {
@@ -50,7 +54,8 @@ impl AudioProcessor {
channel_count: u16,
writer: Arc<Mutex<WavWriter<std::io::BufWriter<std::fs::File>>>>,
app_handle: AppHandle,
wait_for_speech: bool
wait_for_speech: bool,
recording_mode: String,
) -> Result<Self, String> {
let vad_sample_rate = 16000;
let vad_chunk_size = 512;
@@ -100,30 +105,51 @@ impl AudioProcessor {
app_handle: Some(app_handle),
last_event_time: std::time::Instant::now(),
system_queue: Arc::new(Mutex::new(std::collections::VecDeque::new())),
recording_mode,
})
}
pub fn process(&mut self, input_data: &[f32]) {
// MIXING LOGIC:
// We have `input_data` (Microphone). We check `system_queue` for System Audio.
// We mix them: Out = Mic + System.
// System Audio is hardcoded to 2 channels (Stereo) in sc_audio.rs.
// Microphone `self.channel_count` can be 1 (Mono) or 2 (Stereo).
let mic_channels = self.channel_count as usize;
let mut mixed_data = input_data.to_vec();
let mut max_system_energy = 0.0;
let gain_mic = 1.0;
let gain_sys = 0.8; // Slightly lower system audio to prioritize speaker
if let Ok(mut queue) = self.system_queue.lock() {
for i in 0..mixed_data.len() {
if let Some(sys_sample) = queue.pop_front() {
// Track system energy for trigger logic
let abs_sample = sys_sample.abs();
if abs_sample > max_system_energy {
max_system_energy = abs_sample;
}
// Simple addition mixing with clamping to avoid clipping
let mixed = mixed_data[i] + sys_sample;
mixed_data[i] = mixed.max(-1.0).min(1.0);
}
}
let frames = mixed_data.len() / mic_channels;
for f in 0..frames {
// system_queue is always stereo (L, R, L, R...)
if let (Some(l), Some(r)) = (queue.pop_front(), queue.pop_front()) {
let abs_l = l.abs();
let abs_r = r.abs();
let current_sys_max = if abs_l > abs_r { abs_l } else { abs_r };
if current_sys_max > max_system_energy {
max_system_energy = current_sys_max;
}
if mic_channels == 1 {
// Mic is Mono: Mix System L+R down to Mono
let sys_mono = (l + r) / 2.0;
let mixed = (mixed_data[f] * gain_mic) + (sys_mono * gain_sys);
mixed_data[f] = mixed.max(-1.0).min(1.0);
} else {
// Mic is Stereo: Mix L-to-L and R-to-R
let f_start = f * 2;
let mixed_l = (mixed_data[f_start] * gain_mic) + (l * gain_sys);
let mixed_r = (mixed_data[f_start + 1] * gain_mic) + (r * gain_sys);
mixed_data[f_start] = mixed_l.max(-1.0).min(1.0);
mixed_data[f_start + 1] = mixed_r.max(-1.0).min(1.0);
}
}
}
}
let data = &mixed_data;
@@ -181,11 +207,7 @@ impl AudioProcessor {
// Run Detection
let probability = self.vad.predict(vad_chunk.clone());
// Hybrid VAD: Probability > 0.9 OR System Audio Active
// We want to keep recording if there is meaningful audio from the system (Call in progress),
// even if the VAD doesn't strictly classify it as 'speech' (e.g. ringing, laughter, noise).
let system_is_active = max_system_energy > 0.01; // Same threshold as trigger
let system_is_active = max_system_energy > 0.005; // Lowered to match trigger
let is_speech = probability > 0.9;
if is_speech || system_is_active {
@@ -219,23 +241,23 @@ impl AudioProcessor {
// 2. AND System Audio has energy (Meaning audio is coming from the PC, i.e., Call started)
// Threshold 0.01 is roughly -40dB, should cover ringtones/speech easily but ignore silence/hiss.
let system_active = max_system_energy > 0.01;
let system_active = max_system_energy > 0.005;
// Special Case: If System Audio acts like a Ringtone (Constant high energy but maybe not VAD speech?)
// We trust VAD for speech. But we also trust "Loud System Sound" = Call.
// If system is consistently loud, it's likely a call.
// For now, Strict Mode:
// Trigger if: (Speech Detected) AND (System Audio Present)
// This prevents "User talking alone" -> No trigger (System silent).
// This allows "Partner talking" -> Trigger (Speech + System).
// What about Ringtone? Ringtone has energy but maybe no speech.
// If we want to record the ringtone, we should trigger on `system_active` alone?
// "erst wen der call startet" -> usually ringing.
// Let's be generous: If System Audio is loud (> 0.05), we trigger regardless of VAD.
let trigger = (self.is_speech_active && system_active) || (max_system_energy > 0.05);
// Periodically log energy to help debug why meeting mode might not start
if self.last_event_time.elapsed().as_millis() > 2000 && self.recording_mode == "meeting" {
if let Some(app) = &self.app_handle {
emit_log(app, "DEBUG", &format!("Waiting for Meeting... Current System Energy: {:.4} (Threshold: 0.005)", max_system_energy));
}
}
// MODE-SPECIFIC TRIGGER LOGIC:
// "voice" -> Trigger if user speaks (is_speech_active)
// "meeting" -> Trigger ONLY if system audio energy detected (Call starting)
let trigger = if self.recording_mode == "voice" {
self.is_speech_active
} else {
system_active
};
if trigger {
// Trigger Detected!
@@ -271,7 +293,13 @@ impl AudioProcessor {
// Standard Recording Logic (Active or Hangover)
let time_since_speech = self.total_processed_samples.saturating_sub(self.last_speech_time);
if self.is_speech_active || time_since_speech < self.hangover_samples {
// We write to file if:
// 1. VAD thinks someone is speaking (Mic or System)
// 2. OR System audio energy is currently above threshold (Ensures calls are captured)
// 3. OR we are within the hangover period
let system_is_active = max_system_energy > 0.005;
if self.is_speech_active || system_is_active || time_since_speech < self.hangover_samples {
let mut guard = self.writer.lock().unwrap();
for &sample in data {
let amplitude = i16::MAX as f32;