feat: complete history, attendees list, and smart templates

This commit is contained in:
michael.borak
2026-01-20 15:00:56 +01:00
parent d266de942a
commit 52ccd7ee03
18 changed files with 2222 additions and 480 deletions

View File

@@ -0,0 +1,183 @@
use std::sync::{Arc, Mutex};
use tauri::{AppHandle, Emitter};
use cpal::Sample;
use hound::WavWriter;
use rubato::{Resampler, FastFixedIn, PolynomialDegree};
use voice_activity_detector::VoiceActivityDetector;
pub struct AudioProcessor {
// VAD
vad: VoiceActivityDetector,
vad_chunk_size: usize,
vad_buffer: Vec<f32>,
// Resampler
resampler: FastFixedIn<f32>,
resample_input_buffer: Vec<f32>,
resample_output_buffer: Vec<f32>,
// State
is_speech_active: bool,
last_speech_time: u64, // In samples or frames
hangover_samples: u64,
// Ring Buffer (for pre-roll)
ring_buffer: Vec<f32>,
ring_pos: usize,
ring_size: usize,
// Output
writer: Arc<Mutex<WavWriter<std::io::BufWriter<std::fs::File>>>>,
sample_rate: u32,
total_processed_samples: u64,
// Event Emission
app_handle: Option<AppHandle>,
last_event_time: std::time::Instant,
}
impl AudioProcessor {
pub fn new(
sample_rate: u32,
writer: Arc<Mutex<WavWriter<std::io::BufWriter<std::fs::File>>>>,
app_handle: AppHandle
) -> Result<Self, String> {
let vad_sample_rate = 16000;
let vad_chunk_size = 512; // Silero usually likes ~30ms which is 512 at 16k? No 16000 * 0.032 = 512.
// Initialize VAD
let vad = VoiceActivityDetector::builder()
.sample_rate(vad_sample_rate as u32)
.chunk_size(vad_chunk_size)
.build()
.map_err(|e| format!("Failed to init VAD: {:?}", e))?;
// Initialize Resampler (Input Rate -> 16000) using FastFixedIn for speed/simplicity
// new(f_ratio, max_resample_ratio_relative, polyn_deg, chunk_size, channels)
let resampler = FastFixedIn::<f32>::new(
16000.0 / sample_rate as f64,
1.0,
PolynomialDegree::Septic,
1024,
1
).map_err(|e| format!("Failed to init Resampler: {:?}", e))?;
// Pre-roll buffer (e.g. 0.5 seconds of high quality audio)
let ring_curr_seconds = 1.0;
let ring_size = (sample_rate as f32 * ring_curr_seconds) as usize;
Ok(Self {
vad,
vad_chunk_size,
vad_buffer: Vec::new(),
resampler,
resample_input_buffer: Vec::new(),
resample_output_buffer: Vec::new(),
is_speech_active: false,
last_speech_time: 0,
hangover_samples: (sample_rate as f32 * 1.5) as u64, // 1.5s hangover
ring_buffer: vec![0.0; ring_size],
ring_pos: 0,
ring_size,
writer,
sample_rate,
total_processed_samples: 0,
app_handle: Some(app_handle),
last_event_time: std::time::Instant::now(),
})
}
pub fn process(&mut self, data: &[f32]) {
// 1. Add to Ring Buffer (always, for pre-roll)
for &sample in data {
self.ring_buffer[self.ring_pos] = sample;
self.ring_pos = (self.ring_pos + 1) % self.ring_size;
}
// 2. Resample for VAD
// We append new data to input buffer for resampler
self.resample_input_buffer.extend_from_slice(data);
// Process in chunks compatible with resampler
// Actually rubato process_into_buffer needs waves of input.
// Simplified: SincFixedIn wants a fixed number of input frames?
// Docs: "retrieve result... input buffer must contain needed number of frames"
// SincFixedIn: "input buffer used for resampling... must receive a fixed number of frames"
// Wait, SincFixedIn is fixed INPUT size. SincFixedOut is fixed OUTPUT size.
// We want to feed whatever we get.
// For simplicity, let's use a simpler resampling strategy or accept rubato's constraints.
// Rubato SincFixedIn: we must provide `input_frames_next` frames.
// Let's defer strict resampling and just use decimation if sample rate is multiple?
// No, user devices vary.
// Handling Resampling properly:
let needed = self.resampler.input_frames_next();
while self.resample_input_buffer.len() >= needed {
let chunk: Vec<f32> = self.resample_input_buffer.drain(0..needed).collect();
// Resample (mono)
let waves_in = vec![chunk];
// Allocate output (approx)
let mut waves_out = vec![vec![0.0; (needed as f64 * (16000.0 / self.sample_rate as f64)).ceil() as usize + 10]; 1]; // +10 padding
if let Ok((_in_len, out_len)) = self.resampler.process_into_buffer(&waves_in, &mut waves_out, None) {
if out_len > 0 {
self.vad_buffer.extend_from_slice(&waves_out[0][0..out_len]);
}
}
// Update output buffer usage... logic is tricky with drain.
}
// 3. Process VAD
while self.vad_buffer.len() >= self.vad_chunk_size {
let vad_chunk: Vec<f32> = self.vad_buffer.drain(0..self.vad_chunk_size).collect();
// Run Detection
let probability = self.vad.predict(vad_chunk);
let is_speech = probability > 0.5;
if is_speech {
self.is_speech_active = true;
self.last_speech_time = self.total_processed_samples;
}
// Emit VAD event periodically (every 500ms)
if self.last_event_time.elapsed().as_millis() > 500 {
if let Some(app) = &self.app_handle {
// Calculate crude RMS for visualization or just send probability
// Just sending probability is enough for now
#[derive(serde::Serialize, Clone)]
struct VadEvent {
probability: f32,
is_speech: bool,
}
let _ = app.emit("vad-event", VadEvent { probability, is_speech });
}
self.last_event_time = std::time::Instant::now();
}
}
// 4. Update Hangover and Check Write condition
let time_since_speech = self.total_processed_samples.saturating_sub(self.last_speech_time);
if self.is_speech_active || time_since_speech < self.hangover_samples {
// We are recording!
// Check if we just started (transition)
// Ideally we dump the ring buffer here if we just switched state.
// Implementing perfect ring buffer dump is complex (need to track state changes better).
// MVP: Just Write Current Data if in state.
// Improvement: If we are in hangover, we just write.
// If we just detected speech (was not speech?), dump ring buffer?
// We'd need to know if we 'wrote' the ring buffer already.
// Simple Logic: just write all incoming data if (Now - LastSpeech < Hangover)
let mut guard = self.writer.lock().unwrap();
for &sample in data {
let amplitude = i16::MAX as f32;
guard.write_sample((sample * amplitude) as i16).ok();
}
}
self.total_processed_samples += data.len() as u64;
}
}