feat: complete history, attendees list, and smart templates
This commit is contained in:
183
src-tauri/src/audio_processor.rs
Normal file
183
src-tauri/src/audio_processor.rs
Normal file
@@ -0,0 +1,183 @@
|
||||
use std::sync::{Arc, Mutex};
|
||||
use tauri::{AppHandle, Emitter};
|
||||
use cpal::Sample;
|
||||
use hound::WavWriter;
|
||||
use rubato::{Resampler, FastFixedIn, PolynomialDegree};
|
||||
use voice_activity_detector::VoiceActivityDetector;
|
||||
|
||||
pub struct AudioProcessor {
|
||||
// VAD
|
||||
vad: VoiceActivityDetector,
|
||||
vad_chunk_size: usize,
|
||||
vad_buffer: Vec<f32>,
|
||||
|
||||
// Resampler
|
||||
resampler: FastFixedIn<f32>,
|
||||
resample_input_buffer: Vec<f32>,
|
||||
resample_output_buffer: Vec<f32>,
|
||||
|
||||
// State
|
||||
is_speech_active: bool,
|
||||
last_speech_time: u64, // In samples or frames
|
||||
hangover_samples: u64,
|
||||
|
||||
// Ring Buffer (for pre-roll)
|
||||
ring_buffer: Vec<f32>,
|
||||
ring_pos: usize,
|
||||
ring_size: usize,
|
||||
|
||||
// Output
|
||||
writer: Arc<Mutex<WavWriter<std::io::BufWriter<std::fs::File>>>>,
|
||||
sample_rate: u32,
|
||||
total_processed_samples: u64,
|
||||
// Event Emission
|
||||
app_handle: Option<AppHandle>,
|
||||
last_event_time: std::time::Instant,
|
||||
}
|
||||
|
||||
impl AudioProcessor {
|
||||
pub fn new(
|
||||
sample_rate: u32,
|
||||
writer: Arc<Mutex<WavWriter<std::io::BufWriter<std::fs::File>>>>,
|
||||
app_handle: AppHandle
|
||||
) -> Result<Self, String> {
|
||||
let vad_sample_rate = 16000;
|
||||
let vad_chunk_size = 512; // Silero usually likes ~30ms which is 512 at 16k? No 16000 * 0.032 = 512.
|
||||
|
||||
// Initialize VAD
|
||||
let vad = VoiceActivityDetector::builder()
|
||||
.sample_rate(vad_sample_rate as u32)
|
||||
.chunk_size(vad_chunk_size)
|
||||
.build()
|
||||
.map_err(|e| format!("Failed to init VAD: {:?}", e))?;
|
||||
|
||||
// Initialize Resampler (Input Rate -> 16000) using FastFixedIn for speed/simplicity
|
||||
// new(f_ratio, max_resample_ratio_relative, polyn_deg, chunk_size, channels)
|
||||
let resampler = FastFixedIn::<f32>::new(
|
||||
16000.0 / sample_rate as f64,
|
||||
1.0,
|
||||
PolynomialDegree::Septic,
|
||||
1024,
|
||||
1
|
||||
).map_err(|e| format!("Failed to init Resampler: {:?}", e))?;
|
||||
|
||||
// Pre-roll buffer (e.g. 0.5 seconds of high quality audio)
|
||||
let ring_curr_seconds = 1.0;
|
||||
let ring_size = (sample_rate as f32 * ring_curr_seconds) as usize;
|
||||
|
||||
Ok(Self {
|
||||
vad,
|
||||
vad_chunk_size,
|
||||
vad_buffer: Vec::new(),
|
||||
resampler,
|
||||
resample_input_buffer: Vec::new(),
|
||||
resample_output_buffer: Vec::new(),
|
||||
is_speech_active: false,
|
||||
last_speech_time: 0,
|
||||
hangover_samples: (sample_rate as f32 * 1.5) as u64, // 1.5s hangover
|
||||
ring_buffer: vec![0.0; ring_size],
|
||||
ring_pos: 0,
|
||||
ring_size,
|
||||
writer,
|
||||
sample_rate,
|
||||
total_processed_samples: 0,
|
||||
app_handle: Some(app_handle),
|
||||
last_event_time: std::time::Instant::now(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn process(&mut self, data: &[f32]) {
|
||||
// 1. Add to Ring Buffer (always, for pre-roll)
|
||||
for &sample in data {
|
||||
self.ring_buffer[self.ring_pos] = sample;
|
||||
self.ring_pos = (self.ring_pos + 1) % self.ring_size;
|
||||
}
|
||||
|
||||
// 2. Resample for VAD
|
||||
// We append new data to input buffer for resampler
|
||||
self.resample_input_buffer.extend_from_slice(data);
|
||||
|
||||
// Process in chunks compatible with resampler
|
||||
// Actually rubato process_into_buffer needs waves of input.
|
||||
// Simplified: SincFixedIn wants a fixed number of input frames?
|
||||
// Docs: "retrieve result... input buffer must contain needed number of frames"
|
||||
// SincFixedIn: "input buffer used for resampling... must receive a fixed number of frames"
|
||||
// Wait, SincFixedIn is fixed INPUT size. SincFixedOut is fixed OUTPUT size.
|
||||
// We want to feed whatever we get.
|
||||
// For simplicity, let's use a simpler resampling strategy or accept rubato's constraints.
|
||||
// Rubato SincFixedIn: we must provide `input_frames_next` frames.
|
||||
|
||||
// Let's defer strict resampling and just use decimation if sample rate is multiple?
|
||||
// No, user devices vary.
|
||||
|
||||
// Handling Resampling properly:
|
||||
let needed = self.resampler.input_frames_next();
|
||||
while self.resample_input_buffer.len() >= needed {
|
||||
let chunk: Vec<f32> = self.resample_input_buffer.drain(0..needed).collect();
|
||||
// Resample (mono)
|
||||
let waves_in = vec![chunk];
|
||||
// Allocate output (approx)
|
||||
let mut waves_out = vec![vec![0.0; (needed as f64 * (16000.0 / self.sample_rate as f64)).ceil() as usize + 10]; 1]; // +10 padding
|
||||
|
||||
if let Ok((_in_len, out_len)) = self.resampler.process_into_buffer(&waves_in, &mut waves_out, None) {
|
||||
if out_len > 0 {
|
||||
self.vad_buffer.extend_from_slice(&waves_out[0][0..out_len]);
|
||||
}
|
||||
}
|
||||
// Update output buffer usage... logic is tricky with drain.
|
||||
}
|
||||
|
||||
// 3. Process VAD
|
||||
while self.vad_buffer.len() >= self.vad_chunk_size {
|
||||
let vad_chunk: Vec<f32> = self.vad_buffer.drain(0..self.vad_chunk_size).collect();
|
||||
// Run Detection
|
||||
let probability = self.vad.predict(vad_chunk);
|
||||
let is_speech = probability > 0.5;
|
||||
|
||||
if is_speech {
|
||||
self.is_speech_active = true;
|
||||
self.last_speech_time = self.total_processed_samples;
|
||||
}
|
||||
|
||||
// Emit VAD event periodically (every 500ms)
|
||||
if self.last_event_time.elapsed().as_millis() > 500 {
|
||||
if let Some(app) = &self.app_handle {
|
||||
// Calculate crude RMS for visualization or just send probability
|
||||
// Just sending probability is enough for now
|
||||
#[derive(serde::Serialize, Clone)]
|
||||
struct VadEvent {
|
||||
probability: f32,
|
||||
is_speech: bool,
|
||||
}
|
||||
let _ = app.emit("vad-event", VadEvent { probability, is_speech });
|
||||
}
|
||||
self.last_event_time = std::time::Instant::now();
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Update Hangover and Check Write condition
|
||||
let time_since_speech = self.total_processed_samples.saturating_sub(self.last_speech_time);
|
||||
|
||||
if self.is_speech_active || time_since_speech < self.hangover_samples {
|
||||
// We are recording!
|
||||
// Check if we just started (transition)
|
||||
// Ideally we dump the ring buffer here if we just switched state.
|
||||
// Implementing perfect ring buffer dump is complex (need to track state changes better).
|
||||
// MVP: Just Write Current Data if in state.
|
||||
|
||||
// Improvement: If we are in hangover, we just write.
|
||||
// If we just detected speech (was not speech?), dump ring buffer?
|
||||
// We'd need to know if we 'wrote' the ring buffer already.
|
||||
|
||||
// Simple Logic: just write all incoming data if (Now - LastSpeech < Hangover)
|
||||
|
||||
let mut guard = self.writer.lock().unwrap();
|
||||
for &sample in data {
|
||||
let amplitude = i16::MAX as f32;
|
||||
guard.write_sample((sample * amplitude) as i16).ok();
|
||||
}
|
||||
}
|
||||
|
||||
self.total_processed_samples += data.len() as u64;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user