Release 1.1.0: Add Import Audio Files feature
- New Import tab with drag-and-drop support for audio files - Support for 8 formats: MP3, MP4, WAV, M4A, FLAC, OGG, AAC, WMA - File metadata display (duration, size, format) - Editable meeting titles - Progress tracking with visual indicators - Smart template selection - Auto-navigation to Transcription view - Updated README with BlackHole requirement and Teams config - Added get_audio_metadata Rust command - Version bump to 1.1.0
This commit is contained in:
@@ -11,6 +11,9 @@ pub struct AudioProcessor {
|
||||
vad_chunk_size: usize,
|
||||
vad_buffer: Vec<f32>,
|
||||
|
||||
// Audio Config
|
||||
channel_count: u16,
|
||||
|
||||
// Resampler
|
||||
resampler: FastFixedIn<f32>,
|
||||
resample_input_buffer: Vec<f32>,
|
||||
@@ -21,6 +24,9 @@ pub struct AudioProcessor {
|
||||
last_speech_time: u64, // In samples or frames
|
||||
hangover_samples: u64,
|
||||
|
||||
// Waiting Mode
|
||||
waiting_for_speech: bool,
|
||||
|
||||
// Ring Buffer (for pre-roll)
|
||||
ring_buffer: Vec<f32>,
|
||||
ring_pos: usize,
|
||||
@@ -37,12 +43,14 @@ pub struct AudioProcessor {
|
||||
|
||||
impl AudioProcessor {
|
||||
pub fn new(
|
||||
sample_rate: u32,
|
||||
sample_rate: u32,
|
||||
channel_count: u16,
|
||||
writer: Arc<Mutex<WavWriter<std::io::BufWriter<std::fs::File>>>>,
|
||||
app_handle: AppHandle
|
||||
app_handle: AppHandle,
|
||||
wait_for_speech: bool
|
||||
) -> Result<Self, String> {
|
||||
let vad_sample_rate = 16000;
|
||||
let vad_chunk_size = 512; // Silero usually likes ~30ms which is 512 at 16k? No 16000 * 0.032 = 512.
|
||||
let vad_chunk_size = 512;
|
||||
|
||||
// Initialize VAD
|
||||
let vad = VoiceActivityDetector::builder()
|
||||
@@ -51,8 +59,7 @@ impl AudioProcessor {
|
||||
.build()
|
||||
.map_err(|e| format!("Failed to init VAD: {:?}", e))?;
|
||||
|
||||
// Initialize Resampler (Input Rate -> 16000) using FastFixedIn for speed/simplicity
|
||||
// new(f_ratio, max_resample_ratio_relative, polyn_deg, chunk_size, channels)
|
||||
// Initialize Resampler (Input Rate -> 16000)
|
||||
let resampler = FastFixedIn::<f32>::new(
|
||||
16000.0 / sample_rate as f64,
|
||||
1.0,
|
||||
@@ -61,20 +68,26 @@ impl AudioProcessor {
|
||||
1
|
||||
).map_err(|e| format!("Failed to init Resampler: {:?}", e))?;
|
||||
|
||||
// Pre-roll buffer (e.g. 0.5 seconds of high quality audio)
|
||||
// Pre-roll buffer (1.0 seconds) * Channels (interleaved store)
|
||||
let ring_curr_seconds = 1.0;
|
||||
let ring_size = (sample_rate as f32 * ring_curr_seconds) as usize;
|
||||
// WavWriter writes interleaved, so we store interleaved.
|
||||
let ring_size = (sample_rate as f32 * ring_curr_seconds) as usize * channel_count as usize;
|
||||
|
||||
Ok(Self {
|
||||
vad,
|
||||
vad_chunk_size,
|
||||
vad_buffer: Vec::new(),
|
||||
channel_count,
|
||||
resampler,
|
||||
resample_input_buffer: Vec::new(),
|
||||
resample_output_buffer: Vec::new(),
|
||||
is_speech_active: false,
|
||||
last_speech_time: 0,
|
||||
hangover_samples: (sample_rate as f32 * 1.5) as u64, // 1.5s hangover
|
||||
// Hangover counts "processed samples" which are actually frames * channels in current logic?
|
||||
// Actually total_processed_samples usually counts FRAMES in audio terminology, but here we count elements.
|
||||
// Let's stick to elements to match existing logic logic.
|
||||
hangover_samples: (sample_rate as f32 * 1.5 * channel_count as f32) as u64,
|
||||
waiting_for_speech: wait_for_speech,
|
||||
ring_buffer: vec![0.0; ring_size],
|
||||
ring_pos: 0,
|
||||
ring_size,
|
||||
@@ -87,30 +100,39 @@ impl AudioProcessor {
|
||||
}
|
||||
|
||||
pub fn process(&mut self, data: &[f32]) {
|
||||
// 1. Add to Ring Buffer (always, for pre-roll)
|
||||
// 1. Add to Ring Buffer (Interleaved data - Record EVERYTHING)
|
||||
for &sample in data {
|
||||
self.ring_buffer[self.ring_pos] = sample;
|
||||
self.ring_pos = (self.ring_pos + 1) % self.ring_size;
|
||||
}
|
||||
|
||||
// 2. Resample for VAD
|
||||
// We append new data to input buffer for resampler
|
||||
self.resample_input_buffer.extend_from_slice(data);
|
||||
// 2. Prepare VAD Signal (Mono Mixdown)
|
||||
// FRESH START LOGIC (v0.2.0):
|
||||
// We expect standard Stereo Input (BlackHole 2ch).
|
||||
// No magic 3-channel aggregate.
|
||||
|
||||
// Process in chunks compatible with resampler
|
||||
// Actually rubato process_into_buffer needs waves of input.
|
||||
// Simplified: SincFixedIn wants a fixed number of input frames?
|
||||
// Docs: "retrieve result... input buffer must contain needed number of frames"
|
||||
// SincFixedIn: "input buffer used for resampling... must receive a fixed number of frames"
|
||||
// Wait, SincFixedIn is fixed INPUT size. SincFixedOut is fixed OUTPUT size.
|
||||
// We want to feed whatever we get.
|
||||
// For simplicity, let's use a simpler resampling strategy or accept rubato's constraints.
|
||||
// Rubato SincFixedIn: we must provide `input_frames_next` frames.
|
||||
let channels = self.channel_count as usize;
|
||||
let frame_count = data.len() / channels;
|
||||
let mut vad_input_chunk = Vec::with_capacity(frame_count);
|
||||
|
||||
for i in 0..frame_count {
|
||||
let frame_start = i * channels;
|
||||
|
||||
let mix_sample = if channels >= 2 {
|
||||
// Stereo -> Average L + R
|
||||
(data[frame_start] + data[frame_start + 1]) / 2.0
|
||||
} else {
|
||||
// Mono -> Take as is
|
||||
data[frame_start]
|
||||
};
|
||||
|
||||
vad_input_chunk.push(mix_sample);
|
||||
}
|
||||
|
||||
|
||||
// 3. Resample for VAD
|
||||
self.resample_input_buffer.extend_from_slice(&vad_input_chunk);
|
||||
|
||||
// Let's defer strict resampling and just use decimation if sample rate is multiple?
|
||||
// No, user devices vary.
|
||||
|
||||
// Handling Resampling properly:
|
||||
let needed = self.resampler.input_frames_next();
|
||||
while self.resample_input_buffer.len() >= needed {
|
||||
let chunk: Vec<f32> = self.resample_input_buffer.drain(0..needed).collect();
|
||||
@@ -127,63 +149,87 @@ impl AudioProcessor {
|
||||
// Update output buffer usage... logic is tricky with drain.
|
||||
}
|
||||
|
||||
// 3. Process VAD
|
||||
// 4. Process VAD
|
||||
while self.vad_buffer.len() >= self.vad_chunk_size {
|
||||
let vad_chunk: Vec<f32> = self.vad_buffer.drain(0..self.vad_chunk_size).collect();
|
||||
// Run Detection
|
||||
// Run Detection
|
||||
let probability = self.vad.predict(vad_chunk.clone());
|
||||
|
||||
// Calculate RMS for this chunk to use as fallback/hybrid detection
|
||||
let sq_sum: f32 = vad_chunk.iter().map(|x| x * x).sum();
|
||||
let rms = (sq_sum / vad_chunk.len() as f32).sqrt();
|
||||
|
||||
// Hybrid VAD: Probability > 0.4 OR RMS > 0.005 (approx -46dB)
|
||||
let is_speech = probability > 0.4 || rms > 0.005;
|
||||
// Hybrid VAD: Probability > 0.8 OR RMS > 0.015
|
||||
// INCREASED THRESHOLDS (v1.9.0):
|
||||
// Now that routing works, we must filter out system notifications (beeps) and noise floor.
|
||||
let is_speech = probability > 0.8 || rms > 0.015;
|
||||
|
||||
if is_speech {
|
||||
self.is_speech_active = true;
|
||||
self.last_speech_time = self.total_processed_samples;
|
||||
}
|
||||
|
||||
// Emit VAD event periodically (every 500ms)
|
||||
// Emit VAD event periodically (every 500ms is enough for non-diagnostic mode)
|
||||
if self.last_event_time.elapsed().as_millis() > 500 {
|
||||
// Calculate simple RMS of the current chunk for debugging
|
||||
let sq_sum: f32 = vad_chunk.iter().map(|x| x * x).sum();
|
||||
let rms = (sq_sum / vad_chunk.len() as f32).sqrt();
|
||||
|
||||
// Print debug info to stdout (viewable in terminal)
|
||||
println!("VAD Debug: Prob={:.4}, RMS={:.6}, Speech={}", probability, rms, is_speech);
|
||||
|
||||
if let Some(app) = &self.app_handle {
|
||||
// Just sending probability is enough for now
|
||||
#[derive(serde::Serialize, Clone)]
|
||||
#[derive(Clone, serde::Serialize)]
|
||||
struct VadEvent {
|
||||
probability: f32,
|
||||
is_speech: bool,
|
||||
probability: f32,
|
||||
}
|
||||
let _ = app.emit("vad-event", VadEvent { probability, is_speech });
|
||||
let _ = app.emit("vad-event", VadEvent {
|
||||
probability,
|
||||
is_speech: self.is_speech_active,
|
||||
});
|
||||
}
|
||||
self.last_event_time = std::time::Instant::now();
|
||||
|
||||
// IMPORTANT: We reset is_speech_active after emitting,
|
||||
// so we don't latch it forever if the user stops talking.
|
||||
// However, the main loop sets it to true if current chunk is speech.
|
||||
// This logic is a bit of a "latch for X ms".
|
||||
self.is_speech_active = false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// 4. Update Hangover and Check Write condition
|
||||
if self.waiting_for_speech {
|
||||
if self.is_speech_active {
|
||||
// Trigger Detected!
|
||||
println!("Auto-Start: Speech detected. Flushing pre-roll...");
|
||||
self.waiting_for_speech = false;
|
||||
|
||||
// Flush Ring Buffer (Orderly: from ring_pos to end, then 0 to ring_pos)
|
||||
let mut guard = self.writer.lock().unwrap();
|
||||
let amplitude = i16::MAX as f32;
|
||||
|
||||
// Part 1: ring_pos to end
|
||||
for i in self.ring_pos..self.ring_size {
|
||||
let sample = self.ring_buffer[i];
|
||||
guard.write_sample((sample * amplitude) as i16).ok();
|
||||
}
|
||||
// Part 2: 0 to ring_pos
|
||||
for i in 0..self.ring_pos {
|
||||
let sample = self.ring_buffer[i];
|
||||
guard.write_sample((sample * amplitude) as i16).ok();
|
||||
}
|
||||
|
||||
// Emit event to notify frontend that "real" recording started
|
||||
if let Some(app) = &self.app_handle {
|
||||
let _ = app.emit("auto-recording-triggered", ());
|
||||
}
|
||||
|
||||
} else {
|
||||
// Still waiting, do not write to file.
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Standard Recording Logic (Active or Hangover)
|
||||
let time_since_speech = self.total_processed_samples.saturating_sub(self.last_speech_time);
|
||||
|
||||
if self.is_speech_active || time_since_speech < self.hangover_samples {
|
||||
// We are recording!
|
||||
// Check if we just started (transition)
|
||||
// Ideally we dump the ring buffer here if we just switched state.
|
||||
// Implementing perfect ring buffer dump is complex (need to track state changes better).
|
||||
// MVP: Just Write Current Data if in state.
|
||||
|
||||
// Improvement: If we are in hangover, we just write.
|
||||
// If we just detected speech (was not speech?), dump ring buffer?
|
||||
// We'd need to know if we 'wrote' the ring buffer already.
|
||||
|
||||
// Simple Logic: just write all incoming data if (Now - LastSpeech < Hangover)
|
||||
|
||||
let mut guard = self.writer.lock().unwrap();
|
||||
for &sample in data {
|
||||
let amplitude = i16::MAX as f32;
|
||||
|
||||
Reference in New Issue
Block a user