Release 1.1.0: Add Import Audio Files feature

- New Import tab with drag-and-drop support for audio files
- Support for 8 formats: MP3, MP4, WAV, M4A, FLAC, OGG, AAC, WMA
- File metadata display (duration, size, format)
- Editable meeting titles
- Progress tracking with visual indicators
- Smart template selection
- Auto-navigation to Transcription view
- Updated README with BlackHole requirement and Teams config
- Added get_audio_metadata Rust command
- Version bump to 1.1.0
This commit is contained in:
michael.borak
2026-01-21 09:08:56 +01:00
parent 79f509951c
commit a06e473e85
12 changed files with 1041 additions and 171 deletions

View File

@@ -11,6 +11,9 @@ pub struct AudioProcessor {
vad_chunk_size: usize,
vad_buffer: Vec<f32>,
// Audio Config
channel_count: u16,
// Resampler
resampler: FastFixedIn<f32>,
resample_input_buffer: Vec<f32>,
@@ -21,6 +24,9 @@ pub struct AudioProcessor {
last_speech_time: u64, // In samples or frames
hangover_samples: u64,
// Waiting Mode
waiting_for_speech: bool,
// Ring Buffer (for pre-roll)
ring_buffer: Vec<f32>,
ring_pos: usize,
@@ -37,12 +43,14 @@ pub struct AudioProcessor {
impl AudioProcessor {
pub fn new(
sample_rate: u32,
sample_rate: u32,
channel_count: u16,
writer: Arc<Mutex<WavWriter<std::io::BufWriter<std::fs::File>>>>,
app_handle: AppHandle
app_handle: AppHandle,
wait_for_speech: bool
) -> Result<Self, String> {
let vad_sample_rate = 16000;
let vad_chunk_size = 512; // Silero usually likes ~30ms which is 512 at 16k? No 16000 * 0.032 = 512.
let vad_chunk_size = 512;
// Initialize VAD
let vad = VoiceActivityDetector::builder()
@@ -51,8 +59,7 @@ impl AudioProcessor {
.build()
.map_err(|e| format!("Failed to init VAD: {:?}", e))?;
// Initialize Resampler (Input Rate -> 16000) using FastFixedIn for speed/simplicity
// new(f_ratio, max_resample_ratio_relative, polyn_deg, chunk_size, channels)
// Initialize Resampler (Input Rate -> 16000)
let resampler = FastFixedIn::<f32>::new(
16000.0 / sample_rate as f64,
1.0,
@@ -61,20 +68,26 @@ impl AudioProcessor {
1
).map_err(|e| format!("Failed to init Resampler: {:?}", e))?;
// Pre-roll buffer (e.g. 0.5 seconds of high quality audio)
// Pre-roll buffer (1.0 seconds) * Channels (interleaved store)
let ring_curr_seconds = 1.0;
let ring_size = (sample_rate as f32 * ring_curr_seconds) as usize;
// WavWriter writes interleaved, so we store interleaved.
let ring_size = (sample_rate as f32 * ring_curr_seconds) as usize * channel_count as usize;
Ok(Self {
vad,
vad_chunk_size,
vad_buffer: Vec::new(),
channel_count,
resampler,
resample_input_buffer: Vec::new(),
resample_output_buffer: Vec::new(),
is_speech_active: false,
last_speech_time: 0,
hangover_samples: (sample_rate as f32 * 1.5) as u64, // 1.5s hangover
// Hangover counts "processed samples" which are actually frames * channels in current logic?
// Actually total_processed_samples usually counts FRAMES in audio terminology, but here we count elements.
// Let's stick to elements to match existing logic logic.
hangover_samples: (sample_rate as f32 * 1.5 * channel_count as f32) as u64,
waiting_for_speech: wait_for_speech,
ring_buffer: vec![0.0; ring_size],
ring_pos: 0,
ring_size,
@@ -87,30 +100,39 @@ impl AudioProcessor {
}
pub fn process(&mut self, data: &[f32]) {
// 1. Add to Ring Buffer (always, for pre-roll)
// 1. Add to Ring Buffer (Interleaved data - Record EVERYTHING)
for &sample in data {
self.ring_buffer[self.ring_pos] = sample;
self.ring_pos = (self.ring_pos + 1) % self.ring_size;
}
// 2. Resample for VAD
// We append new data to input buffer for resampler
self.resample_input_buffer.extend_from_slice(data);
// 2. Prepare VAD Signal (Mono Mixdown)
// FRESH START LOGIC (v0.2.0):
// We expect standard Stereo Input (BlackHole 2ch).
// No magic 3-channel aggregate.
// Process in chunks compatible with resampler
// Actually rubato process_into_buffer needs waves of input.
// Simplified: SincFixedIn wants a fixed number of input frames?
// Docs: "retrieve result... input buffer must contain needed number of frames"
// SincFixedIn: "input buffer used for resampling... must receive a fixed number of frames"
// Wait, SincFixedIn is fixed INPUT size. SincFixedOut is fixed OUTPUT size.
// We want to feed whatever we get.
// For simplicity, let's use a simpler resampling strategy or accept rubato's constraints.
// Rubato SincFixedIn: we must provide `input_frames_next` frames.
let channels = self.channel_count as usize;
let frame_count = data.len() / channels;
let mut vad_input_chunk = Vec::with_capacity(frame_count);
for i in 0..frame_count {
let frame_start = i * channels;
let mix_sample = if channels >= 2 {
// Stereo -> Average L + R
(data[frame_start] + data[frame_start + 1]) / 2.0
} else {
// Mono -> Take as is
data[frame_start]
};
vad_input_chunk.push(mix_sample);
}
// 3. Resample for VAD
self.resample_input_buffer.extend_from_slice(&vad_input_chunk);
// Let's defer strict resampling and just use decimation if sample rate is multiple?
// No, user devices vary.
// Handling Resampling properly:
let needed = self.resampler.input_frames_next();
while self.resample_input_buffer.len() >= needed {
let chunk: Vec<f32> = self.resample_input_buffer.drain(0..needed).collect();
@@ -127,63 +149,87 @@ impl AudioProcessor {
// Update output buffer usage... logic is tricky with drain.
}
// 3. Process VAD
// 4. Process VAD
while self.vad_buffer.len() >= self.vad_chunk_size {
let vad_chunk: Vec<f32> = self.vad_buffer.drain(0..self.vad_chunk_size).collect();
// Run Detection
// Run Detection
let probability = self.vad.predict(vad_chunk.clone());
// Calculate RMS for this chunk to use as fallback/hybrid detection
let sq_sum: f32 = vad_chunk.iter().map(|x| x * x).sum();
let rms = (sq_sum / vad_chunk.len() as f32).sqrt();
// Hybrid VAD: Probability > 0.4 OR RMS > 0.005 (approx -46dB)
let is_speech = probability > 0.4 || rms > 0.005;
// Hybrid VAD: Probability > 0.8 OR RMS > 0.015
// INCREASED THRESHOLDS (v1.9.0):
// Now that routing works, we must filter out system notifications (beeps) and noise floor.
let is_speech = probability > 0.8 || rms > 0.015;
if is_speech {
self.is_speech_active = true;
self.last_speech_time = self.total_processed_samples;
}
// Emit VAD event periodically (every 500ms)
// Emit VAD event periodically (every 500ms is enough for non-diagnostic mode)
if self.last_event_time.elapsed().as_millis() > 500 {
// Calculate simple RMS of the current chunk for debugging
let sq_sum: f32 = vad_chunk.iter().map(|x| x * x).sum();
let rms = (sq_sum / vad_chunk.len() as f32).sqrt();
// Print debug info to stdout (viewable in terminal)
println!("VAD Debug: Prob={:.4}, RMS={:.6}, Speech={}", probability, rms, is_speech);
if let Some(app) = &self.app_handle {
// Just sending probability is enough for now
#[derive(serde::Serialize, Clone)]
#[derive(Clone, serde::Serialize)]
struct VadEvent {
probability: f32,
is_speech: bool,
probability: f32,
}
let _ = app.emit("vad-event", VadEvent { probability, is_speech });
let _ = app.emit("vad-event", VadEvent {
probability,
is_speech: self.is_speech_active,
});
}
self.last_event_time = std::time::Instant::now();
// IMPORTANT: We reset is_speech_active after emitting,
// so we don't latch it forever if the user stops talking.
// However, the main loop sets it to true if current chunk is speech.
// This logic is a bit of a "latch for X ms".
self.is_speech_active = false;
}
}
// 4. Update Hangover and Check Write condition
if self.waiting_for_speech {
if self.is_speech_active {
// Trigger Detected!
println!("Auto-Start: Speech detected. Flushing pre-roll...");
self.waiting_for_speech = false;
// Flush Ring Buffer (Orderly: from ring_pos to end, then 0 to ring_pos)
let mut guard = self.writer.lock().unwrap();
let amplitude = i16::MAX as f32;
// Part 1: ring_pos to end
for i in self.ring_pos..self.ring_size {
let sample = self.ring_buffer[i];
guard.write_sample((sample * amplitude) as i16).ok();
}
// Part 2: 0 to ring_pos
for i in 0..self.ring_pos {
let sample = self.ring_buffer[i];
guard.write_sample((sample * amplitude) as i16).ok();
}
// Emit event to notify frontend that "real" recording started
if let Some(app) = &self.app_handle {
let _ = app.emit("auto-recording-triggered", ());
}
} else {
// Still waiting, do not write to file.
return;
}
}
// Standard Recording Logic (Active or Hangover)
let time_since_speech = self.total_processed_samples.saturating_sub(self.last_speech_time);
if self.is_speech_active || time_since_speech < self.hangover_samples {
// We are recording!
// Check if we just started (transition)
// Ideally we dump the ring buffer here if we just switched state.
// Implementing perfect ring buffer dump is complex (need to track state changes better).
// MVP: Just Write Current Data if in state.
// Improvement: If we are in hangover, we just write.
// If we just detected speech (was not speech?), dump ring buffer?
// We'd need to know if we 'wrote' the ring buffer already.
// Simple Logic: just write all incoming data if (Now - LastSpeech < Hangover)
let mut guard = self.writer.lock().unwrap();
for &sample in data {
let amplitude = i16::MAX as f32;

View File

@@ -1,4 +1,9 @@
use tauri::{AppHandle, Manager, State, Emitter};
use tauri::{
AppHandle, Manager, State, Emitter,
menu::{Menu, MenuItem},
tray::{TrayIconBuilder, TrayIconEvent},
WindowEvent
};
use std::sync::{Arc, Mutex};
use std::process::Command;
use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
@@ -65,7 +70,7 @@ fn get_input_devices() -> Result<Vec<AudioDevice>, String> {
#[tauri::command]
fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id: String, save_path: Option<String>, custom_filename: Option<String>) -> Result<(), String> {
fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id: String, save_path: Option<String>, custom_filename: Option<String>, wait_for_speech: Option<bool>) -> Result<(), String> {
emit_log(&app, "INFO", &format!("Starting recording on device: {}", device_id));
let host = cpal::default_host();
@@ -77,16 +82,17 @@ fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id: String
.or_else(|| host.default_input_device())
.ok_or("No input device found")?;
let config = device.default_input_config().map_err(|e| e.to_string())?;
// VAD requires 16Hz or 8kHz, typically. Silero likes 16k.
// We might need to resample or just check if the device supports it.
// For MVP VAD, we will try to stick to standard rates.
// Actually, simple energy VAD is easier to start with if Silero is too heavy or requires ONNX runtime.
// Let's check the crate docs or usage first.
// Wait, the user wants to IGNORE music. Energy VAD will fail on music.
// voice_activity_detector crate usually uses Silero or similar.
// Select the configuration with the MAXIMUM number of channels
// This is crucial for "Hearbit Audio" (Aggregate) which lists 3 channels but might default to 2.
// We want the raw 3 channels to separate Mic (Ch0) from System (Ch1+2).
let supported_configs = device.supported_input_configs().map_err(|e| e.to_string())?;
let config = supported_configs
.max_by_key(|c| c.channels())
.map(|c| c.with_max_sample_rate())
.ok_or("No supported input configurations found")?;
emit_log(&app, "INFO", &format!("Selected Audio Config: {} Channels, {} Hz", config.channels(), config.sample_rate()));
let spec = hound::WavSpec {
channels: config.channels(),
sample_rate: config.sample_rate(),
@@ -122,7 +128,12 @@ fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id: String
// Initialize AudioProcessor (VAD)
// We pass the writer to it.
let processor = AudioProcessor::new(config.sample_rate(), writer.clone(), app.clone())
let should_wait = wait_for_speech.unwrap_or(false);
if should_wait {
emit_log(&app, "INFO", "Recording started in WAITING mode (buffer-only until speech).");
}
let processor = AudioProcessor::new(config.sample_rate(), config.channels(), writer.clone(), app.clone(), should_wait)
.map_err(|e| format!("Failed to create AudioProcessor: {}", e))?;
// Wrap processor in Arc<Mutex> so we can share/move it into callback
@@ -560,6 +571,62 @@ async fn summarize_text(app: AppHandle, text: String, api_key: String, product_i
}
}
#[derive(serde::Serialize)]
struct AudioMetadata {
duration: f64,
size: u64,
format: String,
}
#[tauri::command]
fn get_audio_metadata(app: AppHandle, file_path: String) -> Result<AudioMetadata, String> {
emit_log(&app, "INFO", &format!("Getting metadata for: {}", file_path));
// Get file size
let metadata = std::fs::metadata(&file_path).map_err(|e| e.to_string())?;
let size = metadata.len();
// Extract format from extension
let path = std::path::Path::new(&file_path);
let format = path.extension()
.and_then(|e| e.to_str())
.unwrap_or("unknown")
.to_string();
// Get duration using ffprobe (requires ffmpeg to be installed)
let duration = match Command::new("ffprobe")
.args([
"-v", "error",
"-show_entries", "format=duration",
"-of", "default=noprint_wrappers=1:nokey=1",
&file_path
])
.output()
{
Ok(output) => {
if output.status.success() {
let duration_str = String::from_utf8_lossy(&output.stdout);
duration_str.trim().parse::<f64>().unwrap_or(0.0)
} else {
emit_log(&app, "WARN", "ffprobe failed, duration = 0");
0.0
}
},
Err(_) => {
emit_log(&app, "WARN", "ffprobe not found, duration = 0");
0.0
}
};
emit_log(&app, "SUCCESS", &format!("Metadata: {}s, {} bytes", duration, size));
Ok(AudioMetadata {
duration,
size,
format,
})
}
#[tauri::command]
fn open_audio_midi_setup() -> Result<(), String> {
Command::new("open")
@@ -640,6 +707,49 @@ async fn read_log_file(app: AppHandle) -> Result<String, String> {
#[cfg_attr(mobile, tauri::mobile_entry_point)]
pub fn run() {
tauri::Builder::default()
.setup(|app| {
// Setup Tray Icon
let quit_i = MenuItem::with_id(app, "quit", "Quit Hearbit AI", true, None::<&str>).unwrap();
let show_i = MenuItem::with_id(app, "show", "Show Window", true, None::<&str>).unwrap();
let menu = Menu::with_items(app, &[&show_i, &quit_i]).unwrap();
let _tray = TrayIconBuilder::new()
.icon(app.default_window_icon().unwrap().clone())
.menu(&menu)
.show_menu_on_left_click(true)
.on_menu_event(|app, event| {
match event.id.as_ref() {
"quit" => app.exit(0),
"show" => {
if let Some(window) = app.get_webview_window("main") {
let _ = window.show();
let _ = window.set_focus();
}
}
_ => {}
}
})
.on_tray_icon_event(|tray, event| {
if let TrayIconEvent::Click { .. } = event {
let app = tray.app_handle();
if let Some(window) = app.get_webview_window("main") {
let _ = window.show();
let _ = window.set_focus();
}
}
})
.build(app)?;
Ok(())
})
.on_window_event(|window, event| {
if let WindowEvent::CloseRequested { api, .. } = event {
// Prevent window from closing, just hide it
window.hide().unwrap();
api.prevent_close();
}
})
.plugin(tauri_plugin_shell::init())
.plugin(tauri_plugin_log::Builder::default()
.targets([
tauri_plugin_log::Target::new(tauri_plugin_log::TargetKind::Stdout),
@@ -670,6 +780,7 @@ pub fn run() {
auth::get_calendar_events,
save_text_file,
read_log_file,
get_audio_metadata,
email::send_smtp_email
])
.run(tauri::generate_context!())