feat: refine meeting auto-start, silence timeout (25s) and improve transcription logging
This commit is contained in:
23
README.md
23
README.md
@@ -13,6 +13,7 @@
|
||||
* **Noise Filtering**: Ignores typing and background noise.
|
||||
* **🛡️ Empty Audio Guard**: Automatically discards silent recordings (e.g., false triggers) to save API costs and prevent errors.
|
||||
* **✨ 48kHz Crystal Clear Audio**: Optimized audio pipeline prevents "robot voice" distortion.
|
||||
* **💾 Daily Security Backups**: Automatically saves your entire history as a standard JSON file every 24 hours (unencrypted for easy recovery).
|
||||
|
||||
## 🚀 Key Features
|
||||
|
||||
@@ -40,13 +41,23 @@
|
||||
|
||||
### 3. Recording a Meeting
|
||||
1. **Select Mode**: Choose "Meeting" (captures Mic + System) or "Voice Memo" (Mic only).
|
||||
2. **Auto-Start (Recommended)**: Check "Auto-start when audio detected".
|
||||
2. **Auto-Start Logic**:
|
||||
- **Meeting Mode**: Triggers only when the call actually starts (system audio detected).
|
||||
- **Voice Memo**: Triggers immediately when you start speaking.
|
||||
3. **Standby**: Click "Standby (Auto-Start)". The app waits silently.
|
||||
4. **Join Call**: Join your Teams/Zoom call.
|
||||
5. **Trigger**: As soon as someone speaks, Hearbit starts recording automatically.
|
||||
6. **Finish**: When the call ends (silence > 20s), Hearbit stops, transcribes, summarizes, and **goes back to Standby** for the next call.
|
||||
5. **Trigger**: Hearbit starts recording automatically based on the selected mode.
|
||||
6. **Finish**: When the call ends (silence > 25s), Hearbit stops, transcribes, summarizes, and **goes back to Standby** for the next call.
|
||||
|
||||
### 4. Customizing Prompts
|
||||
### 4. Optimal Setup (MS Teams/Zoom)
|
||||
For the best experience without changing any software settings:
|
||||
* **Hearbit App**: Select your **real microphone** (e.g., "MacBook Mic" or Headset).
|
||||
* **Teams/Zoom**: Use your standard output (Speakers/Headset).
|
||||
* *How it works*: Hearbit captures your voice via mic and the other side via macOS System Audio Capture automatically.
|
||||
|
||||
*Note: If you choose "Hearbit Audio" (Aggregate Device) in the app, you MUST set your Teams' speaker output to "Hearbit Audio" as well.*
|
||||
|
||||
### 5. Customizing Prompts
|
||||
You can create custom AI templates in Settings -> Prompts. Example:
|
||||
* **"Sales Call"**: Focus on budget, timeline, and decision makers.
|
||||
* **"Daily Standup"**: Extract blockers and next steps.
|
||||
@@ -63,10 +74,10 @@ If macOS blocks the app because it's not notarized:
|
||||
3. Enter your password and try again.
|
||||
|
||||
### Audio cuts off at the start?
|
||||
v1.2.0 includes a **3-second buffer**. If this persists, ensure your "Auto-start" threshold isn't too high (though it's currently auto-calibrated).
|
||||
v1.2.0 includes a **3-second buffer**. The Meeting mode now uses a more sensitive trigger (0.005 energy) to catch even quiet participants.
|
||||
|
||||
### "Batch processing failed"
|
||||
This means the audio was empty or too short. The new **Empty Guard** prevents this in most cases. If it happens, check your microphone selection.
|
||||
This means the audio was empty or too short. Check the **Logs** tab for detailed error messages from Infomaniak. The most common cause is selecting the wrong input device or a lack of Screen Recording permissions.
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::sync::{Arc, Mutex};
|
||||
use tauri::{AppHandle, Emitter};
|
||||
use crate::emit_log;
|
||||
use cpal::Sample;
|
||||
use hound::WavWriter;
|
||||
use rubato::{Resampler, FastFixedIn, PolynomialDegree};
|
||||
@@ -42,6 +43,9 @@ pub struct AudioProcessor {
|
||||
|
||||
// System Audio Queue for Mixing
|
||||
pub system_queue: Arc<Mutex<std::collections::VecDeque<f32>>>,
|
||||
|
||||
// Recording Mode (voice or meeting)
|
||||
recording_mode: String,
|
||||
}
|
||||
|
||||
impl AudioProcessor {
|
||||
@@ -50,7 +54,8 @@ impl AudioProcessor {
|
||||
channel_count: u16,
|
||||
writer: Arc<Mutex<WavWriter<std::io::BufWriter<std::fs::File>>>>,
|
||||
app_handle: AppHandle,
|
||||
wait_for_speech: bool
|
||||
wait_for_speech: bool,
|
||||
recording_mode: String,
|
||||
) -> Result<Self, String> {
|
||||
let vad_sample_rate = 16000;
|
||||
let vad_chunk_size = 512;
|
||||
@@ -100,30 +105,51 @@ impl AudioProcessor {
|
||||
app_handle: Some(app_handle),
|
||||
last_event_time: std::time::Instant::now(),
|
||||
system_queue: Arc::new(Mutex::new(std::collections::VecDeque::new())),
|
||||
recording_mode,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn process(&mut self, input_data: &[f32]) {
|
||||
// MIXING LOGIC:
|
||||
// We have `input_data` (Microphone). We check `system_queue` for System Audio.
|
||||
// We mix them: Out = Mic + System.
|
||||
// System Audio is hardcoded to 2 channels (Stereo) in sc_audio.rs.
|
||||
// Microphone `self.channel_count` can be 1 (Mono) or 2 (Stereo).
|
||||
|
||||
let mic_channels = self.channel_count as usize;
|
||||
let mut mixed_data = input_data.to_vec();
|
||||
let mut max_system_energy = 0.0;
|
||||
|
||||
if let Ok(mut queue) = self.system_queue.lock() {
|
||||
for i in 0..mixed_data.len() {
|
||||
if let Some(sys_sample) = queue.pop_front() {
|
||||
// Track system energy for trigger logic
|
||||
let abs_sample = sys_sample.abs();
|
||||
if abs_sample > max_system_energy {
|
||||
max_system_energy = abs_sample;
|
||||
}
|
||||
let gain_mic = 1.0;
|
||||
let gain_sys = 0.8; // Slightly lower system audio to prioritize speaker
|
||||
|
||||
// Simple addition mixing with clamping to avoid clipping
|
||||
let mixed = mixed_data[i] + sys_sample;
|
||||
mixed_data[i] = mixed.max(-1.0).min(1.0);
|
||||
}
|
||||
}
|
||||
if let Ok(mut queue) = self.system_queue.lock() {
|
||||
let frames = mixed_data.len() / mic_channels;
|
||||
|
||||
for f in 0..frames {
|
||||
// system_queue is always stereo (L, R, L, R...)
|
||||
if let (Some(l), Some(r)) = (queue.pop_front(), queue.pop_front()) {
|
||||
let abs_l = l.abs();
|
||||
let abs_r = r.abs();
|
||||
let current_sys_max = if abs_l > abs_r { abs_l } else { abs_r };
|
||||
if current_sys_max > max_system_energy {
|
||||
max_system_energy = current_sys_max;
|
||||
}
|
||||
|
||||
if mic_channels == 1 {
|
||||
// Mic is Mono: Mix System L+R down to Mono
|
||||
let sys_mono = (l + r) / 2.0;
|
||||
let mixed = (mixed_data[f] * gain_mic) + (sys_mono * gain_sys);
|
||||
mixed_data[f] = mixed.max(-1.0).min(1.0);
|
||||
} else {
|
||||
// Mic is Stereo: Mix L-to-L and R-to-R
|
||||
let f_start = f * 2;
|
||||
let mixed_l = (mixed_data[f_start] * gain_mic) + (l * gain_sys);
|
||||
let mixed_r = (mixed_data[f_start + 1] * gain_mic) + (r * gain_sys);
|
||||
mixed_data[f_start] = mixed_l.max(-1.0).min(1.0);
|
||||
mixed_data[f_start + 1] = mixed_r.max(-1.0).min(1.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let data = &mixed_data;
|
||||
@@ -181,11 +207,7 @@ impl AudioProcessor {
|
||||
// Run Detection
|
||||
let probability = self.vad.predict(vad_chunk.clone());
|
||||
|
||||
// Hybrid VAD: Probability > 0.9 OR System Audio Active
|
||||
// We want to keep recording if there is meaningful audio from the system (Call in progress),
|
||||
// even if the VAD doesn't strictly classify it as 'speech' (e.g. ringing, laughter, noise).
|
||||
|
||||
let system_is_active = max_system_energy > 0.01; // Same threshold as trigger
|
||||
let system_is_active = max_system_energy > 0.005; // Lowered to match trigger
|
||||
let is_speech = probability > 0.9;
|
||||
|
||||
if is_speech || system_is_active {
|
||||
@@ -219,23 +241,23 @@ impl AudioProcessor {
|
||||
// 2. AND System Audio has energy (Meaning audio is coming from the PC, i.e., Call started)
|
||||
// Threshold 0.01 is roughly -40dB, should cover ringtones/speech easily but ignore silence/hiss.
|
||||
|
||||
let system_active = max_system_energy > 0.01;
|
||||
let system_active = max_system_energy > 0.005;
|
||||
|
||||
// Special Case: If System Audio acts like a Ringtone (Constant high energy but maybe not VAD speech?)
|
||||
// We trust VAD for speech. But we also trust "Loud System Sound" = Call.
|
||||
// If system is consistently loud, it's likely a call.
|
||||
// Periodically log energy to help debug why meeting mode might not start
|
||||
if self.last_event_time.elapsed().as_millis() > 2000 && self.recording_mode == "meeting" {
|
||||
if let Some(app) = &self.app_handle {
|
||||
emit_log(app, "DEBUG", &format!("Waiting for Meeting... Current System Energy: {:.4} (Threshold: 0.005)", max_system_energy));
|
||||
}
|
||||
}
|
||||
|
||||
// For now, Strict Mode:
|
||||
// Trigger if: (Speech Detected) AND (System Audio Present)
|
||||
// This prevents "User talking alone" -> No trigger (System silent).
|
||||
// This allows "Partner talking" -> Trigger (Speech + System).
|
||||
|
||||
// What about Ringtone? Ringtone has energy but maybe no speech.
|
||||
// If we want to record the ringtone, we should trigger on `system_active` alone?
|
||||
// "erst wen der call startet" -> usually ringing.
|
||||
// Let's be generous: If System Audio is loud (> 0.05), we trigger regardless of VAD.
|
||||
|
||||
let trigger = (self.is_speech_active && system_active) || (max_system_energy > 0.05);
|
||||
// MODE-SPECIFIC TRIGGER LOGIC:
|
||||
// "voice" -> Trigger if user speaks (is_speech_active)
|
||||
// "meeting" -> Trigger ONLY if system audio energy detected (Call starting)
|
||||
let trigger = if self.recording_mode == "voice" {
|
||||
self.is_speech_active
|
||||
} else {
|
||||
system_active
|
||||
};
|
||||
|
||||
if trigger {
|
||||
// Trigger Detected!
|
||||
@@ -271,7 +293,13 @@ impl AudioProcessor {
|
||||
// Standard Recording Logic (Active or Hangover)
|
||||
let time_since_speech = self.total_processed_samples.saturating_sub(self.last_speech_time);
|
||||
|
||||
if self.is_speech_active || time_since_speech < self.hangover_samples {
|
||||
// We write to file if:
|
||||
// 1. VAD thinks someone is speaking (Mic or System)
|
||||
// 2. OR System audio energy is currently above threshold (Ensures calls are captured)
|
||||
// 3. OR we are within the hangover period
|
||||
let system_is_active = max_system_energy > 0.005;
|
||||
|
||||
if self.is_speech_active || system_is_active || time_since_speech < self.hangover_samples {
|
||||
let mut guard = self.writer.lock().unwrap();
|
||||
for &sample in data {
|
||||
let amplitude = i16::MAX as f32;
|
||||
|
||||
@@ -37,7 +37,7 @@ struct LogEvent {
|
||||
timestamp: String,
|
||||
}
|
||||
|
||||
fn emit_log(app: &AppHandle, level: &str, message: &str) {
|
||||
pub(crate) fn emit_log(app: &AppHandle, level: &str, message: &str) {
|
||||
let log = LogEvent {
|
||||
level: level.to_string(),
|
||||
message: message.to_string(),
|
||||
@@ -73,8 +73,8 @@ fn get_input_devices() -> Result<Vec<AudioDevice>, String> {
|
||||
|
||||
|
||||
#[tauri::command]
|
||||
async fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id: String, save_path: Option<String>, custom_filename: Option<String>, wait_for_speech: Option<bool>) -> Result<(), String> {
|
||||
emit_log(&app, "INFO", &format!("Starting recording on device: {}", device_id));
|
||||
async fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id: String, save_path: Option<String>, custom_filename: Option<String>, wait_for_speech: Option<bool>, mode: String) -> Result<(), String> {
|
||||
emit_log(&app, "INFO", &format!("Starting recording [Mode: {}] on device: {}", mode, device_id));
|
||||
let host = cpal::default_host();
|
||||
|
||||
// Find device by name (using name as ID)
|
||||
@@ -143,10 +143,10 @@ async fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id:
|
||||
// We pass the writer to it.
|
||||
let should_wait = wait_for_speech.unwrap_or(false);
|
||||
if should_wait {
|
||||
emit_log(&app, "INFO", "Recording started in WAITING mode (buffer-only until speech).");
|
||||
emit_log(&app, "INFO", &format!("Recording started in WAITING mode (Trigger: {}).", if mode == "voice" { "Speech" } else { "System Audio" }));
|
||||
}
|
||||
|
||||
let processor = AudioProcessor::new(config.sample_rate(), config.channels(), writer.clone(), app.clone(), should_wait)
|
||||
let processor = AudioProcessor::new(config.sample_rate(), config.channels(), writer.clone(), app.clone(), should_wait, mode)
|
||||
.map_err(|e| format!("Failed to create AudioProcessor: {}", e))?;
|
||||
|
||||
// Wrap processor in Arc<Mutex> so we can share/move it into callback
|
||||
@@ -158,61 +158,40 @@ async fn start_recording(app: AppHandle, state: State<'_, AppState>, device_id:
|
||||
let processor_clone = processor.clone();
|
||||
|
||||
// --- SYSTEM AUDIO CAPTURE START ---
|
||||
let mut sys_capture = sc_audio::SystemAudioCapture::new(config.sample_rate());
|
||||
// Prevent Doubling: If user selected an aggregate device (Hearbit Audio/BlackHole),
|
||||
// it ALREADY contains system audio. In that case, we don't need internal SCK capture.
|
||||
let is_aggregate = device_id.contains("Hearbit") || device_id.contains("BlackHole");
|
||||
|
||||
// Get the queue to share with the capture callback
|
||||
let queue_clone = {
|
||||
let p = processor.lock().unwrap();
|
||||
p.system_queue.clone() // Access the pub field we added
|
||||
};
|
||||
if is_aggregate {
|
||||
emit_log(&app, "INFO", "Aggregate device detected. Disabling internal System Audio Capture to prevent doubling.");
|
||||
} else {
|
||||
let mut sys_capture = sc_audio::SystemAudioCapture::new(config.sample_rate());
|
||||
|
||||
let sys_handle = app.clone();
|
||||
let sys_callback = move |data: &[f32]| {
|
||||
// Push to queue
|
||||
if let Ok(mut q) = queue_clone.lock() {
|
||||
q.extend(data.iter());
|
||||
// Get the queue to share with the capture callback
|
||||
let queue_clone = {
|
||||
let p = processor.lock().unwrap();
|
||||
p.system_queue.clone() // Access the pub field we added
|
||||
};
|
||||
|
||||
// Limit queue size to avoid memory leaks if main process loop is slow
|
||||
while q.len() > 48000 * 5 { // 5 seconds buffer
|
||||
q.pop_front();
|
||||
let sys_callback = move |data: &[f32]| {
|
||||
// Push to queue
|
||||
if let Ok(mut q) = queue_clone.lock() {
|
||||
q.extend(data.iter());
|
||||
|
||||
// Limit queue size to avoid memory leaks if main process loop is slow
|
||||
while q.len() > 48000 * 5 { // 5 seconds buffer
|
||||
q.pop_front();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
match sys_capture.start(sys_callback).await {
|
||||
Ok(_) => emit_log(&app, "INFO", "System Audio Capture started."),
|
||||
Err(e) => emit_log(&app, "WARN", &format!("System Audio Capture failed (Permissions?): {}", e)),
|
||||
}
|
||||
};
|
||||
|
||||
// Need to run async start in sync command?
|
||||
// Tauri commands are async if they return Future, but here we returned Result.
|
||||
// We should probably spawn it.
|
||||
// Actually, SystemAudioCapture::start is async.
|
||||
// We can spawn a tokio task to start it. But we need to keep the object alive.
|
||||
// The start method modifies self.stream.
|
||||
// If we make start synchronous or use block_in_place?
|
||||
// Better: change start_recording to async fn (it is not currently async in signature used by tauri::command macros? No, tauri supports async commands).
|
||||
// Let's check line 76: `fn start_recording`... it is NOT async.
|
||||
// We should make it `async fn start_recording`.
|
||||
|
||||
// However, changing to async might affect how state is passed or other things.
|
||||
// Actually Tauri works fine with async commands.
|
||||
// But then we need to await `sys_capture.start`.
|
||||
|
||||
// Wait, let's look at `SystemAudioCapture::start`. It takes `&mut self`.
|
||||
// We can't easily spawn it away properly if we want to keep `sys_capture` in State.
|
||||
// The `sys_capture` struct holds the `SCStream` which must be kept alive.
|
||||
|
||||
// Let's assume we can make `start_recording` into `async fn`.
|
||||
|
||||
// TEMPORARY: Just putting placeholder for logic flow.
|
||||
// We will need to change the function signature of start_recording to async first in a separate step or assume I can do it here if I replace the whole signature.
|
||||
// The replace_file_content replaces a block.
|
||||
// I will replace line 76 in a separate call to make it async.
|
||||
|
||||
// For this block, I will assume it's async context.
|
||||
|
||||
match sys_capture.start(sys_callback) {
|
||||
Ok(_) => emit_log(&app, "INFO", "System Audio Capture started."),
|
||||
Err(e) => emit_log(&app, "WARN", &format!("System Audio Capture failed (Permissions?): {}", e)),
|
||||
*state.system_capture.lock().unwrap() = Some(sys_capture);
|
||||
}
|
||||
|
||||
*state.system_capture.lock().unwrap() = Some(sys_capture);
|
||||
// --- SYSTEM AUDIO CAPTURE END ---
|
||||
|
||||
let app_handle = app.clone();
|
||||
@@ -585,8 +564,9 @@ async fn poll_transcription(app: &AppHandle, client: &reqwest::Client, api_key:
|
||||
return Err(format!("Download failed: {}", dl_res.status()));
|
||||
}
|
||||
} else if status == "failed" || status == "error" {
|
||||
emit_log(app, "ERROR", &format!("Batch processing failed: {:?}", json));
|
||||
return Err(format!("Batch processing failed: {:?}", json));
|
||||
let err_msg = format!("Batch processing failed [Status: {}]. Full Response: {:?}", status, json);
|
||||
emit_log(app, "ERROR", &err_msg);
|
||||
return Err(err_msg);
|
||||
}
|
||||
// If 'processing' or 'pending', continue loop
|
||||
}
|
||||
|
||||
@@ -56,7 +56,7 @@ impl SystemAudioCapture {
|
||||
Self { stream: None, sample_rate }
|
||||
}
|
||||
|
||||
pub fn start<F>(&mut self, callback: F) -> Result<(), String>
|
||||
pub async fn start<F>(&mut self, callback: F) -> Result<(), String>
|
||||
where F: Fn(&[f32]) + Send + Sync + 'static {
|
||||
|
||||
let content = UnsafeSCShareableContent::get().map_err(|e| format!("Failed to get content"))?;
|
||||
|
||||
@@ -123,15 +123,18 @@ const Recorder: React.FC<RecorderProps> = ({
|
||||
const aggregateDev = aliasedDevs.find(d => d.name === 'Hearbit Audio');
|
||||
const virtualDev = aliasedDevs.find(d => d.name.includes('Hearbit Virtual'));
|
||||
|
||||
if (aggregateDev) {
|
||||
setRecordingMode('meeting');
|
||||
setSelectedDevice(aggregateDev.id);
|
||||
} else if (virtualDev) {
|
||||
setRecordingMode('meeting');
|
||||
setSelectedDevice(virtualDev.id);
|
||||
} else {
|
||||
setRecordingMode('voice');
|
||||
if (aliasedDevs.length > 0) setSelectedDevice(aliasedDevs[0].id);
|
||||
if (recordingMode === 'meeting') {
|
||||
if (aggregateDev) {
|
||||
setSelectedDevice(aggregateDev.id);
|
||||
} else if (virtualDev) {
|
||||
setSelectedDevice(virtualDev.id);
|
||||
} else if (aliasedDevs.length > 0) {
|
||||
setSelectedDevice(aliasedDevs[0].id);
|
||||
}
|
||||
} else if (aliasedDevs.length > 0) {
|
||||
// Voice mode: just pick first non-virtual if possible, otherwise first
|
||||
const physicalMic = aliasedDevs.find(d => !d.name.includes('Hearbit') && !d.name.includes('BlackHole'));
|
||||
setSelectedDevice(physicalMic ? physicalMic.id : aliasedDevs[0].id);
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
@@ -160,7 +163,8 @@ const Recorder: React.FC<RecorderProps> = ({
|
||||
deviceId: targetDeviceId,
|
||||
savePath: savePath || null,
|
||||
customFilename: props.recordingSubject || null,
|
||||
waitForSpeech: autoStartEnabled // Pass the toggle state
|
||||
waitForSpeech: autoStartEnabled, // Pass the toggle state
|
||||
mode: recordingMode
|
||||
});
|
||||
|
||||
setIsRecording(true);
|
||||
@@ -268,7 +272,7 @@ const Recorder: React.FC<RecorderProps> = ({
|
||||
|
||||
// AUTO STOP Logic
|
||||
// Use Ref to get LATEST visibility instantly
|
||||
if (isVisibleRef.current && timeSinceSpeech > 20 && !isStoppingRef.current) {
|
||||
if (isVisibleRef.current && timeSinceSpeech > 25 && !isStoppingRef.current) {
|
||||
console.log("Auto-stopping due to silence...");
|
||||
isStoppingRef.current = true;
|
||||
addToast('Auto-stopped due to silence', 'info');
|
||||
|
||||
Reference in New Issue
Block a user