Skip to content

Commit 0a4dc1f

Browse files
committed
chore(demo): Voice-to-text-to-voice pipeline
1 parent df14367 commit 0a4dc1f

11 files changed

Lines changed: 2008 additions & 0 deletions
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Voice-to-Text-to-Voice Pipeline
2+
3+
This project implements a complete voice transformation pipeline that:
4+
5+
1. **Voice β†’ Text**: Uses OpenAI's Whisper model (via Transformers) to transcribe speech to text
6+
2. **Text β†’ Voice**: Uses ChatterboxTTS to generate speech with voice cloning based on an audio prompt
7+
8+
9+
This can be done from a file, or by using system audio like microphone & speaker
10+
11+
## Usage
12+
13+
Install the dependencies.
14+
```sh
15+
pip install -r requirements.txt
16+
```
17+
18+
Make sure to uninstall & reinstall the right version of Torch, compatible with your Cuda version
19+
20+
Once ready, start the voice_pipeline.
21+
22+
```sh
23+
python voice_pipeline.py
24+
```
25+
26+
### Platform-specific Setup:
27+
28+
**Windows:**
29+
1. Install [VB-Cable](https://vb-audio.com/Cable/) (free)
30+
2. Set output device to "CABLE Input"
31+
3. In Discord/Zoom, select "CABLE Output" as microphone
32+
33+
**macOS:**
34+
1. Install [BlackHole](https://github.com/ExistentialAudio/BlackHole) (free)
35+
2. Set output device to "BlackHole 2ch"
36+
3. In Discord/Zoom, select "BlackHole 2ch" as microphone
37+
38+
**Linux:**
39+
1. Create virtual device: `pactl load-module module-null-sink sink_name=virtual_mic`
40+
2. Set output device to the virtual sink
41+
3. In Discord/Zoom, select the virtual source as microphone
42+
43+
### Performance Tips
44+
45+
- **Lower latency**: Use shorter chunk durations (1-2 seconds)
46+
- **Better quality**: Use longer chunk durations (3-5 seconds)
47+
- **GPU acceleration**: Ensure CUDA is available for faster processing
48+
- **Audio quality**: Use high-quality voice prompt files
49+
50+
## Files
51+
52+
- [`main.py`](./main.py) - Main pipeline implementation with demos
53+
- [`voice_pipeline.py`](./voice_pipeline.py) - Command-line utility script
54+
- [`male_petergriffin.wav`](./male_petergriffin.wav) - Sample audio prompt for voice cloning
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Debug Audio Test Script
4+
5+
This script tests the TTS audio generation separately from real-time processing
6+
to help diagnose audio output issues.
7+
"""
8+
9+
import torch
10+
import numpy as np
11+
import soundfile as sf
12+
from pathlib import Path
13+
from chatterbox.tts import ChatterboxTTS
14+
import logging
15+
16+
logging.basicConfig(level=logging.INFO)
17+
logger = logging.getLogger(__name__)
18+
19+
def test_tts_generation():
20+
"""Test TTS audio generation with debug output."""
21+
22+
# Test configuration
23+
audio_prompt_path = "male_petergriffin.wav" # Update this path as needed
24+
test_text = "Hello, this is a test of the text-to-speech system."
25+
26+
if not Path(audio_prompt_path).exists():
27+
logger.error(f"Audio prompt file not found: {audio_prompt_path}")
28+
return
29+
30+
try:
31+
# Load TTS model
32+
logger.info("Loading ChatterboxTTS model...")
33+
device = "cuda" if torch.cuda.is_available() else "cpu"
34+
tts_model = ChatterboxTTS.from_pretrained(device=device)
35+
logger.info(f"TTS model loaded on {device}")
36+
37+
# Generate audio
38+
logger.info(f"Generating audio for text: '{test_text}'")
39+
wav = tts_model.generate(test_text, audio_prompt_path=audio_prompt_path)
40+
41+
# Process audio
42+
if isinstance(wav, torch.Tensor):
43+
wav = wav.squeeze().cpu().numpy()
44+
45+
if wav.ndim > 1:
46+
wav = wav.flatten()
47+
48+
# Save original audio
49+
original_path = "debug_tts_original.wav"
50+
sf.write(original_path, wav, tts_model.sr)
51+
logger.info(f"Original TTS audio saved to: {original_path}")
52+
logger.info(f"Original audio: length={len(wav)} samples, sr={tts_model.sr}Hz, duration={len(wav)/tts_model.sr:.2f}s")
53+
logger.info(f"Original audio: min={wav.min():.4f}, max={wav.max():.4f}, mean={wav.mean():.4f}")
54+
55+
# Test resampling to 16kHz (Whisper sample rate)
56+
target_sr = 16000
57+
if tts_model.sr != target_sr:
58+
import torchaudio as ta
59+
logger.info(f"Resampling from {tts_model.sr}Hz to {target_sr}Hz")
60+
wav_tensor = torch.tensor(wav, dtype=torch.float32).unsqueeze(0)
61+
resampler = ta.transforms.Resample(tts_model.sr, target_sr)
62+
wav_resampled = resampler(wav_tensor).squeeze().numpy()
63+
64+
# Save resampled audio
65+
resampled_path = "debug_tts_resampled.wav"
66+
sf.write(resampled_path, wav_resampled, target_sr)
67+
logger.info(f"Resampled audio saved to: {resampled_path}")
68+
logger.info(f"Resampled audio: length={len(wav_resampled)} samples, sr={target_sr}Hz, duration={len(wav_resampled)/target_sr:.2f}s")
69+
logger.info(f"Resampled audio: min={wav_resampled.min():.4f}, max={wav_resampled.max():.4f}, mean={wav_resampled.mean():.4f}")
70+
71+
logger.info("TTS test completed successfully!")
72+
logger.info("Check the generated audio files to verify they sound correct.")
73+
74+
except Exception as e:
75+
logger.error(f"TTS test failed: {e}")
76+
import traceback
77+
logger.error(traceback.format_exc())
78+
79+
if __name__ == "__main__":
80+
test_tts_generation()
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
import torchaudio as ta
2+
import torch
3+
from transformers import WhisperProcessor, WhisperForConditionalGeneration
4+
from chatterbox.tts import ChatterboxTTS
5+
6+
def voice_to_text_to_voice_pipeline(input_audio_path, audio_prompt_path, output_path):
7+
"""
8+
Complete pipeline: voice -> text -> voice (transformed)
9+
10+
Args:
11+
input_audio_path: Path to input audio file to transcribe
12+
audio_prompt_path: Path to audio prompt for voice transformation
13+
output_path: Path to save the transformed output audio
14+
"""
15+
16+
# Step 1: Voice -> Text using Whisper from transformers
17+
print("Loading Whisper model...")
18+
processor = WhisperProcessor.from_pretrained("openai/whisper-base")
19+
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
20+
21+
print(f"Transcribing audio from {input_audio_path}...")
22+
# Load audio file
23+
audio_input, sample_rate = ta.load(input_audio_path)
24+
25+
# Resample to 16kHz if needed (Whisper expects 16kHz)
26+
if sample_rate != 16000:
27+
resampler = ta.transforms.Resample(sample_rate, 16000)
28+
audio_input = resampler(audio_input)
29+
30+
# Convert to mono if stereo
31+
if audio_input.shape[0] > 1:
32+
audio_input = torch.mean(audio_input, dim=0, keepdim=True)
33+
34+
# Process audio
35+
input_features = processor(audio_input.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features
36+
37+
# Generate transcription
38+
with torch.no_grad():
39+
predicted_ids = model.generate(input_features)
40+
41+
transcribed_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
42+
print(f"Transcribed text: {transcribed_text}")
43+
44+
# Step 2: Text -> Voice using ChatterboxTTS with audio prompt
45+
print("Loading ChatterboxTTS model...")
46+
assert torch.cuda.is_available(), "CUDA is not available. Please install CUDA and PyTorch with GPU support."
47+
tts_model = ChatterboxTTS.from_pretrained(device="cuda")
48+
49+
print(f"Generating transformed voice using prompt from {audio_prompt_path}...")
50+
wav = tts_model.generate(transcribed_text, audio_prompt_path=audio_prompt_path)
51+
52+
# Step 3: Save the transformed audio
53+
ta.save(output_path, wav, tts_model.sr)
54+
print(f"Transformed audio saved to {output_path}")
55+
56+
return transcribed_text, output_path
57+
58+
# Example usage with your existing setup
59+
if __name__ == "__main__":
60+
# Original text-to-speech example (keeping for reference)
61+
print("=== Original Text-to-Speech Example ===")
62+
text = "Ezreal and Jinx teamed up with Ahri, Yasuo, and Teemo to take down the enemy's Nexus in an epic late-game pentakill."
63+
model = ChatterboxTTS.from_pretrained(device="cuda" if torch.cuda.is_available() else "cpu")
64+
AUDIO_PROMPT_PATH = "male_petergriffin.wav"
65+
wav = model.generate(text, audio_prompt_path=AUDIO_PROMPT_PATH)
66+
ta.save("output_original.wav", wav, model.sr)
67+
print("Original output saved to output_original.wav")
68+
69+
# New voice-to-text-to-voice pipeline with file input
70+
print("\n=== Voice-to-Text-to-Voice Pipeline (File Input) ===")
71+
# You'll need to provide an input audio file to transcribe
72+
# For demo purposes, let's use the audio prompt as input (you can change this)
73+
INPUT_AUDIO_PATH = "male_petergriffin.wav" # Change this to your input audio file
74+
AUDIO_PROMPT_PATH = "male_petergriffin.wav" # This transforms the voice style
75+
OUTPUT_PATH = "output_transformed.wav"
76+
77+
try:
78+
transcribed_text, output_file = voice_to_text_to_voice_pipeline(
79+
INPUT_AUDIO_PATH,
80+
AUDIO_PROMPT_PATH,
81+
OUTPUT_PATH
82+
)
83+
print(f"\nFile pipeline completed successfully!")
84+
print(f"Transcribed: '{transcribed_text}'")
85+
print(f"Transformed audio saved to: {output_file}")
86+
except Exception as e:
87+
print(f"Error in file pipeline: {e}")
88+
print("Make sure you have an input audio file and the required models are available.")
89+
90+
# Live microphone recording demo
91+
print("\n=== Live Microphone Recording Demo ===")
92+
try:
93+
from microphone_recorder import MicrophoneRecorder
94+
95+
response = input("Would you like to try live microphone recording? (y/n): ").lower().strip()
96+
if response in ['y', 'yes']:
97+
recorder = MicrophoneRecorder()
98+
99+
print("\n🎀 Available audio devices:")
100+
recorder.list_audio_devices()
101+
102+
print(f"\nπŸ”΄ Ready to record! Speak into your microphone...")
103+
print("Press ENTER when you're done speaking.")
104+
105+
temp_recording_path = "temp_recording.wav"
106+
success = recorder.record_and_save(temp_recording_path)
107+
108+
if success:
109+
print(f"\n🎯 Processing your recorded audio...")
110+
transcribed_text, output_file = voice_to_text_to_voice_pipeline(
111+
temp_recording_path,
112+
AUDIO_PROMPT_PATH,
113+
"output_live_recording.wav"
114+
)
115+
116+
print(f"\nβœ… Live recording pipeline completed!")
117+
print(f"πŸ“ You said: '{transcribed_text}'")
118+
print(f"πŸ”Š Your voice transformed and saved to: output_live_recording.wav")
119+
120+
# Clean up temporary file
121+
import os
122+
os.remove(temp_recording_path)
123+
print("πŸ—‘οΈ Temporary recording file cleaned up.")
124+
else:
125+
print("❌ Recording failed.")
126+
else:
127+
print("Skipping live recording demo.")
128+
129+
except ImportError:
130+
print("Microphone recording not available. Install sounddevice and soundfile packages.")
131+
except Exception as e:
132+
print(f"Error in live recording demo: {e}")
475 KB
Binary file not shown.

0 commit comments

Comments
Β (0)