Skip to content

Commit

Permalink
🐛 remove rubberband-cli dependencies lenML#68
Browse files Browse the repository at this point in the history
- 不在依赖 rubberband-cli ,兼容 window 系统
  • Loading branch information
zhzLuke96 committed Jun 24, 2024
1 parent ff9c7c0 commit 1cd34c3
Show file tree
Hide file tree
Showing 7 changed files with 41 additions and 69 deletions.
2 changes: 0 additions & 2 deletions docs/dependencies.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ git clone https://github.com/lenML/ChatTTS-Forge.git --depth=1
音频后处理操作(如加速、减速、提高音量等)依赖以下库:
- **ffmpeg** 或 **libav**(推荐使用 ffmpeg)
- **rubberband-cli**(仅 Linux 环境需要)
### 安装 ffmpeg
Expand All @@ -96,7 +95,6 @@ brew install ffmpeg

```bash
apt-get install ffmpeg libavcodec-extra
apt-get install rubberband-cli
```

**Windows**:
Expand Down
21 changes: 4 additions & 17 deletions modules/SynthesizeSegments.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from modules.speaker import Speaker
from modules.ssml_parser.SSMLParser import SSMLBreak, SSMLContext, SSMLSegment
from modules.utils import rng
from modules.utils.audio import pitch_shift, time_stretch
from modules.utils.audio import apply_prosody_to_audio_segment

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -67,21 +67,6 @@ def combine_audio_segments(audio_segments: list[AudioSegment]) -> AudioSegment:
return combined_audio


def apply_prosody(
audio_segment: AudioSegment, rate: float, volume: float, pitch: float
) -> AudioSegment:
if rate != 1:
audio_segment = time_stretch(audio_segment, rate)

if volume != 0:
audio_segment += volume

if pitch != 0:
audio_segment = pitch_shift(audio_segment, pitch)

return audio_segment


def to_number(value, t, default=0):
try:
number = t(value)
Expand Down Expand Up @@ -228,7 +213,9 @@ def append_eos(text: str):
pitch = float(segment.get("pitch", "0"))

audio_segment = audio_data_to_segment(audio_data, sr)
audio_segment = apply_prosody(audio_segment, rate, volume, pitch)
audio_segment = apply_prosody_to_audio_segment(
audio_segment, rate=rate, volume=volume, pitch=pitch
)
# compare by Box object
original_index = src_segments.index(segment)
audio_segments[original_index] = audio_segment
Expand Down
3 changes: 3 additions & 0 deletions modules/devices/devices.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,9 @@ def reset_device():
global dtype_gpt
global dtype_decoder

if config.runtime_env_vars.use_cpu is None:
config.runtime_env_vars.use_cpu = []

if "all" in config.runtime_env_vars.use_cpu and not config.runtime_env_vars.no_half:
logger.warning(
"Cannot use half precision with CPU, using full precision instead"
Expand Down
72 changes: 31 additions & 41 deletions modules/utils/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from io import BytesIO

import numpy as np
import pyrubberband as pyrb
import soundfile as sf
from pydub import AudioSegment, effects

Expand Down Expand Up @@ -50,7 +49,7 @@ def pydub_to_np(audio: AudioSegment) -> tuple[int, np.ndarray]:
)


def ndarray_to_segment(ndarray, frame_rate):
def ndarray_to_segment(ndarray: np.ndarray, frame_rate: int) -> AudioSegment:
buffer = BytesIO()
sf.write(buffer, ndarray, frame_rate, format="wav")
buffer.seek(0)
Expand All @@ -60,58 +59,49 @@ def ndarray_to_segment(ndarray, frame_rate):
return sound


def time_stretch(input_segment: AudioSegment, time_factor: float) -> AudioSegment:
"""
factor range -> [0.2,10]
"""
time_factor = np.clip(time_factor, 0.2, 10)
sr = input_segment.frame_rate
y = audiosegment_to_librosawav(input_segment)
y_stretch = pyrb.time_stretch(y, sr, time_factor)

sound = ndarray_to_segment(
y_stretch,
frame_rate=sr,
)
return sound
def apply_prosody_to_audio_segment(
audio_segment: AudioSegment,
rate: float = 1,
volume: float = 0,
pitch: int = 0,
sr: int = 24000,
) -> AudioSegment:
# Adjust rate (speed)
if rate != 1:
audio_segment = effects.speedup(audio_segment, playback_speed=rate)

# Adjust volume
if volume != 0:
audio_segment = audio_segment + volume

def pitch_shift(
input_segment: AudioSegment,
pitch_shift_factor: float,
) -> AudioSegment:
"""
factor range -> [-12,12]
"""
pitch_shift_factor = np.clip(pitch_shift_factor, -12, 12)
sr = input_segment.frame_rate
y = audiosegment_to_librosawav(input_segment)
y_shift = pyrb.pitch_shift(y, sr, pitch_shift_factor)

sound = ndarray_to_segment(
y_shift,
frame_rate=sr,
)
return sound
# Adjust pitch
if pitch != 0:
audio_segment = audio_segment._spawn(
audio_segment.raw_data,
overrides={
"frame_rate": int(audio_segment.frame_rate * (2.0 ** (pitch / 12.0)))
},
).set_frame_rate(sr)

return audio_segment


def apply_prosody_to_audio_data(
audio_data: np.ndarray,
rate: float = 1,
volume: float = 0,
pitch: float = 0,
pitch: int = 0,
sr: int = 24000,
) -> np.ndarray:
if rate != 1:
audio_data = pyrb.time_stretch(audio_data, sr=sr, rate=rate)
audio_segment = ndarray_to_segment(audio_data, sr)

if volume != 0:
audio_data = audio_data * volume
audio_segment = apply_prosody_to_audio_segment(
audio_segment, rate=rate, volume=volume, pitch=pitch, sr=sr
)

if pitch != 0:
audio_data = pyrb.pitch_shift(audio_data, sr=sr, n_steps=pitch)
processed_audio_data = np.array(audio_segment.get_array_of_samples())

return audio_data
return processed_audio_data


def apply_normalize(
Expand Down
4 changes: 1 addition & 3 deletions requirements.dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,6 @@ Pygments==2.18.0
pynvml==11.5.0
pyparsing==3.1.2
pypinyin==0.51.0
pyrubberband==0.3.0
PySoundFile==0.9.0.post1
pytest==8.2.2
pytest-cov==5.0.0
Expand Down Expand Up @@ -158,5 +157,4 @@ watchfiles==0.22.0
websockets==11.0.3
Werkzeug==3.0.3
zhon==2.0.2
ftfy==6.2.0
langdetect==1.0.9
ftfy==6.2.0
4 changes: 1 addition & 3 deletions requirements.docker.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ lxml
pydub
fastapi
soundfile
pyrubberband
omegaconf
pypinyin
pandas
Expand All @@ -26,5 +25,4 @@ mistune==3.0.2
cn2an
# audio_denoiser
python-box
ftfy
langdetect
ftfy
4 changes: 1 addition & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ lxml
pydub
fastapi
soundfile
pyrubberband
omegaconf
pypinyin
vocos
Expand All @@ -25,5 +24,4 @@ mistune==3.0.2
cn2an
# audio_denoiser
python-box
ftfy
langdetect
ftfy

0 comments on commit 1cd34c3

Please sign in to comment.