【glm4-voice-9b 本地运行并测试 gradio+notebook】
## 安装环境
git clone --recurse-submodules https://github.com/THUDM/GLM-4-Voice
cd GLM-4-Voice
pip install -r requirements.txt
pip install accelerate
## 下载模型权重
modelscope download --model ZhipuAI/glm-4-voice-9b --local_dir /root/autodl-tmp/models/glm-4-voice-9b
modelscope download --model ZhipuAI/glm-4-voice-tokenizer --local_dir /root/autodl-tmp/models/glm-4-voice-tokenizer
modelscope download --model ZhipuAI/glm-4-voice-decoder --local_dir /root/autodl-tmp/models/glm-4-voice-decoder
## 测试
启动大模型
python model_server.py --host 0.0.0.0 --model-path /root/autodl-tmp/models/glm-4-voice-9b --port 10000 --dtype bfloat16 --device cuda:0
notebook运行
在下载的GLM-4-Voice文件夹中新建demo.ipynb,将下边的代码复制进去
import sys
sys.path.insert(0, "./cosyvoice")
sys.path.insert(0, "./third_party/Matcha-TTS")
import os, random, string,json,os.path,re, uuid,requests
import torch, torchaudio
from transformers import WhisperFeatureExtractor, AutoTokenizer
from speech_tokenizer.modeling_whisper import WhisperVQEncoder
from speech_tokenizer.utils import extract_speech_token
from flow_inference import AudioDecoder
flow_path = "/root/autodl-tmp/models/glm-4-voice-decoder"
model_path = "/root/autodl-tmp/models/glm-4-voice-9b"
tokenizer_path = "/root/autodl-tmp/models/glm-4-voice-tokenizer"
flow_config = os.path.join(flow_path, "config.yaml")
flow_checkpoint = os.path.join(flow_path, "flow.pt")
hift_checkpoint = os.path.join(flow_path, "hift.pt")
device = "cuda"
# Speech tokenizer
whisper_model = WhisperVQEncoder.from_pretrained(tokenizer_path).eval().to(device)
feature_extractor = WhisperFeatureExtractor.from_pretrained(tokenizer_path)
# GLM
glm_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# Flow & Hift
audio_decoder = AudioDecoder(
config_path=flow_config,
flow_ckpt_path=flow_checkpoint,
hift_ckpt_path=hift_checkpoint,
device=device,
)
def inference_fn(
temperature: float = 0.3,
top_p: float = 0.9,
max_new_token: int = 2000,
input_mode="text",
audio_path: str | None = None,
input_text: str | None = None,
history: list[dict] = [],
history_tokens: str = "",
save_dir: str = "audio-tmp",
):
os.makedirs(save_dir, exist_ok=True)
system_prompt = (
"User will provide you with a text/speech instruction. Do it step by step. First, "
"think about the instruction and respond in a interleaved manner, "
"with 13 text token followed by 26 audio tokens."
)
if input_mode == "audio":
assert audio_path is not None
history.append({"role": "user", "content": {"path": audio_path}})
audio_tokens = extract_speech_token(
whisper_model, feature_extractor, [audio_path]
)[0]
if len(audio_tokens) == 0:
raise "No audio tokens extracted"
audio_tokens = "".join([f"<|audio_{x}|>" for x in audio_tokens])
audio_tokens = "<|begin_of_audio|>" + audio_tokens + "<|end_of_audio|>"
user_input = audio_tokens
system_prompt = (
"User will provide you with a speech instruction. Do it step by step. First, "
"think about the instruction and respond in a interleaved manner, "
"with 13 text token followed by 26 audio tokens."
)
else:
assert input_text is not None
history.append({"role": "user", "content": input_text})
user_input = input_text
system_prompt = (
"User will provide you with a text instruction. Do it step by step. First, "
"think about the instruction and respond in a interleaved manner, "
"with 13 text token followed by 26 audio tokens."
)
# Gather history
inputs = history_tokens
if "<|system|>" not in inputs:
inputs += f"<|system|>\n{system_prompt}"
inputs += f"<|user|>\n{user_input}<|assistant|>streaming_transcription\n"
with torch.no_grad():
response = requests.post(
"http://localhost:10000/generate_stream",
data=json.dumps(
{
"prompt": inputs,
"temperature": temperature,
"top_p": top_p,
"max_new_tokens": max_new_token,
}
),
stream=True,
)
text_tokens, audio_tokens = [], []
audio_offset = glm_tokenizer.convert_tokens_to_ids("<|audio_0|>")
end_token_id = glm_tokenizer.convert_tokens_to_ids("<|user|>")
complete_tokens = []
prompt_speech_feat = torch.zeros(1, 0, 80).to(device)
flow_prompt_speech_token = torch.zeros(1, 0, dtype=torch.int64).to(device)
this_uuid = str(uuid.uuid4())
tts_speechs = []
tts_mels = []
prev_mel = None
is_finalize = False
block_size = 10
for chunk in response.iter_lines():
token_id = json.loads(chunk)["token_id"]
if token_id == end_token_id:
is_finalize = True
if len(audio_tokens) >= block_size or (is_finalize and audio_tokens):
block_size = 20
tts_token = torch.tensor(audio_tokens, device=device).unsqueeze(0)
if prev_mel is not None:
prompt_speech_feat = torch.cat(tts_mels, dim=-1).transpose(1, 2)
tts_speech, tts_mel = audio_decoder.token2wav(
tts_token,
uuid=this_uuid,
prompt_token=flow_prompt_speech_token.to(device),
prompt_feat=prompt_speech_feat.to(device),
finalize=is_finalize,
)
prev_mel = tts_mel
tts_speechs.append(tts_speech.squeeze())
tts_mels.append(tts_mel)
# yield history, inputs, "", "", (
# 22050,
# tts_speech.squeeze().cpu().numpy(),
# ), None
flow_prompt_speech_token = torch.cat(
(flow_prompt_speech_token, tts_token), dim=-1
)
audio_tokens = []
if not is_finalize:
complete_tokens.append(token_id)
if token_id >= audio_offset:
audio_tokens.append(token_id - audio_offset)
else:
text_tokens.append(token_id)
tts_speech = torch.cat(tts_speechs, dim=-1).cpu()
name = os.path.join(save_dir, ''.join(random.sample(string.ascii_letters, k=9))+'.wav')
torchaudio.save(name, tts_speech.unsqueeze(0), 22050, format="wav")
history.append(
{
"role": "assistant",
"content": glm_tokenizer.decode(text_tokens, ignore_special_tokens=False),
"audio_path": name,
"type": "audio/wav",
}
)
complete_text = glm_tokenizer.decode(
complete_tokens, spaces_between_special_tokens=False
)
inputs += complete_text
return history, inputs.strip()
# 初始化
from IPython.display import display, Audio
history, history_tokens = [], ''
# 修改input_text即可连续对话
history, history_tokens = inference_fn(
input_text='用可爱的语气说:我是一头可爱的毛驴',
history=history,
history_tokens=history_tokens
)
print(f'Assistant:{history[-1]["content"]}')
display(Audio(history[-1]['audio_path'], autoplay=True))
CodeWithGpu社区--一键测试https://www.codewithgpu.com/i/THUDM/GLM-4-Voice/glm4-voice-9b