当前位置：首页 > article >正文

【glm4-voice-9b 本地运行并测试 gradio+notebook】

article 2024/12/25 2:04:33

## 安装环境

git clone --recurse-submodules https://github.com/THUDM/GLM-4-Voice
cd GLM-4-Voice
pip install -r requirements.txt
pip install accelerate

## 下载模型权重

modelscope download --model ZhipuAI/glm-4-voice-9b --local_dir /root/autodl-tmp/models/glm-4-voice-9b
modelscope download --model ZhipuAI/glm-4-voice-tokenizer --local_dir /root/autodl-tmp/models/glm-4-voice-tokenizer
modelscope download --model ZhipuAI/glm-4-voice-decoder  --local_dir /root/autodl-tmp/models/glm-4-voice-decoder

## 测试

启动大模型

python model_server.py --host 0.0.0.0 --model-path /root/autodl-tmp/models/glm-4-voice-9b --port 10000 --dtype bfloat16 --device cuda:0

notebook运行

在下载的GLM-4-Voice文件夹中新建demo.ipynb，将下边的代码复制进去

import sys

sys.path.insert(0, "./cosyvoice")
sys.path.insert(0, "./third_party/Matcha-TTS")
import os, random, string,json,os.path,re, uuid,requests
import torch, torchaudio
from transformers import WhisperFeatureExtractor, AutoTokenizer
from speech_tokenizer.modeling_whisper import WhisperVQEncoder
from speech_tokenizer.utils import extract_speech_token
from flow_inference import AudioDecoder

flow_path = "/root/autodl-tmp/models/glm-4-voice-decoder"
model_path = "/root/autodl-tmp/models/glm-4-voice-9b"
tokenizer_path = "/root/autodl-tmp/models/glm-4-voice-tokenizer"

flow_config = os.path.join(flow_path, "config.yaml")
flow_checkpoint = os.path.join(flow_path, "flow.pt")
hift_checkpoint = os.path.join(flow_path, "hift.pt")
device = "cuda"

# Speech tokenizer
whisper_model = WhisperVQEncoder.from_pretrained(tokenizer_path).eval().to(device)
feature_extractor = WhisperFeatureExtractor.from_pretrained(tokenizer_path)

# GLM
glm_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# Flow & Hift
audio_decoder = AudioDecoder(
    config_path=flow_config,
    flow_ckpt_path=flow_checkpoint,
    hift_ckpt_path=hift_checkpoint,
    device=device,
)

def inference_fn(
    temperature: float = 0.3,
    top_p: float = 0.9,
    max_new_token: int = 2000,
    input_mode="text",
    audio_path: str | None = None,
    input_text: str | None = None,
    history: list[dict] = [],
    history_tokens: str = "",
    save_dir: str = "audio-tmp",
):
    os.makedirs(save_dir, exist_ok=True)
    system_prompt = (
        "User will provide you with a text/speech instruction. Do it step by step. First, "
        "think about the instruction and respond in a interleaved manner, "
        "with 13 text token followed by 26 audio tokens."
    )
    if input_mode == "audio":
        assert audio_path is not None
        history.append({"role": "user", "content": {"path": audio_path}})
        audio_tokens = extract_speech_token(
            whisper_model, feature_extractor, [audio_path]
        )[0]
        if len(audio_tokens) == 0:
            raise "No audio tokens extracted"
        audio_tokens = "".join([f"<|audio_{x}|>" for x in audio_tokens])
        audio_tokens = "<|begin_of_audio|>" + audio_tokens + "<|end_of_audio|>"
        user_input = audio_tokens
        system_prompt = (
            "User will provide you with a speech instruction. Do it step by step. First, "
            "think about the instruction and respond in a interleaved manner, "
            "with 13 text token followed by 26 audio tokens."
        )
    else:
        assert input_text is not None
        history.append({"role": "user", "content": input_text})
        user_input = input_text
        system_prompt = (
            "User will provide you with a text instruction. Do it step by step. First, "
            "think about the instruction and respond in a interleaved manner, "
            "with 13 text token followed by 26 audio tokens."
        )

    # Gather history
    inputs = history_tokens
    if "<|system|>" not in inputs:
        inputs += f"<|system|>\n{system_prompt}"
    inputs += f"<|user|>\n{user_input}<|assistant|>streaming_transcription\n"

    with torch.no_grad():
        response = requests.post(
            "http://localhost:10000/generate_stream",
            data=json.dumps(
                {
                    "prompt": inputs,
                    "temperature": temperature,
                    "top_p": top_p,
                    "max_new_tokens": max_new_token,
                }
            ),
            stream=True,
        )
        text_tokens, audio_tokens = [], []
        audio_offset = glm_tokenizer.convert_tokens_to_ids("<|audio_0|>")
        end_token_id = glm_tokenizer.convert_tokens_to_ids("<|user|>")
        complete_tokens = []
        prompt_speech_feat = torch.zeros(1, 0, 80).to(device)
        flow_prompt_speech_token = torch.zeros(1, 0, dtype=torch.int64).to(device)
        this_uuid = str(uuid.uuid4())
        tts_speechs = []
        tts_mels = []
        prev_mel = None
        is_finalize = False
        block_size = 10
        for chunk in response.iter_lines():
            token_id = json.loads(chunk)["token_id"]
            if token_id == end_token_id:
                is_finalize = True
            if len(audio_tokens) >= block_size or (is_finalize and audio_tokens):
                block_size = 20
                tts_token = torch.tensor(audio_tokens, device=device).unsqueeze(0)

                if prev_mel is not None:
                    prompt_speech_feat = torch.cat(tts_mels, dim=-1).transpose(1, 2)

                tts_speech, tts_mel = audio_decoder.token2wav(
                    tts_token,
                    uuid=this_uuid,
                    prompt_token=flow_prompt_speech_token.to(device),
                    prompt_feat=prompt_speech_feat.to(device),
                    finalize=is_finalize,
                )
                prev_mel = tts_mel

                tts_speechs.append(tts_speech.squeeze())
                tts_mels.append(tts_mel)
                # yield history, inputs, "", "", (
                #     22050,
                #     tts_speech.squeeze().cpu().numpy(),
                # ), None
                flow_prompt_speech_token = torch.cat(
                    (flow_prompt_speech_token, tts_token), dim=-1
                )
                audio_tokens = []

            if not is_finalize:
                complete_tokens.append(token_id)
                if token_id >= audio_offset:
                    audio_tokens.append(token_id - audio_offset)
                else:
                    text_tokens.append(token_id)
    tts_speech = torch.cat(tts_speechs, dim=-1).cpu()
    name = os.path.join(save_dir, ''.join(random.sample(string.ascii_letters, k=9))+'.wav')
    torchaudio.save(name, tts_speech.unsqueeze(0), 22050, format="wav")
    history.append(
        {
            "role": "assistant",
            "content": glm_tokenizer.decode(text_tokens, ignore_special_tokens=False),
            "audio_path": name,
            "type": "audio/wav",
        }
    )
    complete_text = glm_tokenizer.decode(
        complete_tokens, spaces_between_special_tokens=False
    )
    inputs += complete_text

    return history, inputs.strip()

# 初始化

from IPython.display import display, Audio

history, history_tokens = [], ''

# 修改input_text即可连续对话

history, history_tokens = inference_fn(
    input_text='用可爱的语气说：我是一头可爱的毛驴',
    history=history,
    history_tokens=history_tokens
)
print(f'Assistant:{history[-1]["content"]}')
display(Audio(history[-1]['audio_path'], autoplay=True))