当前位置: 首页 > article >正文

【glm4-voice-9b 本地运行并测试 gradio+notebook】

## 安装环境

git clone --recurse-submodules https://github.com/THUDM/GLM-4-Voice
cd GLM-4-Voice
pip install -r requirements.txt
pip install accelerate

## 下载模型权重

modelscope download --model ZhipuAI/glm-4-voice-9b --local_dir /root/autodl-tmp/models/glm-4-voice-9b
modelscope download --model ZhipuAI/glm-4-voice-tokenizer --local_dir /root/autodl-tmp/models/glm-4-voice-tokenizer
modelscope download --model ZhipuAI/glm-4-voice-decoder  --local_dir /root/autodl-tmp/models/glm-4-voice-decoder


## 测试


启动大模型

python model_server.py --host 0.0.0.0 --model-path /root/autodl-tmp/models/glm-4-voice-9b --port 10000 --dtype bfloat16 --device cuda:0

notebook运行

在下载的GLM-4-Voice文件夹中新建demo.ipynb,将下边的代码复制进去

import sys

sys.path.insert(0, "./cosyvoice")
sys.path.insert(0, "./third_party/Matcha-TTS")
import os, random, string,json,os.path,re, uuid,requests
import torch, torchaudio
from transformers import WhisperFeatureExtractor, AutoTokenizer
from speech_tokenizer.modeling_whisper import WhisperVQEncoder
from speech_tokenizer.utils import extract_speech_token
from flow_inference import AudioDecoder
flow_path = "/root/autodl-tmp/models/glm-4-voice-decoder"
model_path = "/root/autodl-tmp/models/glm-4-voice-9b"
tokenizer_path = "/root/autodl-tmp/models/glm-4-voice-tokenizer"

flow_config = os.path.join(flow_path, "config.yaml")
flow_checkpoint = os.path.join(flow_path, "flow.pt")
hift_checkpoint = os.path.join(flow_path, "hift.pt")
device = "cuda"

# Speech tokenizer
whisper_model = WhisperVQEncoder.from_pretrained(tokenizer_path).eval().to(device)
feature_extractor = WhisperFeatureExtractor.from_pretrained(tokenizer_path)

# GLM
glm_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# Flow & Hift
audio_decoder = AudioDecoder(
    config_path=flow_config,
    flow_ckpt_path=flow_checkpoint,
    hift_ckpt_path=hift_checkpoint,
    device=device,
)
def inference_fn(
    temperature: float = 0.3,
    top_p: float = 0.9,
    max_new_token: int = 2000,
    input_mode="text",
    audio_path: str | None = None,
    input_text: str | None = None,
    history: list[dict] = [],
    history_tokens: str = "",
    save_dir: str = "audio-tmp",
):
    os.makedirs(save_dir, exist_ok=True)
    system_prompt = (
        "User will provide you with a text/speech instruction. Do it step by step. First, "
        "think about the instruction and respond in a interleaved manner, "
        "with 13 text token followed by 26 audio tokens."
    )
    if input_mode == "audio":
        assert audio_path is not None
        history.append({"role": "user", "content": {"path": audio_path}})
        audio_tokens = extract_speech_token(
            whisper_model, feature_extractor, [audio_path]
        )[0]
        if len(audio_tokens) == 0:
            raise "No audio tokens extracted"
        audio_tokens = "".join([f"<|audio_{x}|>" for x in audio_tokens])
        audio_tokens = "<|begin_of_audio|>" + audio_tokens + "<|end_of_audio|>"
        user_input = audio_tokens
        system_prompt = (
            "User will provide you with a speech instruction. Do it step by step. First, "
            "think about the instruction and respond in a interleaved manner, "
            "with 13 text token followed by 26 audio tokens."
        )
    else:
        assert input_text is not None
        history.append({"role": "user", "content": input_text})
        user_input = input_text
        system_prompt = (
            "User will provide you with a text instruction. Do it step by step. First, "
            "think about the instruction and respond in a interleaved manner, "
            "with 13 text token followed by 26 audio tokens."
        )

    # Gather history
    inputs = history_tokens
    if "<|system|>" not in inputs:
        inputs += f"<|system|>\n{system_prompt}"
    inputs += f"<|user|>\n{user_input}<|assistant|>streaming_transcription\n"

    with torch.no_grad():
        response = requests.post(
            "http://localhost:10000/generate_stream",
            data=json.dumps(
                {
                    "prompt": inputs,
                    "temperature": temperature,
                    "top_p": top_p,
                    "max_new_tokens": max_new_token,
                }
            ),
            stream=True,
        )
        text_tokens, audio_tokens = [], []
        audio_offset = glm_tokenizer.convert_tokens_to_ids("<|audio_0|>")
        end_token_id = glm_tokenizer.convert_tokens_to_ids("<|user|>")
        complete_tokens = []
        prompt_speech_feat = torch.zeros(1, 0, 80).to(device)
        flow_prompt_speech_token = torch.zeros(1, 0, dtype=torch.int64).to(device)
        this_uuid = str(uuid.uuid4())
        tts_speechs = []
        tts_mels = []
        prev_mel = None
        is_finalize = False
        block_size = 10
        for chunk in response.iter_lines():
            token_id = json.loads(chunk)["token_id"]
            if token_id == end_token_id:
                is_finalize = True
            if len(audio_tokens) >= block_size or (is_finalize and audio_tokens):
                block_size = 20
                tts_token = torch.tensor(audio_tokens, device=device).unsqueeze(0)

                if prev_mel is not None:
                    prompt_speech_feat = torch.cat(tts_mels, dim=-1).transpose(1, 2)

                tts_speech, tts_mel = audio_decoder.token2wav(
                    tts_token,
                    uuid=this_uuid,
                    prompt_token=flow_prompt_speech_token.to(device),
                    prompt_feat=prompt_speech_feat.to(device),
                    finalize=is_finalize,
                )
                prev_mel = tts_mel

                tts_speechs.append(tts_speech.squeeze())
                tts_mels.append(tts_mel)
                # yield history, inputs, "", "", (
                #     22050,
                #     tts_speech.squeeze().cpu().numpy(),
                # ), None
                flow_prompt_speech_token = torch.cat(
                    (flow_prompt_speech_token, tts_token), dim=-1
                )
                audio_tokens = []

            if not is_finalize:
                complete_tokens.append(token_id)
                if token_id >= audio_offset:
                    audio_tokens.append(token_id - audio_offset)
                else:
                    text_tokens.append(token_id)
    tts_speech = torch.cat(tts_speechs, dim=-1).cpu()
    name = os.path.join(save_dir, ''.join(random.sample(string.ascii_letters, k=9))+'.wav')
    torchaudio.save(name, tts_speech.unsqueeze(0), 22050, format="wav")
    history.append(
        {
            "role": "assistant",
            "content": glm_tokenizer.decode(text_tokens, ignore_special_tokens=False),
            "audio_path": name,
            "type": "audio/wav",
        }
    )
    complete_text = glm_tokenizer.decode(
        complete_tokens, spaces_between_special_tokens=False
    )
    inputs += complete_text

    return history, inputs.strip()
# 初始化

from IPython.display import display, Audio

history, history_tokens = [], ''
# 修改input_text即可连续对话

history, history_tokens = inference_fn(
    input_text='用可爱的语气说:我是一头可爱的毛驴',
    history=history,
    history_tokens=history_tokens
)
print(f'Assistant:{history[-1]["content"]}')
display(Audio(history[-1]['audio_path'], autoplay=True))

CodeWithGpu社区--一键测试icon-default.png?t=O83Ahttps://www.codewithgpu.com/i/THUDM/GLM-4-Voice/glm4-voice-9b


http://www.kler.cn/a/383088.html

相关文章:

  • 智能座舱进阶-应用框架层-Jetpack主要组件
  • 0009.基于springboot+layui的ERP企业进销存管理系统
  • 如何在自己的云服务器上部署mysql
  • 问题解决:发现Excel中的部分内容有问题。是否让我们尽量尝试恢复? 如果您信任此工作簿的源,请单击“是”。
  • (2024.12)Ubuntu20.04安装openMVS<成功>.colmap<成功>和openMVG<失败>记录
  • 大数据、人工智能、云计算、物联网、区块链序言【大数据导论】
  • 探索空间计算与 VR 设备的未来:4K4DGen 高分辨率全景 4D 内容生成系统
  • ssm061基于SSM框架的个人博客网站的设计与实现+vue(论文+源码)_kaic
  • AI 搜索来势汹汹,互联网将被颠覆还是进化?
  • Openlayers高级交互(20/20):超级数据聚合,页面不再混乱
  • 使用 Let’s Encrypt 获取免费SSL证书
  • 城镇住房保障系统:SpringBoot开发要点
  • TLU - Net:一种用于钢材表面缺陷自动检测的深度学习方法
  • c语言架构的一点构想
  • 挂钩图像分割安全状态与危险状态识别系统:更新创新流程
  • 推荐一款可视化和检查原始数据的工具:RawDigger
  • Midjourney从入门到精通教程,10分钟让你从小白变大神!【珍藏版】
  • Bert完形填空
  • Java基础使用①Java特点+环境安装+IDEA使用
  • 求猫用宠物空气净化器推荐,有没有吸毛强、噪音小的产品
  • Linux awk命令详解-参数-选项-内置变量-内置函数-脚本(多图、多示例)
  • 我们来学mysql -- EXPLAIN之ID(原理篇)
  • 爱普生 SG - 8201CJA 可编程振荡器成为电子应用的解决方案
  • 【LeetCode】【算法】142. 环形链表II
  • 开放寻址法、链式哈希数据结构详细解读
  • Vue3 + Element Plus简单使用案例及【eslint】报错处理