当前位置：首页 > article >正文

【azure openai】用tts实现语音对话【demo】

article 2025/3/1 10:00:13

能实现：

只要替换里面的key，就能跑通。

key的查找方法：

【保姆级教程】如何在azure里快速找到openai的key和demo-CSDN博客

代码结构：
azure_openai_client.py

main.py

prompts_config.py

speech_utils.py

stt01.py

tts01.py

azure_openai_client.py

import os
import base64
from openai import AzureOpenAI
from typing import List, Dict, Optional, Union

class AzureOpenAIClient:
    def __init__(
        self,
        endpoint: str = "替换成你的终结点",
        deployment: str = "模型名称",
        api_key: Optional[str] = None,
        system_prompt: str = None
    ):
        """
        初始化 Azure OpenAI 客户端
        
        Args:
            endpoint: Azure OpenAI 服务端点
            deployment: 部署名称
            api_key: API 密钥，如果为 None 则从环境变量获取
            system_prompt: 系统提示词，如果为 None 则使用默认提示
        """
        self.endpoint = endpoint
        self.deployment = deployment
        self.api_key = api_key or os.getenv(
            "AZURE_OPENAI_API_KEY",
            "替换为你的key"
        )
        
        self.client = AzureOpenAI(
            azure_endpoint=self.endpoint,
            api_key=self.api_key,
            api_version="2024-05-01-preview"
        )
        
        # 使用传入的系统提示词或默认提示词
        default_prompt_text = "你是一个帮助用户查找信息的 AI 助手。"
        if system_prompt:
            default_prompt_text = system_prompt
        
        self.default_chat_prompt = [
            {
                "role": "system",
                "content": [
                    {
                        "type": "text",
                        "text": default_prompt_text
                    }
                ]
            }
        ]

    def encode_image(self, image_path: str) -> str:
        """
        将图片编码为 base64 字符串
        
        Args:
            image_path: 图片路径
            
        Returns:
            base64 编码的图片字符串
        """
        with open(image_path, 'rb') as image_file:
            return base64.b64encode(image_file.read()).decode('ascii')

    def chat_completion(
        self,
        messages: Optional[List[Dict]] = None,
        max_tokens: int = 200,
        temperature: float = 0.7,
        top_p: float = 0.95,
        frequency_penalty: float = 0,
        presence_penalty: float = 0,
        stop: Optional[Union[str, List[str]]] = None,
        stream: bool = False
    ):
        """
        生成聊天完成
        
        Args:
            messages: 聊天消息列表，如果为 None 则使用默认提示
            max_tokens: 生成的最大标记数
            temperature: 采样温度
            top_p: 核采样概率
            frequency_penalty: 频率惩罚
            presence_penalty: 存在惩罚
            stop: 停止序列
            stream: 是否使用流式响应
            
        Returns:
            聊天完成响应
        """
        if messages is None:
            messages = self.default_chat_prompt

        completion = self.client.chat.completions.create(
            model=self.deployment,
            messages=messages,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            frequency_penalty=frequency_penalty,
            presence_penalty=presence_penalty,
            stop=stop,
            stream=stream
        )
        
        return completion

main.py

from speech_utils import SpeechService, text_to_speech, speech_to_text
import time
import os
from azure_openai_client import AzureOpenAIClient
from prompts_config import get_system_prompt

def main():
    # 创建 SpeechService 实例
    speech_service = SpeechService(
        speech_key="替换为你的key",
        service_region="资源部署地"
    )
    
    while True:
        print("\n=== 功能菜单 ===")
        print("1. 语音转文字")
        print("2. 文字转语音")
        print("3. AI 语音对话")
        print("0. 退出")
        
        choice = input("请选择功能 (0-3): ")
        
        if choice == "0":
            print("感谢使用，再见！")
            break
        elif choice == "1":
            print("\n=== 语音转文字 ===")
            print("支持的语言：中文、英语、日语")
            print("请说话...")
            
            # 记录开始时间
            start_time = time.time()
            
            success, result = speech_service.speech_to_text(
                languages=["zh-CN", "en-US", "ja-JP"]
            )
            
            # 计算并显示耗时
            elapsed_time = time.time() - start_time
            print(f"\n语音识别耗时: {elapsed_time:.2f}秒")
            
            if success:
                print(f"识别结果: {result['text']}")
                if result['detected_language']:
                    print(f"检测到的语言: {result['detected_language']}")
                
                if input("\n是否要将识别的文字转换为语音？(y/n): ").lower() == 'y':
                    # 记录文字转语音开始时间
                    tts_start_time = time.time()
                    
                    success, message = speech_service.text_to_speech(result['text'])
                    
                    # 计算并显示文字转语音耗时
                    tts_elapsed_time = time.time() - tts_start_time
                    print(f"文字转语音耗时: {tts_elapsed_time:.2f}秒")
                    print(message)
            else:
                print(f"错误: {result}")
                
        elif choice == "2":
            print("\n=== 文字转语音 ===")
            print("可选择的语音：")
            print("1. 中文女声 (zh-CN-XiaoxiaoNeural)")
            print("2. 中文男声 (zh-CN-YunxiNeural)")
            print("3. 英文女声 (en-US-AriaNeural)")
            
            voice_choice = input("请选择语音 (1-3，默认1): ").strip()
            voice_map = {
                "1": "zh-CN-XiaoxiaoNeural",
                "2": "zh-CN-YunxiNeural",
                "3": "en-US-AriaNeural"
            }
            voice_name = voice_map.get(voice_choice, "zh-CN-XiaoxiaoNeural")
            
            text = input("\n请输入要转换为语音的文字: ")
            
            # 记录开始时间
            start_time = time.time()
            
            success, message = speech_service.text_to_speech(text, voice_name=voice_name)
            
            # 计算并显示耗时
            elapsed_time = time.time() - start_time
            print(f"文字转语音耗时: {elapsed_time:.2f}秒")
            print(message)
            
        elif choice == "3":
            voice_chat()
        else:
            print("\n无效的选择，请重试。")
        
        time.sleep(1)

def voice_chat():
    # 初始化服务
    ai_client = AzureOpenAIClient()
    speech_service = SpeechService(
        speech_key=".....",
        service_region="资源位置。例：eastus"
    )
    
    # 选择语言
    print("\n请选择对话语言：")
    print("1. 中文")
    print("2. English")
    lang_choice = input("请选择 (1/2): ")
    
    language = "zh-CN" if lang_choice == "1" else "en-US"
    
    # 获取为该语言配置的系统提示词
    system_prompt = get_system_prompt(language)
    
    # 创建AI客户端并设置系统提示
    ai_client = AzureOpenAIClient(system_prompt=system_prompt)
    
    # 更新系统提示
    messages = [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_prompt}]
        }
    ]
    
    print("\n=== AI 语音对话开始 ===")
    print("输入 's' 开始对话，输入 'q' 结束对话")
    
    while True:
        command = input("\n请输入命令 (s: 开始对话, q: 退出): ")
        if command.lower() == 'q':
            break
        elif command.lower() == 's':
            print("\n开始对话模式...")
            print("系统会在AI回复完成后才开始检测您的语音")
            print("说 '再见' 或 'goodbye' 结束对话")
            
            continue_dialog = True
            while continue_dialog:
                # 提示用户说话
                print("\n请开始说话...")
                
                # 每次只进行一次语音识别
                success, result = speech_service.speech_to_text(languages=[language])
                
                if success and result['text']:
                    user_text = result['text']
                    print(f"\n您说: {user_text}")
                    
                    # 检查是否要结束对话
                    if (language == "zh-CN" and "再见" in user_text.lower()) or \
                       (language == "en-US" and "goodbye" in user_text.lower()):
                        print("对话结束")
                        continue_dialog = False
                        break
                    
                    # 添加用户消息
                    messages.append({
                        "role": "user",
                        "content": [{"type": "text", "text": user_text}]
                    })
                    
                    # 获取 AI 响应
                    print("AI思考中...")
                    response = ai_client.chat_completion(messages=messages)
                    ai_text = response.choices[0].message.content
                    print(f"AI 响应: {ai_text}")
                    
                    # 添加 AI 响应到消息历史
                    messages.append({
                        "role": "assistant",
                        "content": [{"type": "text", "text": ai_text}]
                    })
                    
                    # 文字转语音 - 等待语音合成完成
                    print("正在生成语音...")
                    voice_name = "zh-CN-XiaoxiaoNeural" if language == "zh-CN" else "en-US-AriaNeural"
                    success, message = speech_service.text_to_speech(ai_text, voice_name=voice_name)
                    if not success:
                        print(f"语音合成失败: {message}")
                    
                    print("AI语音播放完成，准备下一轮对话")
                else:
                    print("未能识别您的语音，请重试")
        else:
            print("无效的命令，请重试")

if __name__ == "__main__":
    main()

prompts_config.py

# 系统提示词配置

# 主要系统提示词 - 使用一种语言编写（中文）
MAIN_SYSTEM_PROMPT = """
你是一个智能AI助手，专注于提供有用、准确的信息。请遵循以下准则:
1. 保持回答简洁明了，避免冗长解释
2. 使用礼貌友好的语气
3. 如果不确定答案，坦诚表示不知道
4. 避免有害或不适当的内容
5. 提供准确、最新的信息
6. 尊重用户隐私，不要要求个人信息
7. 只能输出自然语言，禁止输出md格式的内容。
"""

# 语言特定的补充提示
LANGUAGE_PROMPTS = {
    "zh-CN": "请用中文简短回答。",
    "en-US": "Please respond in English concisely.",
    "ja-JP": "簡潔に日本語で回答してください。",
    # 可以添加更多语言
}

def get_system_prompt(language_code="zh-CN"):
    """获取指定语言的完整系统提示词"""
    language_prompt = LANGUAGE_PROMPTS.get(language_code, LANGUAGE_PROMPTS["zh-CN"])
    return f"{MAIN_SYSTEM_PROMPT}\n{language_prompt}"

speech_utils.py

import azure.cognitiveservices.speech as speechsdk
import time

class SpeechService:
    def __init__(self, speech_key, service_region):
        self.speech_key = speech_key
        self.service_region = service_region
        self.speech_config = speechsdk.SpeechConfig(
            subscription=speech_key, 
            region=service_region
        )

    def text_to_speech(self, text, voice_name="zh-CN-XiaoxiaoNeural"):
        """
        将文字转换为语音
        :param text: 要转换的文字
        :param voice_name: 语音名称，默认使用中文女声
        :return: 转换结果和错误信息（如果有）
        """
        try:
            self.speech_config.speech_synthesis_voice_name = voice_name
            speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=self.speech_config)
            
            # 创建事件来跟踪语音合成完成
            synthesis_completed = False
            
            def synthesis_completed_cb(evt):
                nonlocal synthesis_completed
                synthesis_completed = True
            
            # 注册事件
            speech_synthesizer.synthesis_completed.connect(synthesis_completed_cb)
            
            result = speech_synthesizer.speak_text_async(text).get()

            if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
                # 等待合成完成事件
                while not synthesis_completed:
                    time.sleep(0.1)
                return True, "语音合成成功"
            elif result.reason == speechsdk.ResultReason.Canceled:
                cancellation_details = result.cancellation_details
                return False, f"语音合成取消: {cancellation_details.reason}"
        except Exception as e:
            return False, f"发生错误: {str(e)}"

    def speech_to_text(self, languages=None, continuous=False):
        """
        语音转文字
        :param languages: 支持的语言列表，例如 ["zh-CN", "en-US", "ja-JP"]
        :param continuous: 是否使用连续识别模式
        :return: 识别结果和错误信息（如果有）
        """
        try:
            if languages:
                # 多语言支持
                auto_detect_source_language_config = speechsdk.languageconfig.AutoDetectSourceLanguageConfig(
                    languages=languages
                )
                speech_recognizer = speechsdk.SpeechRecognizer(
                    speech_config=self.speech_config,
                    auto_detect_source_language_config=auto_detect_source_language_config
                )
            else:
                # 默认使用中文
                self.speech_config.speech_recognition_language = "zh-CN"
                speech_recognizer = speechsdk.SpeechRecognizer(speech_config=self.speech_config)

            if continuous:
                # 使用连续识别模式
                done = False
                def handle_result(evt):
                    if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
                        return True, {
                            "text": evt.result.text,
                            "detected_language": evt.result.properties.get(
                                speechsdk.PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult
                            )
                        }
                    return False, "无识别结果"

                def stop_cb(evt):
                    nonlocal done
                    done = True

                # 绑定事件
                speech_recognizer.recognized.connect(handle_result)
                speech_recognizer.session_stopped.connect(stop_cb)
                speech_recognizer.canceled.connect(stop_cb)

                # 开始连续识别
                speech_recognizer.start_continuous_recognition()
                while not done:
                    time.sleep(0.5)
                speech_recognizer.stop_continuous_recognition()
                
                return True, {"text": "", "detected_language": None}
            else:
                # 单次识别模式
                result = speech_recognizer.recognize_once()

                if result.reason == speechsdk.ResultReason.RecognizedSpeech:
                    detected_language = None
                    if hasattr(result, 'properties') and result.properties.get(
                        speechsdk.PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult
                    ):
                        detected_language = result.properties[
                            speechsdk.PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult
                        ]
                    return True, {
                        "text": result.text,
                        "detected_language": detected_language
                    }
                elif result.reason == speechsdk.ResultReason.NoMatch:
                    return False, f"无法识别语音: {result.no_match_details}"
                elif result.reason == speechsdk.ResultReason.Canceled:
                    return False, f"语音识别取消: {result.cancellation_details.reason}"
        except Exception as e:
            return False, f"发生错误: {str(e)}"

    def start_continuous_recognition(self, languages=None, callback=None):
        """
        启动连续语音识别
        :param languages: 支持的语言列表，例如 ["zh-CN", "en-US", "ja-JP"]
        :param callback: 回调函数，用于处理识别结果
        :return: speech_recognizer 对象，用于后续控制
        """
        try:
            if languages:
                # 多语言支持
                auto_detect_source_language_config = speechsdk.languageconfig.AutoDetectSourceLanguageConfig(
                    languages=languages
                )
                speech_recognizer = speechsdk.SpeechRecognizer(
                    speech_config=self.speech_config,
                    auto_detect_source_language_config=auto_detect_source_language_config
                )
            else:
                # 默认使用中文
                self.speech_config.speech_recognition_language = "zh-CN"
                speech_recognizer = speechsdk.SpeechRecognizer(speech_config=self.speech_config)

            # 处理识别结果的事件
            def handle_result(evt):
                if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
                    text = evt.result.text
                    detected_language = evt.result.properties.get(
                        speechsdk.PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult
                    )
                    
                    if callback:
                        should_continue = callback(text, detected_language)
                        if should_continue is False:
                            # 如果回调返回 False，停止识别
                            speech_recognizer.stop_continuous_recognition_async()

            # 处理错误的事件
            def handle_canceled(evt):
                if evt.reason == speechsdk.CancellationReason.Error:
                    print(f"语音识别错误: {evt.error_details}")

            # 绑定事件处理器
            speech_recognizer.recognized.connect(handle_result)
            speech_recognizer.canceled.connect(handle_canceled)

            # 开始连续识别
            speech_recognizer.start_continuous_recognition_async()
            
            return speech_recognizer
            
        except Exception as e:
            print(f"启动连续识别时发生错误: {str(e)}")
            raise

def text_to_speech(text: str, language: str = "zh-CN") -> None:
    """
    将文本转换为语音
    
    Args:
        text: 要转换的文本
        language: 语言代码，默认为中文
    """
    # 使用类中已定义的密钥
    speech_key = "语音识别的key"
    service_region = "资源位置，例：eastus"
    
    # 创建语音配置
    speech_config = speechsdk.SpeechConfig(
        subscription=speech_key,
        region=service_region
    )
    
    # 根据语言选择合适的语音
    if language == "zh-CN":
        speech_config.speech_synthesis_voice_name = "zh-CN-XiaoxiaoNeural"
    else:
        speech_config.speech_synthesis_voice_name = "en-US-AriaNeural"
    
    # 创建语音合成器
    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
    
    # 执行语音合成
    result = speech_synthesizer.speak_text_async(text).get()
    
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        print("语音合成完成")
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print(f"语音合成取消: {cancellation_details.reason}")
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print(f"错误详情: {cancellation_details.error_details}")

def speech_to_text(language: str = "zh-CN") -> str:
    """
    将语音转换为文本
    
    Args:
        language: 语言代码，默认为中文
    
    Returns:
        识别出的文本，如果失败则返回空字符串
    """
    speech_key = "语音识别的key"
    service_region = "资源位置"
    
    # 创建语音配置
    speech_config = speechsdk.SpeechConfig(
        subscription=speech_key,
        region=service_region
    )
    
    # 设置语音识别语言
    speech_config.speech_recognition_language = language
    
    # 创建音频配置
    audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
    
    # 创建语音识别器
    speech_recognizer = speechsdk.SpeechRecognizer(
        speech_config=speech_config,
        audio_config=audio_config
    )
    
    print("开始说话...")
    
    # 执行语音识别
    result = speech_recognizer.recognize_once_async().get()
    
    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
        return result.text
    elif result.reason == speechsdk.ResultReason.NoMatch:
        print(f"无法识别语音: {result.no_match_details}")
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print(f"语音识别取消: {cancellation_details.reason}")
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print(f"错误详情: {cancellation_details.error_details}")
    
    return ""

stt01.py

# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.

from speech_utils import SpeechService

def main():
    # 创建 SpeechService 实例
    speech_service = SpeechService(
        speech_key="语音识别的key",
        service_region="eastus"
    )
    
    print("请说话...")
    success, result = speech_service.speech_to_text(languages=["zh-CN", "en-US", "ja-JP"])
    
    if success:
        print(f"识别结果: {result['text']}")
        if result['detected_language']:
            print(f"检测到的语言: {result['detected_language']}")
    else:
        print(f"错误: {result}")

if __name__ == "__main__":
    main()

tts01.py

# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.

from speech_utils import SpeechService

def main():
    # 创建 SpeechService 实例
    speech_service = SpeechService(
        speech_key="语音识别的key",
        service_region="eastus"
    )
    
    print("请输入要转换为语音的文字...")
    text = input()
    
    success, message = speech_service.text_to_speech(text)
    print(message)

if __name__ == "__main__":
    main()

查看全文

http://www.kler.cn/a/565778.html