【azure openai】用tts实现语音对话【demo】
能实现:
只要替换里面的key,就能跑通。
key的查找方法:
【保姆级教程】如何在azure里快速找到openai的key和demo-CSDN博客
代码结构:
azure_openai_client.py
main.py
prompts_config.py
speech_utils.py
stt01.py
tts01.py
azure_openai_client.py
import os
import base64
from openai import AzureOpenAI
from typing import List, Dict, Optional, Union
class AzureOpenAIClient:
def __init__(
self,
endpoint: str = "替换成你的终结点",
deployment: str = "模型名称",
api_key: Optional[str] = None,
system_prompt: str = None
):
"""
初始化 Azure OpenAI 客户端
Args:
endpoint: Azure OpenAI 服务端点
deployment: 部署名称
api_key: API 密钥,如果为 None 则从环境变量获取
system_prompt: 系统提示词,如果为 None 则使用默认提示
"""
self.endpoint = endpoint
self.deployment = deployment
self.api_key = api_key or os.getenv(
"AZURE_OPENAI_API_KEY",
"替换为你的key"
)
self.client = AzureOpenAI(
azure_endpoint=self.endpoint,
api_key=self.api_key,
api_version="2024-05-01-preview"
)
# 使用传入的系统提示词或默认提示词
default_prompt_text = "你是一个帮助用户查找信息的 AI 助手。"
if system_prompt:
default_prompt_text = system_prompt
self.default_chat_prompt = [
{
"role": "system",
"content": [
{
"type": "text",
"text": default_prompt_text
}
]
}
]
def encode_image(self, image_path: str) -> str:
"""
将图片编码为 base64 字符串
Args:
image_path: 图片路径
Returns:
base64 编码的图片字符串
"""
with open(image_path, 'rb') as image_file:
return base64.b64encode(image_file.read()).decode('ascii')
def chat_completion(
self,
messages: Optional[List[Dict]] = None,
max_tokens: int = 200,
temperature: float = 0.7,
top_p: float = 0.95,
frequency_penalty: float = 0,
presence_penalty: float = 0,
stop: Optional[Union[str, List[str]]] = None,
stream: bool = False
):
"""
生成聊天完成
Args:
messages: 聊天消息列表,如果为 None 则使用默认提示
max_tokens: 生成的最大标记数
temperature: 采样温度
top_p: 核采样概率
frequency_penalty: 频率惩罚
presence_penalty: 存在惩罚
stop: 停止序列
stream: 是否使用流式响应
Returns:
聊天完成响应
"""
if messages is None:
messages = self.default_chat_prompt
completion = self.client.chat.completions.create(
model=self.deployment,
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
frequency_penalty=frequency_penalty,
presence_penalty=presence_penalty,
stop=stop,
stream=stream
)
return completion
main.py
from speech_utils import SpeechService, text_to_speech, speech_to_text
import time
import os
from azure_openai_client import AzureOpenAIClient
from prompts_config import get_system_prompt
def main():
# 创建 SpeechService 实例
speech_service = SpeechService(
speech_key="替换为你的key",
service_region="资源部署地"
)
while True:
print("\n=== 功能菜单 ===")
print("1. 语音转文字")
print("2. 文字转语音")
print("3. AI 语音对话")
print("0. 退出")
choice = input("请选择功能 (0-3): ")
if choice == "0":
print("感谢使用,再见!")
break
elif choice == "1":
print("\n=== 语音转文字 ===")
print("支持的语言:中文、英语、日语")
print("请说话...")
# 记录开始时间
start_time = time.time()
success, result = speech_service.speech_to_text(
languages=["zh-CN", "en-US", "ja-JP"]
)
# 计算并显示耗时
elapsed_time = time.time() - start_time
print(f"\n语音识别耗时: {elapsed_time:.2f}秒")
if success:
print(f"识别结果: {result['text']}")
if result['detected_language']:
print(f"检测到的语言: {result['detected_language']}")
if input("\n是否要将识别的文字转换为语音?(y/n): ").lower() == 'y':
# 记录文字转语音开始时间
tts_start_time = time.time()
success, message = speech_service.text_to_speech(result['text'])
# 计算并显示文字转语音耗时
tts_elapsed_time = time.time() - tts_start_time
print(f"文字转语音耗时: {tts_elapsed_time:.2f}秒")
print(message)
else:
print(f"错误: {result}")
elif choice == "2":
print("\n=== 文字转语音 ===")
print("可选择的语音:")
print("1. 中文女声 (zh-CN-XiaoxiaoNeural)")
print("2. 中文男声 (zh-CN-YunxiNeural)")
print("3. 英文女声 (en-US-AriaNeural)")
voice_choice = input("请选择语音 (1-3,默认1): ").strip()
voice_map = {
"1": "zh-CN-XiaoxiaoNeural",
"2": "zh-CN-YunxiNeural",
"3": "en-US-AriaNeural"
}
voice_name = voice_map.get(voice_choice, "zh-CN-XiaoxiaoNeural")
text = input("\n请输入要转换为语音的文字: ")
# 记录开始时间
start_time = time.time()
success, message = speech_service.text_to_speech(text, voice_name=voice_name)
# 计算并显示耗时
elapsed_time = time.time() - start_time
print(f"文字转语音耗时: {elapsed_time:.2f}秒")
print(message)
elif choice == "3":
voice_chat()
else:
print("\n无效的选择,请重试。")
time.sleep(1)
def voice_chat():
# 初始化服务
ai_client = AzureOpenAIClient()
speech_service = SpeechService(
speech_key=".....",
service_region="资源位置。例:eastus"
)
# 选择语言
print("\n请选择对话语言:")
print("1. 中文")
print("2. English")
lang_choice = input("请选择 (1/2): ")
language = "zh-CN" if lang_choice == "1" else "en-US"
# 获取为该语言配置的系统提示词
system_prompt = get_system_prompt(language)
# 创建AI客户端并设置系统提示
ai_client = AzureOpenAIClient(system_prompt=system_prompt)
# 更新系统提示
messages = [
{
"role": "system",
"content": [{"type": "text", "text": system_prompt}]
}
]
print("\n=== AI 语音对话开始 ===")
print("输入 's' 开始对话,输入 'q' 结束对话")
while True:
command = input("\n请输入命令 (s: 开始对话, q: 退出): ")
if command.lower() == 'q':
break
elif command.lower() == 's':
print("\n开始对话模式...")
print("系统会在AI回复完成后才开始检测您的语音")
print("说 '再见' 或 'goodbye' 结束对话")
continue_dialog = True
while continue_dialog:
# 提示用户说话
print("\n请开始说话...")
# 每次只进行一次语音识别
success, result = speech_service.speech_to_text(languages=[language])
if success and result['text']:
user_text = result['text']
print(f"\n您说: {user_text}")
# 检查是否要结束对话
if (language == "zh-CN" and "再见" in user_text.lower()) or \
(language == "en-US" and "goodbye" in user_text.lower()):
print("对话结束")
continue_dialog = False
break
# 添加用户消息
messages.append({
"role": "user",
"content": [{"type": "text", "text": user_text}]
})
# 获取 AI 响应
print("AI思考中...")
response = ai_client.chat_completion(messages=messages)
ai_text = response.choices[0].message.content
print(f"AI 响应: {ai_text}")
# 添加 AI 响应到消息历史
messages.append({
"role": "assistant",
"content": [{"type": "text", "text": ai_text}]
})
# 文字转语音 - 等待语音合成完成
print("正在生成语音...")
voice_name = "zh-CN-XiaoxiaoNeural" if language == "zh-CN" else "en-US-AriaNeural"
success, message = speech_service.text_to_speech(ai_text, voice_name=voice_name)
if not success:
print(f"语音合成失败: {message}")
print("AI语音播放完成,准备下一轮对话")
else:
print("未能识别您的语音,请重试")
else:
print("无效的命令,请重试")
if __name__ == "__main__":
main()
prompts_config.py
# 系统提示词配置
# 主要系统提示词 - 使用一种语言编写(中文)
MAIN_SYSTEM_PROMPT = """
你是一个智能AI助手,专注于提供有用、准确的信息。请遵循以下准则:
1. 保持回答简洁明了,避免冗长解释
2. 使用礼貌友好的语气
3. 如果不确定答案,坦诚表示不知道
4. 避免有害或不适当的内容
5. 提供准确、最新的信息
6. 尊重用户隐私,不要要求个人信息
7. 只能输出自然语言,禁止输出md格式的内容。
"""
# 语言特定的补充提示
LANGUAGE_PROMPTS = {
"zh-CN": "请用中文简短回答。",
"en-US": "Please respond in English concisely.",
"ja-JP": "簡潔に日本語で回答してください。",
# 可以添加更多语言
}
def get_system_prompt(language_code="zh-CN"):
"""获取指定语言的完整系统提示词"""
language_prompt = LANGUAGE_PROMPTS.get(language_code, LANGUAGE_PROMPTS["zh-CN"])
return f"{MAIN_SYSTEM_PROMPT}\n{language_prompt}"
speech_utils.py
import azure.cognitiveservices.speech as speechsdk
import time
class SpeechService:
def __init__(self, speech_key, service_region):
self.speech_key = speech_key
self.service_region = service_region
self.speech_config = speechsdk.SpeechConfig(
subscription=speech_key,
region=service_region
)
def text_to_speech(self, text, voice_name="zh-CN-XiaoxiaoNeural"):
"""
将文字转换为语音
:param text: 要转换的文字
:param voice_name: 语音名称,默认使用中文女声
:return: 转换结果和错误信息(如果有)
"""
try:
self.speech_config.speech_synthesis_voice_name = voice_name
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=self.speech_config)
# 创建事件来跟踪语音合成完成
synthesis_completed = False
def synthesis_completed_cb(evt):
nonlocal synthesis_completed
synthesis_completed = True
# 注册事件
speech_synthesizer.synthesis_completed.connect(synthesis_completed_cb)
result = speech_synthesizer.speak_text_async(text).get()
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
# 等待合成完成事件
while not synthesis_completed:
time.sleep(0.1)
return True, "语音合成成功"
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
return False, f"语音合成取消: {cancellation_details.reason}"
except Exception as e:
return False, f"发生错误: {str(e)}"
def speech_to_text(self, languages=None, continuous=False):
"""
语音转文字
:param languages: 支持的语言列表,例如 ["zh-CN", "en-US", "ja-JP"]
:param continuous: 是否使用连续识别模式
:return: 识别结果和错误信息(如果有)
"""
try:
if languages:
# 多语言支持
auto_detect_source_language_config = speechsdk.languageconfig.AutoDetectSourceLanguageConfig(
languages=languages
)
speech_recognizer = speechsdk.SpeechRecognizer(
speech_config=self.speech_config,
auto_detect_source_language_config=auto_detect_source_language_config
)
else:
# 默认使用中文
self.speech_config.speech_recognition_language = "zh-CN"
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=self.speech_config)
if continuous:
# 使用连续识别模式
done = False
def handle_result(evt):
if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
return True, {
"text": evt.result.text,
"detected_language": evt.result.properties.get(
speechsdk.PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult
)
}
return False, "无识别结果"
def stop_cb(evt):
nonlocal done
done = True
# 绑定事件
speech_recognizer.recognized.connect(handle_result)
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
# 开始连续识别
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(0.5)
speech_recognizer.stop_continuous_recognition()
return True, {"text": "", "detected_language": None}
else:
# 单次识别模式
result = speech_recognizer.recognize_once()
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
detected_language = None
if hasattr(result, 'properties') and result.properties.get(
speechsdk.PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult
):
detected_language = result.properties[
speechsdk.PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult
]
return True, {
"text": result.text,
"detected_language": detected_language
}
elif result.reason == speechsdk.ResultReason.NoMatch:
return False, f"无法识别语音: {result.no_match_details}"
elif result.reason == speechsdk.ResultReason.Canceled:
return False, f"语音识别取消: {result.cancellation_details.reason}"
except Exception as e:
return False, f"发生错误: {str(e)}"
def start_continuous_recognition(self, languages=None, callback=None):
"""
启动连续语音识别
:param languages: 支持的语言列表,例如 ["zh-CN", "en-US", "ja-JP"]
:param callback: 回调函数,用于处理识别结果
:return: speech_recognizer 对象,用于后续控制
"""
try:
if languages:
# 多语言支持
auto_detect_source_language_config = speechsdk.languageconfig.AutoDetectSourceLanguageConfig(
languages=languages
)
speech_recognizer = speechsdk.SpeechRecognizer(
speech_config=self.speech_config,
auto_detect_source_language_config=auto_detect_source_language_config
)
else:
# 默认使用中文
self.speech_config.speech_recognition_language = "zh-CN"
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=self.speech_config)
# 处理识别结果的事件
def handle_result(evt):
if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
text = evt.result.text
detected_language = evt.result.properties.get(
speechsdk.PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult
)
if callback:
should_continue = callback(text, detected_language)
if should_continue is False:
# 如果回调返回 False,停止识别
speech_recognizer.stop_continuous_recognition_async()
# 处理错误的事件
def handle_canceled(evt):
if evt.reason == speechsdk.CancellationReason.Error:
print(f"语音识别错误: {evt.error_details}")
# 绑定事件处理器
speech_recognizer.recognized.connect(handle_result)
speech_recognizer.canceled.connect(handle_canceled)
# 开始连续识别
speech_recognizer.start_continuous_recognition_async()
return speech_recognizer
except Exception as e:
print(f"启动连续识别时发生错误: {str(e)}")
raise
def text_to_speech(text: str, language: str = "zh-CN") -> None:
"""
将文本转换为语音
Args:
text: 要转换的文本
language: 语言代码,默认为中文
"""
# 使用类中已定义的密钥
speech_key = "语音识别的key"
service_region = "资源位置,例:eastus"
# 创建语音配置
speech_config = speechsdk.SpeechConfig(
subscription=speech_key,
region=service_region
)
# 根据语言选择合适的语音
if language == "zh-CN":
speech_config.speech_synthesis_voice_name = "zh-CN-XiaoxiaoNeural"
else:
speech_config.speech_synthesis_voice_name = "en-US-AriaNeural"
# 创建语音合成器
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
# 执行语音合成
result = speech_synthesizer.speak_text_async(text).get()
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
print("语音合成完成")
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print(f"语音合成取消: {cancellation_details.reason}")
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print(f"错误详情: {cancellation_details.error_details}")
def speech_to_text(language: str = "zh-CN") -> str:
"""
将语音转换为文本
Args:
language: 语言代码,默认为中文
Returns:
识别出的文本,如果失败则返回空字符串
"""
speech_key = "语音识别的key"
service_region = "资源位置"
# 创建语音配置
speech_config = speechsdk.SpeechConfig(
subscription=speech_key,
region=service_region
)
# 设置语音识别语言
speech_config.speech_recognition_language = language
# 创建音频配置
audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
# 创建语音识别器
speech_recognizer = speechsdk.SpeechRecognizer(
speech_config=speech_config,
audio_config=audio_config
)
print("开始说话...")
# 执行语音识别
result = speech_recognizer.recognize_once_async().get()
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
return result.text
elif result.reason == speechsdk.ResultReason.NoMatch:
print(f"无法识别语音: {result.no_match_details}")
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print(f"语音识别取消: {cancellation_details.reason}")
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print(f"错误详情: {cancellation_details.error_details}")
return ""
stt01.py
# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
from speech_utils import SpeechService
def main():
# 创建 SpeechService 实例
speech_service = SpeechService(
speech_key="语音识别的key",
service_region="eastus"
)
print("请说话...")
success, result = speech_service.speech_to_text(languages=["zh-CN", "en-US", "ja-JP"])
if success:
print(f"识别结果: {result['text']}")
if result['detected_language']:
print(f"检测到的语言: {result['detected_language']}")
else:
print(f"错误: {result}")
if __name__ == "__main__":
main()
tts01.py
# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
from speech_utils import SpeechService
def main():
# 创建 SpeechService 实例
speech_service = SpeechService(
speech_key="语音识别的key",
service_region="eastus"
)
print("请输入要转换为语音的文字...")
text = input()
success, message = speech_service.text_to_speech(text)
print(message)
if __name__ == "__main__":
main()