GPT 本地运行输出界面简洁美观(命令行、界面、网页)
目录
展示图
代码配合命令行
界面运行
网页运行
展示图
命令行运行
tkinter界面运行
网页运行
代码配合命令行
import os import time from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer start_time = time.time() model_path = os.path.join(os.path.dirname(__file__), "Qwen2.5") # 加载本地模型和分词器 tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModelForCausalLM.from_pretrained( model_path, device_map="cpu", # 自动分配GPU/CPU torch_dtype="auto" # 自动选择精度(如FP16) ) # 修改填充标记 if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # 构建对话 messages = [ {"role": "user", "content": "你是?"} ] input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # 继承方法放入一些处理方法 class Streamer(TextStreamer): def __init__(self, tokenizer, skip_prompt=True, skip_special_tokens=True): super().__init__(tokenizer, skip_prompt=skip_prompt, skip_special_tokens=skip_special_tokens) def on_finalized_text(self, text, stream_end=False): print(text, flush=True, end="" if not stream_end else None) # 创建文本流处理器 streamer = Streamer(tokenizer) # 生成回复 inputs = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt").to(model.device) outputs = model.generate( inputs.input_ids, max_new_tokens=5, do_sample=True, temperature=0.6, top_p=0.95, streamer=streamer ) # 解码完整结果(用于统计) response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True) # 输出统计信息 response_tokens = tokenizer(response, return_tensors="pt") token_count = len(response_tokens["input_ids"][0]) print(f"token 数量: {token_count}") print(f"token: {len(response) / (time.time() - start_time):.2f} /s")
import os
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
start_time = time.time()
model_path = os.path.join(os.path.dirname(__file__), "Qwen2.5")
# 加载本地模型和分词器
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="cpu", # 自动分配GPU/CPU
torch_dtype="auto" # 自动选择精度(如FP16)
)
# 修改填充标记
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# 构建对话
messages = [
{"role": "user", "content": "你是?"}
]
input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# 继承方法放入一些处理方法
class Streamer(TextStreamer):
def __init__(self, tokenizer, skip_prompt=True, skip_special_tokens=True):
super().__init__(tokenizer, skip_prompt=skip_prompt, skip_special_tokens=skip_special_tokens)
def on_finalized_text(self, text, stream_end=False):
print(text, flush=True, end="" if not stream_end else None)
# 创建文本流处理器
streamer = Streamer(tokenizer)
# 生成回复
inputs = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt").to(model.device)
outputs = model.generate(
inputs.input_ids,
max_new_tokens=5,
do_sample=True,
temperature=0.6,
top_p=0.95,
streamer=streamer
)
# 解码完整结果(用于统计)
response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
# 输出统计信息
response_tokens = tokenizer(response, return_tensors="pt")
token_count = len(response_tokens["input_ids"][0])
print(f"token 数量: {token_count}")
print(f"token: {len(response) / (time.time() - start_time):.2f} /s")
界面运行
import os import time import tkinter as tk from tkinter import scrolledtext from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer import threading # 加载本地模型和分词器 model_path = os.path.join(os.path.dirname(__file__), "Qwen2.5") tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModelForCausalLM.from_pretrained( model_path, device_map="cpu", # 自动分配GPU/CPU torch_dtype="auto" # 自动选择精度(如FP16) ) # 修改填充标记 if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # 继承方法放入一些处理方法 class Streamer(TextStreamer): def __init__(self, tokenizer, text_widget, skip_prompt=True, skip_special_tokens=True): super().__init__(tokenizer, skip_prompt=skip_prompt, skip_special_tokens=skip_special_tokens) self.text_widget = text_widget def on_finalized_text(self, text, stream_end=False): self.text_widget.insert(tk.END, text) self.text_widget.see(tk.END) def generate_response(input_text_str, start_time): # 创建文本流处理器 streamer = Streamer(tokenizer, output_text) # 生成回复 inputs = tokenizer(input_text_str, padding=True, truncation=True, return_tensors="pt").to(model.device) outputs = model.generate( inputs.input_ids, max_new_tokens=10, do_sample=True, temperature=0.6, top_p=0.95, streamer=streamer ) # 解码完整结果(用于统计) response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True) # 输出统计信息 response_tokens = tokenizer(response, return_tensors="pt") token_count = len(response_tokens["input_ids"][0]) output_text.insert(tk.END, f"\ntoken 数量: {token_count}") output_text.insert(tk.END, f"\ntoken: {len(response) / (time.time() - start_time):.2f} /s") def send_message(): # 清空显示框 output_text.delete(1.0, tk.END) # 获取输入框的内容 input_content = input_entry.get() # 构建对话 messages = [ {"role": "user", "content": input_content} ] input_text_str = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) start_time = time.time() # 创建并启动线程 thread = threading.Thread(target=generate_response, args=(input_text_str, start_time)) thread.start() # 清空输入框 input_entry.delete(0, tk.END) # 创建主窗口 root = tk.Tk() root.title("聊天界面") # 设置 root.config(highlightthickness=0) # 设置边框为0 bg_color = '#F0F0F0' # root.overrideredirect(True) root.wm_attributes('-transparentcolor', bg_color) # 创建显示框 output_text = scrolledtext.ScrolledText(root, width=60, height=20, bg=bg_color, font=("黑体", 15),fg="#FFFFFF") output_text.pack(pady=10) # 创建一个新的Frame作为输入区域的容器 input_frame = tk.Frame(root) input_frame.pack(side=tk.BOTTOM, fill=tk.X) # 创建输入框,父容器为input_frame input_entry = tk.Entry(input_frame, width=80) input_entry.pack(side=tk.LEFT, padx=10, pady=10) # 创建发送按钮,父容器为input_frame send_button = tk.Button(input_frame, text="发送", command=send_message, bg="white") send_button.pack(side=tk.RIGHT, padx=10, pady=10) # 运行主循环 root.mainloop()
import os
import time
import tkinter as tk
from tkinter import scrolledtext
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
import threading
# 加载本地模型和分词器
model_path = os.path.join(os.path.dirname(__file__), "Qwen2.5")
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="cpu", # 自动分配GPU/CPU
torch_dtype="auto" # 自动选择精度(如FP16)
)
# 修改填充标记
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# 继承方法放入一些处理方法
class Streamer(TextStreamer):
def __init__(self, tokenizer, text_widget, skip_prompt=True, skip_special_tokens=True):
super().__init__(tokenizer, skip_prompt=skip_prompt, skip_special_tokens=skip_special_tokens)
self.text_widget = text_widget
def on_finalized_text(self, text, stream_end=False):
self.text_widget.insert(tk.END, text)
self.text_widget.see(tk.END)
def generate_response(input_text_str, start_time):
# 创建文本流处理器
streamer = Streamer(tokenizer, output_text)
# 生成回复
inputs = tokenizer(input_text_str, padding=True, truncation=True, return_tensors="pt").to(model.device)
outputs = model.generate(
inputs.input_ids,
max_new_tokens=10,
do_sample=True,
temperature=0.6,
top_p=0.95,
streamer=streamer
)
# 解码完整结果(用于统计)
response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
# 输出统计信息
response_tokens = tokenizer(response, return_tensors="pt")
token_count = len(response_tokens["input_ids"][0])
output_text.insert(tk.END, f"\ntoken 数量: {token_count}")
output_text.insert(tk.END, f"\ntoken: {len(response) / (time.time() - start_time):.2f} /s")
def send_message():
# 清空显示框
output_text.delete(1.0, tk.END)
# 获取输入框的内容
input_content = input_entry.get()
# 构建对话
messages = [
{"role": "user", "content": input_content}
]
input_text_str = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
start_time = time.time()
# 创建并启动线程
thread = threading.Thread(target=generate_response, args=(input_text_str, start_time))
thread.start()
# 清空输入框
input_entry.delete(0, tk.END)
# 创建主窗口
root = tk.Tk()
root.title("聊天界面")
# 设置
root.config(highlightthickness=0) # 设置边框为0
bg_color = '#F0F0F0'
# root.overrideredirect(True)
root.wm_attributes('-transparentcolor', bg_color)
# 创建显示框
output_text = scrolledtext.ScrolledText(root, width=60, height=20, bg=bg_color, font=("黑体", 15),fg="#FFFFFF")
output_text.pack(pady=10)
# 创建一个新的Frame作为输入区域的容器
input_frame = tk.Frame(root)
input_frame.pack(side=tk.BOTTOM, fill=tk.X)
# 创建输入框,父容器为input_frame
input_entry = tk.Entry(input_frame, width=80)
input_entry.pack(side=tk.LEFT, padx=10, pady=10)
# 创建发送按钮,父容器为input_frame
send_button = tk.Button(input_frame, text="发送", command=send_message, bg="white")
send_button.pack(side=tk.RIGHT, padx=10, pady=10)
# 运行主循环
root.mainloop()
网页运行
# streamlit run 网页运行.py import os import time import streamlit as st from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer import torch torch.manual_seed(0) text_all = "" # 加载本地模型和分词器 model_path = os.path.join(os.path.dirname(__file__), "Qwen2.5") try: tokenizer = AutoTokenizer.from_pretrained(model_path) except Exception as e: st.error(f"分词器加载失败: {e}") st.stop() def load_model(option): try: if option == "cpu": model = AutoModelForCausalLM.from_pretrained( model_path, device_map="cpu", torch_dtype=torch.float32 ) elif option == "gpu": device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if not torch.cuda.is_available(): st.warning("GPU 不可用,将使用 CPU 运行。") model = AutoModelForCausalLM.from_pretrained( model_path, device_map="cpu", torch_dtype=torch.float32 ) else: model = AutoModelForCausalLM.from_pretrained( model_path, device_map=device, torch_dtype=torch.float16 ) else: model = AutoModelForCausalLM.from_pretrained( model_path, device_map="cpu", torch_dtype=torch.float32 ) return model except Exception as e: st.error(f"模型加载失败: {e}") st.stop() # 继承方法放入一些处理方法 class Streamer(TextStreamer): def __init__(self, tokenizer, st_container, skip_prompt=True, skip_special_tokens=True): super().__init__(tokenizer, skip_prompt=skip_prompt, skip_special_tokens=skip_special_tokens) self.st_container = st_container def on_finalized_text(self, text, stream_end=False): global text_all text_all += text self.st_container.write(text_all, unsafe_allow_html=True) def generate_response(text_number, input_content, text_temperature, text_top_p, place, place_token, place_time, option, place_all): model = load_model(option) # 修改填充标记 if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # 构建对话 messages = [ {"role": "user", "content": input_content} ] input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) start_time = time.time() # 创建文本流处理器 streamer = Streamer(tokenizer, place) # 生成输入的编码及注意力掩码 inputs = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt").to(model.device) attention_mask = inputs.get('attention_mask') input_ids = inputs['input_ids'] # 一次性生成所需文本 with torch.no_grad(): outputs = model.generate( input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=text_number, temperature=text_temperature, top_p=text_top_p, repetition_penalty=1.02, do_sample=True, streamer=streamer ) # 解码完整结果(用于统计) response = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True) response_tokens = tokenizer(response, return_tensors="pt") token_count = len(response_tokens["input_ids"][0]) # 计算已经过去的时间(秒) elapsed_time = time.time() - start_time # 计算并打印每秒 token 速率 if elapsed_time > 0: tokens_per_second = token_count / elapsed_time place_token.write(f"平均 token: {tokens_per_second:.2f}/s") place_time.write(f"耗时: {elapsed_time:.2f}s") place_all.write(f"token: {token_count} ") print("输出完成") if __name__ == '__main__': st.header(':blue[Qwen2.5] :sunglasses:', divider='rainbow') with st.chat_message("assistant"): st.write("你好 👋 我是:blue[Qwen2.5]") option = st.sidebar.selectbox( '选择使用 cpu 还是 gpu', ('cpu', 'gpu') ) text_number = st.sidebar.slider('文本上限', 10, 500, 100, 10) text_temperature = st.sidebar.slider('随机性', 0.1, 1.0, 0.8, 0.1) text_top_p = st.sidebar.slider('多样性', 0.1, 1.0, 0.8, 0.1) st.sidebar.button(":blue[暂停]") prompt = st.chat_input("写些什么……") if prompt: st.write(f"输入文本为:{prompt}") st.write(f'共{len(prompt)} 字符.') st.divider() with st.spinner('稍等一会...'): with st.chat_message("assistant"): st.write('提问是:' + prompt) place = st.empty() place_token = st.empty() place_time = st.empty() place_all = st.empty() generate_response(text_number, prompt, text_temperature, text_top_p, place, place_token, place_time, option, place_all) st.success('完成!')
# streamlit run 网页运行.py
import os
import time
import streamlit as st
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
import torch
torch.manual_seed(0)
text_all = ""
# 加载本地模型和分词器
model_path = os.path.join(os.path.dirname(__file__), "Qwen2.5")
try:
tokenizer = AutoTokenizer.from_pretrained(model_path)
except Exception as e:
st.error(f"分词器加载失败: {e}")
st.stop()
def load_model(option):
try:
if option == "cpu":
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="cpu",
torch_dtype=torch.float32
)
elif option == "gpu":
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if not torch.cuda.is_available():
st.warning("GPU 不可用,将使用 CPU 运行。")
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="cpu",
torch_dtype=torch.float32
)
else:
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map=device,
torch_dtype=torch.float16
)
else:
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="cpu",
torch_dtype=torch.float32
)
return model
except Exception as e:
st.error(f"模型加载失败: {e}")
st.stop()
# 继承方法放入一些处理方法
class Streamer(TextStreamer):
def __init__(self, tokenizer, st_container, skip_prompt=True, skip_special_tokens=True):
super().__init__(tokenizer, skip_prompt=skip_prompt, skip_special_tokens=skip_special_tokens)
self.st_container = st_container
def on_finalized_text(self, text, stream_end=False):
global text_all
text_all += text
self.st_container.write(text_all, unsafe_allow_html=True)
def generate_response(text_number, input_content, text_temperature, text_top_p, place, place_token,
place_time, option, place_all):
model = load_model(option)
# 修改填充标记
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# 构建对话
messages = [
{"role": "user", "content": input_content}
]
input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
start_time = time.time()
# 创建文本流处理器
streamer = Streamer(tokenizer, place)
# 生成输入的编码及注意力掩码
inputs = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt").to(model.device)
attention_mask = inputs.get('attention_mask')
input_ids = inputs['input_ids']
# 一次性生成所需文本
with torch.no_grad():
outputs = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_new_tokens=text_number,
temperature=text_temperature,
top_p=text_top_p,
repetition_penalty=1.02,
do_sample=True,
streamer=streamer
)
# 解码完整结果(用于统计)
response = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
response_tokens = tokenizer(response, return_tensors="pt")
token_count = len(response_tokens["input_ids"][0])
# 计算已经过去的时间(秒)
elapsed_time = time.time() - start_time
# 计算并打印每秒 token 速率
if elapsed_time > 0:
tokens_per_second = token_count / elapsed_time
place_token.write(f"平均 token: {tokens_per_second:.2f}/s")
place_time.write(f"耗时: {elapsed_time:.2f}s")
place_all.write(f"token: {token_count} ")
print("输出完成")
if __name__ == '__main__':
st.header(':blue[Qwen2.5] :sunglasses:', divider='rainbow')
with st.chat_message("assistant"):
st.write("你好 👋 我是:blue[Qwen2.5]")
option = st.sidebar.selectbox(
'选择使用 cpu 还是 gpu',
('cpu', 'gpu')
)
text_number = st.sidebar.slider('文本上限', 10, 500, 100, 10)
text_temperature = st.sidebar.slider('随机性', 0.1, 1.0, 0.8, 0.1)
text_top_p = st.sidebar.slider('多样性', 0.1, 1.0, 0.8, 0.1)
st.sidebar.button(":blue[暂停]")
prompt = st.chat_input("写些什么……")
if prompt:
st.write(f"输入文本为:{prompt}")
st.write(f'共{len(prompt)} 字符.')
st.divider()
with st.spinner('稍等一会...'):
with st.chat_message("assistant"):
st.write('提问是:' + prompt)
place = st.empty()
place_token = st.empty()
place_time = st.empty()
place_all = st.empty()
generate_response(text_number, prompt, text_temperature, text_top_p, place, place_token,
place_time, option, place_all)
st.success('完成!')