Python使用FastAPI结合Word2vec来向量化200维的语言向量数值
准备
pip install fastapi>=0.68.0
pip install uvicorn[standard]>=0.15.0
pip install gensim>=4.0.0
pip install jieba>=0.42.1
pip install numpy>=1.21.0
pip install scikit-learn>=1.0.0
少了的就直接补充就好
代码
from fastapi import FastAPI, HTTPException
from gensim.models import KeyedVectors
import jieba
import numpy as np
import os
import logging
# 配置日志
logging.basicConfig(level=logging.INFO)
app = FastAPI(title="Text Embedding API")
# 路径配置
MODEL_PATH = os.path.abspath("../light_Tencent_AILab_ChineseEmbedding.bin")
# 服务启动前检查
@app.on_event("startup")
async def load_model():
global model
try:
if not os.path.exists(MODEL_PATH):
raise FileNotFoundError(f"Model file not found: {MODEL_PATH}")
model = KeyedVectors.load_word2vec_format(MODEL_PATH, binary=True)
logging.info(f"✅ 模型加载成功 | 词表量:{len(model.key_to_index)}")
logging.info(f"✅ 词向量维度:{model.vector_size}") # 确认输出200
except Exception as e:
logging.error(f"❌ 初始化失败:{str(e)}")
raise RuntimeError("Service initialization failed")
def text_to_vector(text: str) -> np.ndarray:
"""直接返回200维向量"""
words = jieba.lcut(text)
vectors = []
for word in words:
if word in model.key_to_index:
vec = model[word]
# 添加维度验证
assert vec.shape == (200,), f"词向量维度异常: {vec.shape}"
vectors.append(vec)
if not vectors:
return np.zeros(model.vector_size)
avg_vector = np.mean(vectors, axis=0)
assert avg_vector.shape == (200,), f"平均向量维度异常: {avg_vector.shape}"
return avg_vector
@app.get("/vector")
async def get_vector(sentence: str):
if not model:
raise HTTPException(503, "服务未就绪")
if len(sentence.strip()) < 2:
raise HTTPException(400, "输入文本过短")
try:
vector = text_to_vector(sentence)
return {
"dimension": vector.size,
"vector": vector.tolist()
}
except Exception as e:
logging.error(f"处理失败:{str(e)}")
raise HTTPException(500, "内部错误")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)