LLM之基于llama-index部署本地embedding与GLM-4模型对rag系统进行测评
前言
有时间在写吧
评估指标
llama-Index 内置了有评估工具,今天教大家如何使用
llama-Index 有以下评估指标:
-
Answer Relevcancy
-
Context Relevancy
-
Relevancy
-
Faithfulness
-
Correctness
感兴趣可以去 llama_index.core.evaluation 文件查看
当然llama-Index 还提供了测试数据的生成功能,可以帮助我们轻松地生成评估所需的测试数据,包括评估的问题、参考答案等,这样我们就可以快速地进行评估工作,而不需要花费大量的时间去准备测试数据
生成测试数据
from llama_index.core.llama_dataset.rag import LabelledRagDataset
from llama_index.core.llama_dataset.generator import RagDatasetGenerator
from llama_index.core.evaluation import AnswerRelevancyEvaluator
from llama_index.core.node_parser import SentenceSplitter
import os
from typing import Any
from llama_index.core.llms import (
CustomLLM,
CompletionResponse,
CompletionResponseGen,
LLMMetadata,
)
from llama_index.core.llms.callbacks import llm_completion_callback
from transformers import AutoTokenizer, AutoModelForCausalLM
from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader
from datetime import datetime
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.evaluation import ContextRelevancyEvaluator
class GLMCustomLLM(CustomLLM):
context_window: int = 128000 # 上下文窗口大小
num_output: int = 18000 # 输出的token数量
model_name: str = "glm-4-9b-chat" # 模型名称
tokenizer: object = None # 分词器
model: object = None # 模型
dummy_response: str = "My response"
def __init__(self, pretrained_model_name_or_path):
super().__init__()
# GPU方式加载模型
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cuda",
trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cuda",
trust_remote_code=True).eval()
self.model = self.model.float()
@property
def metadata(self) -> LLMMetadata:
"""Get LLM metadata."""
# 得到LLM的元数据
return LLMMetadata(
context_window=self.context_window,
num_output=self.num_output,
model_name=self.model_name,
)
@llm_completion_callback() # 回调函数
def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
now = datetime.now()
inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式
# inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式
outputs = self.model.generate(inputs, max_length=self.num_output)
response = self.tokenizer.decode(outputs[0])
# 完成函数
# print(f"完成函数 当前时间为{now} response={response}")
return CompletionResponse(text=response)
@llm_completion_callback()
def stream_complete(
self, prompt: str, **kwargs: Any
) -> CompletionResponseGen:
# 流式完成函数
now = datetime.now()
print("流式完成函数")
inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式
# inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式
outputs = self.model.generate(inputs, max_length=self.num_output)
response = self.tokenizer.decode(outputs[0])
for token in response:
yield CompletionResponse(text=token, delta=token)
llm = GLMCustomLLM(pretrained_model_name_or_path='/home/kelvin/nlp/model/LLM/THUDM/glm-4-9b-chat')
embed_model_path = '/home/kelvin/nlp/model/Embedding/BAAI/bge-m3'
#输入文件夹的路径
documents = SimpleDirectoryReader("/home/kelvin/nlp/GLM-4/basic_demo/input").load_data()
dataset_generator = RagDatasetGenerator.from_documents(
documents,
llm=llm,
num_questions_per_chunk=1,
)
dataset = dataset_generator.generate_questions_from_nodes()
examples = dataset.examples
for i, example in enumerate(examples):
contexts = [n[:100] for n in example.reference_contexts]
print(f"{i + 1}. {example.query}")
#测试数据保存路径
dataset_json = "/home/kelvin/nlp/GLM-4/basic_demo/test-dataset.json"
# dataset = LabelledRagDataset.from_json(dataset_json)
# examples = dataset.examples
if not os.path.exists(dataset_json):
dataset = dataset_generator.generate_dataset_from_nodes()
examples = dataset.examples
dataset.save_json(dataset_json)
else:
dataset = LabelledRagDataset.from_json(dataset_json)
examples = dataset.examples
Answer Relevcancy与Context Relevancy
Answer Revelancy 是评估 Answer 和 Question 的相关性,这个指标可以帮助我们评估生成的答案是否和问题相关
Context Relevancy 是评估 Context 和 Question 的相关性,这个指标可以帮助我们评估检索到的文档上下文和问题的相关性
Answer Revelancy:将问题和答案传递给AnswerRelevancyEvaluator评估器,通过evaluate方法来评估问题和答案的相关性
Context Relevancy:将问题和检索到的文档上下文传递给ContextRelevancyEvaluator评估器,通过evaluate方法来评估问题和答案的相关性
评估结果的score范围是 0~1,得分越高表示答案和问题的相关性越高,得分为 1 表示完全相关
评估结果中还有feedback属性,用来解释评估结果,这个属性可以帮助我们了解评估结果的产生原因
示例代码:
from llama_index.core.llama_dataset.rag import LabelledRagDataset
from llama_index.core.llama_dataset.generator import RagDatasetGenerator
from llama_index.core.evaluation import AnswerRelevancyEvaluator
from llama_index.core.node_parser import SentenceSplitter
import os
from typing import Any
from llama_index.core.llms import (
CustomLLM,
CompletionResponse,
CompletionResponseGen,
LLMMetadata,
)
from llama_index.core.llms.callbacks import llm_completion_callback
from transformers import AutoTokenizer, AutoModelForCausalLM
from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader
from datetime import datetime
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.evaluation import ContextRelevancyEvaluator
class GLMCustomLLM(CustomLLM):
context_window: int = 128000 # 上下文窗口大小
num_output: int = 18000 # 输出的token数量
model_name: str = "glm-4-9b-chat" # 模型名称
tokenizer: object = None # 分词器
model: object = None # 模型
dummy_response: str = "My response"
def __init__(self, pretrained_model_name_or_path):
super().__init__()
# GPU方式加载模型
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, device_map="cuda",
trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="cuda",
trust_remote_code=True).eval()
self.model = self.model.float()
@property
def metadata(self) -> LLMMetadata:
"""Get LLM metadata."""
# 得到LLM的元数据
return LLMMetadata(
context_window=self.context_window,
num_output=self.num_output,
model_name=self.model_name,
)
@llm_completion_callback() # 回调函数
def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
now = datetime.now()
inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式
# inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式
outputs = self.model.generate(inputs, max_length=self.num_output)
response = self.tokenizer.decode(outputs[0])
# 完成函数
# print(f"完成函数 当前时间为{now} response={response}")
return CompletionResponse(text=response)
@llm_completion_callback()
def stream_complete(
self, prompt: str, **kwargs: Any
) -> CompletionResponseGen:
# 流式完成函数
now = datetime.now()
print("流式完成函数")
inputs = self.tokenizer.encode(prompt, return_tensors='pt').cuda() # GPU方式
# inputs = self.tokenizer.encode(prompt, return_tensors='pt') # CPU方式
outputs = self.model.generate(inputs, max_length=self.num_output)
response = self.tokenizer.decode(outputs[0])
for token in response:
yield CompletionResponse(text=token, delta=token)
llm = GLMCustomLLM(pretrained_model_name_or_path='/home/kelvin/nlp/model/LLM/THUDM/glm-4-9b-chat')
embed_model_path = '/home/kelvin/nlp/model/Embedding/BAAI/bge-m3'
documents = SimpleDirectoryReader("/home/kelvin/nlp/GLM-4/basic_demo/input").load_data()
dataset_json = "/home/kelvin/nlp/GLM-4/basic_demo/test-dataset.json"
dataset = LabelledRagDataset.from_json(dataset_json)
examples = dataset.examples
question = examples[0].query
node_parser = SentenceSplitter()
nodes = node_parser.get_nodes_from_documents(documents)
Settings.llm = llm
Settings.embed_model = HuggingFaceEmbedding(
model_name=f"{embed_model_path}", device='cuda'
)
print(f'检索中...')
vector_index = VectorStoreIndex(nodes)
engine = vector_index.as_query_engine()
response = engine.query(question)
answer = str(response)
print(f"question={question}")
print(f'************')
print(f"Answer: {answer}")
evaluator = AnswerRelevancyEvaluator(llm)
result = evaluator.evaluate(query=question, response=answer)
print(f"score: {result.score}")
print(f"feedback: {result.feedback}")
print(f'*****----------******')
contexts = [n.get_content() for n in response.source_nodes]
evaluator = ContextRelevancyEvaluator(llm)
result = evaluator.evaluate(query=question, contexts=contexts)
print(f"ContextRelevancy_score: {result.score}")
print(f"ContextRelevancy_feedback: {result.feedback}")
print(f'ContextRelevancy_query: {result.query}"')
欢迎大家点赞或收藏~
大家的点赞或收藏可以鼓励作者加快更新哟~