使用LangChainV3.0加载PDF文件并进行总结
LangChain目前已经更新到了V3版本,之前一直使用的V1版本,有很多方法都需要自己去封装,这次重新看了V3版本的API文档,很多方法都十分便利,调用方法简单明了十分方便,下面就来展示下这次对于PDF文件加载的优化处理:
import time
from langchain.chains.summarize import load_summarize_chain
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
# 记录开始时间
start_time = time.time()
# 加载PDF文件,这里存在2种模式,对于小文件直接使用文件流读取single,对于大文件使用page分页加载,这里可以更加细分,按需选择
loader = PyMuPDFLoader(
file_path="your online pdf url",
mode="page")
docs = []
pages = loader.load_and_split()
print(f"总共加载了 {len(pages)} 页")
merged_docs = []
chunk_size = 15 # 每 3 页合并成一个文档对象
for i in range(0, len(pages), chunk_size):
# 合并多个页面的文本
combined_text = ""
for j in range(i, min(i + chunk_size, len(pages))): # 处理小于 3 页的余数
combined_text += pages[j].page_content + "\n" # 将每页的文本拼接起来
# 创建新的 Document 对象
merged_docs.append(Document(page_content=combined_text.strip(),
metadata={"start_page": i + 1, "end_page": min(i + chunk_size, len(pages))}))
# 检查合并后的文档块数量
print(f"最终合并后文档块数量: {len(merged_docs)}")
# print(merged_docs)
# ############通过总页数来拆分文本
# 定义文本拆分器(约 1000 token 分块
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=6000, # 尽可能大,提高每次请求信息量
chunk_overlap=1000 # 适当增加重叠,保持上下文
)
# 处理 PDF 页面的文本
docs = text_splitter.split_documents(merged_docs)
print(f"拆分后共有 {len(docs)} 个文本块")
OPENAI_API_KEY = "your api key"
llm = ChatOpenAI(model_name="gpt-3.5-turbo", streaming=False,
openai_api_key=OPENAI_API_KEY)
map_prompt = PromptTemplate(
input_variables=["text"],
template="""Summarize this content in English in approximately 50 words, keeping technical terms unchanged. Be concise and to the point. Use simple language and avoid unnecessary details:\n\n{text}"""
)
combine_prompt = PromptTemplate(
input_variables=["text"],
template="""Summarize this content: {text},Use the following criteria:
1. Create an attention-grabbing title under 10 words with emojis.
2. Break down complex ideas into bite-sized, memorable points, keeping it under 200 words total."""
)
final_map_prompt = PromptTemplate(
input_variables=["text"],
template=("""Summarize this content: {text},Use the following criteria:"
Create:
1. Title (requirements):
- Create a clear, descriptive title under 10 words
- Capture the main topic or key finding
- Use professional language
- Format as "Title: [Your Title]"
2. Summary content (requirements):
- Main argument or central theme (1-2 sentences)
- 3-4 most important supporting points
- Key conclusions
- Output must be in English regardless of source language
- Keep the total summary under 200 words
- Use professional language but avoid unnecessary jargon
- Preserve all key technical terms in original form when appropriate
" """)
)
# 加载 LangChain 摘要链
print("\n开始总结:")
# chain = load_summarize_chain(llm, chain_type="map_reduce", map_prompt=map_prompt,
# combine_prompt=combine_prompt)
# summary = chain.invoke(docs, config=None)
def summarize_document(doc):
chain = load_summarize_chain(llm, chain_type="map_reduce", map_prompt=map_prompt, combine_prompt=combine_prompt)
return chain.invoke([doc])
import concurrent.futures
# 使用线程池进行并行处理
with concurrent.futures.ThreadPoolExecutor() as executor:
summaries = list(executor.map(summarize_document, docs))
# 合并所有摘要
combined_summaries = "\n\n".join([summary['output_text'] for summary in summaries])
print(f"多少个总结:{combined_summaries}")
final_chain = load_summarize_chain(llm, chain_type="map_reduce", map_prompt=final_map_prompt, combine_prompt=combine_prompt)
final_summary = final_chain.invoke([Document(page_content=combined_summaries)])
# 打印最终总结
print("\n最终总结:")
print(final_summary['output_text'])
#
# print("\n最终总结:")
# print(summary['output_text'])
# 记录结束时间
end_time = time.time()
# 计算总花费时间
elapsed_time = end_time - start_time
print(f"Total execution time: {elapsed_time:.2f} seconds")
上面的内容希望对你有所帮助,如果有什么优化点请评论联系我!十分感谢