当前位置: 首页 > article >正文

Openai API + langchain 分析小型pdf文档



from langchain_community.document_loaders import PyPDFLoader
import getpass
import os
from langchain_openai import ChatOpenAI
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

class QA:
    A class to handle question-answering tasks on a given PDF document.

        question (str): The question to be answered about the PDF.
        pdf_path (str): Path to the PDF file.
        model_name (str): Name of the model used for analysis.
        docs (list): Loaded PDF documents.
        vecstore (Chroma): The vector store object for storing document embeddings.

        set_environ(): Set environment variables for the OpenAI API.
        load_file(): Load a PDF file using PyPDFLoader.
        split_and_store(): Split the PDF text and store embeddings using Chroma.
        retrieve_pdf(): Retrieve and answer questions based on the PDF content.
    def __init__(self, question, pdf_path, model_name):
        Initializes the QA object with provided question, PDF path, and model name.

            question (str): The question to be answered about the PDF.
            pdf_path (str): Path to the PDF file.
            model_name (str): Name of the model used for analysis.
        self.question = question
        self.pdf_path = pdf_path
        self.model_name = model_name
        self.docs = None
        self.vecstore = None

    def set_environ(self):
        Sets the environment variables necessary for OpenAI API authentication.
        os.environ['OPENAI_API_KEY'] = input("your api:")
        os.environ['OPENAI_PROXY'] = ''

    def load_file(self):
        Loads the PDF file specified by the pdf_path attribute using PyPDFLoader.
        loader = PyPDFLoader(self.pdf_path)
        self.docs = loader.load()

    def split_and_store(self):
        Splits the loaded PDF text into manageable chunks and stores the embeddings in a vector store.
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        splits = text_splitter.split_documents(self.docs)
        self.vecstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

    def retrieve_pdf(self):
        Retrieves context from the vector store and generates an answer to the input question
        using a retrieval-augmented generation chain.
        retriever = self.vecstore.as_retriever()
        llm = ChatOpenAI(model="gpt-4o")

        system_prompt = (
            "You are an assistant for question-answering tasks. "
            "Use the following pieces of retrieved context to answer "
            "the question. If you don't know the answer, say that you "
            "don't know. Use three sentences maximum and keep the "
            "answer concise."

        prompt = ChatPromptTemplate.from_messages(
                ("system", system_prompt),
                ("human", "{input}"),

        question_answer_chain = create_stuff_documents_chain(llm, prompt)
        rag_chain = create_retrieval_chain(retriever, question_answer_chain)

        results = rag_chain.invoke({"input": self.question})


    def run(self):

def __main__():
    Main function to execute the QA class functionality.

    Prompts user for input parameters, creates a QA object, and processes the specified PDF.
    question = input("Your question:")
    pdf_path = input("Enter the path of the pdf file:")
    model_name = input("Enter the model name:")
    qa = QA(question, pdf_path, model_name)

if __name__ == "__main__":



  • Tomcat的配置文件中有哪些关键的配置项,它们分别有什么作用?
  • 【搜索引擎】ElasticSearch 7.x版本
  • 电单车TCP通讯协议对接phpworkermanHikversion充电桩上位机通讯协议
  • 【开源分享】在线客服系统PHP源码 带搭建教程
  • 【测试】JMeter从入门到进阶
  • 关于Avalonia程序在Linux上运行画面不显示的问题详解
  • 阅读笔记5:董超底层视觉之美|时空的交错与融合——论视频超分辨率
  • 2024年新算法-基于SBOA-BP混合神经网络的数据预测(Python代码实现)
  • 本地生活服务商系统如何利用本地推获得更多曝光?
  • 排序补充之快排的三路划分法
  • Shell 脚本开发学习
  • SQL函数
  • 5.diff算法和虚拟dom
  • Java接口中的长连接与短连接详解:概念、应用场景及实现
  • RDMA驱动学习(一)- 用户态到内核态的过程
  • 【从问题中去学习k8s】k8s中的常见面试题(夯实理论基础)(十五)
  • Spring Boot 中的 starter 是什么
  • 在Excel中使用VLOOKUP函数时避免显示NA和0
  • 实时变声器免费版:支持微信/QQ等语音实时变声(win版+mac版)
  • 【GCC】编译选项与告警(C/C++建议开启)