RGA DEMO 上部
#拆分pdf类型代码块
#pip install pypdf -i https://pypi.tuna.tsinghua.edu.cn/simple (有版本依赖问题)
#pip install --upgrade cryptography -i https://pypi.tuna.tsinghua.edu.cn/simple
from langchain.document_loaders import PyPDFLoader, NotionDirectoryLoader
from langchain_text_splitters import (
CharacterTextSplitter,
RecursiveCharacterTextSplitter,
)
loader = PyPDFLoader("LLM.pdf")
pages = loader.load()
text_splitter = CharacterTextSplitter(
separator="\n", chunk_size=1000, chunk_overlap=150, length_function=len
)
docs = text_splitter.split_documents(pages)
#pip install langchain_ollama -i https://pypi.tuna.tsinghua.edu.cn/simple
from langchain_ollama import OllamaEmbeddings
embeddings = OllamaEmbeddings(model="lrs33/bce-embedding-base_v1",base_url="http://localhost:11434/")
#pip install -qU langchain-postgres -i https://pypi.tuna.tsinghua.edu.cn/simple
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector
CONNECTION_STRING = "postgresql+psycopg2://postgres:password@192.168.159.130:5432/postgres"
# 矢量存储名
COLLECTION_NAME = "yaofang_test"
# 建立索引库
vector = PGVector.from_documents(
embedding=embeddings,
documents=docs,
collection_name=COLLECTION_NAME,
connection=CONNECTION_STRING,
use_jsonb=True,
pre_delete_collection=True,
)
数据navcat 显示(之所以使用pg 易用,性能高,可控性高,缺点学习成本高基本都是版本依赖问题)
此代码为完成pdf切分chunk到保存pgVector数据库中,代码中LLM.pdf文件为“java 23种设计模式学习资料” 下期代码为从数据库获取向量维度数据通过chain流式返回。