Appearance
小型知识库实战
8.1 读书笔记知识库
8.1.1 功能需求
- 支持上传电子书、PDF 文档
- 自动提取章节结构
- 智能问答功能
- 笔记管理和标签系统
- 阅读进度追踪
8.1.2 实现方案
python
# 读书笔记知识库实现示例
import streamlit as st
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
# 初始化知识库
def init_knowledgebase():
# 加载文档
loader = PyPDFLoader("book.pdf")
documents = loader.load()
# 分割文档
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)
# 向量化
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-zh-v1.5")
vector_store = Chroma.from_documents(documents=chunks, embedding=embeddings)
return vector_store
# 创建问答链
def create_qa_chain(vector_store):
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.3)
chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=vector_store.as_retriever(k=3)
)
return chain
# 主应用
if __name__ == "__main__":
st.title("读书笔记知识库")
# 初始化知识库
vector_store = init_knowledgebase()
qa_chain = create_qa_chain(vector_store)
# 问答界面
query = st.text_input("请输入你的问题:")
if st.button("提交"):
if query:
result = qa_chain({"query": query})
st.write(result["result"])
else:
st.warning("请输入问题")8.2 文档问答助手
8.2.1 功能需求
- 支持多种文档格式(PDF、Word、Excel、Markdown)
- 批量文档处理
- 智能问答功能
- 文档分类和标签
- 搜索和过滤功能
8.2.2 实现方案
python
# 文档问答助手实现示例
import streamlit as st
import os
import tempfile
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
# 加载文档
def load_documents(uploaded_files):
documents = []
for file in uploaded_files:
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.name)[1]) as tmp:
tmp.write(file.getvalue())
tmp_path = tmp.name
try:
if file.name.endswith('.pdf'):
loader = PyPDFLoader(tmp_path)
elif file.name.endswith('.docx'):
loader = Docx2txtLoader(tmp_path)
elif file.name.endswith('.txt') or file.name.endswith('.md'):
loader = TextLoader(tmp_path)
else:
st.warning(f"不支持的文件类型: {file.name}")
continue
docs = loader.load()
for doc in docs:
doc.metadata["source"] = file.name
documents.extend(docs)
finally:
os.unlink(tmp_path)
return documents
# 主应用
if __name__ == "__main__":
st.title("文档问答助手")
# 文件上传
uploaded_files = st.file_uploader("上传文档", type=["pdf", "docx", "txt", "md"], accept_multiple_files=True)
if uploaded_files:
# 加载文档
documents = load_documents(uploaded_files)
st.success(f"成功加载 {len(documents)} 个文档")
# 分割文档
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)
# 向量化
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-zh-v1.5")
vector_store = Chroma.from_documents(documents=chunks, embedding=embeddings)
# 创建问答链
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.3)
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=vector_store.as_retriever(k=3),
return_source_documents=True
)
# 问答界面
query = st.text_input("请输入你的问题:")
if st.button("提交"):
if query:
result = qa_chain({"query": query})
st.write(result["result"])
# 显示来源文档
st.write("来源文档:")
for doc in result["source_documents"]:
st.write(f"- {doc.metadata.get('source', '未知')}")
else:
st.warning("请输入问题")8.3 本地私有化部署
8.3.1 部署方案
- 本地服务器部署:使用本地电脑或服务器部署
- Docker 容器化:使用 Docker 容器化部署
- 离线运行:确保在无网络环境下也能运行
- 数据安全:确保数据存储在本地,不泄露
8.3.2 部署步骤
准备环境:
bash# 安装依赖 pip install -r requirements.txt构建 Docker 镜像:
dockerfileFROM python:3.10-slim WORKDIR /app COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt COPY . . EXPOSE 8501 CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]运行容器:
bashdocker build -t knowledgebase-app . docker run -p 8501:8501 -v ./data:/app/data knowledgebase-app访问应用:
- 本地访问:http://localhost:8501
- 局域网访问:http://<服务器IP>:8501
8.4 本章小结
- 学习了如何构建读书笔记知识库
- 掌握了文档问答助手的实现方法
- 了解了本地私有化部署的方案和步骤
- 实践了小型知识库的完整开发流程
