Skip to content

小型知识库实战

8.1 读书笔记知识库

8.1.1 功能需求

  • 支持上传电子书、PDF 文档
  • 自动提取章节结构
  • 智能问答功能
  • 笔记管理和标签系统
  • 阅读进度追踪

8.1.2 实现方案

python
# 读书笔记知识库实现示例
import streamlit as st
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

# 初始化知识库
def init_knowledgebase():
    # 加载文档
    loader = PyPDFLoader("book.pdf")
    documents = loader.load()
    
    # 分割文档
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = text_splitter.split_documents(documents)
    
    # 向量化
    embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-zh-v1.5")
    vector_store = Chroma.from_documents(documents=chunks, embedding=embeddings)
    
    return vector_store

# 创建问答链
def create_qa_chain(vector_store):
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.3)
    chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever(k=3)
    )
    return chain

# 主应用
if __name__ == "__main__":
    st.title("读书笔记知识库")
    
    # 初始化知识库
    vector_store = init_knowledgebase()
    qa_chain = create_qa_chain(vector_store)
    
    # 问答界面
    query = st.text_input("请输入你的问题:")
    if st.button("提交"):
        if query:
            result = qa_chain({"query": query})
            st.write(result["result"])
        else:
            st.warning("请输入问题")

8.2 文档问答助手

8.2.1 功能需求

  • 支持多种文档格式(PDF、Word、Excel、Markdown)
  • 批量文档处理
  • 智能问答功能
  • 文档分类和标签
  • 搜索和过滤功能

8.2.2 实现方案

python
# 文档问答助手实现示例
import streamlit as st
import os
import tempfile
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

# 加载文档
def load_documents(uploaded_files):
    documents = []
    for file in uploaded_files:
        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.name)[1]) as tmp:
            tmp.write(file.getvalue())
            tmp_path = tmp.name
        
        try:
            if file.name.endswith('.pdf'):
                loader = PyPDFLoader(tmp_path)
            elif file.name.endswith('.docx'):
                loader = Docx2txtLoader(tmp_path)
            elif file.name.endswith('.txt') or file.name.endswith('.md'):
                loader = TextLoader(tmp_path)
            else:
                st.warning(f"不支持的文件类型: {file.name}")
                continue
            
            docs = loader.load()
            for doc in docs:
                doc.metadata["source"] = file.name
            documents.extend(docs)
        finally:
            os.unlink(tmp_path)
    
    return documents

# 主应用
if __name__ == "__main__":
    st.title("文档问答助手")
    
    # 文件上传
    uploaded_files = st.file_uploader("上传文档", type=["pdf", "docx", "txt", "md"], accept_multiple_files=True)
    
    if uploaded_files:
        # 加载文档
        documents = load_documents(uploaded_files)
        st.success(f"成功加载 {len(documents)} 个文档")
        
        # 分割文档
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
        chunks = text_splitter.split_documents(documents)
        
        # 向量化
        embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-zh-v1.5")
        vector_store = Chroma.from_documents(documents=chunks, embedding=embeddings)
        
        # 创建问答链
        llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.3)
        qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=vector_store.as_retriever(k=3),
            return_source_documents=True
        )
        
        # 问答界面
        query = st.text_input("请输入你的问题:")
        if st.button("提交"):
            if query:
                result = qa_chain({"query": query})
                st.write(result["result"])
                
                # 显示来源文档
                st.write("来源文档:")
                for doc in result["source_documents"]:
                    st.write(f"- {doc.metadata.get('source', '未知')}")
            else:
                st.warning("请输入问题")

8.3 本地私有化部署

8.3.1 部署方案

  • 本地服务器部署:使用本地电脑或服务器部署
  • Docker 容器化:使用 Docker 容器化部署
  • 离线运行:确保在无网络环境下也能运行
  • 数据安全:确保数据存储在本地,不泄露

8.3.2 部署步骤

  1. 准备环境

    bash
    # 安装依赖
    pip install -r requirements.txt
  2. 构建 Docker 镜像

    dockerfile
    FROM python:3.10-slim
    
    WORKDIR /app
    
    COPY requirements.txt .
    RUN pip install --no-cache-dir -r requirements.txt
    
    COPY . .
    
    EXPOSE 8501
    
    CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
  3. 运行容器

    bash
    docker build -t knowledgebase-app .
    docker run -p 8501:8501 -v ./data:/app/data knowledgebase-app
  4. 访问应用

8.4 本章小结

  • 学习了如何构建读书笔记知识库
  • 掌握了文档问答助手的实现方法
  • 了解了本地私有化部署的方案和步骤
  • 实践了小型知识库的完整开发流程