RAG调研

调研一下各类RAG是如何实现相关的富文本入向量数据库的：

- DB-GPT：
  - 通过pypdf直接读取pdf文件然后传给langchain使用，其中没有办法读取表格和文档中的图片相关的数据，仅仅是提取里面相关的文字并进行处理。

```
    """PDF Knowledge."""
    from typing import Any, List, Optional
    
    from dbgpt.rag.chunk import Document
    from dbgpt.rag.knowledge.base import (
        ChunkStrategy,
        DocumentType,
        Knowledge,
        KnowledgeType,
    )
    
    
    class PDFKnowledge(Knowledge):
        """PDF Knowledge."""
    def __init__(
        self,
        file_path: Optional[str] = None,
        knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
        loader: Optional[Any] = None,
        language: Optional[str] = "zh",
        **kwargs: Any,
    ) -> None:
        """Create PDF Knowledge with Knowledge arguments.
    
        Args:
            file_path(str,  optional): file path
            knowledge_type(KnowledgeType, optional): knowledge type
            loader(Any, optional): loader
            language(str, optional): language
        """
        self._path = file_path
        self._type = knowledge_type
        self._loader = loader
        self._language = language
    
    def _load(self) -> List[Document]:
        """Load pdf document from loader."""
        if self._loader:
            documents = self._loader.load()
        else:
            import pypdf
    
            pages = []
            documents = []
            if not self._path:
                raise ValueError("file path is required")
            with open(self._path, "rb") as file:
                reader = pypdf.PdfReader(file)
                for page_num in range(len(reader.pages)):
                    _page = reader.pages[page_num]
                    pages.append((_page.extract_text(), page_num))
        
            # cleaned_pages = []
            for page, page_num in pages:
                lines = page.splitlines()
        
                cleaned_lines = []
                for line in lines:
                    if self._language == "en":
                        words = list(line)  # noqa: F841
                    else:
                        words = line.split()  # noqa: F841
                    cleaned_lines.append(line)
                page = "\n".join(cleaned_lines)
                # cleaned_pages.append(page)
                metadata = {"source": self._path, "page": page_num}
                # text = "\f".join(cleaned_pages)
                document = Document(content=page, metadata=metadata)
                documents.append(document)
            return documents
        return [Document.langchain2doc(lc_document) for lc_document in documents]
    
    @classmethod
    def support_chunk_strategy(cls) -> List[ChunkStrategy]:
        """Return support chunk strategy."""
        return [
            ChunkStrategy.CHUNK_BY_SIZE,
            ChunkStrategy.CHUNK_BY_PAGE,
            ChunkStrategy.CHUNK_BY_SEPARATOR,
        ]
    
    @classmethod
    def default_chunk_strategy(cls) -> ChunkStrategy:
        """Return default chunk strategy."""
        return ChunkStrategy.CHUNK_BY_SIZE
    
    @classmethod
    def type(cls) -> KnowledgeType:
        """Return knowledge type."""
        return KnowledgeType.DOCUMENT
    
    @classmethod
    def document_type(cls) -> DocumentType:
        """Document type of PDF."""
        return DocumentType.PDF
    ```
    
    
  
- anything-llm
  - 是通过VUE中的PDFLoader插件来读取PDF文档中的内容然后进行解读，也是只保留文字部分

```
const { v4 } = require("uuid");
const { PDFLoader } = require("langchain/document_loaders/fs/pdf");
const {
  createdDate,
  trashFile,
  writeToServerDocuments,
} = require("../../utils/files");
const { tokenizeString } = require("../../utils/tokenizer");
const { default: slugify } = require("slugify");

async function asPDF({ fullFilePath = "", filename = "" }) {
  const pdfLoader = new PDFLoader(fullFilePath, {
    splitPages: true,
  });

console.log(`-- Working ${filename} --`);
  const pageContent = [];
  const docs = await pdfLoader.load();
  for (const doc of docs) {
    console.log(
      `-- Parsing content from pg ${
        doc.metadata?.loc?.pageNumber || "unknown"
      } --`
    );
    if (!doc.pageContent.length) continue;
    pageContent.push(doc.pageContent);
  }

if (!pageContent.length) {
    console.error(`Resulting text content was empty for ${filename}.`);
    trashFile(fullFilePath);
    return {
      success: false,
      reason: `No text content found in ${filename}.`,
      documents: [],
    };
  }

const content = pageContent.join("");
  const data = {
    id: v4(),
    url: "file://" + fullFilePath,
    title: docs[0]?.metadata?.pdf?.info?.Title || filename,
    docAuthor: docs[0]?.metadata?.pdf?.info?.Creator || "no author found",
    description: "No description found.",
    docSource: "pdf file uploaded by the user.",
    chunkSource: "",
    published: createdDate(fullFilePath),
    wordCount: content.split(" ").length,
    pageContent: content,
    token_count_estimate: tokenizeString(content).length,
  };

module.exports = asPDF;
```

- Q-anything
  - Q-anything用的PDF结构解析是使用了OCR的方式进行的，和普通的pypdf解析不同，效果更好，而且将图片的结果也解析得出，图片中的文字解析得出。但是解析文件速度估计会更慢

```
"""Loader that loads image files."""
from typing import List, Callable

from langchain.document_loaders.unstructured import UnstructuredFileLoader
from unstructured.partition.text import partition_text
import os
import fitz
from tqdm import tqdm
from typing import Union, Any
import numpy as np
import base64

class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
    """Loader that uses unstructured to load image files, such as PNGs and JPGs."""
    def __init__(
        self,
        file_path: Union[str, List[str]],
        ocr_engine: Callable,
        mode: str = "single",
        **unstructured_kwargs: Any,
    ):
        """Initialize with file path."""
        self.ocr_engine = ocr_engine
        super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

def _get_elements(self) -> List:
        def pdf_ocr_txt(filepath, dir_path="tmp_files"):
            full_dir_path = os.path.join(os.path.dirname(filepath), dir_path)
            if not os.path.exists(full_dir_path):
                os.makedirs(full_dir_path)
            doc = fitz.open(filepath)
            txt_file_path = os.path.join(full_dir_path, "{}.txt".format(os.path.split(filepath)[-1]))
            img_name = os.path.join(full_dir_path, 'tmp.png')
            with open(txt_file_path, 'w', encoding='utf-8') as fout:
                for i in tqdm(range(doc.page_count)):
                    page = doc.load_page(i)
                    pix = page.get_pixmap()
                    img = np.frombuffer(pix.samples, dtype=np.uint8).reshape((pix.h, pix.w, pix.n))

img_data = {"img64": base64.b64encode(img).decode("utf-8"), "height": pix.h, "width": pix.w,
                                "channels": pix.n}
                    result = self.ocr_engine(img_data)
                    result = [line for line in result if line]
                    ocr_result = [i[1][0] for line in result for i in line]
                    fout.write("\n".join(ocr_result))
            if os.path.exists(img_name):
                os.remove(img_name)
            return txt_file_path

txt_file_path = pdf_ocr_txt(self.file_path)
        return partition_text(filename=txt_file_path, **self.unstructured_kwargs)

```

# 总结

QAnything背靠大厂，效果更佳。对标的产品可以用腾讯的金融大模型的产品对标。

1. PaddleOCR应该可以选用更好的OCR产品来提高文档识别准确率的性能。

2. pdf文档是否可以进行转换转换成docx文档后再进行读取。
3. 文档中表格数据是否需要增强代码处理