2024工作周期安排
2024项目整体规划
沐曦测试(已完成)
沐曦性能测试研究
沐曦Benchmark相关测试
沐曦模型适配表
2024-02-26沐曦沟通报告
智能打标
数据打标服务Json样例
智能打标寒武纪大模型思路
业务层优先级排序
智能打标流程图
打标API接口文档
图像内容识别
其他
申报项目文本段落
研发链相关资料文档
国产GPU虚拟化培训介绍
RAG垂直落地
DB-GPT与TeleChat-7B搭建相关RAG知识库
ChatWithRTX
ChatRTX安装教程
ChatWithRTX 踩坑记录
ChatWithRTX 使用其他量化模型
ChatWithRTX介绍
RAG 相关资料
英伟达—大模型结合 RAG 构建客服场景自动问答
又一大模型技术开源!有道自研RAG引擎QAnything正式开放下载
收藏!RAG入门参考资料开源大总结:RAG综述、介绍、比较、预处理、RAG Embedding等
RAG调研
解决现代RAG实际生产问题
解决现代 RAG 系统中的生产问题-II
先进的Retriever技术来增强你的RAGs
高级RAG — 使用假设文档嵌入 (HyDE) 改进检索
提升 RAG:选择最佳嵌入和 Reranker 模型
LangGraph
增强型RAG:re-rank
大模型比赛
相关资料
智能填单_填单 启动命令
2024私人规划
ChatGPT API账号记录
公众号相关资料
基于 Docker 的深度学习环境:入门篇
ollama
-
+
首页
RAG调研
调研一下各类RAG是如何实现相关的富文本入向量数据库的: - DB-GPT: - 通过pypdf直接读取pdf文件然后传给langchain使用,其中没有办法读取表格和文档中的图片相关的数据,仅仅是提取里面相关的文字并进行处理。 ``` """PDF Knowledge.""" from typing import Any, List, Optional from dbgpt.rag.chunk import Document from dbgpt.rag.knowledge.base import ( ChunkStrategy, DocumentType, Knowledge, KnowledgeType, ) class PDFKnowledge(Knowledge): """PDF Knowledge.""" def __init__( self, file_path: Optional[str] = None, knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT, loader: Optional[Any] = None, language: Optional[str] = "zh", **kwargs: Any, ) -> None: """Create PDF Knowledge with Knowledge arguments. Args: file_path(str, optional): file path knowledge_type(KnowledgeType, optional): knowledge type loader(Any, optional): loader language(str, optional): language """ self._path = file_path self._type = knowledge_type self._loader = loader self._language = language def _load(self) -> List[Document]: """Load pdf document from loader.""" if self._loader: documents = self._loader.load() else: import pypdf pages = [] documents = [] if not self._path: raise ValueError("file path is required") with open(self._path, "rb") as file: reader = pypdf.PdfReader(file) for page_num in range(len(reader.pages)): _page = reader.pages[page_num] pages.append((_page.extract_text(), page_num)) # cleaned_pages = [] for page, page_num in pages: lines = page.splitlines() cleaned_lines = [] for line in lines: if self._language == "en": words = list(line) # noqa: F841 else: words = line.split() # noqa: F841 cleaned_lines.append(line) page = "\n".join(cleaned_lines) # cleaned_pages.append(page) metadata = {"source": self._path, "page": page_num} # text = "\f".join(cleaned_pages) document = Document(content=page, metadata=metadata) documents.append(document) return documents return [Document.langchain2doc(lc_document) for lc_document in documents] @classmethod def support_chunk_strategy(cls) -> List[ChunkStrategy]: """Return support chunk strategy.""" return [ ChunkStrategy.CHUNK_BY_SIZE, ChunkStrategy.CHUNK_BY_PAGE, ChunkStrategy.CHUNK_BY_SEPARATOR, ] @classmethod def default_chunk_strategy(cls) -> ChunkStrategy: """Return default chunk strategy.""" return ChunkStrategy.CHUNK_BY_SIZE @classmethod def type(cls) -> KnowledgeType: """Return knowledge type.""" return KnowledgeType.DOCUMENT @classmethod def document_type(cls) -> DocumentType: """Document type of PDF.""" return DocumentType.PDF ``` - anything-llm - 是通过VUE中的PDFLoader插件来读取PDF文档中的内容然后进行解读,也是只保留文字部分 ``` const { v4 } = require("uuid"); const { PDFLoader } = require("langchain/document_loaders/fs/pdf"); const { createdDate, trashFile, writeToServerDocuments, } = require("../../utils/files"); const { tokenizeString } = require("../../utils/tokenizer"); const { default: slugify } = require("slugify"); async function asPDF({ fullFilePath = "", filename = "" }) { const pdfLoader = new PDFLoader(fullFilePath, { splitPages: true, }); console.log(`-- Working ${filename} --`); const pageContent = []; const docs = await pdfLoader.load(); for (const doc of docs) { console.log( `-- Parsing content from pg ${ doc.metadata?.loc?.pageNumber || "unknown" } --` ); if (!doc.pageContent.length) continue; pageContent.push(doc.pageContent); } if (!pageContent.length) { console.error(`Resulting text content was empty for ${filename}.`); trashFile(fullFilePath); return { success: false, reason: `No text content found in ${filename}.`, documents: [], }; } const content = pageContent.join(""); const data = { id: v4(), url: "file://" + fullFilePath, title: docs[0]?.metadata?.pdf?.info?.Title || filename, docAuthor: docs[0]?.metadata?.pdf?.info?.Creator || "no author found", description: "No description found.", docSource: "pdf file uploaded by the user.", chunkSource: "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, token_count_estimate: tokenizeString(content).length, }; const document = writeToServerDocuments( data, `${slugify(filename)}-${data.id}` ); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); return { success: true, reason: null, documents: [document] }; } module.exports = asPDF; ``` - Q-anything - Q-anything用的PDF结构解析是使用了OCR的方式进行的,和普通的pypdf解析不同,效果更好,而且将图片的结果也解析得出,图片中的文字解析得出。但是解析文件速度估计会更慢 ``` """Loader that loads image files.""" from typing import List, Callable from langchain.document_loaders.unstructured import UnstructuredFileLoader from unstructured.partition.text import partition_text import os import fitz from tqdm import tqdm from typing import Union, Any import numpy as np import base64 class UnstructuredPaddlePDFLoader(UnstructuredFileLoader): """Loader that uses unstructured to load image files, such as PNGs and JPGs.""" def __init__( self, file_path: Union[str, List[str]], ocr_engine: Callable, mode: str = "single", **unstructured_kwargs: Any, ): """Initialize with file path.""" self.ocr_engine = ocr_engine super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) def _get_elements(self) -> List: def pdf_ocr_txt(filepath, dir_path="tmp_files"): full_dir_path = os.path.join(os.path.dirname(filepath), dir_path) if not os.path.exists(full_dir_path): os.makedirs(full_dir_path) doc = fitz.open(filepath) txt_file_path = os.path.join(full_dir_path, "{}.txt".format(os.path.split(filepath)[-1])) img_name = os.path.join(full_dir_path, 'tmp.png') with open(txt_file_path, 'w', encoding='utf-8') as fout: for i in tqdm(range(doc.page_count)): page = doc.load_page(i) pix = page.get_pixmap() img = np.frombuffer(pix.samples, dtype=np.uint8).reshape((pix.h, pix.w, pix.n)) img_data = {"img64": base64.b64encode(img).decode("utf-8"), "height": pix.h, "width": pix.w, "channels": pix.n} result = self.ocr_engine(img_data) result = [line for line in result if line] ocr_result = [i[1][0] for line in result for i in line] fout.write("\n".join(ocr_result)) if os.path.exists(img_name): os.remove(img_name) return txt_file_path txt_file_path = pdf_ocr_txt(self.file_path) return partition_text(filename=txt_file_path, **self.unstructured_kwargs) ``` # 总结 QAnything背靠大厂,效果更佳。对标的产品可以用腾讯的金融大模型的产品对标。 1. PaddleOCR应该可以选用更好的OCR产品来提高文档识别准确率的性能。 2. pdf文档是否可以进行转换转换成docx文档后再进行读取。 3. 文档中表格数据是否需要增强代码处理
yg9538
2024年3月14日 12:17
87
转发文档
收藏文档
上一篇
下一篇
手机扫码
复制链接
手机扫一扫转发分享
复制链接
Markdown文件
PDF文档
PDF文档(打印)
分享
链接
类型
密码
更新密码