From b00466f025f424e253b20ae44d42ade1b0dd01ea Mon Sep 17 00:00:00 2001 From: LiuVaayne <10231735+vaayne@users.noreply.github.com> Date: Fri, 12 Apr 2024 11:25:02 +0800 Subject: [PATCH] feat:api Add support for extracting EPUB files in ExtractProcessor (#3254) Co-authored-by: crazywoola <427733928@qq.com> --- api/core/rag/extractor/extract_processor.py | 5 +++ .../unstructured_epub_extractor.py | 37 +++++++++++++++++++ api/requirements.txt | 2 +- api/services/file_service.py | 2 +- 4 files changed, 44 insertions(+), 2 deletions(-) create mode 100644 api/core/rag/extractor/unstructured/unstructured_epub_extractor.py diff --git a/api/core/rag/extractor/extract_processor.py b/api/core/rag/extractor/extract_processor.py index 0de7065335..8bb884c2dd 100644 --- a/api/core/rag/extractor/extract_processor.py +++ b/api/core/rag/extractor/extract_processor.py @@ -16,6 +16,7 @@ from core.rag.extractor.pdf_extractor import PdfExtractor from core.rag.extractor.text_extractor import TextExtractor from core.rag.extractor.unstructured.unstructured_doc_extractor import UnstructuredWordExtractor from core.rag.extractor.unstructured.unstructured_eml_extractor import UnstructuredEmailExtractor +from core.rag.extractor.unstructured.unstructured_epub_extractor import UnstructuredEpubExtractor from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor from core.rag.extractor.unstructured.unstructured_msg_extractor import UnstructuredMsgExtractor from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor @@ -106,6 +107,8 @@ class ExtractProcessor: extractor = UnstructuredPPTXExtractor(file_path, unstructured_api_url) elif file_extension == '.xml': extractor = UnstructuredXmlExtractor(file_path, unstructured_api_url) + elif file_extension == 'epub': + extractor = UnstructuredEpubExtractor(file_path, unstructured_api_url) else: # txt extractor = UnstructuredTextExtractor(file_path, unstructured_api_url) if is_automatic \ @@ -123,6 +126,8 @@ class ExtractProcessor: extractor = WordExtractor(file_path) elif file_extension == '.csv': extractor = CSVExtractor(file_path, autodetect_encoding=True) + elif file_extension == 'epub': + extractor = UnstructuredEpubExtractor(file_path) else: # txt extractor = TextExtractor(file_path, autodetect_encoding=True) diff --git a/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py b/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py new file mode 100644 index 0000000000..44cf958ea2 --- /dev/null +++ b/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py @@ -0,0 +1,37 @@ +import logging + +from core.rag.extractor.extractor_base import BaseExtractor +from core.rag.models.document import Document + +logger = logging.getLogger(__name__) + + +class UnstructuredEpubExtractor(BaseExtractor): + """Load epub files. + + + Args: + file_path: Path to the file to load. + """ + + def __init__( + self, + file_path: str, + api_url: str = None, + ): + """Initialize with file path.""" + self._file_path = file_path + self._api_url = api_url + + def extract(self) -> list[Document]: + from unstructured.partition.epub import partition_epub + + elements = partition_epub(filename=self._file_path, xml_keep_tags=True) + from unstructured.chunking.title import chunk_by_title + chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000) + documents = [] + for chunk in chunks: + text = chunk.text.strip() + documents.append(Document(page_content=text)) + + return documents diff --git a/api/requirements.txt b/api/requirements.txt index 19d8d4128f..f9ff2ee60d 100644 --- a/api/requirements.txt +++ b/api/requirements.txt @@ -65,7 +65,7 @@ qdrant-client==1.7.3 cohere~=5.2.4 pyyaml~=6.0.1 numpy~=1.25.2 -unstructured[docx,pptx,msg,md,ppt]~=0.10.27 +unstructured[docx,pptx,msg,md,ppt,epub]~=0.10.27 bs4~=0.0.1 markdown~=3.5.1 httpx[socks]~=0.24.1 diff --git a/api/services/file_service.py b/api/services/file_service.py index 53dd090236..39f31098ae 100644 --- a/api/services/file_service.py +++ b/api/services/file_service.py @@ -22,7 +22,7 @@ IMAGE_EXTENSIONS.extend([ext.upper() for ext in IMAGE_EXTENSIONS]) ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv'] UNSTRUSTURED_ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', - 'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml'] + 'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml', 'epub'] PREVIEW_WORDS_LIMIT = 3000