diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py index c2f51ad1e5..aacee94095 100644 --- a/api/core/workflow/nodes/document_extractor/node.py +++ b/api/core/workflow/nodes/document_extractor/node.py @@ -6,12 +6,14 @@ import docx import pandas as pd import pypdfium2 import yaml +from unstructured.partition.api import partition_via_api from unstructured.partition.email import partition_email from unstructured.partition.epub import partition_epub from unstructured.partition.msg import partition_msg from unstructured.partition.ppt import partition_ppt from unstructured.partition.pptx import partition_pptx +from configs import dify_config from core.file import File, FileTransferMethod, file_manager from core.helper import ssrf_proxy from core.variables import ArrayFileSegment @@ -263,7 +265,14 @@ def _extract_text_from_ppt(file_content: bytes) -> str: def _extract_text_from_pptx(file_content: bytes) -> str: try: with io.BytesIO(file_content) as file: - elements = partition_pptx(file=file) + if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY: + elements = partition_via_api( + file=file, + api_url=dify_config.UNSTRUCTURED_API_URL, + api_key=dify_config.UNSTRUCTURED_API_KEY, + ) + else: + elements = partition_pptx(file=file) return "\n".join([getattr(element, "text", "") for element in elements]) except Exception as e: raise TextExtractionError(f"Failed to extract text from PPTX: {str(e)}") from e