mirror of
https://github.com/langgenius/dify.git
synced 2024-11-16 11:42:29 +08:00
feat(document_extractor): integrate unstructured API for PPTX extraction (#10180)
Some checks are pending
Build and Push API & Web / build (api, DIFY_API_IMAGE_NAME, linux/amd64, build-api-amd64) (push) Waiting to run
Build and Push API & Web / build (api, DIFY_API_IMAGE_NAME, linux/arm64, build-api-arm64) (push) Waiting to run
Build and Push API & Web / build (web, DIFY_WEB_IMAGE_NAME, linux/amd64, build-web-amd64) (push) Waiting to run
Build and Push API & Web / build (web, DIFY_WEB_IMAGE_NAME, linux/arm64, build-web-arm64) (push) Waiting to run
Build and Push API & Web / create-manifest (api, DIFY_API_IMAGE_NAME, merge-api-images) (push) Blocked by required conditions
Build and Push API & Web / create-manifest (web, DIFY_WEB_IMAGE_NAME, merge-web-images) (push) Blocked by required conditions
Some checks are pending
Build and Push API & Web / build (api, DIFY_API_IMAGE_NAME, linux/amd64, build-api-amd64) (push) Waiting to run
Build and Push API & Web / build (api, DIFY_API_IMAGE_NAME, linux/arm64, build-api-arm64) (push) Waiting to run
Build and Push API & Web / build (web, DIFY_WEB_IMAGE_NAME, linux/amd64, build-web-amd64) (push) Waiting to run
Build and Push API & Web / build (web, DIFY_WEB_IMAGE_NAME, linux/arm64, build-web-arm64) (push) Waiting to run
Build and Push API & Web / create-manifest (api, DIFY_API_IMAGE_NAME, merge-api-images) (push) Blocked by required conditions
Build and Push API & Web / create-manifest (web, DIFY_WEB_IMAGE_NAME, merge-web-images) (push) Blocked by required conditions
This commit is contained in:
parent
86739bea8b
commit
53a7cb0e9d
|
@ -6,12 +6,14 @@ import docx
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pypdfium2
|
import pypdfium2
|
||||||
import yaml
|
import yaml
|
||||||
|
from unstructured.partition.api import partition_via_api
|
||||||
from unstructured.partition.email import partition_email
|
from unstructured.partition.email import partition_email
|
||||||
from unstructured.partition.epub import partition_epub
|
from unstructured.partition.epub import partition_epub
|
||||||
from unstructured.partition.msg import partition_msg
|
from unstructured.partition.msg import partition_msg
|
||||||
from unstructured.partition.ppt import partition_ppt
|
from unstructured.partition.ppt import partition_ppt
|
||||||
from unstructured.partition.pptx import partition_pptx
|
from unstructured.partition.pptx import partition_pptx
|
||||||
|
|
||||||
|
from configs import dify_config
|
||||||
from core.file import File, FileTransferMethod, file_manager
|
from core.file import File, FileTransferMethod, file_manager
|
||||||
from core.helper import ssrf_proxy
|
from core.helper import ssrf_proxy
|
||||||
from core.variables import ArrayFileSegment
|
from core.variables import ArrayFileSegment
|
||||||
|
@ -263,6 +265,13 @@ def _extract_text_from_ppt(file_content: bytes) -> str:
|
||||||
def _extract_text_from_pptx(file_content: bytes) -> str:
|
def _extract_text_from_pptx(file_content: bytes) -> str:
|
||||||
try:
|
try:
|
||||||
with io.BytesIO(file_content) as file:
|
with io.BytesIO(file_content) as file:
|
||||||
|
if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY:
|
||||||
|
elements = partition_via_api(
|
||||||
|
file=file,
|
||||||
|
api_url=dify_config.UNSTRUCTURED_API_URL,
|
||||||
|
api_key=dify_config.UNSTRUCTURED_API_KEY,
|
||||||
|
)
|
||||||
|
else:
|
||||||
elements = partition_pptx(file=file)
|
elements = partition_pptx(file=file)
|
||||||
return "\n".join([getattr(element, "text", "") for element in elements])
|
return "\n".join([getattr(element, "text", "") for element in elements])
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user