From 87137ad97a08ec1eaa0f201eb73b2567399eef80 Mon Sep 17 00:00:00 2001
From: -LAN- <laipz8200@outlook.com>
Date: Thu, 14 Nov 2024 21:22:19 +0800
Subject: [PATCH] feat: add support for document, video, and audio content

Expanded the system to handle document types across different modules and introduced video and audio content handling in model features. Adjusted the prompt message logic to conditionally process content based on available features, enhancing flexibility in media processing. Added comprehensive error handling in `LLMNode` for better runtime resilience. Updated YAML configuration and unit tests to reflect these changes.
---
 .../entities/message_entities.py              |  1 +
 .../model_runtime/entities/model_entities.py  |  3 +
 .../openai/llm/gpt-4o-audio-preview.yaml      |  1 +
 api/core/workflow/nodes/llm/node.py           | 59 ++++++++++++++-----
 .../core/workflow/nodes/llm/test_node.py      | 26 ++++++++
 5 files changed, 76 insertions(+), 14 deletions(-)

diff --git a/api/core/model_runtime/entities/message_entities.py b/api/core/model_runtime/entities/message_entities.py
index fc37227bc9..d4d56a42a4 100644
--- a/api/core/model_runtime/entities/message_entities.py
+++ b/api/core/model_runtime/entities/message_entities.py
@@ -58,6 +58,7 @@ class PromptMessageContentType(Enum):
     IMAGE = "image"
     AUDIO = "audio"
     VIDEO = "video"
+    DOCUMENT = "document"
 
 
 class PromptMessageContent(BaseModel):
diff --git a/api/core/model_runtime/entities/model_entities.py b/api/core/model_runtime/entities/model_entities.py
index 52ea787c3a..4e1ce17533 100644
--- a/api/core/model_runtime/entities/model_entities.py
+++ b/api/core/model_runtime/entities/model_entities.py
@@ -87,6 +87,9 @@ class ModelFeature(Enum):
     AGENT_THOUGHT = "agent-thought"
     VISION = "vision"
     STREAM_TOOL_CALL = "stream-tool-call"
+    DOCUMENT = "document"
+    VIDEO = "video"
+    AUDIO = "audio"
 
 
 class DefaultParameterName(str, Enum):
diff --git a/api/core/model_runtime/model_providers/openai/llm/gpt-4o-audio-preview.yaml b/api/core/model_runtime/model_providers/openai/llm/gpt-4o-audio-preview.yaml
index 256e87edbe..5a14bfc47f 100644
--- a/api/core/model_runtime/model_providers/openai/llm/gpt-4o-audio-preview.yaml
+++ b/api/core/model_runtime/model_providers/openai/llm/gpt-4o-audio-preview.yaml
@@ -8,6 +8,7 @@ features:
   - agent-thought
   - stream-tool-call
   - vision
+  - audio
 model_properties:
   mode: chat
   context_size: 128000
diff --git a/api/core/workflow/nodes/llm/node.py b/api/core/workflow/nodes/llm/node.py
index f0b8830eb5..a5620dbc01 100644
--- a/api/core/workflow/nodes/llm/node.py
+++ b/api/core/workflow/nodes/llm/node.py
@@ -193,6 +193,17 @@ class LLMNode(BaseNode[LLMNodeData]):
                 )
             )
             return
+        except Exception as e:
+            logger.exception(f"Node {self.node_id} failed to run: {e}")
+            yield RunCompletedEvent(
+                run_result=NodeRunResult(
+                    status=WorkflowNodeExecutionStatus.FAILED,
+                    error=str(e),
+                    inputs=node_inputs,
+                    process_data=process_data,
+                )
+            )
+            return
 
         outputs = {"text": result_text, "usage": jsonable_encoder(usage), "finish_reason": finish_reason}
 
@@ -607,11 +618,31 @@ class LLMNode(BaseNode[LLMNodeData]):
             if isinstance(prompt_message.content, list):
                 prompt_message_content = []
                 for content_item in prompt_message.content:
-                    # Skip image if vision is disabled or model doesn't support vision
-                    if content_item.type == PromptMessageContentType.IMAGE and (
-                        not vision_enabled
-                        or not model_config.model_schema.features
-                        or ModelFeature.VISION not in model_config.model_schema.features
+                    # Skip content if features are not defined
+                    if not model_config.model_schema.features:
+                        if content_item.type != PromptMessageContentType.TEXT:
+                            continue
+                        prompt_message_content.append(content_item)
+                        continue
+
+                    # Skip content if corresponding feature is not supported
+                    if (
+                        (
+                            content_item.type == PromptMessageContentType.IMAGE
+                            and (not vision_enabled or ModelFeature.VISION not in model_config.model_schema.features)
+                        )
+                        or (
+                            content_item.type == PromptMessageContentType.DOCUMENT
+                            and ModelFeature.DOCUMENT not in model_config.model_schema.features
+                        )
+                        or (
+                            content_item.type == PromptMessageContentType.VIDEO
+                            and ModelFeature.VIDEO not in model_config.model_schema.features
+                        )
+                        or (
+                            content_item.type == PromptMessageContentType.AUDIO
+                            and ModelFeature.AUDIO not in model_config.model_schema.features
+                        )
                     ):
                         continue
                     prompt_message_content.append(content_item)
@@ -854,22 +885,22 @@ class LLMNode(BaseNode[LLMNodeData]):
                 )
 
                 # Process segments for images
-                image_contents = []
+                file_contents = []
                 for segment in segment_group.value:
                     if isinstance(segment, ArrayFileSegment):
                         for file in segment.value:
-                            if file.type == FileType.IMAGE:
-                                image_content = file_manager.to_prompt_message_content(
+                            if file.type in {FileType.IMAGE, FileType.VIDEO, FileType.AUDIO}:
+                                file_content = file_manager.to_prompt_message_content(
                                     file, image_detail_config=self.node_data.vision.configs.detail
                                 )
-                                image_contents.append(image_content)
+                                file_contents.append(file_content)
                     if isinstance(segment, FileSegment):
                         file = segment.value
-                        if file.type == FileType.IMAGE:
-                            image_content = file_manager.to_prompt_message_content(
+                        if file.type in {FileType.IMAGE, FileType.VIDEO, FileType.AUDIO}:
+                            file_content = file_manager.to_prompt_message_content(
                                 file, image_detail_config=self.node_data.vision.configs.detail
                             )
-                            image_contents.append(image_content)
+                            file_contents.append(file_content)
 
                 # Create message with text from all segments
                 plain_text = segment_group.text
@@ -877,9 +908,9 @@ class LLMNode(BaseNode[LLMNodeData]):
                     prompt_message = _combine_text_message_with_role(text=plain_text, role=message.role)
                     prompt_messages.append(prompt_message)
 
-                if image_contents:
+                if file_contents:
                     # Create message with image contents
-                    prompt_message = UserPromptMessage(content=image_contents)
+                    prompt_message = UserPromptMessage(content=file_contents)
                     prompt_messages.append(prompt_message)
 
         return prompt_messages
diff --git a/api/tests/unit_tests/core/workflow/nodes/llm/test_node.py b/api/tests/unit_tests/core/workflow/nodes/llm/test_node.py
index da21710832..6ec219aa8d 100644
--- a/api/tests/unit_tests/core/workflow/nodes/llm/test_node.py
+++ b/api/tests/unit_tests/core/workflow/nodes/llm/test_node.py
@@ -423,6 +423,32 @@ def test_fetch_prompt_messages__basic(faker, llm_node, model_config):
                 )
             },
         ),
+        LLMNodeTestScenario(
+            description="Prompt template with variable selector of File with video file and vision feature",
+            user_query=fake_query,
+            user_files=[],
+            vision_enabled=True,
+            vision_detail=fake_vision_detail,
+            features=[ModelFeature.VISION],
+            window_size=fake_window_size,
+            prompt_template=[
+                LLMNodeChatModelMessage(
+                    text="{{#input.image#}}",
+                    role=PromptMessageRole.USER,
+                    edition_type="basic",
+                ),
+            ],
+            expected_messages=mock_history[fake_window_size * -2 :] + [UserPromptMessage(content=fake_query)],
+            file_variables={
+                "input.image": File(
+                    tenant_id="test",
+                    type=FileType.VIDEO,
+                    filename="test1.jpg",
+                    transfer_method=FileTransferMethod.REMOTE_URL,
+                    remote_url=fake_remote_url,
+                )
+            },
+        ),
     ]
 
     for scenario in test_scenarios: