feat: add support for document, video, and audio content

Expanded the system to handle document types across different modules and introduced video and audio content handling in model features. Adjusted the prompt message logic to conditionally process content based on available features, enhancing flexibility in media processing. Added comprehensive error handling in `LLMNode` for better runtime resilience. Updated YAML configuration and unit tests to reflect these changes.
2024-11-16 11:42:29 +08:00 · 2024-11-14 21:22:19 +08:00 · 2024-11-14 21:22:19 +08:00 · 87137ad97a
commit 87137ad97a
parent 14205a7756
5 changed files with 76 additions and 14 deletions
--- a/api/core/model_runtime/entities/message_entities.py
+++ b/api/core/model_runtime/entities/message_entities.py
@ -58,6 +58,7 @@ class PromptMessageContentType(Enum):
    IMAGE = "image"
    AUDIO = "audio"
    VIDEO = "video"
+    DOCUMENT = "document"


 class PromptMessageContent(BaseModel):
--- a/api/core/model_runtime/entities/model_entities.py
+++ b/api/core/model_runtime/entities/model_entities.py
@ -87,6 +87,9 @@ class ModelFeature(Enum):
    AGENT_THOUGHT = "agent-thought"
    VISION = "vision"
    STREAM_TOOL_CALL = "stream-tool-call"
+    DOCUMENT = "document"
+    VIDEO = "video"
+    AUDIO = "audio"


 class DefaultParameterName(str, Enum):
--- a/api/core/model_runtime/model_providers/openai/llm/gpt-4o-audio-preview.yaml
+++ b/api/core/model_runtime/model_providers/openai/llm/gpt-4o-audio-preview.yaml
@ -8,6 +8,7 @@ features:
  - agent-thought
  - stream-tool-call
  - vision
+  - audio
 model_properties:
  mode: chat
  context_size: 128000
--- a/api/core/workflow/nodes/llm/node.py
+++ b/api/core/workflow/nodes/llm/node.py
@ -193,6 +193,17 @@ class LLMNode(BaseNode[LLMNodeData]):
                )
            )
            return
+        except Exception as e:
+            logger.exception(f"Node {self.node_id} failed to run: {e}")
+            yield RunCompletedEvent(
+                run_result=NodeRunResult(
+                    status=WorkflowNodeExecutionStatus.FAILED,
+                    error=str(e),
+                    inputs=node_inputs,
+                    process_data=process_data,
+                )
+            )
+            return

        outputs = {"text": result_text, "usage": jsonable_encoder(usage), "finish_reason": finish_reason}

@ -607,11 +618,31 @@ class LLMNode(BaseNode[LLMNodeData]):
            if isinstance(prompt_message.content, list):
                prompt_message_content = []
                for content_item in prompt_message.content:
-                    # Skip image if vision is disabled or model doesn't support vision
-                    if content_item.type == PromptMessageContentType.IMAGE and (
-                        not vision_enabled
-                        or not model_config.model_schema.features
-                        or ModelFeature.VISION not in model_config.model_schema.features
+                    # Skip content if features are not defined
+                    if not model_config.model_schema.features:
+                        if content_item.type != PromptMessageContentType.TEXT:
+                            continue
+                        prompt_message_content.append(content_item)
+                        continue
+
+                    # Skip content if corresponding feature is not supported
+                    if (
+                        (
+                            content_item.type == PromptMessageContentType.IMAGE
+                            and (not vision_enabled or ModelFeature.VISION not in model_config.model_schema.features)
+                        )
+                        or (
+                            content_item.type == PromptMessageContentType.DOCUMENT
+                            and ModelFeature.DOCUMENT not in model_config.model_schema.features
+                        )
+                        or (
+                            content_item.type == PromptMessageContentType.VIDEO
+                            and ModelFeature.VIDEO not in model_config.model_schema.features
+                        )
+                        or (
+                            content_item.type == PromptMessageContentType.AUDIO
+                            and ModelFeature.AUDIO not in model_config.model_schema.features
+                        )
                    ):
                        continue
                    prompt_message_content.append(content_item)
@ -854,22 +885,22 @@ class LLMNode(BaseNode[LLMNodeData]):
                )

                # Process segments for images
-                image_contents = []
+                file_contents = []
                for segment in segment_group.value:
                    if isinstance(segment, ArrayFileSegment):
                        for file in segment.value:
-                            if file.type == FileType.IMAGE:
-                                image_content = file_manager.to_prompt_message_content(
+                            if file.type in {FileType.IMAGE, FileType.VIDEO, FileType.AUDIO}:
+                                file_content = file_manager.to_prompt_message_content(
                                    file, image_detail_config=self.node_data.vision.configs.detail
                                )
-                                image_contents.append(image_content)
+                                file_contents.append(file_content)
                    if isinstance(segment, FileSegment):
                        file = segment.value
-                        if file.type == FileType.IMAGE:
-                            image_content = file_manager.to_prompt_message_content(
+                        if file.type in {FileType.IMAGE, FileType.VIDEO, FileType.AUDIO}:
+                            file_content = file_manager.to_prompt_message_content(
                                file, image_detail_config=self.node_data.vision.configs.detail
                            )
-                            image_contents.append(image_content)
+                            file_contents.append(file_content)

                # Create message with text from all segments
                plain_text = segment_group.text
@ -877,9 +908,9 @@ class LLMNode(BaseNode[LLMNodeData]):
                    prompt_message = _combine_text_message_with_role(text=plain_text, role=message.role)
                    prompt_messages.append(prompt_message)

-                if image_contents:
+                if file_contents:
                    # Create message with image contents
-                    prompt_message = UserPromptMessage(content=image_contents)
+                    prompt_message = UserPromptMessage(content=file_contents)
                    prompt_messages.append(prompt_message)

        return prompt_messages
--- a/api/tests/unit_tests/core/workflow/nodes/llm/test_node.py
+++ b/api/tests/unit_tests/core/workflow/nodes/llm/test_node.py
@ -423,6 +423,32 @@ def test_fetch_prompt_messages__basic(faker, llm_node, model_config):
                )
            },
        ),
+        LLMNodeTestScenario(
+            description="Prompt template with variable selector of File with video file and vision feature",
+            user_query=fake_query,
+            user_files=[],
+            vision_enabled=True,
+            vision_detail=fake_vision_detail,
+            features=[ModelFeature.VISION],
+            window_size=fake_window_size,
+            prompt_template=[
+                LLMNodeChatModelMessage(
+                    text="{{#input.image#}}",
+                    role=PromptMessageRole.USER,
+                    edition_type="basic",
+                ),
+            ],
+            expected_messages=mock_history[fake_window_size * -2 :] + [UserPromptMessage(content=fake_query)],
+            file_variables={
+                "input.image": File(
+                    tenant_id="test",
+                    type=FileType.VIDEO,
+                    filename="test1.jpg",
+                    transfer_method=FileTransferMethod.REMOTE_URL,
+                    remote_url=fake_remote_url,
+                )
+            },
+        ),
    ]

    for scenario in test_scenarios: