From 87137ad97a08ec1eaa0f201eb73b2567399eef80 Mon Sep 17 00:00:00 2001 From: -LAN- Date: Thu, 14 Nov 2024 21:22:19 +0800 Subject: [PATCH] feat: add support for document, video, and audio content Expanded the system to handle document types across different modules and introduced video and audio content handling in model features. Adjusted the prompt message logic to conditionally process content based on available features, enhancing flexibility in media processing. Added comprehensive error handling in `LLMNode` for better runtime resilience. Updated YAML configuration and unit tests to reflect these changes. --- .../entities/message_entities.py | 1 + .../model_runtime/entities/model_entities.py | 3 + .../openai/llm/gpt-4o-audio-preview.yaml | 1 + api/core/workflow/nodes/llm/node.py | 59 ++++++++++++++----- .../core/workflow/nodes/llm/test_node.py | 26 ++++++++ 5 files changed, 76 insertions(+), 14 deletions(-) diff --git a/api/core/model_runtime/entities/message_entities.py b/api/core/model_runtime/entities/message_entities.py index fc37227bc9..d4d56a42a4 100644 --- a/api/core/model_runtime/entities/message_entities.py +++ b/api/core/model_runtime/entities/message_entities.py @@ -58,6 +58,7 @@ class PromptMessageContentType(Enum): IMAGE = "image" AUDIO = "audio" VIDEO = "video" + DOCUMENT = "document" class PromptMessageContent(BaseModel): diff --git a/api/core/model_runtime/entities/model_entities.py b/api/core/model_runtime/entities/model_entities.py index 52ea787c3a..4e1ce17533 100644 --- a/api/core/model_runtime/entities/model_entities.py +++ b/api/core/model_runtime/entities/model_entities.py @@ -87,6 +87,9 @@ class ModelFeature(Enum): AGENT_THOUGHT = "agent-thought" VISION = "vision" STREAM_TOOL_CALL = "stream-tool-call" + DOCUMENT = "document" + VIDEO = "video" + AUDIO = "audio" class DefaultParameterName(str, Enum): diff --git a/api/core/model_runtime/model_providers/openai/llm/gpt-4o-audio-preview.yaml b/api/core/model_runtime/model_providers/openai/llm/gpt-4o-audio-preview.yaml index 256e87edbe..5a14bfc47f 100644 --- a/api/core/model_runtime/model_providers/openai/llm/gpt-4o-audio-preview.yaml +++ b/api/core/model_runtime/model_providers/openai/llm/gpt-4o-audio-preview.yaml @@ -8,6 +8,7 @@ features: - agent-thought - stream-tool-call - vision + - audio model_properties: mode: chat context_size: 128000 diff --git a/api/core/workflow/nodes/llm/node.py b/api/core/workflow/nodes/llm/node.py index f0b8830eb5..a5620dbc01 100644 --- a/api/core/workflow/nodes/llm/node.py +++ b/api/core/workflow/nodes/llm/node.py @@ -193,6 +193,17 @@ class LLMNode(BaseNode[LLMNodeData]): ) ) return + except Exception as e: + logger.exception(f"Node {self.node_id} failed to run: {e}") + yield RunCompletedEvent( + run_result=NodeRunResult( + status=WorkflowNodeExecutionStatus.FAILED, + error=str(e), + inputs=node_inputs, + process_data=process_data, + ) + ) + return outputs = {"text": result_text, "usage": jsonable_encoder(usage), "finish_reason": finish_reason} @@ -607,11 +618,31 @@ class LLMNode(BaseNode[LLMNodeData]): if isinstance(prompt_message.content, list): prompt_message_content = [] for content_item in prompt_message.content: - # Skip image if vision is disabled or model doesn't support vision - if content_item.type == PromptMessageContentType.IMAGE and ( - not vision_enabled - or not model_config.model_schema.features - or ModelFeature.VISION not in model_config.model_schema.features + # Skip content if features are not defined + if not model_config.model_schema.features: + if content_item.type != PromptMessageContentType.TEXT: + continue + prompt_message_content.append(content_item) + continue + + # Skip content if corresponding feature is not supported + if ( + ( + content_item.type == PromptMessageContentType.IMAGE + and (not vision_enabled or ModelFeature.VISION not in model_config.model_schema.features) + ) + or ( + content_item.type == PromptMessageContentType.DOCUMENT + and ModelFeature.DOCUMENT not in model_config.model_schema.features + ) + or ( + content_item.type == PromptMessageContentType.VIDEO + and ModelFeature.VIDEO not in model_config.model_schema.features + ) + or ( + content_item.type == PromptMessageContentType.AUDIO + and ModelFeature.AUDIO not in model_config.model_schema.features + ) ): continue prompt_message_content.append(content_item) @@ -854,22 +885,22 @@ class LLMNode(BaseNode[LLMNodeData]): ) # Process segments for images - image_contents = [] + file_contents = [] for segment in segment_group.value: if isinstance(segment, ArrayFileSegment): for file in segment.value: - if file.type == FileType.IMAGE: - image_content = file_manager.to_prompt_message_content( + if file.type in {FileType.IMAGE, FileType.VIDEO, FileType.AUDIO}: + file_content = file_manager.to_prompt_message_content( file, image_detail_config=self.node_data.vision.configs.detail ) - image_contents.append(image_content) + file_contents.append(file_content) if isinstance(segment, FileSegment): file = segment.value - if file.type == FileType.IMAGE: - image_content = file_manager.to_prompt_message_content( + if file.type in {FileType.IMAGE, FileType.VIDEO, FileType.AUDIO}: + file_content = file_manager.to_prompt_message_content( file, image_detail_config=self.node_data.vision.configs.detail ) - image_contents.append(image_content) + file_contents.append(file_content) # Create message with text from all segments plain_text = segment_group.text @@ -877,9 +908,9 @@ class LLMNode(BaseNode[LLMNodeData]): prompt_message = _combine_text_message_with_role(text=plain_text, role=message.role) prompt_messages.append(prompt_message) - if image_contents: + if file_contents: # Create message with image contents - prompt_message = UserPromptMessage(content=image_contents) + prompt_message = UserPromptMessage(content=file_contents) prompt_messages.append(prompt_message) return prompt_messages diff --git a/api/tests/unit_tests/core/workflow/nodes/llm/test_node.py b/api/tests/unit_tests/core/workflow/nodes/llm/test_node.py index da21710832..6ec219aa8d 100644 --- a/api/tests/unit_tests/core/workflow/nodes/llm/test_node.py +++ b/api/tests/unit_tests/core/workflow/nodes/llm/test_node.py @@ -423,6 +423,32 @@ def test_fetch_prompt_messages__basic(faker, llm_node, model_config): ) }, ), + LLMNodeTestScenario( + description="Prompt template with variable selector of File with video file and vision feature", + user_query=fake_query, + user_files=[], + vision_enabled=True, + vision_detail=fake_vision_detail, + features=[ModelFeature.VISION], + window_size=fake_window_size, + prompt_template=[ + LLMNodeChatModelMessage( + text="{{#input.image#}}", + role=PromptMessageRole.USER, + edition_type="basic", + ), + ], + expected_messages=mock_history[fake_window_size * -2 :] + [UserPromptMessage(content=fake_query)], + file_variables={ + "input.image": File( + tenant_id="test", + type=FileType.VIDEO, + filename="test1.jpg", + transfer_method=FileTransferMethod.REMOTE_URL, + remote_url=fake_remote_url, + ) + }, + ), ] for scenario in test_scenarios: