mirror of
https://github.com/langgenius/dify.git
synced 2024-11-16 11:42:29 +08:00
feat: add support for document, video, and audio content
Expanded the system to handle document types across different modules and introduced video and audio content handling in model features. Adjusted the prompt message logic to conditionally process content based on available features, enhancing flexibility in media processing. Added comprehensive error handling in `LLMNode` for better runtime resilience. Updated YAML configuration and unit tests to reflect these changes.
This commit is contained in:
parent
14205a7756
commit
87137ad97a
|
@ -58,6 +58,7 @@ class PromptMessageContentType(Enum):
|
|||
IMAGE = "image"
|
||||
AUDIO = "audio"
|
||||
VIDEO = "video"
|
||||
DOCUMENT = "document"
|
||||
|
||||
|
||||
class PromptMessageContent(BaseModel):
|
||||
|
|
|
@ -87,6 +87,9 @@ class ModelFeature(Enum):
|
|||
AGENT_THOUGHT = "agent-thought"
|
||||
VISION = "vision"
|
||||
STREAM_TOOL_CALL = "stream-tool-call"
|
||||
DOCUMENT = "document"
|
||||
VIDEO = "video"
|
||||
AUDIO = "audio"
|
||||
|
||||
|
||||
class DefaultParameterName(str, Enum):
|
||||
|
|
|
@ -8,6 +8,7 @@ features:
|
|||
- agent-thought
|
||||
- stream-tool-call
|
||||
- vision
|
||||
- audio
|
||||
model_properties:
|
||||
mode: chat
|
||||
context_size: 128000
|
||||
|
|
|
@ -193,6 +193,17 @@ class LLMNode(BaseNode[LLMNodeData]):
|
|||
)
|
||||
)
|
||||
return
|
||||
except Exception as e:
|
||||
logger.exception(f"Node {self.node_id} failed to run: {e}")
|
||||
yield RunCompletedEvent(
|
||||
run_result=NodeRunResult(
|
||||
status=WorkflowNodeExecutionStatus.FAILED,
|
||||
error=str(e),
|
||||
inputs=node_inputs,
|
||||
process_data=process_data,
|
||||
)
|
||||
)
|
||||
return
|
||||
|
||||
outputs = {"text": result_text, "usage": jsonable_encoder(usage), "finish_reason": finish_reason}
|
||||
|
||||
|
@ -607,11 +618,31 @@ class LLMNode(BaseNode[LLMNodeData]):
|
|||
if isinstance(prompt_message.content, list):
|
||||
prompt_message_content = []
|
||||
for content_item in prompt_message.content:
|
||||
# Skip image if vision is disabled or model doesn't support vision
|
||||
if content_item.type == PromptMessageContentType.IMAGE and (
|
||||
not vision_enabled
|
||||
or not model_config.model_schema.features
|
||||
or ModelFeature.VISION not in model_config.model_schema.features
|
||||
# Skip content if features are not defined
|
||||
if not model_config.model_schema.features:
|
||||
if content_item.type != PromptMessageContentType.TEXT:
|
||||
continue
|
||||
prompt_message_content.append(content_item)
|
||||
continue
|
||||
|
||||
# Skip content if corresponding feature is not supported
|
||||
if (
|
||||
(
|
||||
content_item.type == PromptMessageContentType.IMAGE
|
||||
and (not vision_enabled or ModelFeature.VISION not in model_config.model_schema.features)
|
||||
)
|
||||
or (
|
||||
content_item.type == PromptMessageContentType.DOCUMENT
|
||||
and ModelFeature.DOCUMENT not in model_config.model_schema.features
|
||||
)
|
||||
or (
|
||||
content_item.type == PromptMessageContentType.VIDEO
|
||||
and ModelFeature.VIDEO not in model_config.model_schema.features
|
||||
)
|
||||
or (
|
||||
content_item.type == PromptMessageContentType.AUDIO
|
||||
and ModelFeature.AUDIO not in model_config.model_schema.features
|
||||
)
|
||||
):
|
||||
continue
|
||||
prompt_message_content.append(content_item)
|
||||
|
@ -854,22 +885,22 @@ class LLMNode(BaseNode[LLMNodeData]):
|
|||
)
|
||||
|
||||
# Process segments for images
|
||||
image_contents = []
|
||||
file_contents = []
|
||||
for segment in segment_group.value:
|
||||
if isinstance(segment, ArrayFileSegment):
|
||||
for file in segment.value:
|
||||
if file.type == FileType.IMAGE:
|
||||
image_content = file_manager.to_prompt_message_content(
|
||||
if file.type in {FileType.IMAGE, FileType.VIDEO, FileType.AUDIO}:
|
||||
file_content = file_manager.to_prompt_message_content(
|
||||
file, image_detail_config=self.node_data.vision.configs.detail
|
||||
)
|
||||
image_contents.append(image_content)
|
||||
file_contents.append(file_content)
|
||||
if isinstance(segment, FileSegment):
|
||||
file = segment.value
|
||||
if file.type == FileType.IMAGE:
|
||||
image_content = file_manager.to_prompt_message_content(
|
||||
if file.type in {FileType.IMAGE, FileType.VIDEO, FileType.AUDIO}:
|
||||
file_content = file_manager.to_prompt_message_content(
|
||||
file, image_detail_config=self.node_data.vision.configs.detail
|
||||
)
|
||||
image_contents.append(image_content)
|
||||
file_contents.append(file_content)
|
||||
|
||||
# Create message with text from all segments
|
||||
plain_text = segment_group.text
|
||||
|
@ -877,9 +908,9 @@ class LLMNode(BaseNode[LLMNodeData]):
|
|||
prompt_message = _combine_text_message_with_role(text=plain_text, role=message.role)
|
||||
prompt_messages.append(prompt_message)
|
||||
|
||||
if image_contents:
|
||||
if file_contents:
|
||||
# Create message with image contents
|
||||
prompt_message = UserPromptMessage(content=image_contents)
|
||||
prompt_message = UserPromptMessage(content=file_contents)
|
||||
prompt_messages.append(prompt_message)
|
||||
|
||||
return prompt_messages
|
||||
|
|
|
@ -423,6 +423,32 @@ def test_fetch_prompt_messages__basic(faker, llm_node, model_config):
|
|||
)
|
||||
},
|
||||
),
|
||||
LLMNodeTestScenario(
|
||||
description="Prompt template with variable selector of File with video file and vision feature",
|
||||
user_query=fake_query,
|
||||
user_files=[],
|
||||
vision_enabled=True,
|
||||
vision_detail=fake_vision_detail,
|
||||
features=[ModelFeature.VISION],
|
||||
window_size=fake_window_size,
|
||||
prompt_template=[
|
||||
LLMNodeChatModelMessage(
|
||||
text="{{#input.image#}}",
|
||||
role=PromptMessageRole.USER,
|
||||
edition_type="basic",
|
||||
),
|
||||
],
|
||||
expected_messages=mock_history[fake_window_size * -2 :] + [UserPromptMessage(content=fake_query)],
|
||||
file_variables={
|
||||
"input.image": File(
|
||||
tenant_id="test",
|
||||
type=FileType.VIDEO,
|
||||
filename="test1.jpg",
|
||||
transfer_method=FileTransferMethod.REMOTE_URL,
|
||||
remote_url=fake_remote_url,
|
||||
)
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
for scenario in test_scenarios:
|
||||
|
|
Loading…
Reference in New Issue
Block a user