diff --git a/api/core/tools/entities/api_entities.py b/api/core/tools/entities/api_entities.py index 724f35f1d9..2d8af8cabe 100644 --- a/api/core/tools/entities/api_entities.py +++ b/api/core/tools/entities/api_entities.py @@ -1,6 +1,5 @@ from typing import Literal, Optional -from pydantic import BaseModel, Field from pydantic import BaseModel, Field, field_validator from core.model_runtime.utils.encoders import jsonable_encoder diff --git a/api/core/tools/utils/rag_web_reader.py b/api/core/tools/utils/rag_web_reader.py new file mode 100644 index 0000000000..22c47fa814 --- /dev/null +++ b/api/core/tools/utils/rag_web_reader.py @@ -0,0 +1,17 @@ +import re + + +def get_image_upload_file_ids(content): + pattern = r"!\[image\]\((http?://.*?(file-preview|image-preview))\)" + matches = re.findall(pattern, content) + image_upload_file_ids = [] + for match in matches: + if match[1] == "file-preview": + content_pattern = r"files/([^/]+)/file-preview" + else: + content_pattern = r"files/([^/]+)/image-preview" + content_match = re.search(content_pattern, match[0]) + if content_match: + image_upload_file_id = content_match.group(1) + image_upload_file_ids.append(image_upload_file_id) + return image_upload_file_ids diff --git a/api/core/workflow/nodes/tool/tool_node.py b/api/core/workflow/nodes/tool/tool_node.py index 68014940b7..bcf92074f7 100644 --- a/api/core/workflow/nodes/tool/tool_node.py +++ b/api/core/workflow/nodes/tool/tool_node.py @@ -1,8 +1,6 @@ from collections.abc import Generator, Mapping, Sequence from os import path from typing import Any, cast -from collections.abc import Mapping, Sequence -from typing import Any from sqlalchemy import select from sqlalchemy.orm import Session diff --git a/api/tasks/clean_dataset_task.py b/api/tasks/clean_dataset_task.py index 4d45df4d2a..2e4fe8abdb 100644 --- a/api/tasks/clean_dataset_task.py +++ b/api/tasks/clean_dataset_task.py @@ -5,7 +5,7 @@ import click from celery import shared_task from core.rag.index_processor.index_processor_factory import IndexProcessorFactory -from core.tools.utils.web_reader_tool import get_image_upload_file_ids +from core.tools.utils.rag_web_reader import get_image_upload_file_ids from extensions.ext_database import db from extensions.ext_storage import storage from models.dataset import ( diff --git a/api/tasks/clean_document_task.py b/api/tasks/clean_document_task.py index 54c89450c9..1adc3ba5c3 100644 --- a/api/tasks/clean_document_task.py +++ b/api/tasks/clean_document_task.py @@ -6,7 +6,7 @@ import click from celery import shared_task from core.rag.index_processor.index_processor_factory import IndexProcessorFactory -from core.tools.utils.web_reader_tool import get_image_upload_file_ids +from core.tools.utils.rag_web_reader import get_image_upload_file_ids from extensions.ext_database import db from extensions.ext_storage import storage from models.dataset import Dataset, DocumentSegment