From 6300e506fbb3b52cde3869060418dad3f1425633 Mon Sep 17 00:00:00 2001 From: Yeuoly Date: Fri, 15 Nov 2024 15:54:14 +0800 Subject: [PATCH] fix: rag --- api/core/tools/entities/api_entities.py | 1 - api/core/tools/utils/rag_web_reader.py | 17 +++++++++++++++++ api/core/workflow/nodes/tool/tool_node.py | 2 -- api/tasks/clean_dataset_task.py | 2 +- api/tasks/clean_document_task.py | 2 +- 5 files changed, 19 insertions(+), 5 deletions(-) create mode 100644 api/core/tools/utils/rag_web_reader.py diff --git a/api/core/tools/entities/api_entities.py b/api/core/tools/entities/api_entities.py index 724f35f1d9..2d8af8cabe 100644 --- a/api/core/tools/entities/api_entities.py +++ b/api/core/tools/entities/api_entities.py @@ -1,6 +1,5 @@ from typing import Literal, Optional -from pydantic import BaseModel, Field from pydantic import BaseModel, Field, field_validator from core.model_runtime.utils.encoders import jsonable_encoder diff --git a/api/core/tools/utils/rag_web_reader.py b/api/core/tools/utils/rag_web_reader.py new file mode 100644 index 0000000000..22c47fa814 --- /dev/null +++ b/api/core/tools/utils/rag_web_reader.py @@ -0,0 +1,17 @@ +import re + + +def get_image_upload_file_ids(content): + pattern = r"!\[image\]\((http?://.*?(file-preview|image-preview))\)" + matches = re.findall(pattern, content) + image_upload_file_ids = [] + for match in matches: + if match[1] == "file-preview": + content_pattern = r"files/([^/]+)/file-preview" + else: + content_pattern = r"files/([^/]+)/image-preview" + content_match = re.search(content_pattern, match[0]) + if content_match: + image_upload_file_id = content_match.group(1) + image_upload_file_ids.append(image_upload_file_id) + return image_upload_file_ids diff --git a/api/core/workflow/nodes/tool/tool_node.py b/api/core/workflow/nodes/tool/tool_node.py index 68014940b7..bcf92074f7 100644 --- a/api/core/workflow/nodes/tool/tool_node.py +++ b/api/core/workflow/nodes/tool/tool_node.py @@ -1,8 +1,6 @@ from collections.abc import Generator, Mapping, Sequence from os import path from typing import Any, cast -from collections.abc import Mapping, Sequence -from typing import Any from sqlalchemy import select from sqlalchemy.orm import Session diff --git a/api/tasks/clean_dataset_task.py b/api/tasks/clean_dataset_task.py index 4d45df4d2a..2e4fe8abdb 100644 --- a/api/tasks/clean_dataset_task.py +++ b/api/tasks/clean_dataset_task.py @@ -5,7 +5,7 @@ import click from celery import shared_task from core.rag.index_processor.index_processor_factory import IndexProcessorFactory -from core.tools.utils.web_reader_tool import get_image_upload_file_ids +from core.tools.utils.rag_web_reader import get_image_upload_file_ids from extensions.ext_database import db from extensions.ext_storage import storage from models.dataset import ( diff --git a/api/tasks/clean_document_task.py b/api/tasks/clean_document_task.py index 54c89450c9..1adc3ba5c3 100644 --- a/api/tasks/clean_document_task.py +++ b/api/tasks/clean_document_task.py @@ -6,7 +6,7 @@ import click from celery import shared_task from core.rag.index_processor.index_processor_factory import IndexProcessorFactory -from core.tools.utils.web_reader_tool import get_image_upload_file_ids +from core.tools.utils.rag_web_reader import get_image_upload_file_ids from extensions.ext_database import db from extensions.ext_storage import storage from models.dataset import Dataset, DocumentSegment