refactor: improve handling of leading punctuation removal

This commit is contained in:
Zan 2024-11-16 03:22:50 +08:00
parent d05fee1182
commit 4318d18402
3 changed files with 7 additions and 6 deletions

View File

@ -500,8 +500,8 @@ class IndexingRunner:
document_node.metadata["doc_hash"] = hash
# delete Splitter character
page_content = document_node.page_content
if page_content.startswith(".") or page_content.startswith(""):
page_content = page_content[1:]
if re.match(r"^[\p{P}\p{S}]+", page_content, re.UNICODE):
page_content = re.sub(r"^[\p{P}\p{S}]+", "", page_content)
else:
page_content = page_content
document_node.page_content = page_content

View File

@ -1,6 +1,7 @@
"""Paragraph index processor."""
import uuid
import re
from typing import Optional
from core.rag.cleaner.clean_processor import CleanProcessor
@ -44,8 +45,8 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
document_node.metadata["doc_hash"] = hash
# delete Splitter character
page_content = document_node.page_content
if page_content.startswith(".") or page_content.startswith(""):
page_content = page_content[1:].strip()
if re.match(r"^[\p{P}\p{S}]+", page_content, re.UNICODE):
page_content = re.sub(r"^[\p{P}\p{S}]+", "", page_content).strip()
else:
page_content = page_content
if len(page_content) > 0:

View File

@ -53,8 +53,8 @@ class QAIndexProcessor(BaseIndexProcessor):
document_node.metadata["doc_hash"] = hash
# delete Splitter character
page_content = document_node.page_content
if page_content.startswith(".") or page_content.startswith(""):
page_content = page_content[1:]
if re.match(r"^[\p{P}\p{S}]+", page_content, re.UNICODE):
page_content = re.sub(r"^[\p{P}\p{S}]+", "", page_content)
else:
page_content = page_content
document_node.page_content = page_content