mirror of
https://github.com/langgenius/dify.git
synced 2024-11-16 03:32:23 +08:00
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
This commit is contained in:
parent
fbb9c1c249
commit
722964667f
|
@ -143,14 +143,14 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str)
|
|||
|
||||
def _extract_text_from_plain_text(file_content: bytes) -> str:
|
||||
try:
|
||||
return file_content.decode("utf-8")
|
||||
return file_content.decode("utf-8", "ignore")
|
||||
except UnicodeDecodeError as e:
|
||||
raise TextExtractionError("Failed to decode plain text file") from e
|
||||
|
||||
|
||||
def _extract_text_from_json(file_content: bytes) -> str:
|
||||
try:
|
||||
json_data = json.loads(file_content.decode("utf-8"))
|
||||
json_data = json.loads(file_content.decode("utf-8", "ignore"))
|
||||
return json.dumps(json_data, indent=2, ensure_ascii=False)
|
||||
except (UnicodeDecodeError, json.JSONDecodeError) as e:
|
||||
raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e
|
||||
|
@ -159,7 +159,7 @@ def _extract_text_from_json(file_content: bytes) -> str:
|
|||
def _extract_text_from_yaml(file_content: bytes) -> str:
|
||||
"""Extract the content from yaml file"""
|
||||
try:
|
||||
yaml_data = yaml.safe_load_all(file_content.decode("utf-8"))
|
||||
yaml_data = yaml.safe_load_all(file_content.decode("utf-8", "ignore"))
|
||||
return yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False)
|
||||
except (UnicodeDecodeError, yaml.YAMLError) as e:
|
||||
raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e
|
||||
|
@ -217,7 +217,7 @@ def _extract_text_from_file(file: File):
|
|||
|
||||
def _extract_text_from_csv(file_content: bytes) -> str:
|
||||
try:
|
||||
csv_file = io.StringIO(file_content.decode("utf-8"))
|
||||
csv_file = io.StringIO(file_content.decode("utf-8", "ignore"))
|
||||
csv_reader = csv.reader(csv_file)
|
||||
rows = list(csv_reader)
|
||||
|
||||
|
|
|
@ -140,6 +140,17 @@ def test_extract_text_from_plain_text():
|
|||
assert text == "Hello, world!"
|
||||
|
||||
|
||||
def tet_extract_text_from_plain_text_non_utf8():
|
||||
import tempfile
|
||||
|
||||
non_utf8_content = b"Hello world\xa9." # \xA9 represents © in Latin-1
|
||||
with tempfile.NamedTemporaryFile(delete=True) as temp_file:
|
||||
temp_file.write(non_utf8_content)
|
||||
temp_file.seek(0)
|
||||
text = _extract_text_from_plain_text(temp_file.read())
|
||||
assert text == "Hello, world."
|
||||
|
||||
|
||||
@patch("pypdfium2.PdfDocument")
|
||||
def test_extract_text_from_pdf(mock_pdf_document):
|
||||
mock_page = Mock()
|
||||
|
|
Loading…
Reference in New Issue
Block a user