From cf258b7a677d42ba83c739d7f7581ee79f5ac62c Mon Sep 17 00:00:00 2001 From: Jyong <76649700+JohnJyong@users.noreply.github.com> Date: Fri, 26 Jul 2024 19:26:52 +0800 Subject: [PATCH] add xlsx support hyperlink extract (#6722) --- api/core/rag/extractor/excel_extractor.py | 51 ++++++++++++++++------- api/poetry.lock | 2 +- api/pyproject.toml | 1 + 3 files changed, 39 insertions(+), 15 deletions(-) diff --git a/api/core/rag/extractor/excel_extractor.py b/api/core/rag/extractor/excel_extractor.py index 2b16275dc8..f0c302a619 100644 --- a/api/core/rag/extractor/excel_extractor.py +++ b/api/core/rag/extractor/excel_extractor.py @@ -3,6 +3,7 @@ import os from typing import Optional import pandas as pd +from openpyxl import load_workbook from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document @@ -28,26 +29,48 @@ class ExcelExtractor(BaseExtractor): self._autodetect_encoding = autodetect_encoding def extract(self) -> list[Document]: - """ Load from Excel file in xls or xlsx format using Pandas.""" + """ Load from Excel file in xls or xlsx format using Pandas and openpyxl.""" documents = [] - # Determine the file extension file_extension = os.path.splitext(self._file_path)[-1].lower() - # Read each worksheet of an Excel file using Pandas + if file_extension == '.xlsx': - excel_file = pd.ExcelFile(self._file_path, engine='openpyxl') + wb = load_workbook(self._file_path, data_only=True) + for sheet_name in wb.sheetnames: + sheet = wb[sheet_name] + data = sheet.values + cols = next(data) + df = pd.DataFrame(data, columns=cols) + + df.dropna(how='all', inplace=True) + + for index, row in df.iterrows(): + page_content = [] + for col_index, (k, v) in enumerate(row.items()): + if pd.notna(v): + cell = sheet.cell(row=index + 2, + column=col_index + 1) # +2 to account for header and 1-based index + if cell.hyperlink: + value = f"[{v}]({cell.hyperlink.target})" + page_content.append(f'"{k}":"{value}"') + else: + page_content.append(f'"{k}":"{v}"') + documents.append(Document(page_content=';'.join(page_content), + metadata={'source': self._file_path})) + elif file_extension == '.xls': excel_file = pd.ExcelFile(self._file_path, engine='xlrd') + for sheet_name in excel_file.sheet_names: + df = excel_file.parse(sheet_name=sheet_name) + df.dropna(how='all', inplace=True) + + for _, row in df.iterrows(): + page_content = [] + for k, v in row.items(): + if pd.notna(v): + page_content.append(f'"{k}":"{v}"') + documents.append(Document(page_content=';'.join(page_content), + metadata={'source': self._file_path})) else: raise ValueError(f"Unsupported file extension: {file_extension}") - for sheet_name in excel_file.sheet_names: - df: pd.DataFrame = excel_file.parse(sheet_name=sheet_name) - - # filter out rows with all NaN values - df.dropna(how='all', inplace=True) - - # transform each row into a Document - documents += [Document(page_content=';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v)), - metadata={'source': self._file_path}, - ) for _, row in df.iterrows()] return documents diff --git a/api/poetry.lock b/api/poetry.lock index 2a277dac2d..0abd2d2175 100644 --- a/api/poetry.lock +++ b/api/poetry.lock @@ -9543,4 +9543,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "6b7d8b1333ae9c71ba2e1c5800eecf1535ed3945cd55ebb1e253b7a29ba09559" +content-hash = "9619ddabdd67710981c13dcfa3ddae0a48497c9f694afc81b820e882440c1265" diff --git a/api/pyproject.toml b/api/pyproject.toml index 7be3c7af64..430e3d79c4 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -177,6 +177,7 @@ xinference-client = "0.9.4" yarl = "~1.9.4" zhipuai = "1.0.7" rank-bm25 = "~0.2.2" +openpyxl = "^3.1.5" ############################################################ # Tool dependencies required by tool implementations ############################################################