fixing a bug of handling header row when parsing xls file, and tune xls/xlsx parsing result to be more structured (#3600)

This commit is contained in:
YC 2024-06-05 15:28:43 +08:00 committed by GitHub
parent 80a87f36ea
commit 9f8ca75a81
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -39,8 +39,8 @@ class ExcelExtractor(BaseExtractor):
documents = []
# loop over all sheets
for sheet in wb.sheets():
for row_index, row in enumerate(sheet.get_rows(), start=1):
row_header = None
row_header = None
for row_index, row in enumerate(sheet.get_rows(), start=1):
if self.is_blank_row(row):
continue
if row_header is None:
@ -49,8 +49,8 @@ class ExcelExtractor(BaseExtractor):
item_arr = []
for index, cell in enumerate(row):
txt_value = str(cell.value)
item_arr.append(f'{row_header[index].value}:{txt_value}')
item_str = "\n".join(item_arr)
item_arr.append(f'"{row_header[index].value}":"{txt_value}"')
item_str = ",".join(item_arr)
document = Document(page_content=item_str, metadata={'source': self._file_path})
documents.append(document)
return documents
@ -68,7 +68,7 @@ class ExcelExtractor(BaseExtractor):
# transform each row into a Document
for _, row in df.iterrows():
item = ';'.join(f'{k}:{v}' for k, v in row.items() if pd.notna(v))
item = ';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v))
document = Document(page_content=item, metadata={'source': self._file_path})
data.append(document)
return data