mirror of
https://github.com/langgenius/dify.git
synced 2024-11-16 11:42:29 +08:00
fixing a bug of handling header row when parsing xls file, and tune xls/xlsx parsing result to be more structured (#3600)
This commit is contained in:
parent
80a87f36ea
commit
9f8ca75a81
|
@ -39,8 +39,8 @@ class ExcelExtractor(BaseExtractor):
|
|||
documents = []
|
||||
# loop over all sheets
|
||||
for sheet in wb.sheets():
|
||||
for row_index, row in enumerate(sheet.get_rows(), start=1):
|
||||
row_header = None
|
||||
row_header = None
|
||||
for row_index, row in enumerate(sheet.get_rows(), start=1):
|
||||
if self.is_blank_row(row):
|
||||
continue
|
||||
if row_header is None:
|
||||
|
@ -49,8 +49,8 @@ class ExcelExtractor(BaseExtractor):
|
|||
item_arr = []
|
||||
for index, cell in enumerate(row):
|
||||
txt_value = str(cell.value)
|
||||
item_arr.append(f'{row_header[index].value}:{txt_value}')
|
||||
item_str = "\n".join(item_arr)
|
||||
item_arr.append(f'"{row_header[index].value}":"{txt_value}"')
|
||||
item_str = ",".join(item_arr)
|
||||
document = Document(page_content=item_str, metadata={'source': self._file_path})
|
||||
documents.append(document)
|
||||
return documents
|
||||
|
@ -68,7 +68,7 @@ class ExcelExtractor(BaseExtractor):
|
|||
|
||||
# transform each row into a Document
|
||||
for _, row in df.iterrows():
|
||||
item = ';'.join(f'{k}:{v}' for k, v in row.items() if pd.notna(v))
|
||||
item = ';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v))
|
||||
document = Document(page_content=item, metadata={'source': self._file_path})
|
||||
data.append(document)
|
||||
return data
|
||||
|
|
Loading…
Reference in New Issue
Block a user