adding unit tests and bugfixing

This commit is contained in:
rafaelsideguide 2024-04-17 14:54:54 -03:00
parent d23a7ae591
commit b375ce3e39
2 changed files with 107 additions and 5 deletions

View File

@ -0,0 +1,97 @@
import { WebScraperDataProvider } from '../index';
describe('WebScraperDataProvider', () => {
describe('replaceImgPathsWithAbsolutePaths', () => {
it('should replace image paths with absolute paths', () => {
const webScraperDataProvider = new WebScraperDataProvider();
const documents = [
{
metadata: { sourceURL: 'https://example.com/page' },
content: '![alt text](/image.png)',
},
{
metadata: { sourceURL: 'https://example.com/another-page' },
content: '![another alt text](./another-image.png)',
},
{
metadata: { sourceURL: 'https://example.com/data-image' },
content: '![data image](data:image/png;base64,...)',
}
];
const expectedDocuments = [
{
metadata: { sourceURL: 'https://example.com/page' },
content: '![alt text](https://example.com/image.png)',
},
{
metadata: { sourceURL: 'https://example.com/another-page' },
content: '![another alt text](https://example.com/another-image.png)',
},
{
metadata: { sourceURL: 'https://example.com/data-image' },
content: '![data image](data:image/png;base64,...)',
}
];
const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
expect(result).toEqual(expectedDocuments);
});
it('should handle absolute URLs without modification', () => {
const webScraperDataProvider = new WebScraperDataProvider();
const documents = [
{
metadata: { sourceURL: 'https://example.com/page' },
content: '![alt text](https://example.com/image.png)',
},
{
metadata: { sourceURL: 'https://example.com/another-page' },
content: '![another alt text](http://anotherexample.com/another-image.png)',
}
];
const expectedDocuments = [
{
metadata: { sourceURL: 'https://example.com/page' },
content: '![alt text](https://example.com/image.png)',
},
{
metadata: { sourceURL: 'https://example.com/another-page' },
content: '![another alt text](http://anotherexample.com/another-image.png)',
}
];
const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
expect(result).toEqual(expectedDocuments);
});
it('should not replace non-image content within the documents', () => {
const webScraperDataProvider = new WebScraperDataProvider();
const documents = [
{
metadata: { sourceURL: 'https://example.com/page' },
content: 'This is a test. ![alt text](/image.png) Here is a link: [Example](https://example.com).',
},
{
metadata: { sourceURL: 'https://example.com/another-page' },
content: 'Another test. ![another alt text](./another-image.png) Here is some **bold text**.',
}
];
const expectedDocuments = [
{
metadata: { sourceURL: 'https://example.com/page' },
content: 'This is a test. ![alt text](https://example.com/image.png) Here is a link: [Example](https://example.com).',
},
{
metadata: { sourceURL: 'https://example.com/another-page' },
content: 'Another test. ![another alt text](https://example.com/another-image.png) Here is some **bold text**.',
}
];
const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
expect(result).toEqual(expectedDocuments);
});
});
});

View File

@ -325,19 +325,24 @@ export class WebScraperDataProvider {
documents.forEach(document => {
const baseUrl = new URL(document.metadata.sourceURL).origin;
const images = document.content.match(/!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g) || [];
images.forEach(image => {
let imageUrl = image.match(/\(([^)]+)\)/)[1];
let altText = image.match(/\[(.*?)\]/)[1];
if (!imageUrl.startsWith("data:image")) {
imageUrl = baseUrl + imageUrl;
if (!imageUrl.startsWith("http")) {
if (imageUrl.startsWith("/")) {
imageUrl = imageUrl.substring(1);
}
imageUrl = new URL(imageUrl, baseUrl).toString();
}
}
document.content = document.content.replace(image, `![${altText}](${imageUrl})`);
});
});
return documents;
}
}