Merge remote-tracking branch 'origin/main'

This commit is contained in:
Han Xiao 2024-04-13 11:42:21 -07:00
commit e050a5bffa
5 changed files with 357 additions and 13 deletions

View File

@ -27,6 +27,8 @@
"minio": "^7.1.3", "minio": "^7.1.3",
"openai": "^4.20.0", "openai": "^4.20.0",
"puppeteer": "^22.6.3", "puppeteer": "^22.6.3",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"stripe": "^11.11.0", "stripe": "^11.11.0",
"tiktoken": "^1.0.10", "tiktoken": "^1.0.10",
"turndown": "^7.1.3", "turndown": "^7.1.3",
@ -2526,6 +2528,14 @@
"@types/node": "*" "@types/node": "*"
} }
}, },
"node_modules/@types/debug": {
"version": "4.1.12",
"resolved": "https://registry.npmjs.org/@types/debug/-/debug-4.1.12.tgz",
"integrity": "sha512-vIChWdVG3LG1SMxEvI/AK+FWJthlrqlTu7fbrlywTkkaONwk/UAGaULXRlf8vkzFBLVm0zkMdCquhL5aOjhXPQ==",
"dependencies": {
"@types/ms": "*"
}
},
"node_modules/@types/express": { "node_modules/@types/express": {
"version": "4.17.3", "version": "4.17.3",
"resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.3.tgz", "resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.3.tgz",
@ -2673,6 +2683,11 @@
"integrity": "sha512-K0VQKziLUWkVKiRVrx4a40iPaxTUefQmjtkQofBkYRcoaaL/8rhwDWww9qWbrgicNOgnpIsMxyNIUM4+n6dUIA==", "integrity": "sha512-K0VQKziLUWkVKiRVrx4a40iPaxTUefQmjtkQofBkYRcoaaL/8rhwDWww9qWbrgicNOgnpIsMxyNIUM4+n6dUIA==",
"optional": true "optional": true
}, },
"node_modules/@types/ms": {
"version": "0.7.34",
"resolved": "https://registry.npmjs.org/@types/ms/-/ms-0.7.34.tgz",
"integrity": "sha512-nG96G3Wp6acyAgJqGasjODb+acrI7KltPiRxzHPXnP3NgI28bpQDRv53olbqGXbfcgF5aiiHmO3xpwEpS5Ld9g=="
},
"node_modules/@types/node": { "node_modules/@types/node": {
"version": "18.19.31", "version": "18.19.31",
"resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.31.tgz", "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.31.tgz",
@ -3234,6 +3249,14 @@
"resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz",
"integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==" "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q=="
}, },
"node_modules/arr-union": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/arr-union/-/arr-union-3.1.0.tgz",
"integrity": "sha512-sKpyeERZ02v1FeCZT8lrfJq5u6goHCtpTAzPwJYe7c8SPFOboNjNg1vz2L4VTn9T4PQxEx13TbXLmYUcS6Ug7Q==",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/array-buffer-byte-length": { "node_modules/array-buffer-byte-length": {
"version": "1.0.1", "version": "1.0.1",
"resolved": "https://registry.npmjs.org/array-buffer-byte-length/-/array-buffer-byte-length-1.0.1.tgz", "resolved": "https://registry.npmjs.org/array-buffer-byte-length/-/array-buffer-byte-length-1.0.1.tgz",
@ -4076,6 +4099,21 @@
"node": ">=12" "node": ">=12"
} }
}, },
"node_modules/clone-deep": {
"version": "0.2.4",
"resolved": "https://registry.npmjs.org/clone-deep/-/clone-deep-0.2.4.tgz",
"integrity": "sha512-we+NuQo2DHhSl+DP6jlUiAhyAjBQrYnpOk15rN6c6JSPScjiCLh8IbSU+VTcph6YS3o7mASE8a0+gbZ7ChLpgg==",
"dependencies": {
"for-own": "^0.1.3",
"is-plain-object": "^2.0.1",
"kind-of": "^3.0.2",
"lazy-cache": "^1.0.3",
"shallow-clone": "^0.1.2"
},
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/co": { "node_modules/co": {
"version": "4.6.0", "version": "4.6.0",
"resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz", "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz",
@ -4466,8 +4504,6 @@
"version": "4.3.1", "version": "4.3.1",
"resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz", "resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz",
"integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==", "integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==",
"dev": true,
"peer": true,
"engines": { "engines": {
"node": ">=0.10.0" "node": ">=0.10.0"
} }
@ -5739,6 +5775,25 @@
"is-callable": "^1.1.3" "is-callable": "^1.1.3"
} }
}, },
"node_modules/for-in": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/for-in/-/for-in-1.0.2.tgz",
"integrity": "sha512-7EwmXrOjyL+ChxMhmG5lnW9MPt1aIeZEwKhQzoBUdTV0N3zuwWDZYVJatDvZ2OyzPUvdIAZDsCetk3coyMfcnQ==",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/for-own": {
"version": "0.1.5",
"resolved": "https://registry.npmjs.org/for-own/-/for-own-0.1.5.tgz",
"integrity": "sha512-SKmowqGTJoPzLO1T0BBJpkfp3EMacCMOuH40hOUbrbzElVktk4DioXVM99QkLCyKoiuOmyjgcWMpVz2xjE7LZw==",
"dependencies": {
"for-in": "^1.0.1"
},
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/form-data": { "node_modules/form-data": {
"version": "4.0.0", "version": "4.0.0",
"resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz", "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz",
@ -6786,6 +6841,11 @@
"url": "https://github.com/sponsors/ljharb" "url": "https://github.com/sponsors/ljharb"
} }
}, },
"node_modules/is-buffer": {
"version": "1.1.6",
"resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz",
"integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w=="
},
"node_modules/is-callable": { "node_modules/is-callable": {
"version": "1.2.7", "version": "1.2.7",
"resolved": "https://registry.npmjs.org/is-callable/-/is-callable-1.2.7.tgz", "resolved": "https://registry.npmjs.org/is-callable/-/is-callable-1.2.7.tgz",
@ -6839,6 +6899,14 @@
"url": "https://github.com/sponsors/ljharb" "url": "https://github.com/sponsors/ljharb"
} }
}, },
"node_modules/is-extendable": {
"version": "0.1.1",
"resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-0.1.1.tgz",
"integrity": "sha512-5BMULNob1vgFX6EjQw5izWDxrecWK9AM72rugNr0TFldMOi0fj6Jk+zeKIt0xGj4cEfQIJth4w3OKWOJ4f+AFw==",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/is-extglob": { "node_modules/is-extglob": {
"version": "2.1.1", "version": "2.1.1",
"resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz",
@ -6948,6 +7016,17 @@
"node": ">=8" "node": ">=8"
} }
}, },
"node_modules/is-plain-object": {
"version": "2.0.4",
"resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-2.0.4.tgz",
"integrity": "sha512-h5PpgXkWitc38BBMYawTYMWJHFZJVnBquFE57xFpjB8pJFiF6gZ+bU+WyI/yqXiFR5mdLsgYNaPe8uao6Uv9Og==",
"dependencies": {
"isobject": "^3.0.1"
},
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/is-regex": { "node_modules/is-regex": {
"version": "1.1.4", "version": "1.1.4",
"resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.4.tgz", "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.4.tgz",
@ -7064,6 +7143,14 @@
"integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==", "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==",
"dev": true "dev": true
}, },
"node_modules/isobject": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz",
"integrity": "sha512-WhB9zCku7EGTj/HQQRz5aUQEUeoQZH2bWcltRErOpymJ4boYE6wL9Tbr23krRPSZ+C5zqNSrSw+Cc7sZZ4b7vg==",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/istanbul-lib-coverage": { "node_modules/istanbul-lib-coverage": {
"version": "3.2.2", "version": "3.2.2",
"resolved": "https://registry.npmjs.org/istanbul-lib-coverage/-/istanbul-lib-coverage-3.2.2.tgz", "resolved": "https://registry.npmjs.org/istanbul-lib-coverage/-/istanbul-lib-coverage-3.2.2.tgz",
@ -8049,6 +8136,17 @@
"json-buffer": "3.0.1" "json-buffer": "3.0.1"
} }
}, },
"node_modules/kind-of": {
"version": "3.2.2",
"resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz",
"integrity": "sha512-NOW9QQXMoZGg/oqnVNoNTTIFEIid1627WCffUBJEdMxYApq7mNE7CpzucIPc+ZQg25Phej7IJSmX3hO+oblOtQ==",
"dependencies": {
"is-buffer": "^1.1.5"
},
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/klaw": { "node_modules/klaw": {
"version": "3.0.0", "version": "3.0.0",
"resolved": "https://registry.npmjs.org/klaw/-/klaw-3.0.0.tgz", "resolved": "https://registry.npmjs.org/klaw/-/klaw-3.0.0.tgz",
@ -8184,6 +8282,14 @@
"unicode-9.0.0": "0.7.0" "unicode-9.0.0": "0.7.0"
} }
}, },
"node_modules/lazy-cache": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-1.0.4.tgz",
"integrity": "sha512-RE2g0b5VGZsOCFOCgP7omTRYFqydmZkBwl5oNnQ1lDYC57uyO9KqNnNVxT7COSHTxrRCWVcAVOcbjk+tvh/rgQ==",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/lazystream": { "node_modules/lazystream": {
"version": "1.0.1", "version": "1.0.1",
"resolved": "https://registry.npmjs.org/lazystream/-/lazystream-1.0.1.tgz", "resolved": "https://registry.npmjs.org/lazystream/-/lazystream-1.0.1.tgz",
@ -8504,6 +8610,19 @@
"optional": true, "optional": true,
"peer": true "peer": true
}, },
"node_modules/merge-deep": {
"version": "3.0.3",
"resolved": "https://registry.npmjs.org/merge-deep/-/merge-deep-3.0.3.tgz",
"integrity": "sha512-qtmzAS6t6grwEkNrunqTBdn0qKwFgNWvlxUbAV8es9M7Ot1EbyApytCnvE0jALPa46ZpKDUo527kKiaWplmlFA==",
"dependencies": {
"arr-union": "^3.1.0",
"clone-deep": "^0.2.4",
"kind-of": "^3.0.2"
},
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/merge-descriptors": { "node_modules/merge-descriptors": {
"version": "1.0.1", "version": "1.0.1",
"resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz", "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz",
@ -8672,6 +8791,26 @@
"resolved": "https://registry.npmjs.org/mitt/-/mitt-3.0.1.tgz", "resolved": "https://registry.npmjs.org/mitt/-/mitt-3.0.1.tgz",
"integrity": "sha512-vKivATfr97l2/QBCYAkXYDbrIWPM2IIKEl7YPhjCvKlG3kE2gm+uBo6nEXK3M5/Ffh/FLpKExzOQ3JJoJGFKBw==" "integrity": "sha512-vKivATfr97l2/QBCYAkXYDbrIWPM2IIKEl7YPhjCvKlG3kE2gm+uBo6nEXK3M5/Ffh/FLpKExzOQ3JJoJGFKBw=="
}, },
"node_modules/mixin-object": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/mixin-object/-/mixin-object-2.0.1.tgz",
"integrity": "sha512-ALGF1Jt9ouehcaXaHhn6t1yGWRqGaHkPFndtFVHfZXOvkIZ/yoGaSi0AHVTafb3ZBGg4dr/bDwnaEKqCXzchMA==",
"dependencies": {
"for-in": "^0.1.3",
"is-extendable": "^0.1.1"
},
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/mixin-object/node_modules/for-in": {
"version": "0.1.8",
"resolved": "https://registry.npmjs.org/for-in/-/for-in-0.1.8.tgz",
"integrity": "sha512-F0to7vbBSHP8E3l6dCjxNOLuSFAACIxFy3UehTUlG7svlXi37HHsDkyVcHo0Pq8QwrE+pXvWSVX3ZT1T9wAZ9g==",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/mkdirp": { "node_modules/mkdirp": {
"version": "1.0.4", "version": "1.0.4",
"resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-1.0.4.tgz", "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-1.0.4.tgz",
@ -9719,6 +9858,150 @@
"node": ">=18" "node": ">=18"
} }
}, },
"node_modules/puppeteer-extra": {
"version": "3.3.6",
"resolved": "https://registry.npmjs.org/puppeteer-extra/-/puppeteer-extra-3.3.6.tgz",
"integrity": "sha512-rsLBE/6mMxAjlLd06LuGacrukP2bqbzKCLzV1vrhHFavqQE/taQ2UXv3H5P0Ls7nsrASa+6x3bDbXHpqMwq+7A==",
"dependencies": {
"@types/debug": "^4.1.0",
"debug": "^4.1.1",
"deepmerge": "^4.2.2"
},
"engines": {
"node": ">=8"
},
"peerDependencies": {
"@types/puppeteer": "*",
"puppeteer": "*",
"puppeteer-core": "*"
},
"peerDependenciesMeta": {
"@types/puppeteer": {
"optional": true
},
"puppeteer": {
"optional": true
},
"puppeteer-core": {
"optional": true
}
}
},
"node_modules/puppeteer-extra-plugin": {
"version": "3.2.3",
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin/-/puppeteer-extra-plugin-3.2.3.tgz",
"integrity": "sha512-6RNy0e6pH8vaS3akPIKGg28xcryKscczt4wIl0ePciZENGE2yoaQJNd17UiEbdmh5/6WW6dPcfRWT9lxBwCi2Q==",
"dependencies": {
"@types/debug": "^4.1.0",
"debug": "^4.1.1",
"merge-deep": "^3.0.1"
},
"engines": {
"node": ">=9.11.2"
},
"peerDependencies": {
"playwright-extra": "*",
"puppeteer-extra": "*"
},
"peerDependenciesMeta": {
"playwright-extra": {
"optional": true
},
"puppeteer-extra": {
"optional": true
}
}
},
"node_modules/puppeteer-extra-plugin-stealth": {
"version": "2.11.2",
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-stealth/-/puppeteer-extra-plugin-stealth-2.11.2.tgz",
"integrity": "sha512-bUemM5XmTj9i2ZerBzsk2AN5is0wHMNE6K0hXBzBXOzP5m5G3Wl0RHhiqKeHToe/uIH8AoZiGhc1tCkLZQPKTQ==",
"dependencies": {
"debug": "^4.1.1",
"puppeteer-extra-plugin": "^3.2.3",
"puppeteer-extra-plugin-user-preferences": "^2.4.1"
},
"engines": {
"node": ">=8"
},
"peerDependencies": {
"playwright-extra": "*",
"puppeteer-extra": "*"
},
"peerDependenciesMeta": {
"playwright-extra": {
"optional": true
},
"puppeteer-extra": {
"optional": true
}
}
},
"node_modules/puppeteer-extra-plugin-user-data-dir": {
"version": "2.4.1",
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-user-data-dir/-/puppeteer-extra-plugin-user-data-dir-2.4.1.tgz",
"integrity": "sha512-kH1GnCcqEDoBXO7epAse4TBPJh9tEpVEK/vkedKfjOVOhZAvLkHGc9swMs5ChrJbRnf8Hdpug6TJlEuimXNQ+g==",
"dependencies": {
"debug": "^4.1.1",
"fs-extra": "^10.0.0",
"puppeteer-extra-plugin": "^3.2.3",
"rimraf": "^3.0.2"
},
"engines": {
"node": ">=8"
},
"peerDependencies": {
"playwright-extra": "*",
"puppeteer-extra": "*"
},
"peerDependenciesMeta": {
"playwright-extra": {
"optional": true
},
"puppeteer-extra": {
"optional": true
}
}
},
"node_modules/puppeteer-extra-plugin-user-data-dir/node_modules/fs-extra": {
"version": "10.1.0",
"resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-10.1.0.tgz",
"integrity": "sha512-oRXApq54ETRj4eMiFzGnHWGy+zo5raudjuxN0b8H7s/RU2oW0Wvsx9O0ACRN/kRq9E8Vu/ReskGB5o3ji+FzHQ==",
"dependencies": {
"graceful-fs": "^4.2.0",
"jsonfile": "^6.0.1",
"universalify": "^2.0.0"
},
"engines": {
"node": ">=12"
}
},
"node_modules/puppeteer-extra-plugin-user-preferences": {
"version": "2.4.1",
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-user-preferences/-/puppeteer-extra-plugin-user-preferences-2.4.1.tgz",
"integrity": "sha512-i1oAZxRbc1bk8MZufKCruCEC3CCafO9RKMkkodZltI4OqibLFXF3tj6HZ4LZ9C5vCXZjYcDWazgtY69mnmrQ9A==",
"dependencies": {
"debug": "^4.1.1",
"deepmerge": "^4.2.2",
"puppeteer-extra-plugin": "^3.2.3",
"puppeteer-extra-plugin-user-data-dir": "^2.4.1"
},
"engines": {
"node": ">=8"
},
"peerDependencies": {
"playwright-extra": "*",
"puppeteer-extra": "*"
},
"peerDependenciesMeta": {
"playwright-extra": {
"optional": true
},
"puppeteer-extra": {
"optional": true
}
}
},
"node_modules/pure-rand": { "node_modules/pure-rand": {
"version": "6.1.0", "version": "6.1.0",
"resolved": "https://registry.npmjs.org/pure-rand/-/pure-rand-6.1.0.tgz", "resolved": "https://registry.npmjs.org/pure-rand/-/pure-rand-6.1.0.tgz",
@ -10314,6 +10597,39 @@
"resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.2.0.tgz", "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.2.0.tgz",
"integrity": "sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw==" "integrity": "sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw=="
}, },
"node_modules/shallow-clone": {
"version": "0.1.2",
"resolved": "https://registry.npmjs.org/shallow-clone/-/shallow-clone-0.1.2.tgz",
"integrity": "sha512-J1zdXCky5GmNnuauESROVu31MQSnLoYvlyEn6j2Ztk6Q5EHFIhxkMhYcv6vuDzl2XEzoRr856QwzMgWM/TmZgw==",
"dependencies": {
"is-extendable": "^0.1.1",
"kind-of": "^2.0.1",
"lazy-cache": "^0.2.3",
"mixin-object": "^2.0.1"
},
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/shallow-clone/node_modules/kind-of": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/kind-of/-/kind-of-2.0.1.tgz",
"integrity": "sha512-0u8i1NZ/mg0b+W3MGGw5I7+6Eib2nx72S/QvXa0hYjEkjTknYmEYQJwGu3mLC0BrhtJjtQafTkyRUQ75Kx0LVg==",
"dependencies": {
"is-buffer": "^1.0.2"
},
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/shallow-clone/node_modules/lazy-cache": {
"version": "0.2.7",
"resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-0.2.7.tgz",
"integrity": "sha512-gkX52wvU/R8DVMMt78ATVPFMJqfW8FPz1GZ1sVHBVQHmu/WvhIWE4cE1GBzhJNFicDeYhnwp6Rl35BcAIM3YOQ==",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/shebang-command": { "node_modules/shebang-command": {
"version": "2.0.0", "version": "2.0.0",
"resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz", "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz",

View File

@ -47,6 +47,8 @@
"minio": "^7.1.3", "minio": "^7.1.3",
"openai": "^4.20.0", "openai": "^4.20.0",
"puppeteer": "^22.6.3", "puppeteer": "^22.6.3",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"stripe": "^11.11.0", "stripe": "^11.11.0",
"tiktoken": "^1.0.10", "tiktoken": "^1.0.10",
"turndown": "^7.1.3", "turndown": "^7.1.3",

View File

@ -1,4 +1,4 @@
import { assignTransferProtocolMeta, marshalErrorLike, RPCHost, RPCReflection, AssertionFailureError } from 'civkit'; import { assignTransferProtocolMeta, marshalErrorLike, RPCHost, RPCReflection, AssertionFailureError, ParamValidationError } from 'civkit';
import { singleton } from 'tsyringe'; import { singleton } from 'tsyringe';
import { CloudHTTPv2, Ctx, Logger, OutputServerEventStream, RPCReflect } from '../shared'; import { CloudHTTPv2, Ctx, Logger, OutputServerEventStream, RPCReflect } from '../shared';
import _ from 'lodash'; import _ from 'lodash';
@ -32,11 +32,11 @@ export class CrawlerHost extends RPCHost {
const toBeTurnedToMd = snapshot.parsed?.content; const toBeTurnedToMd = snapshot.parsed?.content;
const turnedDown = toBeTurnedToMd ? this.turnDownService.turndown(toBeTurnedToMd).trim() : ''; const turnedDown = toBeTurnedToMd ? this.turnDownService.turndown(toBeTurnedToMd).trim() : '';
const contentText = turnedDown && !(turnedDown.startsWith('<') && turnedDown.endsWith('>')) ? turnedDown : snapshot.text.trim(); const contentText = turnedDown && !(turnedDown.startsWith('<') && turnedDown.endsWith('>')) ? turnedDown : snapshot.text?.trim();
const formatted = { const formatted = {
title: (snapshot.parsed?.title || snapshot.title || '').trim(), title: (snapshot.parsed?.title || snapshot.title || '').trim(),
url: snapshot.href.trim(), url: snapshot.href?.trim(),
content: contentText.trim(), content: contentText.trim(),
toString() { toString() {
@ -80,7 +80,15 @@ ${this.content}
}, },
) { ) {
const noSlashURL = ctx.req.url.slice(1); const noSlashURL = ctx.req.url.slice(1);
const urlToCrawl = new URL(normalizeUrl(noSlashURL)); let urlToCrawl;
try {
urlToCrawl = new URL(normalizeUrl(noSlashURL.trim()));
} catch (err) {
throw new ParamValidationError({
message: `${err}`,
path: 'url'
});
}
const screenshotEnabled = Boolean(ctx.req.headers['x-screenshot']); const screenshotEnabled = Boolean(ctx.req.headers['x-screenshot']);
const noCache = Boolean(ctx.req.headers['x-no-cache']); const noCache = Boolean(ctx.req.headers['x-no-cache']);
@ -125,7 +133,7 @@ ${this.content}
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) { if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) { for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
lastScrapped = scrapped; lastScrapped = scrapped;
if (!scrapped?.parsed?.content) { if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
continue; continue;
} }
@ -143,7 +151,7 @@ ${this.content}
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) { for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
lastScrapped = scrapped; lastScrapped = scrapped;
if (!scrapped?.parsed?.content) { if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
continue; continue;
} }

View File

@ -21,6 +21,7 @@ initializeApp();
import { loadModulesDynamically, registry } from './shared'; import { loadModulesDynamically, registry } from './shared';
import path from 'path'; import path from 'path';
import { ApplicationError } from 'civkit';
loadModulesDynamically(path.resolve(__dirname, 'cloud-functions')); loadModulesDynamically(path.resolve(__dirname, 'cloud-functions'));
Object.assign(exports, registry.exportAll()); Object.assign(exports, registry.exportAll());
@ -31,4 +32,12 @@ Object.assign(exports, registry.exportGrouped({
registry.title = 'reader'; registry.title = 'reader';
registry.version = '0.1.0'; registry.version = '0.1.0';
process.on('unhandledRejection', () => 'no big deal'); process.on('unhandledRejection', (err) => {
// Walk around Firebase runtime bug.
if (err instanceof ApplicationError) {
// Application error shall not crash the process;
return;
}
throw err;
});

View File

@ -1,11 +1,13 @@
import { AssertionFailureError, AsyncService, Defer, HashManager, marshalErrorLike } from 'civkit'; import { AssertionFailureError, AsyncService, Defer, HashManager, marshalErrorLike } from 'civkit';
import { container, singleton } from 'tsyringe'; import { container, singleton } from 'tsyringe';
import puppeteer, { Browser } from 'puppeteer'; import type { Browser } from 'puppeteer';
import { Logger } from '../shared/services/logger'; import { Logger } from '../shared/services/logger';
import genericPool from 'generic-pool'; import genericPool from 'generic-pool';
import os from 'os'; import os from 'os';
import fs from 'fs'; import fs from 'fs';
import { Crawled } from '../db/crawled'; import { Crawled } from '../db/crawled';
import puppeteer from 'puppeteer-extra';
import puppeteerStealth from 'puppeteer-extra-plugin-stealth';
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8'); const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
@ -31,6 +33,12 @@ export interface PageSnapshot {
} }
const md5Hasher = new HashManager('md5', 'hex'); const md5Hasher = new HashManager('md5', 'hex');
puppeteer.use(puppeteerStealth());
// const puppeteerUAOverride = require('puppeteer-extra-plugin-stealth/evasions/user-agent-override');
// puppeteer.use(puppeteerUAOverride({
// userAgent: `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`
// }))
@singleton() @singleton()
export class PuppeteerControl extends AsyncService { export class PuppeteerControl extends AsyncService {
@ -77,9 +85,10 @@ export class PuppeteerControl extends AsyncService {
headless: true, headless: true,
timeout: 10_000 timeout: 10_000
}).catch((err) => { }).catch((err) => {
this.logger.error(`Unknown firebase issue, just die fast, quitting process.`, { err }); this.logger.error(`Unknown firebase issue, just die fast.`, { err });
process.nextTick(() => { process.nextTick(() => {
process.exit(1); this.emit('error', err);
// process.exit(1);
}); });
return Promise.reject(err); return Promise.reject(err);
}); });
@ -100,7 +109,7 @@ export class PuppeteerControl extends AsyncService {
const preparations = []; const preparations = [];
// preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`)); // preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`)); // preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`));
preparations.push(page.setBypassCSP(true)); preparations.push(page.setBypassCSP(true));
preparations.push(page.setViewport({ width: 1920, height: 1080 })); preparations.push(page.setViewport({ width: 1920, height: 1080 }));
preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => { preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => {