From 2fa25cb992e748acc10c6e42b13659bbf649953e Mon Sep 17 00:00:00 2001 From: Dmitriy Vasilyuk <111213624+reasonmethis@users.noreply.github.com> Date: Mon, 4 Nov 2024 23:33:39 -0800 Subject: [PATCH 1/2] [Fix] Prevent Python Firecrawl logger from interfering with loggers in client applications (#613) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Prevent firecrawl logger from interfering with other loggers * Always check if logger has handlers --------- Co-authored-by: Gergő Móricz --- apps/python-sdk/firecrawl/__init__.py | 40 +++++++++++++++++++++------ 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 82c73348..23d22cf4 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -11,7 +11,7 @@ For more information visit https://github.com/firecrawl/ import logging import os -from .firecrawl import FirecrawlApp +from .firecrawl import FirecrawlApp # noqa __version__ = "1.4.0" @@ -19,24 +19,46 @@ __version__ = "1.4.0" logger: logging.Logger = logging.getLogger("firecrawl") -def _basic_config() -> None: - """Set up basic configuration for logging with a specific format and date format.""" +def _configure_logger() -> None: + """ + Configure the firecrawl logger for console output. + + The function attaches a handler for console output with a specific format and date + format to the firecrawl logger. + """ try: - logging.basicConfig( - format="[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s", + # Create the formatter + formatter = logging.Formatter( + "[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) + + # Create the console handler and set the formatter + console_handler = logging.StreamHandler() + console_handler.setFormatter(formatter) + + # Add the console handler to the firecrawl logger + logger.addHandler(console_handler) except Exception as e: logger.error("Failed to configure logging: %s", e) def setup_logging() -> None: """Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable.""" - env = os.environ.get( - "FIRECRAWL_LOGGING_LEVEL", "INFO" - ).upper() # Default to 'INFO' level - _basic_config() + # Check if the firecrawl logger already has a handler + if logger.hasHandlers(): + return # To prevent duplicate logging + # Check if the FIRECRAWL_LOGGING_LEVEL environment variable is set + if not (env := os.getenv("FIRECRAWL_LOGGING_LEVEL", "").upper()): + # Attach a no-op handler to prevent warnings about no handlers + logger.addHandler(logging.NullHandler()) + return + + # Attach the console handler to the firecrawl logger + _configure_logger() + + # Set the logging level based on the FIRECRAWL_LOGGING_LEVEL environment variable if env == "DEBUG": logger.setLevel(logging.DEBUG) elif env == "INFO": From f07bbef78e4af19fe44003ea82a6a056254e1168 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <8574157+rafaelmmiller@users.noreply.github.com> Date: Tue, 5 Nov 2024 08:11:49 -0300 Subject: [PATCH 2/2] added trycatch and removed redundancy --- .../api/src/scraper/WebScraper/utils/utils.ts | 35 ++++++++++--------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/utils.ts b/apps/api/src/scraper/WebScraper/utils/utils.ts index 872adc6e..849d209d 100644 --- a/apps/api/src/scraper/WebScraper/utils/utils.ts +++ b/apps/api/src/scraper/WebScraper/utils/utils.ts @@ -29,27 +29,28 @@ export function extractLinks(html: string, baseUrl: string): string[] { const $ = cheerio.load(html); const links: string[] = []; - // Parse the base URL to get the origin - const urlObject = new URL(baseUrl); - const origin = urlObject.origin; - $('a').each((_, element) => { const href = $(element).attr('href'); if (href) { - if (href.startsWith('http://') || href.startsWith('https://')) { - // Absolute URL, add as is - links.push(href); - } else if (href.startsWith('/')) { - // Relative URL starting with '/', append to origin - links.push(new URL(href, baseUrl).href); - } else if (!href.startsWith('#') && !href.startsWith('mailto:')) { - // Relative URL not starting with '/', append to base URL - links.push(new URL(href, baseUrl).href); - } else if (href.startsWith('mailto:')) { - // mailto: links, add as is - links.push(href); + try { + if (href.startsWith('http://') || href.startsWith('https://')) { + // Absolute URL, add as is + links.push(href); + } else if (href.startsWith('/')) { + // Relative URL starting with '/', append to base URL + links.push(new URL(href, baseUrl).href); + } else if (!href.startsWith('#') && !href.startsWith('mailto:')) { + // Relative URL not starting with '/', append to base URL + links.push(new URL(href, baseUrl).href); + } else if (href.startsWith('mailto:')) { + // mailto: links, add as is + links.push(href); + } + // Fragment-only links (#) are ignored + } catch (error) { + // Log the error and continue + console.error(`Failed to construct URL for href: ${href} with base: ${baseUrl}`, error); } - // Fragment-only links (#) are ignored } });