Skip to content

Commit

Permalink
Log final site URL in addition to logging domain
Browse files Browse the repository at this point in the history
  • Loading branch information
ghostwords committed Jan 8, 2025
1 parent d029d68 commit 4e4d5f2
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 6 deletions.
8 changes: 5 additions & 3 deletions crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def get_recently_failed_domains(since_date):

error_pattern = re.compile("(?:Error loading|Exception on) ([^: ]+):")
num_scans = len(revisions)
timeout_pattern = re.compile("Timed out loading ([^ ]+)$")
timeout_pattern = re.compile("Timed out loading ([^ ]+)(?: on |$)")
timeout_counts = {}
logs = []

Expand Down Expand Up @@ -1036,7 +1036,8 @@ def crawl(self, domains):
domain, CHROME_URL_PREFIX)
continue

self.logger.info("Visited %s", curl or domain)
self.logger.info("Visited %s%s",
domain, (" on " + curl if curl else ""))
num_visited += 1

except (MaxRetryError, ProtocolError, ReadTimeoutError) as ex:
Expand All @@ -1047,7 +1048,8 @@ def crawl(self, domains):
curl = self.get_current_url()
if curl and curl.startswith((FF_URL_PREFIX, CHROME_URL_PREFIX)):
curl = None
self.logger.warning("Timed out loading %s", curl or domain)
self.logger.warning("Timed out loading %s%s",
domain, (" on " + curl if curl else ""))

except WebDriverException as ex:
self.logger.error("%s on %s: %s",
Expand Down
11 changes: 8 additions & 3 deletions tests/sitelist_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,22 +76,27 @@ def mock_run(cmd, cwd=None): # pylint:disable=unused-argument
"Visiting 4: example.website",
"Timed out loading example.website",
"Visiting 5: example.net",
"InsecureCertificateException on example.net: ZZZ"])
"InsecureCertificateException on example.net: ZZZ",
"Visiting 4: example.club",
"Timed out loading example.club"])

if cmd == "git show klmno:log.txt":
return "\n".join(["Visiting 1: example.website",
"Timed out loading example.website",
"Visiting 2: example.com",
"Error loading extension page (JavascriptException):",
"Visiting 3: example.us",
"Error loading example.us:"])
"Error loading example.us:",
"Visiting 4: example.club",
"Timed out loading example.club on https://example.club"])

return ""

monkeypatch.setattr(crawler, "run", mock_run)

expected_domains_set = set(["example.com", "example.net",
"example.org", "example.co.uk",
"example.website", "example.us"])
"example.website", "example.us",
"example.club"])

assert crawler.get_recently_failed_domains("1 week ago") == expected_domains_set

0 comments on commit 4e4d5f2

Please sign in to comment.