Skip to content

Commit

Permalink
Update _leak_lookup.py
Browse files Browse the repository at this point in the history
  • Loading branch information
usmancout committed Feb 24, 2025
1 parent f80bdcc commit b76d82c
Showing 1 changed file with 15 additions and 8 deletions.
23 changes: 15 additions & 8 deletions shared_collector/scripts/_leak_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,15 @@ def parse_leak_data(self, page: Page):
if not link_element:
continue

site_name = link_element.inner_text()
site_name = link_element.inner_text().strip()
site_url = link_element.get_attribute("href")

breach_size_element = row.query_selector("td.d-xl-table-cell:nth-of-type(2)")
breach_size = breach_size_element.inner_text().strip() if breach_size_element else "Unknown"

date_indexed_element = row.query_selector("td.d-xl-table-cell:nth-of-type(3)")
date_indexed = date_indexed_element.inner_text().strip() if date_indexed_element else "Unknown"

dropdown_button = row.query_selector("td .dropdown a")
if dropdown_button:
dropdown_button.click()
Expand All @@ -74,17 +81,17 @@ def parse_leak_data(self, page: Page):
modal_content = page.query_selector("#breachModal .modal-body")
modal_text = modal_content.inner_text() if modal_content else "No data available"

modal_text_cleaned = "\n".join([line.strip() for line in modal_text.split("\n") if line.strip()])

self._card_data.append(card_extraction_model(
m_title=site_name,
m_url=site_url,
m_base_url=self.base_url,
m_content=modal_text,
m_content=modal_text_cleaned,
m_network=helper_method.get_network_type(self.base_url),
m_important_content=modal_text,
m_weblink=[],
m_dumplink=[],
m_email_addresses=helper_method.extract_emails(modal_text),
m_phone_numbers=helper_method.extract_phone_numbers(modal_text),
m_important_content=modal_text_cleaned,
m_data_size=breach_size,
m_leak_date=date_indexed,
m_content_type="leaks",
))

Expand All @@ -99,4 +106,4 @@ def parse_leak_data(self, page: Page):
page.wait_for_selector("table tr", timeout=5000)
page.wait_for_timeout(3000)
else:
break
break

0 comments on commit b76d82c

Please sign in to comment.