From c84f632ac257c5eef7ab1d9429ff6cf232320af0 Mon Sep 17 00:00:00 2001 From: Muhammad Hassan Arshad <152703576+MHassanAr@users.noreply.github.com> Date: Thu, 20 Feb 2025 18:46:19 +0500 Subject: [PATCH] Create _bianlianlbc5an4kgnay3opdemgcryg2kpfcbgczopmm3dnbz3uaunad.py --- ...nay3opdemgcryg2kpfcbgczopmm3dnbz3uaunad.py | 126 ++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 shared_collector/scripts/_bianlianlbc5an4kgnay3opdemgcryg2kpfcbgczopmm3dnbz3uaunad.py diff --git a/shared_collector/scripts/_bianlianlbc5an4kgnay3opdemgcryg2kpfcbgczopmm3dnbz3uaunad.py b/shared_collector/scripts/_bianlianlbc5an4kgnay3opdemgcryg2kpfcbgczopmm3dnbz3uaunad.py new file mode 100644 index 0000000..24e7ab8 --- /dev/null +++ b/shared_collector/scripts/_bianlianlbc5an4kgnay3opdemgcryg2kpfcbgczopmm3dnbz3uaunad.py @@ -0,0 +1,126 @@ +from abc import ABC +from typing import List +from playwright.sync_api import Page + +from crawler.crawler_instance.local_interface_model.leak_extractor_interface import leak_extractor_interface +from crawler.crawler_instance.local_shared_model.card_extraction_model import card_extraction_model +from crawler.crawler_instance.local_shared_model.rule_model import RuleModel, FetchProxy, FetchConfig +from crawler.crawler_services.redis_manager.redis_controller import redis_controller +from crawler.crawler_services.redis_manager.redis_enums import REDIS_COMMANDS, CUSTOM_SCRIPT_REDIS_KEYS +from crawler.crawler_services.shared.helper_method import helper_method + + +class _bianlianlbc5an4kgnay3opdemgcryg2kpfcbgczopmm3dnbz3uaunad(leak_extractor_interface, ABC): + _instance = None + + def __init__(self): + self._card_data = [] + self._redis_instance = redis_controller() + + def __new__(cls): + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + @property + def seed_url(self) -> str: + return "http://bianlianlbc5an4kgnay3opdemgcryg2kpfcbgczopmm3dnbz3uaunad.onion" + + @property + def base_url(self) -> str: + return self.seed_url + + @property + def rule_config(self) -> RuleModel: + return RuleModel(m_fetch_proxy=FetchProxy.TOR, m_fetch_config=FetchConfig.SELENIUM) + + @property + def card_data(self) -> List[card_extraction_model]: + return self._card_data + + def invoke_db(self, command: REDIS_COMMANDS, key: CUSTOM_SCRIPT_REDIS_KEYS, default_value) -> None: + return self._redis_instance.invoke_trigger(command, [key.value + self.__class__.__name__, default_value]) + + def contact_page(self) -> str: + return "swikipedia@onionmail.org" + + def parse_leak_data(self, page: Page): + try: + while True: + page.wait_for_load_state("networkidle") + + read_more_count = page.locator('a.readmore').count() + if read_more_count == 0: + return + + for i in range(read_more_count): + try: + read_more_link = page.locator('a.readmore').nth(i) + read_more_link.click() + page.wait_for_load_state("networkidle") + + title_elem = page.locator('article div.title h1.title') + title = title_elem.text_content().strip() if title_elem.count() > 0 else "No title found" + + description_elem = page.locator('section.body p').first + description = description_elem.text_content().strip() if description_elem.count() > 0 else "" + + weblink_elem = page.locator('section.body a[href^="https://"]').first + weblink = [weblink_elem.get_attribute('href')] if weblink_elem.count() > 0 else [] + + revenue_elem = page.locator('section.body p:has-text("Revenue")').first + revenue = revenue_elem.text_content().split(':')[ + -1].strip() if revenue_elem.count() > 0 else None + + data_size_elem = page.locator('section.body p:has-text("Data Volume")').first + data_size = data_size_elem.text_content().split(':')[ + -1].strip() if data_size_elem.count() > 0 else None + + dump_links = [f"{self.base_url}{link.get_attribute('href')}" for link in + page.locator('section.body a[href^="/dl/"]').all()] + + images = [ + f"{self.base_url}{img.get_attribute('src')}" if not img.get_attribute('src').startswith( + 'http') + else img.get_attribute('src') + for img in page.locator('section.body img').all() + ] + + card_data = card_extraction_model( + m_title=title, + m_url=page.url, + m_base_url=self.base_url, + m_content=description, + m_network=helper_method.get_network_type(page.url), + m_important_content=description, + m_weblink=weblink, + m_logo_or_images=images, + m_dumplink=dump_links, + m_email_addresses=helper_method.extract_emails(description), + m_phone_numbers=helper_method.extract_phone_numbers(description), + m_content_type="leaks", + m_revenue=revenue, + m_data_size=data_size + ) + + self._card_data.append(card_data) + + page.go_back() + page.wait_for_load_state("networkidle") + + except Exception as link_ex: + print(f"Error processing entry {i}: {str(link_ex)}") + page.go_back() + page.wait_for_load_state("networkidle") + continue + + next_button = page.locator('a:has-text("Next")').first + if next_button.count() > 0: + next_button.click() + page.wait_for_load_state("networkidle") + else: + break + + except Exception as e: + print(f"Error parsing leak data: {str(e)}") +