From 76bc32e2e0a51cff993534734a3beb83b48fffb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D8=B9=D8=A8=D8=AF=D8=A7=D9=84=D9=85=D9=86=D8=A7=D9=86?= Date: Sat, 8 Feb 2025 22:53:06 +0500 Subject: [PATCH] code optimizations --- crawler/constants/enums.py | 48 +++++- .../card_extraction_model.py | 44 +++--- .../local_shared_model/rule_model.py | 2 +- .../crawler_services/shared/helper_method.py | 22 ++- crawler/request_manager.py | 46 ++++-- shared_collector/main.py | 7 +- ...sqlkrqcmxq6zu3d7obrdhglpy5jpbr7whmlfgqd.py | 5 +- shared_collector/scripts/_darkfeed.py | 142 ++++++++++-------- shared_collector/scripts/_ddosecrets.py | 10 +- ...bvrjqtyro2hmhkmh6vkyfyjjzfllm3ix72aqaid.py | 10 +- shared_collector/scripts/_handala_hack.py | 5 +- shared_collector/scripts/_mirror_h.py | 8 +- ...yol7qhazkwkv57dwqvye5v46k5bcujtfa6sduad.py | 8 +- ...bisc42o2q2i54vdulyvtqqbudqousisjgc7j7yd.py | 3 +- ...5626k2ib6dds6zizjwuuashz67usjps2wehz4id.py | 11 +- shared_collector/scripts/_ransomwiki.py | 46 +----- ...pqu6bpzwztryeflq3s23tegbmnhkbpqz637f2yd.py | 5 +- shared_collector/scripts/_zone_xsec.py | 7 +- 18 files changed, 238 insertions(+), 191 deletions(-) diff --git a/crawler/constants/enums.py b/crawler/constants/enums.py index f06abb0..609998c 100644 --- a/crawler/constants/enums.py +++ b/crawler/constants/enums.py @@ -1,8 +1,42 @@ -from enum import Enum +VALID_NETWORK_TYPES = [ + "clearnet", + "i2p", + "onion", + "invalid" +] - -class network_type(Enum): - CLEARNET = "clearnet" - I2P = "i2p" - ONION = "onion" - INVALID = "invalid" +VALID_CONTENT_TYPES = [ + "general", + "forums", + "news", + "stolen", + "drugs", + "hacking", + "marketplaces", + "cryptocurrency", + "leaks", + "adult", + "carding", + "scams", + "ransomware", + "databases", + "money_laundering", + "counterfeit", + "malware", + "botnets", + "exploits", + "spam", + "chemicals", + "weapons", + "human_trafficking", + "csam", + "doxing", + "extortion", + "espionage", + "propaganda", + "terrorism", + "government_leaks", + "c2_panels", + "ddos", + "apt" +] diff --git a/crawler/crawler_instance/local_shared_model/card_extraction_model.py b/crawler/crawler_instance/local_shared_model/card_extraction_model.py index 2565d29..160ef9a 100644 --- a/crawler/crawler_instance/local_shared_model/card_extraction_model.py +++ b/crawler/crawler_instance/local_shared_model/card_extraction_model.py @@ -1,35 +1,43 @@ from dataclasses import dataclass, field from typing import List, Optional +from crawler.constants.enums import VALID_NETWORK_TYPES, VALID_CONTENT_TYPES + + @dataclass class card_extraction_model: - m_title: str = "" - m_url: str = "" - m_content: str = "" - m_base_url: str = "" - m_network: str = "" - m_important_content: str = "" - m_content_type: str = "general" + m_title: str + m_url: str + m_base_url: str + m_content: str + m_important_content: str + m_network: str + m_content_type: str m_weblink: List[str] = field(default_factory=list) m_dumplink: List[str] = field(default_factory=list) - m_extra_tags: List[str] = field(default_factory=list) - m_sections: List[str] = field(default_factory=list) m_name: str = "" m_email_addresses: List[str] = field(default_factory=list) + m_industry: Optional[str] = None m_phone_numbers: List[str] = field(default_factory=list) m_addresses: List[str] = field(default_factory=list) m_social_media_profiles: List[str] = field(default_factory=list) m_websites: List[str] = field(default_factory=list) m_company_name: Optional[str] = None - m_industry: Optional[str] = None - m_job_title_or_position: Optional[str] = None - m_associated_entities: List[str] = field(default_factory=list) - m_aliases_or_alternate_names: List[str] = field(default_factory=list) m_logo_or_images: List[str] = field(default_factory=list) - m_business_categories: List[str] = field(default_factory=list) - m_services_or_products: List[str] = field(default_factory=list) - m_public_records: List[str] = field(default_factory=list) - m_online_activity: List[str] = field(default_factory=list) m_leak_date: Optional[str] = None - m_last_updated: Optional[str] = None + m_data_size: Optional[str] = None + m_country_name: Optional[str] = None + m_revenue: Optional[str] = None + + def __post_init__(self): + required_fields = ["m_title", "m_url", "m_content", "m_base_url", "m_important_content"] + + for field_name in required_fields: + if getattr(self, field_name) is None: + raise ValueError(f"The field '{field_name}' is required and cannot be None.") + + if self.m_network not in VALID_NETWORK_TYPES: + raise ValueError(f"Invalid network type provided: {self.m_network}. Must be one of {', '.join(VALID_NETWORK_TYPES)}.") + if self.m_content_type not in VALID_CONTENT_TYPES: + raise ValueError(f"Invalid content type provided: {self.m_content_type}. Must be one of {', '.join(VALID_CONTENT_TYPES)}.") diff --git a/crawler/crawler_instance/local_shared_model/rule_model.py b/crawler/crawler_instance/local_shared_model/rule_model.py index 2c7bae6..1cc9f9a 100644 --- a/crawler/crawler_instance/local_shared_model/rule_model.py +++ b/crawler/crawler_instance/local_shared_model/rule_model.py @@ -9,7 +9,7 @@ class FetchProxy(str, Enum): NONE = "none" class RuleModel: - def __init__(self, m_timeout: int = 7200, m_fetch_config: FetchConfig = FetchConfig.SELENIUM, m_fetch_proxy: FetchProxy = FetchProxy.NONE): + def __init__(self, m_timeout: int = 17200, m_fetch_config: FetchConfig = FetchConfig.SELENIUM, m_fetch_proxy: FetchProxy = FetchProxy.NONE): self.m_timeout = m_timeout self.m_fetch_config = m_fetch_config self.m_fetch_proxy = m_fetch_proxy diff --git a/crawler/crawler_services/shared/helper_method.py b/crawler/crawler_services/shared/helper_method.py index 6f83248..f7a201b 100644 --- a/crawler/crawler_services/shared/helper_method.py +++ b/crawler/crawler_services/shared/helper_method.py @@ -1,7 +1,7 @@ # Local Imports import re from urllib.parse import urlparse -from crawler.constants.enums import network_type +from bs4 import BeautifulSoup class helper_method: @@ -12,20 +12,20 @@ def clean_text(text: str) -> str: return text @staticmethod - def get_network_type(url:str): + def get_network_type(url: str): try: if not url.startswith("http"): url = "http://" + url parsed_url = urlparse(url) if not parsed_url.scheme or not parsed_url.netloc: - return network_type.INVALID + return "invalid" if re.search(r"\.onion$", parsed_url.netloc, re.IGNORECASE): - return network_type.ONION + return "onion" if re.search(r"\.i2p$", parsed_url.netloc, re.IGNORECASE): - return network_type.I2P - return network_type.CLEARNET + return "i2p" + return "clearnet" except Exception: - return network_type.INVALID + return "invalid" @staticmethod def extract_emails(text: str) -> list: @@ -41,3 +41,11 @@ def extract_phone_numbers(text: str) -> list: phone_numbers = re.findall(phone_pattern, text) return phone_numbers + @staticmethod + def extract_text_from_html(html: str) -> str: + """ + Extracts and cleans text from an HTML string using BeautifulSoup. + """ + soup = BeautifulSoup(html, "html.parser") + text = soup.get_text(separator=' ') + return helper_method.clean_text(text) diff --git a/crawler/request_manager.py b/crawler/request_manager.py index f881fda..99d497c 100644 --- a/crawler/request_manager.py +++ b/crawler/request_manager.py @@ -1,4 +1,5 @@ import sys +import traceback from threading import Timer import redis @@ -34,7 +35,8 @@ def check_services_status(): print(f"Error: Redis server is not running or accessible. Details: {ex}") sys.exit(1) -def parse_leak_data(proxy: dict, model:leak_extractor_interface) -> tuple: + +def parse_leak_data(blocked_media, proxy: dict, model: leak_extractor_interface) -> tuple: default_data_model = leak_data_model( cards_data=[], contact_link=model.contact_page(), @@ -42,6 +44,15 @@ def parse_leak_data(proxy: dict, model:leak_extractor_interface) -> tuple: content_type=["leak"] ) + def get_block_resources(route): + request_url = route.request.url.lower() + + if any(request_url.startswith(scheme) for scheme in ["data:image", "data:video", "data:audio"]) or \ + route.request.resource_type in ["image", "media", "font", "stylesheet"]: + route.abort() + else: + route.continue_() + raw_parse_mapping = {} timeout_flag = {"value": False} browser = None @@ -52,6 +63,7 @@ def terminate_browser(): if browser: try: print("Timeout reached. Closing browser and terminating tasks.") + browser.close() except Exception: pass @@ -63,21 +75,23 @@ def terminate_browser(): browser = p.chromium.launch(proxy=proxy, headless=False) context = browser.new_context() - context.set_default_timeout(60000) - context.set_default_navigation_timeout(60000) + context.set_default_timeout(600000) + context.set_default_navigation_timeout(600000) timeout_timer = Timer(model.rule_config.m_timeout, terminate_browser) timeout_timer.start() try: page = context.new_page() + if blocked_media: + page.route("**/*", get_block_resources) + def capture_response(response): if response.request.resource_type == "document" and response.ok: try: - cc = response.text() raw_parse_mapping[response.url] = response.text() - print("parsed : " + response.url) - except Exception as ex: + print("Parsed:", response.url) + except Exception: pass page.on("response", capture_response) @@ -86,21 +100,33 @@ def capture_response(response): if timeout_flag["value"]: raise TimeoutException("Timeout occurred during navigation.") + page.evaluate(""" + document.querySelectorAll('*').forEach(el => { + if (el.src && el.src.startsWith('data:image')) el.remove(); + if (el.src && el.src.startsWith('data:video')) el.remove(); + if (el.src && el.src.startsWith('data:audio')) el.remove(); + if (el.href && el.href.startsWith('data:')) el.remove(); + if (el.innerHTML.includes('data:image') || el.innerHTML.includes('data:video')) el.remove(); + }); + """) + model.soup = BeautifulSoup(page.content(), 'html.parser') raw_parse_mapping[page.url] = page.content() + model.parse_leak_data(page) except Exception as e: - pass + error_traceback = traceback.format_exc() + print(f"TRACEBACK: {error_traceback}") finally: timeout_timer.cancel() - except Exception as e: - print(f"Unexpected Error: {e}") + except Exception as _: + error_traceback = traceback.format_exc() + print(f"TRACEBACK: {error_traceback}") default_data_model.cards_data = model.card_data return default_data_model, raw_parse_mapping - async def get_proxy(use_proxy=True) -> Dict[str, str]: if use_proxy: proxies = {"server": "socks5://127.0.0.1:9150"} diff --git a/shared_collector/main.py b/shared_collector/main.py index 84e327c..649f087 100644 --- a/shared_collector/main.py +++ b/shared_collector/main.py @@ -1,10 +1,9 @@ from crawler.request_manager import check_services_status, parse_leak_data -from shared_collector.scripts._nerqnacjmdy3obvevyol7qhazkwkv57dwqvye5v46k5bcujtfa6sduad import \ - _nerqnacjmdy3obvevyol7qhazkwkv57dwqvye5v46k5bcujtfa6sduad +from shared_collector.scripts._zone_xsec import _zone_xsec check_services_status() if __name__ == "__main__": - parse_sample = _nerqnacjmdy3obvevyol7qhazkwkv57dwqvye5v46k5bcujtfa6sduad() - parsed_data, raw_parse_mapping = parse_leak_data({"server": "socks5://127.0.0.1:9150"}, parse_sample) + parse_sample = _zone_xsec() + parsed_data, raw_parse_mapping = parse_leak_data(blocked_media=True, proxy={"server": "socks5://127.0.0.1:9150"}, model=parse_sample) print(parsed_data) diff --git a/shared_collector/scripts/_3ev4metjirohtdpshsqlkrqcmxq6zu3d7obrdhglpy5jpbr7whmlfgqd.py b/shared_collector/scripts/_3ev4metjirohtdpshsqlkrqcmxq6zu3d7obrdhglpy5jpbr7whmlfgqd.py index 9d8259b..cf88d94 100644 --- a/shared_collector/scripts/_3ev4metjirohtdpshsqlkrqcmxq6zu3d7obrdhglpy5jpbr7whmlfgqd.py +++ b/shared_collector/scripts/_3ev4metjirohtdpshsqlkrqcmxq6zu3d7obrdhglpy5jpbr7whmlfgqd.py @@ -76,14 +76,13 @@ def parse_leak_data(self, page: Page): m_url=self.seed_url, m_base_url=self.base_url, m_content=body_text, - m_network=helper_method.get_network_type(self.base_url).value, + m_network=helper_method.get_network_type(self.base_url), m_important_content=body_text, m_weblink=[self.seed_url], m_dumplink=dump_links, m_email_addresses=helper_method.extract_emails(body_text), m_phone_numbers=helper_method.extract_phone_numbers(body_text), - m_extra_tags=[], - m_content_type="organization", + m_content_type="leaks", ) self._card_data.append(card_data) diff --git a/shared_collector/scripts/_darkfeed.py b/shared_collector/scripts/_darkfeed.py index dc7abad..0e40118 100644 --- a/shared_collector/scripts/_darkfeed.py +++ b/shared_collector/scripts/_darkfeed.py @@ -1,67 +1,81 @@ from abc import ABC -from typing import List, Tuple, Set +from datetime import datetime +from typing import List from bs4 import BeautifulSoup -from crawler.crawler_instance.local_interface_model.collector_interface import collector_interface -from crawler.crawler_instance.local_shared_model.rule_model import RuleModel, FetchProxy, FetchConfig -from crawler.crawler_instance.local_shared_model.leak_data_model import leak_data_model +from playwright.sync_api import Page +from crawler.crawler_instance.local_interface_model.leak_extractor_interface import leak_extractor_interface from crawler.crawler_instance.local_shared_model.card_extraction_model import card_extraction_model -from datetime import datetime - - -class _darkfeed(collector_interface, ABC): - _instance = None - - def __init__(self): - self.soup = None - self.extracted_data: List[card_extraction_model] = [] - - def __new__(cls): - if cls._instance is None: - cls._instance = super(_darkfeed, cls).__new__(cls) - return cls._instance - - @property - def base_url(self) -> str: - return "https://darkfeed.io" - - @property - def rule_config(self) -> RuleModel: - return RuleModel(m_fetch_proxy=FetchProxy.NONE, m_fetch_config=FetchConfig.SELENIUM) - - def parse_leak_data(self, html_content: str, p_data_url: str) -> Tuple[leak_data_model, Set[str]]: - self.soup = BeautifulSoup(html_content, 'html.parser') - data_model = leak_data_model(cards_data=[], contact_link=self.contact_page(), base_url=self.base_url, content_type=["leak"]) - - today_date = datetime.today().strftime('%Y-%m-%d') - allowed_tags = {"tag-data-breach", "tag-ransomware-intelligence"} - - for article in self.soup.find_all("article", class_="elementor-post"): - classes = set(article.get("class", [])) # Convert class list to set for easy comparison - - if not allowed_tags.intersection(classes): - continue - - title_link = article.find("h3", class_="elementor-post__title").find("a") - url = title_link['href'] if title_link else None - title = title_link.get_text(strip=True) if title_link else None - - date_element = article.find("span", class_="elementor-post-date") - posted_date = date_element.get_text(strip=True) if date_element else None - - image_element = article.find("img", class_="attachment-large") - image_url = image_element['src'] if image_element else None - - if url and title and posted_date: - content_message = f"{title}, To visit or explore more visit the website: {url}" - - important_content = title - - card = card_extraction_model(m_title=title, m_url=url, m_base_url=self.base_url, m_content=content_message, m_content_type="leak", m_logo_or_images=[image_url] if image_url else [], m_last_updated=today_date, m_important_content=important_content # Add title here - ) - self.extracted_data.append(card) - data_model.cards_data.append(card) - - return data_model, set() - - def contact_page(self) -> str: - return "https://www.linkedin.com/company/darkfeed/" +from crawler.crawler_instance.local_shared_model.rule_model import RuleModel, FetchProxy, FetchConfig +from crawler.crawler_services.redis_manager.redis_controller import redis_controller +from crawler.crawler_services.redis_manager.redis_enums import REDIS_COMMANDS, CUSTOM_SCRIPT_REDIS_KEYS +from crawler.crawler_services.shared.helper_method import helper_method + +class _darkfeed(leak_extractor_interface, ABC): + _instance = None + + def __init__(self): + self._card_data = [] + self.soup = None + self._initialized = None + self._redis_instance = redis_controller() + + def __new__(cls): + if cls._instance is None: + cls._instance = super(_darkfeed, cls).__new__(cls) + cls._instance._initialized = False + return cls._instance + + @property + def seed_url(self) -> str: + return "https://darkfeed.io/threat-intelligence/" + + @property + def base_url(self) -> str: + return "https://darkfeed.io" + + @property + def rule_config(self) -> RuleModel: + return RuleModel(m_fetch_proxy=FetchProxy.NONE, m_fetch_config=FetchConfig.SELENIUM) + + @property + def card_data(self) -> List[card_extraction_model]: + return self._card_data + + def invoke_db(self, command: REDIS_COMMANDS, key: CUSTOM_SCRIPT_REDIS_KEYS, default_value) -> None: + return self._redis_instance.invoke_trigger(command, [key.value + self.__class__.__name__, default_value]) + + def contact_page(self) -> str: + return "https://darkfeed.io/aboutus/" + + def parse_leak_data(self, page: Page): + try: + self.soup = BeautifulSoup(page.content(), 'html.parser') + today_date = datetime.today().strftime('%Y-%m-%d') + + for article in self.soup.find_all("article", class_="elementor-post"): + title_link = article.find("h3", class_="elementor-post__title").find("a") + url = title_link['href'] if title_link else None + title = title_link.get_text(strip=True) if title_link else None + + date_element = article.find("span", class_="elementor-post-date") + posted_date = date_element.get_text(strip=True) if date_element else None + + if url and title and posted_date: + content_message = f"{title}, To visit or explore more visit the website: {url}" + + card_data = card_extraction_model( + m_title=title, + m_url=url, + m_base_url=self.base_url, + m_content=content_message, + m_network=helper_method.get_network_type(self.base_url), + m_important_content=content_message, + m_email_addresses=helper_method.extract_emails(content_message), + m_phone_numbers=helper_method.extract_phone_numbers(content_message), + m_content_type="leaks", + m_leak_date=today_date + ) + + self._card_data.append(card_data) + except Exception as ex: + print(ex) \ No newline at end of file diff --git a/shared_collector/scripts/_ddosecrets.py b/shared_collector/scripts/_ddosecrets.py index 2178756..8408b6c 100644 --- a/shared_collector/scripts/_ddosecrets.py +++ b/shared_collector/scripts/_ddosecrets.py @@ -9,6 +9,8 @@ from crawler.crawler_instance.local_shared_model.rule_model import RuleModel, FetchProxy, FetchConfig from crawler.crawler_services.redis_manager.redis_controller import redis_controller from crawler.crawler_services.redis_manager.redis_enums import REDIS_COMMANDS, CUSTOM_SCRIPT_REDIS_KEYS +from crawler.crawler_services.shared.helper_method import helper_method + class _ddosecrets(leak_extractor_interface, ABC): _instance = None @@ -120,15 +122,15 @@ def parse_leak_data(self, page: Page): m_url=article_url, m_base_url=self.base_url, m_content=content_text, - m_content_type="leak", + m_content_type="leaks", m_important_content=content_text, m_weblink=weblinks, + m_network=helper_method.get_network_type(self.base_url), m_dumplink=dumplinks, - m_extra_tags=types, - m_last_updated=published_date, + m_leak_date=published_date, m_company_name=source, m_addresses=countries, - m_services_or_products=[download_size], + m_data_size=download_size, ) self._card_data.append(card) diff --git a/shared_collector/scripts/_ebhmkoohccl45qesdbvrjqtyro2hmhkmh6vkyfyjjzfllm3ix72aqaid.py b/shared_collector/scripts/_ebhmkoohccl45qesdbvrjqtyro2hmhkmh6vkyfyjjzfllm3ix72aqaid.py index d05684f..27e0b6d 100644 --- a/shared_collector/scripts/_ebhmkoohccl45qesdbvrjqtyro2hmhkmh6vkyfyjjzfllm3ix72aqaid.py +++ b/shared_collector/scripts/_ebhmkoohccl45qesdbvrjqtyro2hmhkmh6vkyfyjjzfllm3ix72aqaid.py @@ -1,9 +1,8 @@ from abc import ABC from datetime import datetime from typing import List -import time from bs4 import BeautifulSoup -from playwright.sync_api import Page, sync_playwright +from playwright.sync_api import Page from crawler.crawler_instance.local_interface_model.leak_extractor_interface import leak_extractor_interface from crawler.crawler_instance.local_shared_model.card_extraction_model import card_extraction_model from crawler.crawler_instance.local_shared_model.rule_model import RuleModel, FetchProxy, FetchConfig @@ -119,12 +118,11 @@ def parse_leak_data(self, page: Page): m_base_url=self.base_url, m_content=content, m_websites=[], - m_important_content="", - m_content_type="leak", - m_online_activity="", + m_important_content=content, + m_network=helper_method.get_network_type(self.base_url), + m_content_type="leaks", m_email_addresses=[], m_phone_numbers=[], - m_last_updated=today_date, m_leak_date=today_date ) diff --git a/shared_collector/scripts/_handala_hack.py b/shared_collector/scripts/_handala_hack.py index 05ca661..3df6911 100644 --- a/shared_collector/scripts/_handala_hack.py +++ b/shared_collector/scripts/_handala_hack.py @@ -106,18 +106,17 @@ def parse_leak_data(self, page: Page): card_data = card_extraction_model( m_title=title, - m_network=helper_method.get_network_type(self.base_url).value, m_weblink=external_links, m_dumplink=dump_links, m_url=link, m_base_url=self.base_url, m_content=content, m_logo_or_images=image_urls, + m_network=helper_method.get_network_type(self.base_url), m_important_content=important_content, - m_content_type="leak", + m_content_type="leaks", m_email_addresses=helper_method.extract_emails(content), m_phone_numbers=helper_method.extract_phone_numbers(content), - m_last_updated=today_date, m_leak_date=date_time ) diff --git a/shared_collector/scripts/_mirror_h.py b/shared_collector/scripts/_mirror_h.py index f3010b8..1b82dc9 100644 --- a/shared_collector/scripts/_mirror_h.py +++ b/shared_collector/scripts/_mirror_h.py @@ -2,7 +2,7 @@ from datetime import datetime from typing import List from bs4 import BeautifulSoup -from playwright.sync_api import Page, sync_playwright +from playwright.sync_api import Page from crawler.crawler_instance.local_interface_model.leak_extractor_interface import leak_extractor_interface from crawler.crawler_instance.local_shared_model.card_extraction_model import card_extraction_model from crawler.crawler_instance.local_shared_model.rule_model import RuleModel, FetchProxy, FetchConfig @@ -38,7 +38,7 @@ def base_url(self) -> str: @property def rule_config(self) -> RuleModel: - return RuleModel(m_fetch_proxy=FetchProxy.NONE, m_fetch_config=FetchConfig.SELENIUM) + return RuleModel(m_fetch_proxy=FetchProxy.TOR, m_fetch_config=FetchConfig.SELENIUM) @property def card_data(self) -> List[card_extraction_model]: @@ -124,11 +124,9 @@ def parse_leak_data(self, page: Page): m_content=m_content_container, m_websites=[web_server] if web_server else [], m_important_content=m_important_content_container if m_important_content_container else "", - m_content_type="leak", - m_online_activity=total, + m_content_type="leaks", m_email_addresses=helper_method.extract_emails(m_content_container), m_phone_numbers=helper_method.extract_phone_numbers(m_content_container), - m_last_updated=today_date, m_leak_date=date ) diff --git a/shared_collector/scripts/_nerqnacjmdy3obvevyol7qhazkwkv57dwqvye5v46k5bcujtfa6sduad.py b/shared_collector/scripts/_nerqnacjmdy3obvevyol7qhazkwkv57dwqvye5v46k5bcujtfa6sduad.py index 9d49e35..d9f2c64 100644 --- a/shared_collector/scripts/_nerqnacjmdy3obvevyol7qhazkwkv57dwqvye5v46k5bcujtfa6sduad.py +++ b/shared_collector/scripts/_nerqnacjmdy3obvevyol7qhazkwkv57dwqvye5v46k5bcujtfa6sduad.py @@ -72,7 +72,7 @@ def parse_leak_data(self, page: Page): visited_cards.add(card_text) card.click() - page.wait_for_selector('.text-block', timeout=5000) + page.wait_for_selector('.text-block', timeout=15000) detail_html = page.content() detail_soup = BeautifulSoup(detail_html, 'html.parser') @@ -119,17 +119,17 @@ def parse_leak_data(self, page: Page): m_addresses=[address] if address != "N/A" else [], m_logo_or_images=image_urls, m_phone_numbers=[phone_number] if phone_number != "N/A" else [], - m_extra_tags=[revenue], + m_revenue = revenue, m_leak_date=date_time, m_url=page.url, m_base_url=self.base_url, m_company_name=title, - m_network=helper_method.get_network_type(self.base_url).value, + m_network=helper_method.get_network_type(self.base_url), m_important_content=content, m_dumplink=dumplinks, m_email_addresses=helper_method.extract_emails(detail_soup.text), m_industry=industry, - m_content_type="Leak", + m_content_type="leaks", )) page.go_back() diff --git a/shared_collector/scripts/_omegalock5zxwbhswbisc42o2q2i54vdulyvtqqbudqousisjgc7j7yd.py b/shared_collector/scripts/_omegalock5zxwbhswbisc42o2q2i54vdulyvtqqbudqousisjgc7j7yd.py index 6aaa221..c927abf 100644 --- a/shared_collector/scripts/_omegalock5zxwbhswbisc42o2q2i54vdulyvtqqbudqousisjgc7j7yd.py +++ b/shared_collector/scripts/_omegalock5zxwbhswbisc42o2q2i54vdulyvtqqbudqousisjgc7j7yd.py @@ -91,8 +91,7 @@ def parse_leak_data(self, page: Page): m_dumplink=dump_links, m_email_addresses=helper_method.extract_emails(content), m_phone_numbers=helper_method.extract_phone_numbers(content), - m_extra_tags=[], - m_content_type="organization", + m_content_type="leaks", ) self._card_data.append(card_data) diff --git a/shared_collector/scripts/_orca66hwnpciepupe5626k2ib6dds6zizjwuuashz67usjps2wehz4id.py b/shared_collector/scripts/_orca66hwnpciepupe5626k2ib6dds6zizjwuuashz67usjps2wehz4id.py index ca1923b..bd96193 100644 --- a/shared_collector/scripts/_orca66hwnpciepupe5626k2ib6dds6zizjwuuashz67usjps2wehz4id.py +++ b/shared_collector/scripts/_orca66hwnpciepupe5626k2ib6dds6zizjwuuashz67usjps2wehz4id.py @@ -122,19 +122,20 @@ def parse_leak_data(self, page: Page): card_data = card_extraction_model( m_company_name=card_title, + m_title=card_title, + m_url=self.base_url, m_weblink=[company_url] if company_url else [], m_dumplink=download_url, - m_network=helper_method.get_network_type(self.base_url).value, + m_network=helper_method.get_network_type(self.base_url), m_base_url=self.base_url, m_content=description, + m_important_content = description, m_logo_or_images=image_urls, - m_content_type="leak", - m_public_records=[number_of_files], + m_content_type="leaks", + m_data_size=number_of_files, m_email_addresses=helper_method.extract_emails(description) if description else [], m_phone_numbers=helper_method.extract_phone_numbers(description) if description else [], - m_last_updated=today_date, m_leak_date=date_of_publication, - m_extra_tags=[file_size] ) diff --git a/shared_collector/scripts/_ransomwiki.py b/shared_collector/scripts/_ransomwiki.py index 9f98225..0ac2553 100644 --- a/shared_collector/scripts/_ransomwiki.py +++ b/shared_collector/scripts/_ransomwiki.py @@ -16,20 +16,12 @@ class _ransomwiki(leak_extractor_interface, ABC): _instance = None def __init__(self): - """ - Initialize the _shared_sample class instance. - Sets up attributes for storing card data, parsing content, and interacting with Redis. - """ self._card_data = [] self.soup = None self._initialized = None self._redis_instance = redis_controller() def __new__(cls): - """ - Create a singleton instance of the _shared_sample class. - Ensures only one instance of the class is created during runtime. - """ if cls._instance is None: cls._instance = super(_ransomwiki, cls).__new__(cls) cls._instance._initialized = False @@ -37,55 +29,24 @@ def __new__(cls): @property def seed_url(self) -> str: - """ - Returns the seed URL for the data extraction process. - This is the starting point for parsing the required content. - """ return "https://ransom.wiki/" @property def base_url(self) -> str: - """ - Returns the base URL for relative URL resolution. - Used to create absolute URLs during parsing. - """ return "https://ransom.wiki/" @property def rule_config(self) -> RuleModel: - """ - Returns the configuration rules for data fetching. - Specifies the use of TOR as the proxy and Selenium as the fetching mechanism. - """ return RuleModel(m_fetch_proxy=FetchProxy.TOR, m_fetch_config=FetchConfig.SELENIUM) @property def card_data(self) -> List[card_extraction_model]: - """ - Returns the list of extracted card data models. - Stores all parsed information from the leak extraction process. - """ return self._card_data def invoke_db(self, command: REDIS_COMMANDS, key: CUSTOM_SCRIPT_REDIS_KEYS, default_value) -> None: - """ - Interacts with the Redis database to perform a specified command. - - Args: - command (REDIS_COMMANDS): The Redis command to execute (e.g., GET, SET). - key (CUSTOM_SCRIPT_REDIS_KEYS): The key for the operation. - default_value: The default value to use if the key is not found. - - Returns: - None - """ return self._redis_instance.invoke_trigger(command, [key.value + self.__class__.__name__, default_value]) def contact_page(self) -> str: - """ - Returns the URL of the contact page for the shared sample data source. - Useful for referencing or navigating to the contact page. - """ return "https://www.linkedin.com/in/soufianetahiri/" def parse_leak_data(self, page: Page): @@ -149,13 +110,15 @@ def parse_leak_data(self, page: Page): country = line.split(":")[-1].strip() data["Country"] = country + if victim is None: + continue self._card_data.append(card_extraction_model( m_title=victim, m_url=post_url, m_base_url=self.base_url, m_content=description, m_company_name=group, - m_network=helper_method.get_network_type(self.base_url).value, + m_network=helper_method.get_network_type(self.base_url), m_important_content=description, m_weblink=[website], m_leak_date=published, @@ -163,7 +126,6 @@ def parse_leak_data(self, page: Page): m_dumplink=[], m_email_addresses=helper_method.extract_emails(soup.text), m_phone_numbers=helper_method.extract_phone_numbers(soup.text), - m_extra_tags=[], - m_content_type="Leak", + m_content_type="leaks", )) diff --git a/shared_collector/scripts/_weg7sdx54bevnvulapqu6bpzwztryeflq3s23tegbmnhkbpqz637f2yd.py b/shared_collector/scripts/_weg7sdx54bevnvulapqu6bpzwztryeflq3s23tegbmnhkbpqz637f2yd.py index b3fada4..7560b27 100644 --- a/shared_collector/scripts/_weg7sdx54bevnvulapqu6bpzwztryeflq3s23tegbmnhkbpqz637f2yd.py +++ b/shared_collector/scripts/_weg7sdx54bevnvulapqu6bpzwztryeflq3s23tegbmnhkbpqz637f2yd.py @@ -73,13 +73,12 @@ def parse_leak_data(self, page:Page ): m_url=page_url, m_base_url=self.base_url, m_content=content, - m_network=helper_method.get_network_type(self.base_url).value, + m_network=helper_method.get_network_type(self.base_url), m_important_content=content, m_weblink=weblinks, m_dumplink=dumplinks, m_email_addresses= helper_method.extract_emails(content), m_phone_numbers= helper_method.extract_phone_numbers(content), - m_extra_tags=extra_tags, - m_content_type="organization" + m_content_type="leaks" ) self._card_data.append(card_data) diff --git a/shared_collector/scripts/_zone_xsec.py b/shared_collector/scripts/_zone_xsec.py index 7a73539..cf1200c 100644 --- a/shared_collector/scripts/_zone_xsec.py +++ b/shared_collector/scripts/_zone_xsec.py @@ -10,6 +10,8 @@ from crawler.crawler_instance.local_shared_model.rule_model import RuleModel, FetchProxy, FetchConfig from crawler.crawler_services.redis_manager.redis_controller import redis_controller from crawler.crawler_services.redis_manager.redis_enums import REDIS_COMMANDS, CUSTOM_SCRIPT_REDIS_KEYS +from crawler.crawler_services.shared.helper_method import helper_method + class _zone_xsec(leak_extractor_interface, ABC): _instance = None @@ -139,10 +141,9 @@ def parse_leak_data(self, page: Page): m_base_url=self.base_url, m_content=m_content_container, m_websites=[web_server] if web_server else [], + m_network=helper_method.get_network_type(self.base_url), m_important_content=m_important_content_container if m_important_content_container else "", - m_content_type="leak", - m_online_activity=[team], - m_last_updated=today_date, + m_content_type="leaks", m_leak_date=date )