Skip to content

Commit

Permalink
code optimizations
Browse files Browse the repository at this point in the history
  • Loading branch information
msmannan00 committed Feb 8, 2025
1 parent 5094d3e commit 76bc32e
Show file tree
Hide file tree
Showing 18 changed files with 238 additions and 191 deletions.
48 changes: 41 additions & 7 deletions crawler/constants/enums.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,42 @@
from enum import Enum
VALID_NETWORK_TYPES = [
"clearnet",
"i2p",
"onion",
"invalid"
]


class network_type(Enum):
CLEARNET = "clearnet"
I2P = "i2p"
ONION = "onion"
INVALID = "invalid"
VALID_CONTENT_TYPES = [
"general",
"forums",
"news",
"stolen",
"drugs",
"hacking",
"marketplaces",
"cryptocurrency",
"leaks",
"adult",
"carding",
"scams",
"ransomware",
"databases",
"money_laundering",
"counterfeit",
"malware",
"botnets",
"exploits",
"spam",
"chemicals",
"weapons",
"human_trafficking",
"csam",
"doxing",
"extortion",
"espionage",
"propaganda",
"terrorism",
"government_leaks",
"c2_panels",
"ddos",
"apt"
]
Original file line number Diff line number Diff line change
@@ -1,35 +1,43 @@
from dataclasses import dataclass, field
from typing import List, Optional

from crawler.constants.enums import VALID_NETWORK_TYPES, VALID_CONTENT_TYPES


@dataclass
class card_extraction_model:
m_title: str = ""
m_url: str = ""
m_content: str = ""
m_base_url: str = ""
m_network: str = ""
m_important_content: str = ""
m_content_type: str = "general"
m_title: str
m_url: str
m_base_url: str
m_content: str
m_important_content: str
m_network: str
m_content_type: str
m_weblink: List[str] = field(default_factory=list)
m_dumplink: List[str] = field(default_factory=list)
m_extra_tags: List[str] = field(default_factory=list)
m_sections: List[str] = field(default_factory=list)
m_name: str = ""
m_email_addresses: List[str] = field(default_factory=list)
m_industry: Optional[str] = None
m_phone_numbers: List[str] = field(default_factory=list)
m_addresses: List[str] = field(default_factory=list)
m_social_media_profiles: List[str] = field(default_factory=list)
m_websites: List[str] = field(default_factory=list)
m_company_name: Optional[str] = None
m_industry: Optional[str] = None
m_job_title_or_position: Optional[str] = None
m_associated_entities: List[str] = field(default_factory=list)
m_aliases_or_alternate_names: List[str] = field(default_factory=list)
m_logo_or_images: List[str] = field(default_factory=list)
m_business_categories: List[str] = field(default_factory=list)
m_services_or_products: List[str] = field(default_factory=list)
m_public_records: List[str] = field(default_factory=list)
m_online_activity: List[str] = field(default_factory=list)
m_leak_date: Optional[str] = None
m_last_updated: Optional[str] = None
m_data_size: Optional[str] = None
m_country_name: Optional[str] = None
m_revenue: Optional[str] = None

def __post_init__(self):
required_fields = ["m_title", "m_url", "m_content", "m_base_url", "m_important_content"]

for field_name in required_fields:
if getattr(self, field_name) is None:
raise ValueError(f"The field '{field_name}' is required and cannot be None.")

if self.m_network not in VALID_NETWORK_TYPES:
raise ValueError(f"Invalid network type provided: {self.m_network}. Must be one of {', '.join(VALID_NETWORK_TYPES)}.")

if self.m_content_type not in VALID_CONTENT_TYPES:
raise ValueError(f"Invalid content type provided: {self.m_content_type}. Must be one of {', '.join(VALID_CONTENT_TYPES)}.")
2 changes: 1 addition & 1 deletion crawler/crawler_instance/local_shared_model/rule_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class FetchProxy(str, Enum):
NONE = "none"

class RuleModel:
def __init__(self, m_timeout: int = 7200, m_fetch_config: FetchConfig = FetchConfig.SELENIUM, m_fetch_proxy: FetchProxy = FetchProxy.NONE):
def __init__(self, m_timeout: int = 17200, m_fetch_config: FetchConfig = FetchConfig.SELENIUM, m_fetch_proxy: FetchProxy = FetchProxy.NONE):
self.m_timeout = m_timeout
self.m_fetch_config = m_fetch_config
self.m_fetch_proxy = m_fetch_proxy
22 changes: 15 additions & 7 deletions crawler/crawler_services/shared/helper_method.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Local Imports
import re
from urllib.parse import urlparse
from crawler.constants.enums import network_type
from bs4 import BeautifulSoup

class helper_method:

Expand All @@ -12,20 +12,20 @@ def clean_text(text: str) -> str:
return text

@staticmethod
def get_network_type(url:str):
def get_network_type(url: str):
try:
if not url.startswith("http"):
url = "http://" + url
parsed_url = urlparse(url)
if not parsed_url.scheme or not parsed_url.netloc:
return network_type.INVALID
return "invalid"
if re.search(r"\.onion$", parsed_url.netloc, re.IGNORECASE):
return network_type.ONION
return "onion"
if re.search(r"\.i2p$", parsed_url.netloc, re.IGNORECASE):
return network_type.I2P
return network_type.CLEARNET
return "i2p"
return "clearnet"
except Exception:
return network_type.INVALID
return "invalid"

@staticmethod
def extract_emails(text: str) -> list:
Expand All @@ -41,3 +41,11 @@ def extract_phone_numbers(text: str) -> list:
phone_numbers = re.findall(phone_pattern, text)
return phone_numbers

@staticmethod
def extract_text_from_html(html: str) -> str:
"""
Extracts and cleans text from an HTML string using BeautifulSoup.
"""
soup = BeautifulSoup(html, "html.parser")
text = soup.get_text(separator=' ')
return helper_method.clean_text(text)
46 changes: 36 additions & 10 deletions crawler/request_manager.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import sys
import traceback
from threading import Timer

import redis
Expand Down Expand Up @@ -34,14 +35,24 @@ def check_services_status():
print(f"Error: Redis server is not running or accessible. Details: {ex}")
sys.exit(1)

def parse_leak_data(proxy: dict, model:leak_extractor_interface) -> tuple:

def parse_leak_data(blocked_media, proxy: dict, model: leak_extractor_interface) -> tuple:
default_data_model = leak_data_model(
cards_data=[],
contact_link=model.contact_page(),
base_url=model.base_url,
content_type=["leak"]
)

def get_block_resources(route):
request_url = route.request.url.lower()

if any(request_url.startswith(scheme) for scheme in ["data:image", "data:video", "data:audio"]) or \
route.request.resource_type in ["image", "media", "font", "stylesheet"]:
route.abort()
else:
route.continue_()

raw_parse_mapping = {}
timeout_flag = {"value": False}
browser = None
Expand All @@ -52,6 +63,7 @@ def terminate_browser():
if browser:
try:
print("Timeout reached. Closing browser and terminating tasks.")
browser.close()
except Exception:
pass

Expand All @@ -63,21 +75,23 @@ def terminate_browser():
browser = p.chromium.launch(proxy=proxy, headless=False)

context = browser.new_context()
context.set_default_timeout(60000)
context.set_default_navigation_timeout(60000)
context.set_default_timeout(600000)
context.set_default_navigation_timeout(600000)
timeout_timer = Timer(model.rule_config.m_timeout, terminate_browser)
timeout_timer.start()

try:
page = context.new_page()

if blocked_media:
page.route("**/*", get_block_resources)

def capture_response(response):
if response.request.resource_type == "document" and response.ok:
try:
cc = response.text()
raw_parse_mapping[response.url] = response.text()
print("parsed : " + response.url)
except Exception as ex:
print("Parsed:", response.url)
except Exception:
pass

page.on("response", capture_response)
Expand All @@ -86,21 +100,33 @@ def capture_response(response):
if timeout_flag["value"]:
raise TimeoutException("Timeout occurred during navigation.")

page.evaluate("""
document.querySelectorAll('*').forEach(el => {
if (el.src && el.src.startsWith('data:image')) el.remove();
if (el.src && el.src.startsWith('data:video')) el.remove();
if (el.src && el.src.startsWith('data:audio')) el.remove();
if (el.href && el.href.startsWith('data:')) el.remove();
if (el.innerHTML.includes('data:image') || el.innerHTML.includes('data:video')) el.remove();
});
""")

model.soup = BeautifulSoup(page.content(), 'html.parser')
raw_parse_mapping[page.url] = page.content()

model.parse_leak_data(page)
except Exception as e:
pass
error_traceback = traceback.format_exc()
print(f"TRACEBACK: {error_traceback}")
finally:
timeout_timer.cancel()

except Exception as e:
print(f"Unexpected Error: {e}")
except Exception as _:
error_traceback = traceback.format_exc()
print(f"TRACEBACK: {error_traceback}")

default_data_model.cards_data = model.card_data
return default_data_model, raw_parse_mapping


async def get_proxy(use_proxy=True) -> Dict[str, str]:
if use_proxy:
proxies = {"server": "socks5://127.0.0.1:9150"}
Expand Down
7 changes: 3 additions & 4 deletions shared_collector/main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
from crawler.request_manager import check_services_status, parse_leak_data
from shared_collector.scripts._nerqnacjmdy3obvevyol7qhazkwkv57dwqvye5v46k5bcujtfa6sduad import \
_nerqnacjmdy3obvevyol7qhazkwkv57dwqvye5v46k5bcujtfa6sduad
from shared_collector.scripts._zone_xsec import _zone_xsec

check_services_status()

if __name__ == "__main__":
parse_sample = _nerqnacjmdy3obvevyol7qhazkwkv57dwqvye5v46k5bcujtfa6sduad()
parsed_data, raw_parse_mapping = parse_leak_data({"server": "socks5://127.0.0.1:9150"}, parse_sample)
parse_sample = _zone_xsec()
parsed_data, raw_parse_mapping = parse_leak_data(blocked_media=True, proxy={"server": "socks5://127.0.0.1:9150"}, model=parse_sample)
print(parsed_data)
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,13 @@ def parse_leak_data(self, page: Page):
m_url=self.seed_url,
m_base_url=self.base_url,
m_content=body_text,
m_network=helper_method.get_network_type(self.base_url).value,
m_network=helper_method.get_network_type(self.base_url),
m_important_content=body_text,
m_weblink=[self.seed_url],
m_dumplink=dump_links,
m_email_addresses=helper_method.extract_emails(body_text),
m_phone_numbers=helper_method.extract_phone_numbers(body_text),
m_extra_tags=[],
m_content_type="organization",
m_content_type="leaks",
)

self._card_data.append(card_data)
Expand Down
Loading

0 comments on commit 76bc32e

Please sign in to comment.