-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge remote-tracking branch 'origin/_nerqnacjmdy3obvevyol7qhazkwkv57…
…dwqvye5v46k5bcujtfa6sduad'
- Loading branch information
Showing
1 changed file
with
13 additions
and
44 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,80 +16,42 @@ class _nerqnacjmdy3obvevyol7qhazkwkv57dwqvye5v46k5bcujtfa6sduad(leak_extractor_i | |
_instance = None | ||
|
||
def __init__(self): | ||
""" | ||
Initialize the _shared_sample class instance. | ||
Sets up attributes for storing card data, parsing content, and interacting with Redis. | ||
""" | ||
self._card_data = [] | ||
self.soup = None | ||
self._initialized = None | ||
self._redis_instance = redis_controller() | ||
|
||
def __new__(cls): | ||
""" | ||
Create a singleton instance of the _shared_sample class. | ||
Ensures only one instance of the class is created during runtime. | ||
""" | ||
if cls._instance is None: | ||
cls._instance = super(_nerqnacjmdy3obvevyol7qhazkwkv57dwqvye5v46k5bcujtfa6sduad, cls).__new__(cls) | ||
cls._instance._initialized = False | ||
return cls._instance | ||
|
||
@property | ||
def seed_url(self) -> str: | ||
""" | ||
Returns the seed URL for the data extraction process. | ||
This is the starting point for parsing the required content. | ||
""" | ||
return "http://nerqnacjmdy3obvevyol7qhazkwkv57dwqvye5v46k5bcujtfa6sduad.onion/" | ||
return "http://nerqnacjmdy3obvevyol7qhazkwkv57dwqvye5v46k5bcujtfa6sduad.onion" | ||
|
||
@property | ||
def base_url(self) -> str: | ||
""" | ||
Returns the base URL for relative URL resolution. | ||
Used to create absolute URLs during parsing. | ||
""" | ||
return "http://nerqnacjmdy3obvevyol7qhazkwkv57dwqvye5v46k5bcujtfa6sduad.onion/" | ||
return "http://nerqnacjmdy3obvevyol7qhazkwkv57dwqvye5v46k5bcujtfa6sduad.onion" | ||
|
||
@property | ||
def rule_config(self) -> RuleModel: | ||
""" | ||
Returns the configuration rules for data fetching. | ||
Specifies the use of TOR as the proxy and Selenium as the fetching mechanism. | ||
""" | ||
return RuleModel(m_fetch_proxy=FetchProxy.TOR, m_fetch_config=FetchConfig.SELENIUM) | ||
|
||
@property | ||
def card_data(self) -> List[card_extraction_model]: | ||
""" | ||
Returns the list of extracted card data models. | ||
Stores all parsed information from the leak extraction process. | ||
""" | ||
return self._card_data | ||
|
||
def invoke_db(self, command: REDIS_COMMANDS, key: CUSTOM_SCRIPT_REDIS_KEYS, default_value) -> None: | ||
""" | ||
Interacts with the Redis database to perform a specified command. | ||
Args: | ||
command (REDIS_COMMANDS): The Redis command to execute (e.g., GET, SET). | ||
key (CUSTOM_SCRIPT_REDIS_KEYS): The key for the operation. | ||
default_value: The default value to use if the key is not found. | ||
Returns: | ||
None | ||
""" | ||
return self._redis_instance.invoke_trigger(command, [key.value + self.__class__.__name__, default_value]) | ||
|
||
def contact_page(self) -> str: | ||
""" | ||
Returns the URL of the contact page for the shared sample data source. | ||
Useful for referencing or navigating to the contact page. | ||
""" | ||
return "[email protected]" | ||
|
||
def parse_leak_data(self, page: Page): | ||
visited_pages = set() | ||
visited_cards = set() | ||
|
||
while True: | ||
current_url = page.url | ||
|
@@ -102,6 +64,12 @@ def parse_leak_data(self, page: Page): | |
card_links = page.locator('.card').all() | ||
|
||
for card in card_links: | ||
card_text = card.inner_text() | ||
|
||
if card_text in visited_cards: | ||
continue | ||
|
||
visited_cards.add(card_text) | ||
card.click() | ||
|
||
page.wait_for_selector('.text-block', timeout=5000) | ||
|
@@ -110,9 +78,7 @@ def parse_leak_data(self, page: Page): | |
detail_soup = BeautifulSoup(detail_html, 'html.parser') | ||
|
||
title = detail_soup.select_one('.title').text.strip() if detail_soup.select_one('.title') else "N/A" | ||
|
||
content = detail_soup.select_one('.desc').text.strip() if detail_soup.select_one('.desc') else "N/A" | ||
|
||
website_elem = detail_soup.select_one('.desc a') | ||
website = website_elem['href'].strip() if website_elem else "N/A" | ||
|
||
|
@@ -144,6 +110,8 @@ def parse_leak_data(self, page: Page): | |
|
||
date_time = detail_soup.select_one('.date').text.strip() if detail_soup.select_one('.date') else "N/A" | ||
|
||
dumplinks = [a['href'].strip() for a in detail_soup.find_all('a', href=True) if ".onion" in a['href']] | ||
|
||
self._card_data.append(card_extraction_model( | ||
m_title=title, | ||
m_content=content, | ||
|
@@ -158,7 +126,7 @@ def parse_leak_data(self, page: Page): | |
m_company_name=title, | ||
m_network=helper_method.get_network_type(self.base_url).value, | ||
m_important_content=content, | ||
m_dumplink=[], | ||
m_dumplink=dumplinks, | ||
m_email_addresses=helper_method.extract_emails(detail_soup.text), | ||
m_industry=industry, | ||
m_content_type="Leak", | ||
|
@@ -174,3 +142,4 @@ def parse_leak_data(self, page: Page): | |
page.wait_for_selector('.card', timeout=5000) | ||
else: | ||
break | ||
|