From 009f139dfc3505e6d7eaa2a1653917419be6acba Mon Sep 17 00:00:00 2001 From: alphadeveloper12 <64380267+alphadeveloper12@users.noreply.github.com> Date: Mon, 20 Jan 2025 16:38:36 +0500 Subject: [PATCH 1/2] Update _darkfeed.py --- shared_collector/scripts/_darkfeed.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/shared_collector/scripts/_darkfeed.py b/shared_collector/scripts/_darkfeed.py index f651e22..fa75fab 100644 --- a/shared_collector/scripts/_darkfeed.py +++ b/shared_collector/scripts/_darkfeed.py @@ -33,8 +33,14 @@ def parse_leak_data(self, html_content: str, p_data_url: str) -> Tuple[leak_data data_model = leak_data_model(cards_data=[], contact_link=self.contact_page(), base_url=self.base_url, content_type=["leak"]) today_date = datetime.today().strftime('%Y-%m-%d') + allowed_tags = { "tag-data-breach", "tag-ransomware-intelligence"} for article in self.soup.find_all("article", class_="elementor-post"): + classes = set(article.get("class", [])) # Convert class list to set for easy comparison + + if not allowed_tags.intersection(classes): + continue + title_link = article.find("h3", class_="elementor-post__title").find("a") url = title_link['href'] if title_link else None title = title_link.get_text(strip=True) if title_link else None @@ -48,7 +54,7 @@ def parse_leak_data(self, html_content: str, p_data_url: str) -> Tuple[leak_data if url and title and posted_date: content_message = f"{title}, To visit or explore more visit the website: {url}" - card = card_extraction_model(m_title=title, m_url=url, m_base_url=self.base_url, m_content=content_message, m_content_type="leak", m_logo_or_images=[image_url] if image_url else [], m_last_updated=today_date) + card = card_extraction_model(m_title=title, m_url=url, m_base_url=self.base_url, m_content=content_message, m_content_type="leak", m_logo_or_images=[image_url] if image_url else [], m_last_updated=today_date, ) self.extracted_data.append(card) data_model.cards_data.append(card) From ec14f74b4366ca19092fdb1e98fc28ab32a6e8b2 Mon Sep 17 00:00:00 2001 From: alphadeveloper12 <64380267+alphadeveloper12@users.noreply.github.com> Date: Mon, 20 Jan 2025 17:06:42 +0500 Subject: [PATCH 2/2] Update _darkfeed.py --- shared_collector/scripts/_darkfeed.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/shared_collector/scripts/_darkfeed.py b/shared_collector/scripts/_darkfeed.py index fa75fab..dc7abad 100644 --- a/shared_collector/scripts/_darkfeed.py +++ b/shared_collector/scripts/_darkfeed.py @@ -33,7 +33,7 @@ def parse_leak_data(self, html_content: str, p_data_url: str) -> Tuple[leak_data data_model = leak_data_model(cards_data=[], contact_link=self.contact_page(), base_url=self.base_url, content_type=["leak"]) today_date = datetime.today().strftime('%Y-%m-%d') - allowed_tags = { "tag-data-breach", "tag-ransomware-intelligence"} + allowed_tags = {"tag-data-breach", "tag-ransomware-intelligence"} for article in self.soup.find_all("article", class_="elementor-post"): classes = set(article.get("class", [])) # Convert class list to set for easy comparison @@ -54,7 +54,10 @@ def parse_leak_data(self, html_content: str, p_data_url: str) -> Tuple[leak_data if url and title and posted_date: content_message = f"{title}, To visit or explore more visit the website: {url}" - card = card_extraction_model(m_title=title, m_url=url, m_base_url=self.base_url, m_content=content_message, m_content_type="leak", m_logo_or_images=[image_url] if image_url else [], m_last_updated=today_date, ) + important_content = title + + card = card_extraction_model(m_title=title, m_url=url, m_base_url=self.base_url, m_content=content_message, m_content_type="leak", m_logo_or_images=[image_url] if image_url else [], m_last_updated=today_date, m_important_content=important_content # Add title here + ) self.extracted_data.append(card) data_model.cards_data.append(card)