-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Preetham Kamidi <[email protected]>
- Loading branch information
Showing
7 changed files
with
106 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,31 +1,54 @@ | ||
from re import search | ||
from datetime import timezone | ||
from dateutil.parser import parse | ||
|
||
from sklearn.feature_extraction.text import CountVectorizer | ||
from sklearn.metrics.pairwise import cosine_similarity | ||
|
||
from app.logger.logger import logger | ||
from app.config.config import app_config | ||
|
||
|
||
count_vectorizer = CountVectorizer() | ||
|
||
def get_entities(text: str): | ||
if not text: | ||
return {} | ||
logger.info('Parsing data out of processed text...') | ||
username_match = search(r'@(\w{1,15})\b', text) | ||
date_match = search(r'\d{1,2}\s\w+\s\d{4}', text) | ||
if not username_match or not date_match: | ||
datetime_match = search( | ||
r'((1[0-2]|0?[1-9]):([0-5][0-9]) ?([AaPp][Mm]))\s-\s\d{1,2}\s\w+\s\d{4}', text) | ||
if not username_match or not datetime_match: | ||
return { | ||
'user_id': None, | ||
'tweet': None, | ||
'datetime': None | ||
} | ||
user_id = username_match.group() | ||
date_str = date_match.group() | ||
date = parse(date_str) | ||
formatted_date = date.strftime('%Y-%m-%d') | ||
user_id = username_match.group()[1:] | ||
date_str = datetime_match.group().replace('-', '') | ||
processed_datetime = parse(date_str).replace(tzinfo=timezone.utc) | ||
username_end_index = username_match.end() | ||
date_start_index = date_match.start() | ||
tweet = text[username_end_index+5:date_start_index-10] | ||
date_start_index = datetime_match.start() | ||
tweet = text[username_end_index+5:date_start_index].strip() | ||
return { | ||
'user_id': user_id, | ||
'tweet': tweet, | ||
'date': formatted_date | ||
'date': processed_datetime | ||
} | ||
|
||
|
||
def get_similarity(processed_tweet:str, same_day_tweets:list): | ||
if not processed_tweet or not same_day_tweets: | ||
return [] | ||
logger.info('Processing similarity of two tweets...') | ||
corpus = list() | ||
corpus.append(processed_tweet) | ||
corpus.extend(same_day_tweets) | ||
logger.info('Corpus: ' + str(corpus)) | ||
sparse_matrix = count_vectorizer.fit_transform(corpus) | ||
similarity_matrix = cosine_similarity(sparse_matrix, sparse_matrix) | ||
print(similarity_matrix) | ||
for row in similarity_matrix: | ||
for column in row: | ||
if column > app_config.SIMILARITY_THRESHOLD: | ||
return True | ||
return False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
from dateutil.parser import parse | ||
|
||
|
||
from app.logger.logger import logger | ||
from app.config.config import app_config | ||
from app.util.date_checker import valid, format_for_date | ||
from app.services.search import search | ||
|
||
|
||
def search_controller(entities): | ||
if not entities or not entities['user_id']: | ||
return {} | ||
logger.info('Searching for tweet using Twitter API...') | ||
querystring = { | ||
app_config.TWEET_USERNAME_KEY: entities['user_id'], app_config.TWEET_COUNT_KEY: app_config.TWEET_COUNT} | ||
response = search.search_results(querystring) | ||
same_day_tweets = list() | ||
for entry in response: | ||
tweet_date = parse(entry[app_config.TWEET_DATE_KEY]) | ||
if format_for_date(tweet_date) == format_for_date(entities['date']) and valid(tweet_date): | ||
logger.info('Tweet found...: ' + str(entry[app_config.TWEET_TEXT_KEY])) | ||
same_day_tweets.append(entry[app_config.TWEET_TEXT_KEY]) | ||
return same_day_tweets |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from datetime import datetime, timezone | ||
|
||
from app.config.config import app_config | ||
from app.logger.logger import logger | ||
|
||
|
||
def valid(processed_date): | ||
if not processed_date: | ||
return False | ||
curr_date = datetime.now(timezone.utc) | ||
datetime_diff = curr_date - processed_date | ||
if datetime_diff.days > app_config.TWEET_MAX_OLD: | ||
return False | ||
return True | ||
|
||
def format_for_date(tweet_datetime:datetime): | ||
return tweet_datetime.strftime('%Y-%m-%d') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters