Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
LQ6H authored Apr 28, 2020
1 parent 9cd75c9 commit c33a085
Show file tree
Hide file tree
Showing 48 changed files with 1,630 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
import twitter

def oauth_login():
# XXX: Go to http://twitter.com/apps/new to create an app and get values
# for these credentials that you'll need to provide in place of these
# empty string values that are defined as placeholders.
# See https://dev.twitter.com/docs/auth/oauth for more information
# on Twitter's OAuth implementation.

CONSUMER_KEY = ''
CONSUMER_SECRET = ''
OAUTH_TOKEN = ''
OAUTH_TOKEN_SECRET = ''

auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
CONSUMER_KEY, CONSUMER_SECRET)

twitter_api = twitter.Twitter(auth=auth)
return twitter_api

# Sample usage
twitter_api = oauth_login()

# Nothing to see by displaying twitter_api except that it's now a
# defined variable

print twitter_api


import json


def twitter_trends(twitter_api, woe_id):
# Prefix ID with the underscore for query string parameterization.
# Without the underscore, the twitter package appends the ID value
# to the URL itself as a special-case keyword argument.
return twitter_api.trends.place(_id=woe_id)





def twitter_search(twitter_api, q, max_results=2000000, **kw):

# See https://dev.twitter.com/docs/api/1.1/get/search/tweets and
# https://dev.twitter.com/docs/using-search for details on advanced
# search criteria that may be useful for keyword arguments

# See https://dev.twitter.com/docs/api/1.1/get/search/tweets
search_results = twitter_api.search.tweets(q=q, count=200000, **kw)

statuses = search_results['statuses']

# Iterate through batches of results by following the cursor until we
# reach the desired number of results, keeping in mind that OAuth users
# can "only" make 180 search queries per 15-minute interval. See
# https://dev.twitter.com/docs/rate-limiting/1.1/limits
# for details. A reasonable number of results is ~1000, although
# that number of results may not exist for all queries.

# Enforce a reasonable limit
max_results = min(1000, max_results)

for _ in range(10): # 10*100 = 1000
try:
next_results = search_results['search_metadata']['next_results']
except KeyError, e: # No more results when next_results doesn't exist
break

# Create a dictionary from next_results, which has the following form:
# ?max_id=313519052523986943&q=NCAA&include_entities=1
kwargs = dict([ kv.split('=')
for kv in next_results[1:].split("&") ])

search_results = twitter_api.search.tweets(**kwargs)
statuses += search_results['statuses']

if len(statuses) > max_results:
break

return statuses






def extract_tweet_entities(statuses):

# See https://dev.twitter.com/docs/tweet-entities for more details
on tweet
# entities

if len(statuses) == 0:
return [], [], [], [], []

screen_names = [ user_mention['screen_name']
for status in statuses
for user_mention in status['entities']['user_mentions'] ]

hashtags = [ hashtag['text']
for status in statuses
for hashtag in status['entities']['hashtags'] ]

urls = [ url['expanded_url']
for status in statuses
for url in status['entities']['urls'] ]

symbols = [ symbol['text']
for status in statuses
for symbol in status['entities']['symbols'] ]

# In some circumstances (such as search results), the media entity
# may not appear
if status['entities'].has_key('media'):
media = [ media['url']
for status in statuses
for media in status['entities']['media'] ]
else:
media = []

return screen_names, hashtags, urls, media, symbols







def find_popular_tweets(twitter_api, statuses, retweet_threshold=3):

# You could also consider using the favorite_count parameter as part of
# this heuristic, possibly using it to provide an additional boost to
# popular tweets in a ranked formulation

return [ status
for status in statuses
if status['retweet_count'] > retweet_threshold ]





from collections import Counter

def get_common_tweet_entities(statuses, entity_threshold=3):

# Create a flat list of all tweet entities
tweet_entities = [ e
for status in statuses
for entity_type in extract_tweet_entities([status])
for e in entity_type
]

c = Counter(tweet_entities).most_common()

# Compute frequencies
return [ (k,v)
for (k,v) in c
if v >= entity_threshold
]





import re

def get_rt_attributions(tweet):

# Regex adapted from Stack Overflow (http://bit.ly/1821y0J)

rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE)
rt_attributions = []

# Inspect the tweet to see if it was produced with /statuses/retweet/:id.
# See https://dev.twitter.com/docs/api/1.1/get/statuses/retweets/%3Aid.

if tweet.has_key('retweeted_status'):
attribution = tweet['retweeted_status']['user']['screen_name'].lower()
rt_attributions.append(attribution)

# Also, inspect the tweet for the presence of "legacy" retweet patterns
# such as "RT" and "via", which are still widely used for various reasons
# and potentially very useful. See https://dev.twitter.com/discussions/2847
# and https://dev.twitter.com/discussion
s/1748 for some details on how/why.

try:
rt_attributions += [
mention.strip()
for mention in rt_patterns.findall(tweet['text'])[0][1].split()
]
except IndexError, e:
pass

# Filter out any duplicates

return list(set([rta.strip("@").lower() for rta in rt_attributions]))






import pymongo # pip install pymongo

def save_to_mongo(data, mongo_db, mongo_db_coll, **mongo_conn_kw):

# Connects to the MongoDB server running on
# localhost:27017 by default

client = pymongo.MongoClient(**mongo_conn_kw)

# Get a reference to a particular database

db = client[mongo_db]

# Reference a particular collection in the database

coll = db[mongo_db_coll]

# Perform a bulk insert and return the IDs

return coll.insert(data)

def load_from_mongo(mongo_db, mongo_db_coll, return_cursor=False,
criteria=None, projection=None, **mongo_conn_kw):

# Optionally, use criteria and projection to limit the data that is
# returned as documented in
# http://docs.mongodb.org/manual/reference/method/db.collection.find/

# Consider leveraging MongoDB's aggregations framework for more
# sophisticated queries.

client = pymongo.MongoClient(**mongo_conn_kw)
db = client[mongo_db]
coll = db[mongo_db_coll]

if criteria is None:
criteria = {}

if projection is None:
cursor = coll.find(criteria)
else:
cursor = coll.find(criteria, projection)

# Returning a cursor is recommended for large amounts of data

if return_cursor:
return cursor
else:
return [ item for item in cursor ]





#save_to_mongo(results, 'search_results005', q)

##load_from_mongo('search_results005', q)









import sys
import time
from urllib2 import URLError
from httplib import BadStatusLine



def make_twitter_request(twitter_api_func, max_errors=10, *args, **kw):

# A nested helper function that handles common HTTPErrors. Return an updated
# value for wait_period if the problem is a 500 level error. Block until the
# rate limit is reset if it's a rate limiting issue (429 error). Returns None
# for 401 and 404 errors, which requires special handling by the caller.
def handle_twitter_http_error(e, wait_period=2, sleep_when_rate_limited=True):

if wait_period > 3600: # Seconds
print >> sys.stderr, 'Too many retries. Quitting.'
raise e

# See https://dev.twitter.com/docs/error-codes-responses for common codes

if e.e.code == 401:
print >> sys.stderr, 'Encountered 401 Error (Not Authorized)'
return None
elif e.e.code == 404:
print >> sys.stderr, 'Encountered 404 Error (N
Expand Down
Loading

0 comments on commit c33a085

Please sign in to comment.