Add files via upload

LQ6H · Apr 28, 2020 · c33a085 · c33a085
1 parent 9cd75c9
commit c33a085
Show file tree

Hide file tree

Showing 48 changed files with 1,630 additions and 0 deletions.
diff --git a/20200428_Python百度文库爬虫终极版/Files/Twitter爬虫核心技术：全自动抓取世界上的热门主题以及转推,引用,回复的用户的最新微博.txt b/20200428_Python百度文库爬虫终极版/Files/Twitter爬虫核心技术：全自动抓取世界上的热门主题以及转推,引用,回复的用户的最新微博.txt
@@ -0,0 +1,297 @@
+import twitter
+
+def oauth_login():
+# XXX: Go to http://twitter.com/apps/new to create an app and get values
+# for these credentials that you'll need to provide in place of these
+# empty string values that are defined as placeholders.
+# See https://dev.twitter.com/docs/auth/oauth for more information 
+# on Twitter's OAuth implementation.
+
+CONSUMER_KEY = ''
+CONSUMER_SECRET = ''
+OAUTH_TOKEN = ''
+OAUTH_TOKEN_SECRET = ''
+
+auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
+CONSUMER_KEY, CONSUMER_SECRET)
+
+twitter_api = twitter.Twitter(auth=auth)
+return twitter_api
+
+# Sample usage
+twitter_api = oauth_login()    
+
+# Nothing to see by displaying twitter_api except that it's now a
+# defined variable
+
+print twitter_api
+
+
+import json
+
+
+def twitter_trends(twitter_api, woe_id):
+# Prefix ID with the underscore for query string parameterization.
+# Without the underscore, the twitter package appends the ID value
+# to the URL itself as a special-case keyword argument.
+return twitter_api.trends.place(_id=woe_id)
+
+
+
+
+
+def twitter_search(twitter_api, q, max_results=2000000, **kw):
+
+# See https://dev.twitter.com/docs/api/1.1/get/search/tweets and 
+# https://dev.twitter.com/docs/using-search for details on advanced 
+# search criteria that may be useful for keyword arguments
+
+# See https://dev.twitter.com/docs/api/1.1/get/search/tweets    
+search_results = twitter_api.search.tweets(q=q, count=200000, **kw)
+
+statuses = search_results['statuses']
+
+# Iterate through batches of results by following the cursor until we
+# reach the desired number of results, keeping in mind that OAuth users
+# can "only" make 180 search queries per 15-minute interval. See
+# https://dev.twitter.com/docs/rate-limiting/1.1/limits
+# for details. A reasonable number of results is ~1000, although
+# that number of results may not exist for all queries.
+
+# Enforce a reasonable limit
+max_results = min(1000, max_results)
+
+for _ in range(10): # 10*100 = 1000
+try:
+next_results = search_results['search_metadata']['next_results']
+except KeyError, e: # No more results when next_results doesn't exist
+break
+
+# Create a dictionary from next_results, which has the following form:
+# ?max_id=313519052523986943&q=NCAA&include_entities=1
+kwargs = dict([ kv.split('=') 
+for kv in next_results[1:].split("&") ])
+
+search_results = twitter_api.search.tweets(**kwargs)
+statuses += search_results['statuses']
+
+if len(statuses) > max_results: 
+break
+
+return statuses
+
+
+
+
+
+
+def extract_tweet_entities(statuses):
+
+# See https://dev.twitter.com/docs/tweet-entities for more details
+on tweet
+# entities
+
+if len(statuses) == 0:
+return [], [], [], [], []
+
+screen_names = [ user_mention['screen_name'] 
+for status in statuses
+for user_mention in status['entities']['user_mentions'] ]
+
+hashtags = [ hashtag['text'] 
+for status in statuses 
+for hashtag in status['entities']['hashtags'] ]
+
+urls = [ url['expanded_url'] 
+for status in statuses 
+for url in status['entities']['urls'] ]
+
+symbols = [ symbol['text']
+for status in statuses
+for symbol in status['entities']['symbols'] ]
+
+# In some circumstances (such as search results), the media entity
+# may not appear
+if status['entities'].has_key('media'): 
+media = [ media['url'] 
+for status in statuses  
+for media in status['entities']['media'] ]
+else:
+media = []
+
+return screen_names, hashtags, urls, media, symbols
+
+
+
+
+
+
+
+def find_popular_tweets(twitter_api, statuses, retweet_threshold=3):
+
+# You could also consider using the favorite_count parameter as part of 
+# this  heuristic, possibly using it to provide an additional boost to 
+# popular tweets in a ranked formulation
+
+return [ status
+for status in statuses 
+if status['retweet_count'] > retweet_threshold ] 
+
+
+
+
+
+from collections import Counter
+
+def get_common_tweet_entities(statuses, entity_threshold=3):
+
+# Create a flat list of all tweet entities
+tweet_entities = [  e
+for status in statuses
+for entity_type in extract_tweet_entities([status]) 
+for e in entity_type 
+]
+
+c = Counter(tweet_entities).most_common()
+
+# Compute frequencies
+return [ (k,v) 
+for (k,v) in c
+if v >= entity_threshold
+]
+
+
+
+
+
+import re
+
+def get_rt_attributions(tweet):
+
+# Regex adapted from Stack Overflow (http://bit.ly/1821y0J)
+
+rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE)
+rt_attributions = []
+
+# Inspect the tweet to see if it was produced with /statuses/retweet/:id.
+# See https://dev.twitter.com/docs/api/1.1/get/statuses/retweets/%3Aid.
+
+if tweet.has_key('retweeted_status'):
+attribution = tweet['retweeted_status']['user']['screen_name'].lower()
+rt_attributions.append(attribution)
+
+# Also, inspect the tweet for the presence of "legacy" retweet patterns
+# such as "RT" and "via", which are still widely used for various reasons
+# and potentially very useful. See https://dev.twitter.com/discussions/2847 
+# and https://dev.twitter.com/discussion
+s/1748 for some details on how/why.
+
+try:
+rt_attributions += [ 
+mention.strip() 
+for mention in rt_patterns.findall(tweet['text'])[0][1].split() 
+]
+except IndexError, e:
+pass
+
+# Filter out any duplicates
+
+return list(set([rta.strip("@").lower() for rta in rt_attributions]))
+
+
+
+
+
+
+import pymongo # pip install pymongo
+
+def save_to_mongo(data, mongo_db, mongo_db_coll, **mongo_conn_kw):
+
+# Connects to the MongoDB server running on 
+# localhost:27017 by default
+
+client = pymongo.MongoClient(**mongo_conn_kw)
+
+# Get a reference to a particular database
+
+db = client[mongo_db]
+
+# Reference a particular collection in the database
+
+coll = db[mongo_db_coll]
+
+# Perform a bulk insert and  return the IDs
+
+return coll.insert(data)
+
+def load_from_mongo(mongo_db, mongo_db_coll, return_cursor=False,
+criteria=None, projection=None, **mongo_conn_kw):
+
+# Optionally, use criteria and projection to limit the data that is 
+# returned as documented in 
+# http://docs.mongodb.org/manual/reference/method/db.collection.find/
+
+# Consider leveraging MongoDB's aggregations framework for more 
+# sophisticated queries.
+
+client = pymongo.MongoClient(**mongo_conn_kw)
+db = client[mongo_db]
+coll = db[mongo_db_coll]
+
+if criteria is None:
+criteria = {}
+
+if projection is None:
+cursor = coll.find(criteria)
+else:
+cursor = coll.find(criteria, projection)
+
+# Returning a cursor is recommended for large amounts of data
+
+if return_cursor:
+return cursor
+else:
+return [ item for item in cursor ]
+
+
+
+
+
+#save_to_mongo(results, 'search_results005', q)
+
+##load_from_mongo('search_results005', q)
+
+
+
+
+
+
+
+
+
+import sys
+import time
+from urllib2 import URLError
+from httplib import BadStatusLine
+
+
+
+def make_twitter_request(twitter_api_func, max_errors=10, *args, **kw): 
+
+# A nested helper function that handles common HTTPErrors. Return an updated
+# value for wait_period if the problem is a 500 level error. Block until the
+# rate limit is reset if it's a rate limiting issue (429 error). Returns None
+# for 401 and 404 errors, which requires special handling by the caller.
+def handle_twitter_http_error(e, wait_period=2, sleep_when_rate_limited=True):
+
+if wait_period > 3600: # Seconds
+print >> sys.stderr, 'Too many retries. Quitting.'
+raise e
+
+# See https://dev.twitter.com/docs/error-codes-responses for common codes
+
+if e.e.code == 401:
+print >> sys.stderr, 'Encountered 401 Error (Not Authorized)'
+return None
+elif e.e.code == 404:
+print >> sys.stderr, 'Encountered 404 Error (N