-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScrapeAHsubs.py
94 lines (75 loc) · 3.02 KB
/
ScrapeAHsubs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python
import praw
import re
import pandas as pd
import argparse
parser = argparse.ArgumentParser(
prog='Scrape a subreddit using PRAW',
description='Given a subreddit name and filtering method (hot, new, top, rising), scrape last 1000 posts and comments'
)
parser.add_argument('-s', '--subreddit', required=True, help='Name of subreddit.')
parser.add_argument('-f', '--filter_var', required=True, help='Name of filtering method.')
parser.add_argument('-t', '--time_filter', required=False, help='Time frame if top or controversial, specify as all, day, hour, month, week, year.')
args = parser.parse_args()
print(args)
subreddit_name = args.subreddit
filter_var = args.filter_var
if args.time_filter:
time_filter = args.time_filter
else:
time_filter = 'all'
print(subreddit_name)
print(filter_var)
print(time_filter)
# user_agent = 'AH 1.0 by /u/cynophopic'
# reddit = praw.Reddit(
# client_id = '13gOYOmLudJH9JvfGAWBeQ',
# client_secret = 'y0QcvAfK2HfNiYEKuc73PghzcXHmGA',
# user_agent = user_agent
# )
# # time_filter – Can be one of: "all", "day", "hour", "month", "week", or "year" (default: "all")
# # if and only if filter_var is either "top" or "controversial"
# if filter_var == 'hot':
# subreddit_contents = reddit.subreddit(subreddit_name).hot(limit = None)
# elif filter_var == 'new':
# subreddit_contents = reddit.subreddit(subreddit_name).new(limit = None)
# elif filter_var == 'rising':
# subreddit_contents = reddit.subreddit(subreddit_name).rising(limit = None)
# elif filter_var == 'top':
# subreddit_contents = reddit.subreddit(subreddit_name).top(limit = None, time_filter=time_filter)
# elif filter_var == 'controversial':
# subreddit_contents = reddit.subreddit(subreddit_name).controversial(limit = None, time_filter=time_filter)
# # hot new rising top controversial
# submission_titles = []
# submission_bodies = []
# submission_ids = []
# submission_upvotes = []
# comment_ids = []
# comment_labels = []
# comment_upvotes = []
# for i, submission in enumerate(subreddit_contents):
# if str(submission.author) == 'AITAMod':
# continue
# if i % 10 == 0:
# print(i)
# submission.comments.replace_more(limit=None)
# submission_titles.append(submission.title)
# submission_bodies.append(submission.selftext)
# submission_upvotes.append(submission.score)
# submission_id = submission.id
# submission_ids.append(submission_id)
# for comment in submission.comments:
# #if (isinstance(comment, praw.models.MoreComments)):
# comment_body = comment.body.replace('\n', '').lower()
# nta = bool(re.search('(^| )nta', comment_body))
# yta = bool(re.search('(^| )yta', comment_body))
# if nta & yta:
# continue
# if not nta and not yta:
# continue
# comment_ids.append(submission_id)
# comment_upvotes.append(comment.score)
# if nta:
# comment_labels.append('NTA')
# if yta:
# comment_labels.append('YTA')