From 1385fc6c7a4949d0ca1114d51f5634a6aada9620 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Sun, 12 Nov 2017 15:12:10 +0000
Subject: [PATCH 001/178] Boolean values in config file.

---
 common.py                 |  8 +++++++-
 outputs/elastic_output.py |  3 ++-
 pastehunter.py            | 16 +++++++---------
 settings.conf.sample      |  4 ++--
 4 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/common.py b/common.py
index cb8a7dc..69e92b6 100644
--- a/common.py
+++ b/common.py
@@ -13,7 +13,13 @@ def parse_config():
         for section in config.sections():
             section_dict = {}
             for key, value in config.items(section):
-                section_dict[key] = value
+                if value.lower() == 'true':
+                    new_val = True
+                elif value.lower() == 'false':
+                    new_val = False
+                else:
+                    new_val = value
+                section_dict[key] = new_val
             config_dict[section] = section_dict
     else:
         config_dict['valid'] = False
diff --git a/outputs/elastic_output.py b/outputs/elastic_output.py
index d98fa2f..ddf08f6 100644
--- a/outputs/elastic_output.py
+++ b/outputs/elastic_output.py
@@ -17,7 +17,8 @@ def __init__(self):
         try:
             self.es = Elasticsearch(es_host, port=es_port, http_auth=(es_user, es_pass), use_ssl=es_ssl)
             self.test = True
-        except Exception:
+        except Exception as e:
+            print(e)
             raise Exception('Unable to Connect') from None
 
     def store_paste(self, paste_data):
diff --git a/pastehunter.py b/pastehunter.py
index dcf2b0e..7da3bf4 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -18,34 +18,31 @@
 api_scrape = conf['pastebin']['api_scrape']
 api_raw = conf['pastebin']['api_raw']
 rule_path = conf['yara']['rule_path']
+store_all = conf['pastebin']['store_all']
 
 print("Configure Outputs")
 # configure outputs
 outputs = []
-if conf['elastic_output']['enabled'] == 'True':
+if conf['elastic_output']['enabled']:
     es = elastic_output.ElasticOutput()
     outputs.append(es)
 
-if conf['json_output']['enabled'] == 'True':
+if conf['json_output']['enabled']:
     js = json_output.JsonOutput()
     outputs.append(js)
 
-if conf['csv_output']['enabled'] == 'True':
+if conf['csv_output']['enabled']:
     csv = csv_output.CSVOutput()
     outputs.append(csv)
 
-if conf['syslog_output']['enabled'] == 'True':
+if conf['syslog_output']['enabled']:
     syslog = syslog_output.SyslogOutput()
     outputs.append(syslog)
 
-if conf['smtp_output']['enabled'] == 'True':
+if conf['smtp_output']['enabled']:
     smtp = smtp_output.SMTPOutput()
     outputs.append(smtp)
 
-# Do we need to store all pastes, irrespective of Yara rule matches ?
-if conf['pastebin']['store_all'] == 'True':
-    store_all = True
-
 
 def yara_index(rule_path):
     index_file = os.path.join(rule_path, 'index.yar')
@@ -126,6 +123,7 @@ def yara_index(rule_path):
                 rule_match = s[1].lstrip('$')
                 if rule_match not in results:
                     results.append(rule_match)
+            results.append(str(match.rule))
 
         # But a break in here for the base64. Will use it later.
         elif match.rule.startswith('b64'):
diff --git a/settings.conf.sample b/settings.conf.sample
index beac071..452a582 100644
--- a/settings.conf.sample
+++ b/settings.conf.sample
@@ -2,7 +2,7 @@
 api_scrape = https://pastebin.com/api_scraping.php
 api_raw = https://pastebin.com/api_scrape_item.php?i=
 paste_limit = 200
-store_all = false
+store_all = False
 
 [elastic_output]
 enabled = True
@@ -24,7 +24,7 @@ enabled = True
 csv_path = logs/csv/
 
 [syslog_output]
-enabled = True
+enabled = False
 host = 192.168.1.1
 port = 514
 

From 9b0dfbd3f36d6b0ebcdbcb0302179f7759456964 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Thu, 16 Nov 2017 15:44:03 +0000
Subject: [PATCH 002/178] Implement Blacklist Tune Rules Tidy Console Output

---
 YaraRules/blacklist.yar     | 14 ++++++++++++++
 YaraRules/database.yar      | 21 +++++++++++++++++++++
 YaraRules/password_leak.yar |  4 ++--
 pastehunter.py              | 18 ++++++++++++++++--
 settings.conf.sample        |  3 ++-
 5 files changed, 55 insertions(+), 5 deletions(-)
 create mode 100644 YaraRules/blacklist.yar

diff --git a/YaraRules/blacklist.yar b/YaraRules/blacklist.yar
new file mode 100644
index 0000000..c1cba30
--- /dev/null
+++ b/YaraRules/blacklist.yar
@@ -0,0 +1,14 @@
+rule blacklist
+{
+    meta:
+        author = "@KevTheHermit"
+        info = "Part of PasteHunter"
+        reference = "https://github.com/kevthehermit/PasteHunter"
+
+    strings:
+        $a = "#EXTINF:" nocase // IPTV stream Lists.
+        $b = "--app-name=LeagueClient" nocase // League of Legends Debug Log
+    condition:
+        any of them
+
+}
\ No newline at end of file
diff --git a/YaraRules/database.yar b/YaraRules/database.yar
index 8000287..4febba6 100644
--- a/YaraRules/database.yar
+++ b/YaraRules/database.yar
@@ -16,4 +16,25 @@ rule db_connection
 
     condition:
         $a and not any of ($n*)
+}
+
+rule db_structure
+{
+    meta:
+        author = "@KevTheHermit"
+        info = "Part of PasteHunter"
+        reference = "https://github.com/kevthehermit/PasteHunter"
+
+    strings:
+        $a = "CREATE TABLE" nocase
+        $b = "INSERT INTO" nocase
+        $c = "VALUES" nocase
+        $d = "ENGINE" nocase
+        $e = "CHARSET" nocase
+        $f = "NOT NULL" nocase
+        $g = "varchar" nocase
+        $h = "PRIMARY KEY"
+
+    condition:
+        5 of them
 }
\ No newline at end of file
diff --git a/YaraRules/password_leak.yar b/YaraRules/password_leak.yar
index cd049f9..f6f631d 100644
--- a/YaraRules/password_leak.yar
+++ b/YaraRules/password_leak.yar
@@ -10,10 +10,9 @@ rule email_list
         reference = "https://github.com/kevthehermit/PasteHunter"
 
     strings:
-        //$email_add1 = /^([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)$/
         $email_add = /\b[\w\.-]+@[\w\.-]+\.\w+\b/
     condition:
-        #email_add > 10
+        #email_add > 20
 
 }
 
@@ -27,6 +26,7 @@ rule password_list
 
     strings:
         $data_format = /\b([@a-zA-Z0-9._-]{5,})(:|\|)(.*)\b/
+
     condition:
         #data_format > 10
 
diff --git a/pastehunter.py b/pastehunter.py
index 7da3bf4..816ad3f 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -18,6 +18,7 @@
 api_scrape = conf['pastebin']['api_scrape']
 api_raw = conf['pastebin']['api_raw']
 rule_path = conf['yara']['rule_path']
+blacklist = conf['yara']['blacklist']
 store_all = conf['pastebin']['store_all']
 
 print("Configure Outputs")
@@ -80,7 +81,10 @@ def yara_index(rule_path):
 print("Processing Results")
 # Iterate the results
 store_count = 0
+skipped_count = 0
+blacklist_count = 0
 paste_ids = ''
+
 # Get paste ids from last round
 if os.path.exists('paste_history.tmp'):
     with open('paste_history.tmp', 'r')as old:
@@ -92,7 +96,7 @@ def yara_index(rule_path):
     # Track paste ids to prevent dupes
     paste_ids += '{0},'.format(paste['key'])
     if paste['key'] in old_pastes:
-        print("Already Processed, Skipping")
+        skipped_count += 1
         continue
 
     # Create a new paste dict for us to modify
@@ -133,6 +137,13 @@ def yara_index(rule_path):
         else:
             results.append(match.rule)
 
+    # Blacklist Check
+    # If any of the blacklist rules appear then empty the result set
+    if blacklist and 'blacklist' in results:
+        results = []
+        print("Blacklisted paste {0}".format(paste['key']))
+        blacklist_count += 0
+
     # If we have a result add some meta data and send to storage
     # If results is empty, ie no match, and store_all is True,
     # then append "no_match" to results. This will then force output.
@@ -142,6 +153,7 @@ def yara_index(rule_path):
             results.append('no_match')
 
     if len(results) > 0:
+
         encoded_paste_data = raw_paste_data.encode('utf-8')
         md5 = hashlib.md5(encoded_paste_data).hexdigest()
         sha256 = hashlib.sha256(encoded_paste_data).hexdigest()
@@ -153,7 +165,9 @@ def yara_index(rule_path):
             output.store_paste(paste_data)
         store_count += 1
 
-print("Saved {0} Pastes".format(store_count))
+print("\n\nSaved {0} Pastes".format(store_count))
+print("Skipped {0} Pastes".format(skipped_count))
+print("Blacklisted {0} Pastes\n\n".format(blacklist_count))
 # Store paste ids for next check
 with open('paste_history.tmp', 'w')as old:
     old.write(paste_ids)
diff --git a/settings.conf.sample b/settings.conf.sample
index 452a582..d5eb6f5 100644
--- a/settings.conf.sample
+++ b/settings.conf.sample
@@ -37,4 +37,5 @@ username =
 password =
 
 [yara]
-rule_path = YaraRules
\ No newline at end of file
+rule_path = YaraRules
+blacklist = True
\ No newline at end of file

From 00fafa96722b0c116c205d0b5265a1b3eb30559c Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Thu, 16 Nov 2017 23:46:59 +0000
Subject: [PATCH 003/178] Major Revision Modular Inputs for multiple paste
 sites. Threading and Queue No more Cron Start to use logging not print()

---
 .gitignore                 |   1 +
 README.md                  |   4 +-
 YaraRules/index.yar        |  12 +-
 inputs/__init__.py         |   0
 inputs/dumpz.py            |  41 ++++++
 inputs/pastebin.py         |  41 ++++++
 outputs/csv_output.py      |   9 +-
 outputs/elastic_output.py  |  12 +-
 outputs/json_output.py     |   2 +-
 outputs/syslog_output.py   |   9 +-
 pastehunter.py             | 263 ++++++++++++++++++++-----------------
 postprocess/postprocess.py |  14 ++
 12 files changed, 270 insertions(+), 138 deletions(-)
 create mode 100644 inputs/__init__.py
 create mode 100644 inputs/dumpz.py
 create mode 100644 inputs/pastebin.py
 create mode 100644 postprocess/postprocess.py

diff --git a/.gitignore b/.gitignore
index 239f2bc..f18d0b0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -101,3 +101,4 @@ ENV/
 .mypy_cache/
 /settings.conf
 /YaraRules/custom_keywords.yar
+/paste_history.tmp
diff --git a/README.md b/README.md
index 56bcdf4..503ed25 100644
--- a/README.md
+++ b/README.md
@@ -61,13 +61,13 @@ If you have yara errors check the installed version numbers for yara and yara-py
 ### This little app
 git clone https://github.com/kevthehermit/pastehunter
 
-# Configure
+## Configure
 
 copy settings.conf.sample to settings.conf
 populate the details.
 For the scraping API you need to whitelist your IP on pastebin. No API key is required. See the link above
 
-# Running
+## Running
 
 This needs python 3 as per the prereqs. 
 You can run it on its own with ```python3 pastehunter.py```
diff --git a/YaraRules/index.yar b/YaraRules/index.yar
index dde82a0..9068a02 100644
--- a/YaraRules/index.yar
+++ b/YaraRules/index.yar
@@ -1,8 +1,10 @@
-include "core_keywords.yar"
-include "base64.yar"
+include "blacklist.yar"
+include "api_keys.yar"
 include "password_leak.yar"
-include "custom_keywords.yar"
+include "database.yar"
+include "base64.yar"
+include "core_keywords.yar"
 include "hak5.yar"
-include "api_keys.yar"
+include "custom_keywords.yar"
+include "general.yar"
 include "powershell.yar"
-include "database.yar"
\ No newline at end of file
diff --git a/inputs/__init__.py b/inputs/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/inputs/dumpz.py b/inputs/dumpz.py
new file mode 100644
index 0000000..b40b971
--- /dev/null
+++ b/inputs/dumpz.py
@@ -0,0 +1,41 @@
+import requests
+import logging
+
+
+def recent_pastes(conf, input_history):
+    # populate vars from config
+    paste_limit = conf['dumpz']['paste_limit']
+    api_scrape = conf['dumpz']['api_scrape']
+    history = []
+    paste_list = []
+    try:
+        # Create the API uri
+        scrape_uri = '{0}?limit={1}'.format(api_scrape, paste_limit)
+        # Get some pastes and convert to json
+        # Get last 'paste_limit' pastes
+        paste_list_request = requests.get(scrape_uri)
+        paste_list_json = paste_list_request.json()
+
+        for paste in paste_list_json['dumps']:
+            # Track paste ids to prevent dupes
+            history.append(paste['id'])
+            if paste['id'] in input_history:
+                continue
+
+            # We don't want password protected pastes
+            if paste['pwd'] == 1:
+                continue
+
+            # Create a new paste dict for us to normalize
+            paste_data = paste
+            paste_data['pasteid'] = paste['id']
+            paste_data['pastesite'] = 'dumpz.org'
+            paste_data['scrape_url'] = '{0}{1}'.format(conf['dumpz']['api_raw'], paste['id'])
+            # Add a date field that kibana will map
+            paste_data['@timestamp'] = paste_data['date']
+            paste_list.append(paste_data)
+        return paste_list, history
+
+    except Exception as e:
+        logging.error("Unable to parse paste results: {0}".format(e))
+        return paste_list, history
\ No newline at end of file
diff --git a/inputs/pastebin.py b/inputs/pastebin.py
new file mode 100644
index 0000000..6703b5a
--- /dev/null
+++ b/inputs/pastebin.py
@@ -0,0 +1,41 @@
+import requests
+import logging
+from datetime import datetime
+
+
+def recent_pastes(conf, input_history):
+    # populate vars from config
+    paste_limit = conf['pastebin']['paste_limit']
+    api_scrape = conf['pastebin']['api_scrape']
+    history = []
+    paste_list = []
+    try:
+        # Create the API uri
+        scrape_uri = '{0}?limit={1}'.format(api_scrape, paste_limit)
+        # Get some pastes and convert to json
+        # Get last 'paste_limit' pastes
+        paste_list_request = requests.get(scrape_uri)
+        paste_list_json = paste_list_request.json()
+
+        for paste in paste_list_json:
+            # Track paste ids to prevent dupes
+            history.append(paste['key'])
+            if paste['key'] in input_history:
+                continue
+
+            # Create a new paste dict for us to normalize
+            paste_data = paste
+            paste_data['pasteid'] = paste['key']
+            paste_data['pastesite'] = 'pastebin.com'
+            # Add a date field that kibana will map
+            date = datetime.utcfromtimestamp(float(paste_data['date'])).isoformat()
+            paste_data['@timestamp'] = date
+            paste_list.append(paste_data)
+        return paste_list, history
+
+    except Exception as e:
+        logging.error("Unable to parse paste results: {0}".format(e))
+        return paste_list, history
+
+
+
diff --git a/outputs/csv_output.py b/outputs/csv_output.py
index 1fbc456..92d2248 100644
--- a/outputs/csv_output.py
+++ b/outputs/csv_output.py
@@ -26,11 +26,12 @@ def __init__(self):
     def store_paste(self, paste_data):
         if self.test:
             # date, _id, YaraRule, raw_url
-            csv_line = '{0},{1},{2},{3}'.format(paste_data['@timestamp'],
-                                                paste_data['key'],
+            csv_line = '{0},{1},{2},{3},{4}'.format(paste_data['@timestamp'],
+                                                paste_data['pasteid'],
                                                 paste_data['YaraRule'],
-                                                paste_data['scrape_url'])
+                                                paste_data['scrape_url'],
+                                                paste_data['pastesite'])
             with open(self.csv_path, 'a') as out:
                 out.write('{0}\n'.format(csv_line))
         else:
-            print("CSV Output Error")
\ No newline at end of file
+            print("CSV Output Error")
diff --git a/outputs/elastic_output.py b/outputs/elastic_output.py
index ddf08f6..cab1cf8 100644
--- a/outputs/elastic_output.py
+++ b/outputs/elastic_output.py
@@ -2,6 +2,7 @@
 from common import parse_config
 
 config = parse_config()
+import logging
 
 
 class ElasticOutput():
@@ -25,7 +26,12 @@ def store_paste(self, paste_data):
         if self.test:
             index_name = self.es_index
             # Consider adding date to the index
-            self.es.index(index=index_name, doc_type='paste', id=paste_data['key'], body=paste_data)
-            print("Stored Paste {0}, Matched Rule {1}".format(paste_data['key'], paste_data['YaraRule']))
+            # ToDo: With multiple paste sites a pasteid collision is more likly!
+            self.es.index(index=index_name, doc_type='paste', id=paste_data['pasteid'], body=paste_data)
+            logging.info("Stored {0} Paste {1}, Matched Rule {2}".format(paste_data['pastesite'],
+                                                                         paste_data['pasteid'],
+                                                                         paste_data['YaraRule']
+                                                                         )
+                         )
         else:
-            print("Elastic Search Enabled, not configured!")
\ No newline at end of file
+            logging.error("Elastic Search Enabled, not configured!")
diff --git a/outputs/json_output.py b/outputs/json_output.py
index b28d5e9..0a343af 100644
--- a/outputs/json_output.py
+++ b/outputs/json_output.py
@@ -24,7 +24,7 @@ def store_paste(self, paste_data):
             del paste_data['raw_paste']
 
         if self.test:
-            json_file = os.path.join(self.json_path, paste_data['key'])
+            json_file = os.path.join(self.json_path, str(paste_data['pasteid']))
             with open(json_file, 'w') as out:
                 out.write(json.dumps(paste_data, indent=4))
         else:
diff --git a/outputs/syslog_output.py b/outputs/syslog_output.py
index 688cef3..d2dc777 100644
--- a/outputs/syslog_output.py
+++ b/outputs/syslog_output.py
@@ -9,10 +9,11 @@ def store_paste(self, paste_data):
         host = config['syslog_output']['host']
         port = int(config['syslog_output']['port'])
 
-        syslog_line = '{0} "{1}" "{2}" "{3}"'.format(paste_data['@timestamp'],
-                                            paste_data['key'],
-                                            paste_data['YaraRule'],
-                                            paste_data['scrape_url'])
+        syslog_line = '"{0}" "{1}" "{2}" "{3}" "{4}"'.format(paste_data['@timestamp'],
+                                                paste_data['pasteid'],
+                                                paste_data['YaraRule'],
+                                                paste_data['scrape_url'],
+                                                paste_data['pastesite'])
         syslog = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
         syslog.connect((host, port))
         syslog.send(syslog_line.encode('utf-8'))
diff --git a/pastehunter.py b/pastehunter.py
index 816ad3f..afffd98 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -3,46 +3,61 @@
 import os
 import sys
 import yara
+import json
 import hashlib
 import requests
-import datetime
+from time import sleep
 from common import parse_config
 from outputs import elastic_output, json_output, csv_output, syslog_output, smtp_output
+from queue import Queue
+import threading
+import importlib
+import logging
 
-print("Reading Configs")
+lock = threading.Lock()
+
+# Set some logging options
+logging.basicConfig(level=logging.INFO)
+logging.getLogger('requests').setLevel(logging.ERROR)
+
+logging.info("Reading Configs")
 # Parse the config file
 conf = parse_config()
 
 # populate vars from config
-paste_limit = conf['pastebin']['paste_limit']
-api_scrape = conf['pastebin']['api_scrape']
 api_raw = conf['pastebin']['api_raw']
 rule_path = conf['yara']['rule_path']
 blacklist = conf['yara']['blacklist']
 store_all = conf['pastebin']['store_all']
+input_list = conf['inputs']['inputs']
 
-print("Configure Outputs")
+logging.info("Configure Outputs")
 # configure outputs
 outputs = []
 if conf['elastic_output']['enabled']:
     es = elastic_output.ElasticOutput()
     outputs.append(es)
+    logging.info("Elastic Output Enabled")
 
 if conf['json_output']['enabled']:
     js = json_output.JsonOutput()
     outputs.append(js)
+    logging.info("Json Output Enabled")
 
 if conf['csv_output']['enabled']:
     csv = csv_output.CSVOutput()
     outputs.append(csv)
+    logging.info("CSV Output Enabled")
 
 if conf['syslog_output']['enabled']:
     syslog = syslog_output.SyslogOutput()
     outputs.append(syslog)
+    logging.info("Syslog Output Enabled")
 
 if conf['smtp_output']['enabled']:
     smtp = smtp_output.SMTPOutput()
     outputs.append(smtp)
+    logging.info("SMTP Output Enabled")
 
 
 def yara_index(rule_path):
@@ -54,120 +69,130 @@ def yara_index(rule_path):
                 yar.write(include)
 
 
-print("Compile Yara Rules")
-try:
-    # Update the yara rules index
-    yara_index(rule_path)
-    # Compile the yara rules we will use to match pastes
-    index_file = os.path.join(rule_path, 'index.yar')
-    rules = yara.compile(index_file)
-except Exception as e:
-    print("Unable to Create Yara index: ", e)
-    sys.exit()
-
-print("Connecting to Pastebin")
-try:
-    # Create the API uri
-    scrape_uri = '{0}?limit={1}'.format(api_scrape, paste_limit)
-    # Get some pastes and convert to json
-    # Get last 'paste_limit' pastes
-    paste_list_request = requests.get(scrape_uri)
-    paste_list_json = paste_list_request.json()
-except Exception as e:
-    print("Unable to parse paste results: ", e)
-    sys.exit()
-
-
-print("Processing Results")
-# Iterate the results
-store_count = 0
-skipped_count = 0
-blacklist_count = 0
-paste_ids = ''
-
-# Get paste ids from last round
-if os.path.exists('paste_history.tmp'):
-    with open('paste_history.tmp', 'r')as old:
-        old_pastes = old.read().split(',')
-else:
-    old_pastes = []
-
-for paste in paste_list_json:
-    # Track paste ids to prevent dupes
-    paste_ids += '{0},'.format(paste['key'])
-    if paste['key'] in old_pastes:
-        skipped_count += 1
-        continue
-
-    # Create a new paste dict for us to modify
-    paste_data = paste
-
-    # Add a date field that kibana will map
-    date = datetime.datetime.utcfromtimestamp(float(paste_data['date'])).isoformat()
-    paste_data['@timestamp'] = date
-
-    # get raw paste and hash them
-    raw_paste_uri = paste['scrape_url']
-    raw_paste_data = requests.get(raw_paste_uri).text
-
-    # Process the paste data here
+def paste_scanner():
+    # Get a paste URI from the Queue
+    # Fetch the raw paste
+    # scan the Paste
+    # Store the Paste
+    while True:
+        paste_data = q.get()
+        logging.info("Found New {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
+        # get raw paste and hash them
+        raw_paste_uri = paste_data['scrape_url']
+        raw_paste_data = requests.get(raw_paste_uri).text
+        # Process the paste data here
+
+        try:
+            # Scan with yara
+            matches = rules.match(data=raw_paste_data)
+        except Exception as e:
+            logging.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
+            continue
 
+        results = []
+        for match in matches:
+            # For keywords get the word from the matched string
+            if match.rule == 'core_keywords' or match.rule == 'custom_keywords':
+                for s in match.strings:
+                    rule_match = s[1].lstrip('$')
+                    if rule_match not in results:
+                        results.append(rule_match)
+                results.append(str(match.rule))
+
+            # But a break in here for the base64. Will use it later.
+            elif match.rule.startswith('b64'):
+                results.append(match.rule)
+
+            # Else use the rule name
+            else:
+                results.append(match.rule)
+
+        # Blacklist Check
+        # If any of the blacklist rules appear then empty the result set
+        if blacklist and 'blacklist' in results:
+            results = []
+            logging.info("Blacklisted {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
+
+        # If we have a result add some meta data and send to storage
+        # If results is empty, ie no match, and store_all is True,
+        # then append "no_match" to results. This will then force output.
+
+        if store_all is True:
+            if len(results) == 0:
+                results.append('no_match')
+
+        if len(results) > 0:
+
+            encoded_paste_data = raw_paste_data.encode('utf-8')
+            md5 = hashlib.md5(encoded_paste_data).hexdigest()
+            sha256 = hashlib.sha256(encoded_paste_data).hexdigest()
+            paste_data['MD5'] = md5
+            paste_data['SHA256'] = sha256
+            paste_data['raw_paste'] = raw_paste_data
+            paste_data['YaraRule'] = results
+            for output in outputs:
+                output.store_paste(paste_data)
+
+        # Mark Tasks as complete
+        q.task_done()
+
+
+if __name__ == "__main__":
+    logging.info("Compile Yara Rules")
     try:
-        # Scan with yara
-        matches = rules.match(data=raw_paste_data)
+        # Update the yara rules index
+        yara_index(rule_path)
+        # Compile the yara rules we will use to match pastes
+        index_file = os.path.join(rule_path, 'index.yar')
+        rules = yara.compile(index_file)
     except Exception as e:
-        print("Unable to scan raw paste : {0} - {1}".format(paste['key'], e))
-        continue
-
-    results = []
-    for match in matches:
-        # For keywords get the word from the matched string
-        if match.rule == 'core_keywords' or match.rule == 'custom_keywords':
-            for s in match.strings:
-                rule_match = s[1].lstrip('$')
-                if rule_match not in results:
-                    results.append(rule_match)
-            results.append(str(match.rule))
-
-        # But a break in here for the base64. Will use it later.
-        elif match.rule.startswith('b64'):
-            results.append(match.rule)
-
-        # Else use the rule name
-        else:
-            results.append(match.rule)
-
-    # Blacklist Check
-    # If any of the blacklist rules appear then empty the result set
-    if blacklist and 'blacklist' in results:
-        results = []
-        print("Blacklisted paste {0}".format(paste['key']))
-        blacklist_count += 0
-
-    # If we have a result add some meta data and send to storage
-    # If results is empty, ie no match, and store_all is True,
-    # then append "no_match" to results. This will then force output.
-
-    if store_all is True:
-        if len(results) == 0:
-            results.append('no_match')
-
-    if len(results) > 0:
-
-        encoded_paste_data = raw_paste_data.encode('utf-8')
-        md5 = hashlib.md5(encoded_paste_data).hexdigest()
-        sha256 = hashlib.sha256(encoded_paste_data).hexdigest()
-        paste_data['MD5'] = md5
-        paste_data['SHA256'] = sha256
-        paste_data['raw_paste'] = raw_paste_data
-        paste_data['YaraRule'] = results
-        for output in outputs:
-            output.store_paste(paste_data)
-        store_count += 1
-
-print("\n\nSaved {0} Pastes".format(store_count))
-print("Skipped {0} Pastes".format(skipped_count))
-print("Blacklisted {0} Pastes\n\n".format(blacklist_count))
-# Store paste ids for next check
-with open('paste_history.tmp', 'w')as old:
-    old.write(paste_ids)
+        print("Unable to Create Yara index: ", e)
+        sys.exit()
+
+    # Create Queue to hold paste URI's
+    q = Queue()
+
+    # Threads
+    for i in range(5):
+        t = threading.Thread(target=paste_scanner)
+        t.daemon = True
+        t.start()
+
+    # Now Fill the Queue
+    try:
+        while True:
+            # Paste History
+            logging.info("Populating Queue")
+            if os.path.exists('paste_history.tmp'):
+                with open('paste_history.tmp') as json_file:
+                    paste_history = json.load(json_file)
+            else:
+                paste_history = {}
+
+            for input_name in input_list.split(','):
+                if input_name in paste_history:
+                    input_history = paste_history[input_name]
+                else:
+                    input_history = []
+
+                import_name = 'inputs.{0}'.format(input_name)
+                i = importlib.import_module(import_name)
+                # Get list of recent pastes
+                paste_list, history = i.recent_pastes(conf, input_history)
+                for paste in paste_list:
+                    q.put(paste)
+                paste_history[input_name] = history
+
+            # Write History
+            with open('paste_history.tmp', 'w') as outfile:
+                json.dump(paste_history, outfile)
+
+            # Flush the list
+            q.join()
+
+            # Slow it down a little
+            logging.info("Sleeping for 30")
+            sleep(10)
+
+    except KeyboardInterrupt:
+        logging.info("Stopping Threads")
diff --git a/postprocess/postprocess.py b/postprocess/postprocess.py
new file mode 100644
index 0000000..f2dea09
--- /dev/null
+++ b/postprocess/postprocess.py
@@ -0,0 +1,14 @@
+# This gets the raw paste, yara rules and the paste id
+# Post runs after paste is stored.
+# We call in post process modules
+# If we are in elastic search we update the document
+
+
+class PostProcess:
+    def __init__(self):
+        self.raw_paste = ''
+        self.yararules = []
+        self.pasteid = ''
+
+    def run(self):
+        pass

From 211b84f1a8b0b94f54453c1bf023a123878e055a Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Fri, 17 Nov 2017 00:09:05 +0000
Subject: [PATCH 004/178] Prep for Gists

---
 inputs/gists.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 pastehunter.py  |  2 +-
 2 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 inputs/gists.py

diff --git a/inputs/gists.py b/inputs/gists.py
new file mode 100644
index 0000000..cfc0274
--- /dev/null
+++ b/inputs/gists.py
@@ -0,0 +1,48 @@
+# ToDo:
+'''
+Just some notes for reference while i have them
+
+https://developer.github.com/v3/rate_limit/
+GET /rate_limit # Doesnt count against your rate limit
+
+{
+  "resources": {
+    "core": {
+      "limit": 5000,
+      "remaining": 4999,
+      "reset": 1372700873
+    },
+    "search": {
+      "limit": 30,
+      "remaining": 18,
+      "reset": 1372697452
+    }
+  },
+  "rate": {
+    "limit": 5000,
+    "remaining": 4999,
+    "reset": 1372700873
+  }
+}
+
+
+https://developer.github.com/v3/gists/#list-all-public-gists
+
+
+GET /gists/public
+
+Github API only returns 1Mb of data per gist. look for "truncated"
+You will need to get the raw_gist to get full file. If its over 10Mb you need to clone the gist.
+
+Each gist can hold multiple files
+
+
+Get a single gist
+
+GET /gists/:id
+
+
+
+
+
+'''
\ No newline at end of file
diff --git a/pastehunter.py b/pastehunter.py
index afffd98..c027230 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -191,7 +191,7 @@ def paste_scanner():
             q.join()
 
             # Slow it down a little
-            logging.info("Sleeping for 30")
+            logging.info("Sleeping for 10 Seconds")
             sleep(10)
 
     except KeyboardInterrupt:

From 821fac3907edd11d2a7186618bc082008487ab1a Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Fri, 17 Nov 2017 00:10:07 +0000
Subject: [PATCH 005/178] Sample Conf update

---
 settings.conf.sample | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/settings.conf.sample b/settings.conf.sample
index d5eb6f5..c9158b3 100644
--- a/settings.conf.sample
+++ b/settings.conf.sample
@@ -1,8 +1,16 @@
+[inputs]
+inputs = pastebin,dumpz
+
 [pastebin]
 api_scrape = https://pastebin.com/api_scraping.php
 api_raw = https://pastebin.com/api_scrape_item.php?i=
 paste_limit = 200
-store_all = False
+store_all = false
+
+[dumpz]
+api_scrape = https://dumpz.org/api/recent
+api_raw = https://dumpz.org/api/dump/
+paste_limit = 200
 
 [elastic_output]
 enabled = True

From a4299ddffefdc2cb9bfdf83b5fa4f758eb526e29 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Fri, 17 Nov 2017 13:05:11 +0000
Subject: [PATCH 006/178] Sample code for gists

---
 inputs/gists.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/inputs/gists.py b/inputs/gists.py
index cfc0274..947ad9a 100644
--- a/inputs/gists.py
+++ b/inputs/gists.py
@@ -41,6 +41,24 @@
 
 GET /gists/:id
 
+example code
+
+
+# Create a token - https://github.com/settings/tokens  Needs only Gist
+
+token = ''
+
+from octohub.connection import Connection
+
+conn = Connection(token)
+uri = '/repos/turnkeylinux/tracker/issues'
+
+uri = '/gists/public'
+
+response = conn.send('GET', uri, params={})
+for gist in response.parsed:
+    print(gist)
+
 
 
 

From 27c1d60add5b50032288ecce2ae1502f2c282cad Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Sun, 19 Nov 2017 15:20:05 +0000
Subject: [PATCH 007/178] Support for gist.github.com

---
 YaraRules/blacklist.yar |   1 +
 inputs/gists.py         | 135 ++++++++++++++++++++--------------------
 settings.conf.sample    |   6 +-
 3 files changed, 75 insertions(+), 67 deletions(-)

diff --git a/YaraRules/blacklist.yar b/YaraRules/blacklist.yar
index c1cba30..c783711 100644
--- a/YaraRules/blacklist.yar
+++ b/YaraRules/blacklist.yar
@@ -8,6 +8,7 @@ rule blacklist
     strings:
         $a = "#EXTINF:" nocase // IPTV stream Lists.
         $b = "--app-name=LeagueClient" nocase // League of Legends Debug Log
+        $c = "common.application_name: LeagueClient" // League of Legends Debug Log
     condition:
         any of them
 
diff --git a/inputs/gists.py b/inputs/gists.py
index 947ad9a..5059840 100644
--- a/inputs/gists.py
+++ b/inputs/gists.py
@@ -1,66 +1,69 @@
-# ToDo:
-'''
-Just some notes for reference while i have them
-
-https://developer.github.com/v3/rate_limit/
-GET /rate_limit # Doesnt count against your rate limit
-
-{
-  "resources": {
-    "core": {
-      "limit": 5000,
-      "remaining": 4999,
-      "reset": 1372700873
-    },
-    "search": {
-      "limit": 30,
-      "remaining": 18,
-      "reset": 1372697452
-    }
-  },
-  "rate": {
-    "limit": 5000,
-    "remaining": 4999,
-    "reset": 1372700873
-  }
-}
-
-
-https://developer.github.com/v3/gists/#list-all-public-gists
-
-
-GET /gists/public
-
-Github API only returns 1Mb of data per gist. look for "truncated"
-You will need to get the raw_gist to get full file. If its over 10Mb you need to clone the gist.
-
-Each gist can hold multiple files
-
-
-Get a single gist
-
-GET /gists/:id
-
-example code
-
-
-# Create a token - https://github.com/settings/tokens  Needs only Gist
-
-token = ''
-
-from octohub.connection import Connection
-
-conn = Connection(token)
-uri = '/repos/turnkeylinux/tracker/issues'
-
-uri = '/gists/public'
-
-response = conn.send('GET', uri, params={})
-for gist in response.parsed:
-    print(gist)
-
-
-
-
-
-'''
\ No newline at end of file
+import requests
+import math
+import logging
+
+# Set some logging options
+logging.basicConfig(level=logging.INFO)
+logging.getLogger('requests').setLevel(logging.ERROR)
+
+api_uri = 'https://api.github.com/gists/public'
+api_version = 'application/vnd.github.v3+json'  # Set Accept header to force api v3
+
+
+def recent_pastes(conf, input_history):
+    oauth_token = conf['gists']['api_token']
+    gist_limit = int(conf['gists']['api_limit'])
+    headers = {'user-agent': 'PasteHunter',
+               'Accept': api_version,
+               'Authorization': 'token {0}'.format(oauth_token)}
+
+    # calculate number of pages
+    page_count = math.ceil(gist_limit / 100)
+
+    result_pages = []
+    history = []
+    paste_list = []
+
+    try:
+        # Get the required amount of entries via pagination
+        for page_num in range(1, page_count + 1):
+            url = '{0}?page={1}&per_page=100'.format(api_uri, page_num)
+            logging.info("Fetching page: {0}".format(page_num))
+            req = requests.get(url, headers=headers)
+            # Check some headers
+            logging.info("Remainig Limit: {0}".format(req.headers['X-RateLimit-Remaining']))
+            logging.info("Limit Reset: {0}".format(req.headers['X-RateLimit-Reset']))
+
+            if req.status_code == 200:
+                result_pages.append(req.json())
+
+            if req.status_code == 401:
+                logging.error("Auth Failed")
+
+            elif req.status_code == 403:
+                logging.error("Login Attempts Exceeded")
+
+        # Parse results
+
+        for page in result_pages:
+            for gist_meta in page:
+                # Track paste ids to prevent dupes
+                history.append(gist_meta['id'])
+                if gist_meta['id'] in input_history:
+                    continue
+
+                for file_name, file_meta in gist_meta['files'].items():
+                    gist_data = file_meta
+                    gist_data['@timestamp'] = gist_meta['created_at']
+                    gist_data['pasteid'] = gist_meta['id']
+                    gist_data['pastesite'] = 'gist.github.com'
+                    gist_data['scrape_url'] = file_meta['raw_url']
+                    # remove some origional keys just to keep it a bit cleaner
+                    del gist_data['raw_url']
+                    paste_list.append(gist_data)
+
+        # Return results and history
+        return paste_list, history
+    except Exception as e:
+        logging.error("Unable to parse paste results: {0}".format(e))
+        return paste_list, history
diff --git a/settings.conf.sample b/settings.conf.sample
index c9158b3..a578616 100644
--- a/settings.conf.sample
+++ b/settings.conf.sample
@@ -1,5 +1,5 @@
 [inputs]
-inputs = pastebin,dumpz
+inputs = pastebin,dumpz,gists
 
 [pastebin]
 api_scrape = https://pastebin.com/api_scraping.php
@@ -12,6 +12,10 @@ api_scrape = https://dumpz.org/api/recent
 api_raw = https://dumpz.org/api/dump/
 paste_limit = 200
 
+[gists]
+api_token =
+api_limit = 200
+
 [elastic_output]
 enabled = True
 elastic_index = paste-test

From 2227e02c4db3e00eeab7cb46ea7c84d3fabe47ce Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Sun, 19 Nov 2017 15:42:46 +0000
Subject: [PATCH 008/178] Logging format

---
 inputs/gists.py      | 7 +++++--
 pastehunter.py       | 3 ++-
 settings.conf.sample | 2 +-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/inputs/gists.py b/inputs/gists.py
index 5059840..c544e38 100644
--- a/inputs/gists.py
+++ b/inputs/gists.py
@@ -1,9 +1,9 @@
 import requests
 import math
 import logging
+from datetime import datetime
 
 # Set some logging options
-logging.basicConfig(level=logging.INFO)
 logging.getLogger('requests').setLevel(logging.ERROR)
 
 api_uri = 'https://api.github.com/gists/public'
@@ -32,7 +32,10 @@ def recent_pastes(conf, input_history):
             req = requests.get(url, headers=headers)
             # Check some headers
             logging.info("Remainig Limit: {0}".format(req.headers['X-RateLimit-Remaining']))
-            logging.info("Limit Reset: {0}".format(req.headers['X-RateLimit-Reset']))
+
+            reset_date = datetime.utcfromtimestamp(float(req.headers['X-RateLimit-Reset'])).isoformat()
+
+            logging.info("Limit Reset: {0}".format(reset_date))
 
             if req.status_code == 200:
                 result_pages.append(req.json())
diff --git a/pastehunter.py b/pastehunter.py
index c027230..cf05d13 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -17,8 +17,9 @@
 lock = threading.Lock()
 
 # Set some logging options
-logging.basicConfig(level=logging.INFO)
+logging.basicConfig(format='%(levelname)s:%(filename)s:%(message)s', level=logging.INFO)
 logging.getLogger('requests').setLevel(logging.ERROR)
+logging.getLogger('elasticsearch').setLevel(logging.ERROR)
 
 logging.info("Reading Configs")
 # Parse the config file
diff --git a/settings.conf.sample b/settings.conf.sample
index a578616..5f1d071 100644
--- a/settings.conf.sample
+++ b/settings.conf.sample
@@ -14,7 +14,7 @@ paste_limit = 200
 
 [gists]
 api_token =
-api_limit = 200
+api_limit = 100
 
 [elastic_output]
 enabled = True

From cac4b335f73ac7b3c50be31778c158f7e77ea4ea Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Sun, 19 Nov 2017 17:05:26 +0000
Subject: [PATCH 009/178] Dumpz.org text endpoint

---
 inputs/dumpz.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/inputs/dumpz.py b/inputs/dumpz.py
index b40b971..7b62d91 100644
--- a/inputs/dumpz.py
+++ b/inputs/dumpz.py
@@ -30,7 +30,11 @@ def recent_pastes(conf, input_history):
             paste_data = paste
             paste_data['pasteid'] = paste['id']
             paste_data['pastesite'] = 'dumpz.org'
-            paste_data['scrape_url'] = '{0}{1}'.format(conf['dumpz']['api_raw'], paste['id'])
+
+            #paste_data['scrape_url'] = '{0}{1}'.format(conf['dumpz']['api_raw'], paste['id'])
+
+            paste_data['scrape_url'] = 'https://dumpz.org/{0}/text/'.format(paste['id'])
+
             # Add a date field that kibana will map
             paste_data['@timestamp'] = paste_data['date']
             paste_list.append(paste_data)

From 8e4d0bd47aeb5746d1ddc7faab7e021ea4cafc9d Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Sun, 19 Nov 2017 17:22:23 +0000
Subject: [PATCH 010/178] Update The Readme

---
 README.md | 90 +++++++++++++++++++++++++++++++++----------------------
 1 file changed, 54 insertions(+), 36 deletions(-)

diff --git a/README.md b/README.md
index 503ed25..a193773 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,58 @@
 # PasteHunter
-Scan pastebin pastes with a collection of yara rules.
+PasteHunter is a python3 application that is designed to query a collection of sites that host publicliy pasted data. 
+For all the pasts it finds it scans the raw contents against a series of yara rules looking for information that can be used 
+by an org or a researcher.
 
-# PreReqs
+## Supported Sites
+Pastehunter currently has support for the following sites:
+ - pastebin.com
+ - dumpz.org
+ - gist.github.com
+
+Support for the following sites is listed as ToDo:
+ - paste.ee
+
+
+# Installation
+
+## PreReqs
+
+### Pastebin
 
 You need a Pro account on pastebin that has access to the scraping API.
 https://pastebin.com/api_scraping_faq
 
-* Yara 
-* Python3
-* Elastic Search Kibana optional
+### GitHub
+Github needs an oauth token to stop it hitting the free ratelimit. 
+Create one at https://github.com/settings/tokens
+
+YOU DO NOT NEED TO GIVE IT ANY ACCESS PERMISSIONS
+
+# Installation
+
+## Local install 
+
+### Python / Deps
+Python 3
+```pip3 install -r requirements.txt```
+
+
+### Elastic Search
+https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html
+
+### Kibana
+https://www.elastic.co/guide/en/kibana/current/deb.html
+
+### Yara
+https://yara.readthedocs.io/en/v3.6.0/gettingstarted.html#compiling-and-installing-yara
+
+Don't forget the python bindings
+```pip3 install yara-python```
+
+If you have yara errors check the installed version numbers for yara and yara-python match the lastest versions.
 
-# Install.
+### PasteHunter
+git clone https://github.com/kevthehermit/pastehunter
 
 ## Using Docker
 
@@ -41,42 +83,18 @@ The mount point is `/usr/share/elasticsearch/data` by default
 You can re-run the pastehunter script by doing `docker-compose up -d`
 Docker-compose will use already running instances of Elasticsearch and Kibana
 
-## Local install 
-
-### Elastic Search
-https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html
-
-### Kibana
-https://www.elastic.co/guide/en/kibana/current/deb.html
-
-### Yara
-https://yara.readthedocs.io/en/v3.6.0/gettingstarted.html#compiling-and-installing-yara
-
-Don't forget the python bindings
-```pip install yara-python```
-
-If you have yara errors check the installed version numbers for yara and yara-python match the lastest versions.
-
-
-### This little app
-git clone https://github.com/kevthehermit/pastehunter
 
-## Configure
+# Configure
 
 copy settings.conf.sample to settings.conf
 populate the details.
 For the scraping API you need to whitelist your IP on pastebin. No API key is required. See the link above
 
-## Running
+# Running
 
-This needs python 3 as per the prereqs. 
-You can run it on its own with ```python3 pastehunter.py```
+Start the application with ```python3 pastehunter.py```
 
-Or you can set a cronjob to run this script every two minutes with a pastelimit of 200
+It may be useful to run in a screen to keep it running in the background. 
 
-```
-localadmin@pastebin:~/pastehunter$ cat /etc/cron.d/pastehunter
-# Run every 5 minutes
-*/2 * * * *   localadmin  cd /home/localadmin/pastehunter && python3 pastehunter.py >> /home/localadmin/pastehunter/cronlog.txt
-localadmin@pastebin:~/pastehunter$
-```
+## Service 
+Service config is coming 

From 5c38b9a021541553a54f9787b118658e2b9b3d02 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Mon, 20 Nov 2017 11:00:20 +0000
Subject: [PATCH 011/178] Spelling Mistake

---
 inputs/gists.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inputs/gists.py b/inputs/gists.py
index c544e38..497e8d8 100644
--- a/inputs/gists.py
+++ b/inputs/gists.py
@@ -31,7 +31,7 @@ def recent_pastes(conf, input_history):
             logging.info("Fetching page: {0}".format(page_num))
             req = requests.get(url, headers=headers)
             # Check some headers
-            logging.info("Remainig Limit: {0}".format(req.headers['X-RateLimit-Remaining']))
+            logging.info("Remaining Limit: {0}".format(req.headers['X-RateLimit-Remaining']))
 
             reset_date = datetime.utcfromtimestamp(float(req.headers['X-RateLimit-Reset'])).isoformat()
 

From 9d9c1dde6a7a62b9d7c9d7150ae29b2a6c966448 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Wed, 22 Nov 2017 10:12:18 +0000
Subject: [PATCH 012/178] Gist Blacklisting

---
 inputs/gists.py      | 16 ++++++++++++++++
 settings.conf.sample |  2 ++
 2 files changed, 18 insertions(+)

diff --git a/inputs/gists.py b/inputs/gists.py
index 497e8d8..91a8785 100644
--- a/inputs/gists.py
+++ b/inputs/gists.py
@@ -9,6 +9,9 @@
 api_uri = 'https://api.github.com/gists/public'
 api_version = 'application/vnd.github.v3+json'  # Set Accept header to force api v3
 
+# Some people use gists to store large blobs of data every 17 minutes. This just slows down the kibana UI
+
+
 
 def recent_pastes(conf, input_history):
     oauth_token = conf['gists']['api_token']
@@ -24,6 +27,9 @@ def recent_pastes(conf, input_history):
     history = []
     paste_list = []
 
+    gist_file_blacklist = conf['gists']['file_blacklist'].split(',')
+    gist_user_blacklist = conf['gists']['user_blacklist'].split(',')
+
     try:
         # Get the required amount of entries via pagination
         for page_num in range(1, page_count + 1):
@@ -55,10 +61,20 @@ def recent_pastes(conf, input_history):
                 if gist_meta['id'] in input_history:
                     continue
 
+                if gist_meta['user'] in gist_user_blacklist:
+                    logging.info("Blacklisting Gist from user: {0}".format(gist_meta['owner']['login']))
+                    continue
+
                 for file_name, file_meta in gist_meta['files'].items():
+
+                    if file_name in gist_file_blacklist:
+                        logging.info("Blacklisting Paste {0}".format(gist_meta['filename']))
+                        continue
+
                     gist_data = file_meta
                     gist_data['@timestamp'] = gist_meta['created_at']
                     gist_data['pasteid'] = gist_meta['id']
+                    gist_data['user'] = gist_meta['user']
                     gist_data['pastesite'] = 'gist.github.com'
                     gist_data['scrape_url'] = file_meta['raw_url']
                     # remove some origional keys just to keep it a bit cleaner
diff --git a/settings.conf.sample b/settings.conf.sample
index 5f1d071..e5224b4 100644
--- a/settings.conf.sample
+++ b/settings.conf.sample
@@ -15,6 +15,8 @@ paste_limit = 200
 [gists]
 api_token =
 api_limit = 100
+user_blacklist =
+file_blacklist = grahamcofborg-eval-package-list
 
 [elastic_output]
 enabled = True

From a37dc20abd85b0b608d6cf14f65f4a7984a75c93 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Wed, 22 Nov 2017 13:20:19 +0000
Subject: [PATCH 013/178] SMTP Support

---
 outputs/smtp_output.py | 71 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/outputs/smtp_output.py b/outputs/smtp_output.py
index e69de29..7c39c2a 100644
--- a/outputs/smtp_output.py
+++ b/outputs/smtp_output.py
@@ -0,0 +1,71 @@
+import smtplib
+import email.encoders
+import email.header
+import email.mime.base
+import email.mime.multipart
+import email.mime.text
+from email.mime.multipart import MIMEMultipart
+import json
+import logging
+
+from common import parse_config
+
+config = parse_config()
+
+class SMTPOutput():
+    def __init__(self):
+        self.smtp_host = config['smtp_output']['smtp_host']
+        self.smtp_port = int(config['smtp_output']['smtp_port'])
+        self.smtp_tls = config['smtp_output']['smtp_tls']
+        self.smtp_user = config['smtp_output']['smtp_user']
+        self.smtp_pass = config['smtp_output']['smtp_pass']
+        self.recipient = config['smtp_output']['recipient']
+        self.alert_list = config['smtp_output']['rule_list'].split(',')
+
+    def store_paste(self, paste_data):
+
+        alert_email = False
+        # Alert on All
+        if 'all' in self.alert_list:
+            alert_email = True
+
+        # Alert on specific rules e.g. custom_keywords
+        if 'all' not in self.alert_list:
+            for yara in paste_data['YaraRule']:
+                if yara in self.alert_list:
+                    alert_email = True
+
+        # To Alert or not to Alert
+        if not alert_email:
+            return
+
+        msg = MIMEMultipart()
+        msg['Subject'] = 'PasteHunter Alert {0}'.format(paste_data['YaraRule'])
+        msg['From'] = self.smtp_user
+        msg['To'] = self.recipient
+
+        body = 'This is the body of the email'
+        json_body = json.dumps(paste_data)
+        # Attach the body
+        msg.attach(email.mime.text.MIMEText(body, 'plain'))
+
+        # Attach the raw paste
+
+        json_att = email.mime.base.MIMEBase('application', 'json')
+        json_att.set_payload(json_body)
+        email.encoders.encode_base64(json_att)
+        json_att.add_header('Content-Disposition', 'attachment; filename="Alert.json"')
+        msg.attach(json_att)
+
+        # Connect and send
+
+        smtp_conn = smtplib.SMTP(self.smtp_host, self.smtp_port)
+
+        smtp_conn.ehlo()
+        if self.smtp_tls:
+            smtp_conn.starttls()
+
+        smtp_conn.login(self.smtp_user, self.smtp_pass)
+        logging.info("Email Sent")
+        smtp_conn.send_message(msg)
+        smtp_conn.quit()

From 07bb3dacf954182431a4698f59e3b5b2ea2cd5e9 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Wed, 22 Nov 2017 13:20:38 +0000
Subject: [PATCH 014/178] Wrong Key in Gists

---
 inputs/gists.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inputs/gists.py b/inputs/gists.py
index 91a8785..d7e9b82 100644
--- a/inputs/gists.py
+++ b/inputs/gists.py
@@ -68,7 +68,7 @@ def recent_pastes(conf, input_history):
                 for file_name, file_meta in gist_meta['files'].items():
 
                     if file_name in gist_file_blacklist:
-                        logging.info("Blacklisting Paste {0}".format(gist_meta['filename']))
+                        logging.info("Blacklisting Paste {0}".format(file_name))
                         continue
 
                     gist_data = file_meta

From 7bb652fafa658fb4bdb4394a7afa2eadbf0ba460 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Wed, 22 Nov 2017 13:20:51 +0000
Subject: [PATCH 015/178] Change Debug level

---
 pastehunter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pastehunter.py b/pastehunter.py
index cf05d13..d97f41c 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -77,7 +77,7 @@ def paste_scanner():
     # Store the Paste
     while True:
         paste_data = q.get()
-        logging.info("Found New {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
+        logging.debug("Found New {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
         # get raw paste and hash them
         raw_paste_uri = paste_data['scrape_url']
         raw_paste_data = requests.get(raw_paste_uri).text

From bef28e31ccbb831401876b93adafaccd14fb6dec Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Wed, 22 Nov 2017 13:22:54 +0000
Subject: [PATCH 016/178] Update sample config for SMTP

---
 settings.conf.sample | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/settings.conf.sample b/settings.conf.sample
index e5224b4..00e22cf 100644
--- a/settings.conf.sample
+++ b/settings.conf.sample
@@ -42,13 +42,19 @@ enabled = False
 host = 192.168.1.1
 port = 514
 
+#
+# Recipient is the email address that will receive the alerts
+# Rule list is a comma separated list of rules that will generate an alert.
+# Alternatively use all
 [smtp_output]
 enabled = False
-server =
-port =
-tls = True
-username =
-password =
+smtp_host =
+smtp_port =
+smtp_tls =
+smtp_user =
+smtp_pass =
+recipient = pastehunter@techanarchy.net
+rule_list = custom_keywords
 
 [yara]
 rule_path = YaraRules

From 068f1555edce004923a264fa0a11bd58dcc2dd6d Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Tue, 28 Nov 2017 11:54:18 +0000
Subject: [PATCH 017/178] Change config file to JSON

---
 .gitignore                 |   1 +
 common.py                  |  33 ++++--------
 inputs/dumpz.py            |   4 +-
 inputs/gists.py            |  19 ++++---
 inputs/pastebin.py         |   4 +-
 outputs/csv_output.py      |   2 +-
 outputs/elastic_output.py  |  12 ++---
 outputs/json_output.py     |   4 +-
 outputs/smtp_output.py     |  14 ++---
 outputs/syslog_output.py   |   4 +-
 pastehunter.py             |  71 +++++++++++---------------
 postprocess/__init__.py    |   0
 postprocess/postprocess.py |  14 ++---
 settings.json.sample       | 101 +++++++++++++++++++++++++++++++++++++
 14 files changed, 177 insertions(+), 106 deletions(-)
 create mode 100644 postprocess/__init__.py
 create mode 100644 settings.json.sample

diff --git a/.gitignore b/.gitignore
index f18d0b0..3bec5bf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -102,3 +102,4 @@ ENV/
 /settings.conf
 /YaraRules/custom_keywords.yar
 /paste_history.tmp
+/settings.json
diff --git a/common.py b/common.py
index 69e92b6..81f58b7 100644
--- a/common.py
+++ b/common.py
@@ -1,26 +1,15 @@
-import configparser
+import json
+import logging
+
 
 # Parse the config file in to a dict
 def parse_config():
-    config_dict = {}
-    config = configparser.ConfigParser(allow_no_value=True)
-
-    conf_file = 'settings.conf'
+    conf_file = 'settings.json'
+    conf = None
+    try:
+        with open(conf_file, 'r') as read_conf:
+            conf = json.load(read_conf)
+    except Exception as e:
+        logging.error("Unable to parse config file: {0}".format(e))
 
-    valid = config.read(conf_file)
-    if len(valid) > 0:
-        config_dict['valid'] = True
-        for section in config.sections():
-            section_dict = {}
-            for key, value in config.items(section):
-                if value.lower() == 'true':
-                    new_val = True
-                elif value.lower() == 'false':
-                    new_val = False
-                else:
-                    new_val = value
-                section_dict[key] = new_val
-            config_dict[section] = section_dict
-    else:
-        config_dict['valid'] = False
-    return config_dict
\ No newline at end of file
+    return conf
diff --git a/inputs/dumpz.py b/inputs/dumpz.py
index 7b62d91..ebcbfa2 100644
--- a/inputs/dumpz.py
+++ b/inputs/dumpz.py
@@ -4,8 +4,8 @@
 
 def recent_pastes(conf, input_history):
     # populate vars from config
-    paste_limit = conf['dumpz']['paste_limit']
-    api_scrape = conf['dumpz']['api_scrape']
+    paste_limit = conf['inputs']['dumpz']['paste_limit']
+    api_scrape = conf['inputs']['dumpz']['api_scrape']
     history = []
     paste_list = []
     try:
diff --git a/inputs/gists.py b/inputs/gists.py
index d7e9b82..c56edb4 100644
--- a/inputs/gists.py
+++ b/inputs/gists.py
@@ -14,34 +14,33 @@
 
 
 def recent_pastes(conf, input_history):
-    oauth_token = conf['gists']['api_token']
-    gist_limit = int(conf['gists']['api_limit'])
+    oauth_token = conf['inputs']['gists']['api_token']
+    gist_limit = conf['inputs']['gists']['api_limit']
     headers = {'user-agent': 'PasteHunter',
                'Accept': api_version,
                'Authorization': 'token {0}'.format(oauth_token)}
 
     # calculate number of pages
-    page_count = math.ceil(gist_limit / 100)
+    page_count = int(math.ceil(gist_limit / 100))
 
     result_pages = []
     history = []
     paste_list = []
 
-    gist_file_blacklist = conf['gists']['file_blacklist'].split(',')
-    gist_user_blacklist = conf['gists']['user_blacklist'].split(',')
+    gist_file_blacklist = conf['inputs']['gists']['file_blacklist']
+    gist_user_blacklist = conf['inputs']['gists']['user_blacklist']
 
     try:
         # Get the required amount of entries via pagination
         for page_num in range(1, page_count + 1):
             url = '{0}?page={1}&per_page=100'.format(api_uri, page_num)
-            logging.info("Fetching page: {0}".format(page_num))
+            logging.debug("Fetching page: {0}".format(page_num))
             req = requests.get(url, headers=headers)
             # Check some headers
-            logging.info("Remaining Limit: {0}".format(req.headers['X-RateLimit-Remaining']))
-
             reset_date = datetime.utcfromtimestamp(float(req.headers['X-RateLimit-Reset'])).isoformat()
-
-            logging.info("Limit Reset: {0}".format(reset_date))
+            # logging.info("Limit Reset: {0}".format(reset_date))
+            logging.info("Remaining Limit: {0}. Resets at {1}".format(req.headers['X-RateLimit-Remaining'],
+                                                                      reset_date))
 
             if req.status_code == 200:
                 result_pages.append(req.json())
diff --git a/inputs/pastebin.py b/inputs/pastebin.py
index 6703b5a..2fe66bd 100644
--- a/inputs/pastebin.py
+++ b/inputs/pastebin.py
@@ -5,8 +5,8 @@
 
 def recent_pastes(conf, input_history):
     # populate vars from config
-    paste_limit = conf['pastebin']['paste_limit']
-    api_scrape = conf['pastebin']['api_scrape']
+    paste_limit = conf['inputs']['pastebin']['paste_limit']
+    api_scrape = conf['inputs']['pastebin']['api_scrape']
     history = []
     paste_list = []
     try:
diff --git a/outputs/csv_output.py b/outputs/csv_output.py
index 92d2248..d79cf67 100644
--- a/outputs/csv_output.py
+++ b/outputs/csv_output.py
@@ -7,7 +7,7 @@
 
 class CSVOutput():
     def __init__(self):
-        base_path = config['csv_output']['csv_path']
+        base_path = config['outputs']['csv_output']['output_path']
         # Get todays CSV
         dtg = datetime.date.today().strftime("%Y-%m-%d")
         csv_name = 'pastes_{0}.csv'.format(dtg)
diff --git a/outputs/elastic_output.py b/outputs/elastic_output.py
index cab1cf8..d29e534 100644
--- a/outputs/elastic_output.py
+++ b/outputs/elastic_output.py
@@ -8,12 +8,12 @@
 class ElasticOutput():
     def __init__(self):
         # Set up the database connection
-        es_host = config['elastic_output']['elastic_host']
-        es_port = config['elastic_output']['elastic_port']
-        es_user = config['elastic_output']['elastic_user']
-        es_pass = config['elastic_output']['elastic_pass']
-        self.es_index = config['elastic_output']['elastic_index']
-        es_ssl = config['elastic_output']['elastic_ssl']
+        es_host = config['outputs']['elastic_output']['elastic_host']
+        es_port = config['outputs']['elastic_output']['elastic_port']
+        es_user = config['outputs']['elastic_output']['elastic_user']
+        es_pass = config['outputs']['elastic_output']['elastic_pass']
+        self.es_index = config['outputs']['elastic_output']['elastic_index']
+        es_ssl = config['outputs']['elastic_output']['elastic_ssl']
         self.test = False
         try:
             self.es = Elasticsearch(es_host, port=es_port, http_auth=(es_user, es_pass), use_ssl=es_ssl)
diff --git a/outputs/json_output.py b/outputs/json_output.py
index 0a343af..490059b 100644
--- a/outputs/json_output.py
+++ b/outputs/json_output.py
@@ -7,7 +7,7 @@
 
 class JsonOutput():
     def __init__(self):
-        base_path = config['json_output']['json_path']
+        base_path = config['outputs']['json_output']['output_path']
         self.json_path = base_path
         if not os.path.exists(base_path):
             try:
@@ -20,7 +20,7 @@ def __init__(self):
             self.test = True
 
     def store_paste(self, paste_data):
-        if not config['json_output']['store_raw']:
+        if not config['outputs']['json_output']['store_raw']:
             del paste_data['raw_paste']
 
         if self.test:
diff --git a/outputs/smtp_output.py b/outputs/smtp_output.py
index 7c39c2a..2bae275 100644
--- a/outputs/smtp_output.py
+++ b/outputs/smtp_output.py
@@ -14,13 +14,13 @@
 
 class SMTPOutput():
     def __init__(self):
-        self.smtp_host = config['smtp_output']['smtp_host']
-        self.smtp_port = int(config['smtp_output']['smtp_port'])
-        self.smtp_tls = config['smtp_output']['smtp_tls']
-        self.smtp_user = config['smtp_output']['smtp_user']
-        self.smtp_pass = config['smtp_output']['smtp_pass']
-        self.recipient = config['smtp_output']['recipient']
-        self.alert_list = config['smtp_output']['rule_list'].split(',')
+        self.smtp_host = config['outputs']['smtp_output']['smtp_host']
+        self.smtp_port = config['outputs']['smtp_output']['smtp_port']
+        self.smtp_tls = config['outputs']['smtp_output']['smtp_tls']
+        self.smtp_user = config['outputs']['smtp_output']['smtp_user']
+        self.smtp_pass = config['outputs']['smtp_output']['smtp_pass']
+        self.recipient = config['outputs']['smtp_output']['recipient']
+        self.alert_list = config['outputs']['smtp_output']['rule_list']
 
     def store_paste(self, paste_data):
 
diff --git a/outputs/syslog_output.py b/outputs/syslog_output.py
index d2dc777..6618eba 100644
--- a/outputs/syslog_output.py
+++ b/outputs/syslog_output.py
@@ -6,8 +6,8 @@
 class SyslogOutput():
 
     def store_paste(self, paste_data):
-        host = config['syslog_output']['host']
-        port = int(config['syslog_output']['port'])
+        host = config['outputs']['syslog_output']['host']
+        port = config['outputs']['syslog_output']['port']
 
         syslog_line = '"{0}" "{1}" "{2}" "{3}" "{4}"'.format(paste_data['@timestamp'],
                                                 paste_data['pasteid'],
diff --git a/pastehunter.py b/pastehunter.py
index d97f41c..e4e9a7b 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -6,13 +6,13 @@
 import json
 import hashlib
 import requests
-from time import sleep
-from common import parse_config
-from outputs import elastic_output, json_output, csv_output, syslog_output, smtp_output
-from queue import Queue
 import threading
 import importlib
 import logging
+from time import sleep
+from queue import Queue
+from common import parse_config
+from postprocess import postprocess
 
 lock = threading.Lock()
 
@@ -25,40 +25,23 @@
 # Parse the config file
 conf = parse_config()
 
-# populate vars from config
-api_raw = conf['pastebin']['api_raw']
-rule_path = conf['yara']['rule_path']
-blacklist = conf['yara']['blacklist']
-store_all = conf['pastebin']['store_all']
-input_list = conf['inputs']['inputs']
+logging.info("Configure Inputs")
+input_list = []
+for input_type, input_values in conf["inputs"].items():
+    if input_values["enabled"]:
+        input_list.append(input_values["module"])
+        logging.info("Enabled Input: {0}".format(input_type))
+
 
 logging.info("Configure Outputs")
-# configure outputs
 outputs = []
-if conf['elastic_output']['enabled']:
-    es = elastic_output.ElasticOutput()
-    outputs.append(es)
-    logging.info("Elastic Output Enabled")
-
-if conf['json_output']['enabled']:
-    js = json_output.JsonOutput()
-    outputs.append(js)
-    logging.info("Json Output Enabled")
-
-if conf['csv_output']['enabled']:
-    csv = csv_output.CSVOutput()
-    outputs.append(csv)
-    logging.info("CSV Output Enabled")
-
-if conf['syslog_output']['enabled']:
-    syslog = syslog_output.SyslogOutput()
-    outputs.append(syslog)
-    logging.info("Syslog Output Enabled")
-
-if conf['smtp_output']['enabled']:
-    smtp = smtp_output.SMTPOutput()
-    outputs.append(smtp)
-    logging.info("SMTP Output Enabled")
+for output_type, output_values in conf["outputs"].items():
+    if output_values["enabled"]:
+        logging.info("Enabled Output: {0}".format(output_type))
+        _module = importlib.import_module(output_values["module"])
+        _class = getattr(_module, output_values["classname"])
+        instance = _class()
+        outputs.append(instance)
 
 
 def yara_index(rule_path):
@@ -110,14 +93,19 @@ def paste_scanner():
 
         # Blacklist Check
         # If any of the blacklist rules appear then empty the result set
-        if blacklist and 'blacklist' in results:
+        if conf['yara']['blacklist'] and 'blacklist' in results:
             results = []
             logging.info("Blacklisted {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
 
+        # Post Process
+        new_output = postprocess.run(results, paste_data)
+
         # If we have a result add some meta data and send to storage
         # If results is empty, ie no match, and store_all is True,
         # then append "no_match" to results. This will then force output.
 
+        #ToDo: Need to make this check for each output not universal
+        store_all = conf['inputs']['pastebin']['store_all']
         if store_all is True:
             if len(results) == 0:
                 results.append('no_match')
@@ -142,9 +130,9 @@ def paste_scanner():
     logging.info("Compile Yara Rules")
     try:
         # Update the yara rules index
-        yara_index(rule_path)
+        yara_index(conf['yara']['rule_path'])
         # Compile the yara rules we will use to match pastes
-        index_file = os.path.join(rule_path, 'index.yar')
+        index_file = os.path.join(conf['yara']['rule_path'], 'index.yar')
         rules = yara.compile(index_file)
     except Exception as e:
         print("Unable to Create Yara index: ", e)
@@ -170,20 +158,21 @@ def paste_scanner():
             else:
                 paste_history = {}
 
-            for input_name in input_list.split(','):
+            for input_name in input_list:
                 if input_name in paste_history:
                     input_history = paste_history[input_name]
                 else:
                     input_history = []
 
-                import_name = 'inputs.{0}'.format(input_name)
-                i = importlib.import_module(import_name)
+                i = importlib.import_module(input_name)
                 # Get list of recent pastes
+                logging.info("Fetching paste list from {0}".format(input_name))
                 paste_list, history = i.recent_pastes(conf, input_history)
                 for paste in paste_list:
                     q.put(paste)
                 paste_history[input_name] = history
 
+            logging.debug("Writing History")
             # Write History
             with open('paste_history.tmp', 'w') as outfile:
                 json.dump(paste_history, outfile)
diff --git a/postprocess/__init__.py b/postprocess/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/postprocess/postprocess.py b/postprocess/postprocess.py
index f2dea09..c6ac48a 100644
--- a/postprocess/postprocess.py
+++ b/postprocess/postprocess.py
@@ -1,14 +1,6 @@
 # This gets the raw paste, yara rules and the paste id
-# Post runs after paste is stored.
 # We call in post process modules
-# If we are in elastic search we update the document
 
-
-class PostProcess:
-    def __init__(self):
-        self.raw_paste = ''
-        self.yararules = []
-        self.pasteid = ''
-
-    def run(self):
-        pass
+def run(results, paste_object):
+    # Use the rule name to determine what post to do
+    pass
diff --git a/settings.json.sample b/settings.json.sample
new file mode 100644
index 0000000..2d2c341
--- /dev/null
+++ b/settings.json.sample
@@ -0,0 +1,101 @@
+{
+  "inputs": {
+    "pastebin":{
+      "enabled": true,
+      "module": "inputs.pastebin",
+      "api_scrape": "https://pastebin.com/api_scraping.php",
+      "api_raw": "https://pastebin.com/api_scrape_item.php?i=",
+      "paste_limit": 200,
+      "store_all": false
+    },
+    "dumpz": {
+      "enabled": true,
+      "module": "inputs.dumpz",
+      "api_scrape": "https://dumpz.org/api/recent",
+      "api_raw": "https://dumpz.org/api/dump",
+      "paste_limit": 200,
+      "store_all": false
+    },
+    "gists": {
+      "enabled": true,
+      "module": "inputs.gists",
+      "api_token": "",
+      "api_limit": 100,
+      "store_all": false,
+      "user_blacklist": [],
+      "file_blacklist": ["grahamcofborg-eval-package-list"]
+    }
+  },
+  "outputs": {
+    "elastic_output": {
+      "enabled": true,
+      "module": "outputs.elastic_output",
+      "classname": "ElasticOutput",
+      "elastic_index": "paste-test",
+      "elastic_host": "192.168.1.22",
+      "elastic_port": 9200,
+      "elastic_user": "elastic",
+      "elastic_pass": "changeme",
+      "elastic_ssl": false
+    },
+    "json_output": {
+      "enabled": true,
+      "module": "outputs.json_output",
+      "classname": "JsonOutput",
+      "output_path": "logs/json/",
+      "store_raw": true,
+      "encode_raw": true
+    },
+    "csv_output": {
+      "enabled": false,
+      "module": "outputs.csv_output",
+      "classname": "CSVOutput",
+      "output_path": "/logs/csv/"
+    },
+    "syslog_output": {
+      "enabled": false,
+      "module": "outputs.syslog_output",
+      "classname": "SyslogOutput",
+      "host": "192.168.1.1",
+      "port": 514
+    },
+    "smtp_output": {
+      "enabled": false,
+      "module": "outputs.smtp_output",
+      "classname": "SMTPOutput",
+      "smtp_host": "smtp.server.com",
+      "smtp_port": 25,
+      "smtp_tls": true,
+      "smtp_user": "smtpusername",
+      "smtp_pass": "smtppassword",
+      "recipient": "emailaddress that gets the alerts",
+      "rule_list": ["custom_keywords"]
+    }
+  },
+  "yara": {
+    "rule_path": "YaraRules",
+    "blacklist": true
+  },
+  "post_process": {
+    "post_email": {
+      "enabled": false,
+      "module": "postprocess.post_email",
+      "rule_list": ["email_list"]
+    },
+    "post_b64": {
+      "enabled": false,
+      "module": "postprocess.post_b64",
+      "rule_list": ["b64_exe", "b64_rar", "b64_zip", "b64_gzip"],
+      "cuckoo": {
+        "enabled": false,
+        "api_host": "127.0.0.1",
+        "api_port": 8080
+      },
+      "viper": {
+        "enabled": false,
+        "api_host": "127.0.0.1",
+        "api_port": 8080
+      }
+    }
+  }
+}

From 66bbfab3469eb3bc24ed8e13d5595a9c1ace637b Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Tue, 28 Nov 2017 11:55:20 +0000
Subject: [PATCH 018/178] Update .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 3bec5bf..d1dc644 100644
--- a/.gitignore
+++ b/.gitignore
@@ -103,3 +103,4 @@ ENV/
 /YaraRules/custom_keywords.yar
 /paste_history.tmp
 /settings.json
+/.idea
\ No newline at end of file

From e246e5bd3d898b1f19a8d83106e4adef6ac2c7c3 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Wed, 29 Nov 2017 12:26:21 +0000
Subject: [PATCH 019/178] DB Create User rules

---
 YaraRules/database.yar | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/YaraRules/database.yar b/YaraRules/database.yar
index 4febba6..e7d4e0b 100644
--- a/YaraRules/database.yar
+++ b/YaraRules/database.yar
@@ -37,4 +37,21 @@ rule db_structure
 
     condition:
         5 of them
+}
+
+rule db_create_user
+{
+    meta:
+        author = "@KevTheHermit"
+        info = "Part of PasteHunter"
+        reference = "https://github.com/kevthehermit/PasteHunter"
+
+    strings:
+        $a = "GRANT ALL PRIVILEGES" nocase
+        $b = "IDENTIFIED BY" nocase
+        $c = "GRANT SELECT" nocase
+        $d = "CREATE USER" nocase
+
+    condition:
+        2 of them
 }
\ No newline at end of file

From ee2e0f5f97c2806684df91395299790ed4c43606 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Thu, 30 Nov 2017 20:16:18 +0000
Subject: [PATCH 020/178] Tweak Readme

---
 README.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index a193773..7061981 100644
--- a/README.md
+++ b/README.md
@@ -32,11 +32,6 @@ YOU DO NOT NEED TO GIVE IT ANY ACCESS PERMISSIONS
 
 ## Local install 
 
-### Python / Deps
-Python 3
-```pip3 install -r requirements.txt```
-
-
 ### Elastic Search
 https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html
 
@@ -54,6 +49,10 @@ If you have yara errors check the installed version numbers for yara and yara-py
 ### PasteHunter
 git clone https://github.com/kevthehermit/pastehunter
 
+### Python / Deps
+Python 3
+```pip3 install -r requirements.txt```
+
 ## Using Docker
 
 Install Docker & docker-compose
@@ -90,6 +89,8 @@ copy settings.conf.sample to settings.conf
 populate the details.
 For the scraping API you need to whitelist your IP on pastebin. No API key is required. See the link above
 
+
+
 # Running
 
 Start the application with ```python3 pastehunter.py```

From 80365c310d3a072fa1a311bdb05cc3d4f94ee08e Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Thu, 30 Nov 2017 20:17:05 +0000
Subject: [PATCH 021/178] Update Requirements.txt

---
 README.md            |  3 ---
 requirements.txt     |  1 +
 settings.conf.sample | 61 --------------------------------------------
 3 files changed, 1 insertion(+), 64 deletions(-)
 delete mode 100644 settings.conf.sample

diff --git a/README.md b/README.md
index 7061981..a3bd1e7 100644
--- a/README.md
+++ b/README.md
@@ -41,9 +41,6 @@ https://www.elastic.co/guide/en/kibana/current/deb.html
 ### Yara
 https://yara.readthedocs.io/en/v3.6.0/gettingstarted.html#compiling-and-installing-yara
 
-Don't forget the python bindings
-```pip3 install yara-python```
-
 If you have yara errors check the installed version numbers for yara and yara-python match the lastest versions.
 
 ### PasteHunter
diff --git a/requirements.txt b/requirements.txt
index 4edf322..7e0d85f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
 requests==2.18.4
 elasticsearch>=5.0.0,<6.0.0
+yara-python
\ No newline at end of file
diff --git a/settings.conf.sample b/settings.conf.sample
deleted file mode 100644
index 00e22cf..0000000
--- a/settings.conf.sample
+++ /dev/null
@@ -1,61 +0,0 @@
-[inputs]
-inputs = pastebin,dumpz,gists
-
-[pastebin]
-api_scrape = https://pastebin.com/api_scraping.php
-api_raw = https://pastebin.com/api_scrape_item.php?i=
-paste_limit = 200
-store_all = false
-
-[dumpz]
-api_scrape = https://dumpz.org/api/recent
-api_raw = https://dumpz.org/api/dump/
-paste_limit = 200
-
-[gists]
-api_token =
-api_limit = 100
-user_blacklist =
-file_blacklist = grahamcofborg-eval-package-list
-
-[elastic_output]
-enabled = True
-elastic_index = paste-test
-elastic_host = 192.168.1.1
-elastic_port = 9200
-elastic_user = elastic
-elastic_pass = changeme
-elastic_ssl = False
-
-[json_output]
-enabled = True
-json_path = logs/json/
-store_raw = True
-encode_raw = True
-
-[csv_output]
-enabled = True
-csv_path = logs/csv/
-
-[syslog_output]
-enabled = False
-host = 192.168.1.1
-port = 514
-
-#
-# Recipient is the email address that will receive the alerts
-# Rule list is a comma separated list of rules that will generate an alert.
-# Alternatively use all
-[smtp_output]
-enabled = False
-smtp_host =
-smtp_port =
-smtp_tls =
-smtp_user =
-smtp_pass =
-recipient = pastehunter@techanarchy.net
-rule_list = custom_keywords
-
-[yara]
-rule_path = YaraRules
-blacklist = True
\ No newline at end of file

From f5a6785a923e1dc5e06f1c830810ccc2a1abee31 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Thu, 30 Nov 2017 22:33:02 +0000
Subject: [PATCH 022/178] Post Process Modules

---
 pastehunter.py             | 19 ++++++++-
 postprocess/post_b64.py    | 83 ++++++++++++++++++++++++++++++++++++++
 postprocess/post_email.py  | 29 +++++++++++++
 postprocess/postprocess.py |  6 ---
 4 files changed, 129 insertions(+), 8 deletions(-)
 create mode 100644 postprocess/post_b64.py
 create mode 100644 postprocess/post_email.py
 delete mode 100644 postprocess/postprocess.py

diff --git a/pastehunter.py b/pastehunter.py
index e4e9a7b..9d3473f 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -12,7 +12,7 @@
 from time import sleep
 from queue import Queue
 from common import parse_config
-from postprocess import postprocess
+from postprocess import post_email
 
 lock = threading.Lock()
 
@@ -98,7 +98,22 @@ def paste_scanner():
             logging.info("Blacklisted {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
 
         # Post Process
-        new_output = postprocess.run(results, paste_data)
+
+        # If post module is enabled and the paste has a matching rule.
+        post_results = paste_data
+        for post_process, post_values in conf["post_process"].items():
+            if post_values["enabled"]:
+                if any(i in results for i in post_values["rule_list"]):
+                    logging.info("Running Post Module on {0}".format(paste_data["pasteid"]))
+                    post_module = importlib.import_module(post_values["module"])
+                    post_results = post_module.run(results,
+                                                   raw_paste_data,
+                                                   paste_data
+                                                   )
+
+        # Throw everything back to paste_data for ease.
+        paste_data = post_results
+
 
         # If we have a result add some meta data and send to storage
         # If results is empty, ie no match, and store_all is True,
diff --git a/postprocess/post_b64.py b/postprocess/post_b64.py
new file mode 100644
index 0000000..69bcab7
--- /dev/null
+++ b/postprocess/post_b64.py
@@ -0,0 +1,83 @@
+import io
+import re
+import hashlib
+import gzip
+import logging
+import requests
+from base64 import b64decode
+# This gets the raw paste and the paste_data json object
+from common import parse_config
+conf = parse_config()
+
+
+
+def run(results, raw_paste_data, paste_object):
+    # Figure out which b64 rule fire
+
+    b64_re = '(?:[A-Za-z0-9+/]{4}){2,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)'
+    b64_strings = re.findall(b64_re, raw_paste_data)
+
+    # Set a counter for multiple streams.
+    counter = 0
+    for b64_str in b64_strings:
+        for rule in results:
+            if rule == 'b64_gzip':
+                # Lets try to decode and get a file listing.
+                # Also get the MD5 of the decoded file
+                try:
+                    uncompressed = gzip.decompress(b64decode(b64_str))
+                    paste_object["decompressed_stream"] = uncompressed
+                except Exception as e:
+                    logging.error("Unable to decompress gzip stream")
+            if rule == 'b64_exe':
+                try:
+                    raw_exe = b64decode(b64_str)
+                    paste_object["exe_size"] = len(raw_exe)
+                    paste_object["exe_md5"] = hashlib.md5(raw_exe).hexdigest()
+                    paste_object["exe_sha256"] = hashlib.sha256(raw_exe).hexdigest()
+                    paste_object["VT"] = 'https://www.virustotal.com/#/file/{0}'.format(paste_object["exe_md5"])
+
+                    # Cuckoo
+                    if conf["post_process"]["post_b64"]["cuckoo"]["enabled"]:
+                        try:
+                            task_id = send_to_cuckoo(raw_exe, paste_object["pasteid"])
+                            paste_object["Cuckoo Task ID"] = task_id
+                            logging.info("exe submitted to Cuckoo with task id {0}".format(task_id))
+                        except Exception as e:
+                            logging.error("Unabled to submit sample to cuckoo")
+
+                    # Viper
+                    if conf["post_process"]["post_b64"]["viper"]["enabled"]:
+                        send_to_cuckoo(raw_exe, paste_object["pasteid"])
+
+                except Exception as e:
+                    logging.error("Unable to decode exe file")
+
+        # Increase the counter
+        counter += 1
+
+    # Get unique domain count
+    # Update the json
+
+    # Send the updated json back
+    return paste_object
+
+
+def send_to_cuckoo(raw_exe, pasteid):
+    cuckoo_ip = conf["post_process"]["post_b64"]["cuckoo"]["api_host"]
+    cuckoo_port = conf["post_process"]["post_b64"]["cuckoo"]["api_port"]
+    cuckoo_host = 'http://{0}:{1}'.format(cuckoo_ip, cuckoo_port)
+    submit_file_url = '{0}/tasks/create/file'.format(cuckoo_host)
+    files = {'file': ('{0}.exe'.format(pasteid), io.BytesIO(raw_exe))}
+    submit_file = requests.post(submit_file_url, files=files).json()
+    task_id = None
+    try:
+        task_id = submit_file['task_id']
+    except KeyError:
+        try:
+            task_id = submit_file['task_ids'][0]
+        except KeyError:
+            logging.error(submit_file)
+
+    return task_id
+
diff --git a/postprocess/post_email.py b/postprocess/post_email.py
new file mode 100644
index 0000000..53bbc79
--- /dev/null
+++ b/postprocess/post_email.py
@@ -0,0 +1,29 @@
+import re
+
+
+def run(results, raw_paste_data, paste_object):
+    # Use the rule name to determine what postprocess to do
+
+    # Get total unique emails.
+
+    all_emails = re.findall('[\w\.-]+@[\w\.-]+\.\w+', raw_paste_data)
+    domain_list = []
+    for email_address in all_emails:
+        email_domain = email_address.split("@")
+        domain_list.append(email_domain[-1])
+
+    unique_emails = set(all_emails)
+    unique_domains = set(domain_list)
+    # We can filter some of the false positives from the yara match here
+
+    if len(unique_emails) < 10:
+        paste_object["results"] = []
+
+    # Get unique domain count
+    # Update the json
+    paste_object["total_emails"] = len(all_emails)
+    paste_object["unique_emails"] = len(unique_emails)
+    paste_object["unique_domains"] = len(unique_domains)
+
+    # Send the updated json back
+    return paste_object
diff --git a/postprocess/postprocess.py b/postprocess/postprocess.py
deleted file mode 100644
index c6ac48a..0000000
--- a/postprocess/postprocess.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# This gets the raw paste, yara rules and the paste id
-# We call in post process modules
-
-def run(results, paste_object):
-    # Use the rule name to determine what post to do
-    pass

From 8709d8fb6a2886dff2cec120f81d494a5f5136ca Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Thu, 30 Nov 2017 23:25:00 +0000
Subject: [PATCH 023/178] Base64_exe to cuckoo

---
 .gitignore              |  3 +-
 README.md               | 11 +++++--
 postprocess/post_b64.py | 67 ++++++++++++++++++++++-------------------
 3 files changed, 47 insertions(+), 34 deletions(-)

diff --git a/.gitignore b/.gitignore
index d1dc644..6d8e73b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -103,4 +103,5 @@ ENV/
 /YaraRules/custom_keywords.yar
 /paste_history.tmp
 /settings.json
-/.idea
\ No newline at end of file
+/.idea
+/postprocess/tester.py
diff --git a/README.md b/README.md
index a3bd1e7..c375fd4 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@ PasteHunter is a python3 application that is designed to query a collection of s
 For all the pasts it finds it scans the raw contents against a series of yara rules looking for information that can be used 
 by an org or a researcher.
 
-## Supported Sites
+## Supported Inputs
 Pastehunter currently has support for the following sites:
  - pastebin.com
  - dumpz.org
@@ -12,8 +12,15 @@ Pastehunter currently has support for the following sites:
 Support for the following sites is listed as ToDo:
  - paste.ee
 
+## Supported Outputs
+
+## PostProcess Modules
+Pastehunter comes with a couple of post process modules that extact useful data from pastes or pass them to other services
+The following are default modules:
+
+ - Emails
+ - 
 
-# Installation
 
 ## PreReqs
 
diff --git a/postprocess/post_b64.py b/postprocess/post_b64.py
index 69bcab7..acad122 100644
--- a/postprocess/post_b64.py
+++ b/postprocess/post_b64.py
@@ -10,48 +10,54 @@
 conf = parse_config()
 
 
-
 def run(results, raw_paste_data, paste_object):
     # Figure out which b64 rule fire
 
-    b64_re = '(?:[A-Za-z0-9+/]{4}){2,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)'
+    # The base64 re can hang on occasion with this one
+    # b64_re = '(?:[A-Za-z0-9+/]{4}){2,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)'
+
+    # This one has a few empty results i need to catch but doesn't kill pastehunter
+    b64_re = '(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?'
     b64_strings = re.findall(b64_re, raw_paste_data)
 
+
     # Set a counter for multiple streams.
     counter = 0
     for b64_str in b64_strings:
         for rule in results:
-            if rule == 'b64_gzip':
-                # Lets try to decode and get a file listing.
-                # Also get the MD5 of the decoded file
-                try:
-                    uncompressed = gzip.decompress(b64decode(b64_str))
-                    paste_object["decompressed_stream"] = uncompressed
-                except Exception as e:
-                    logging.error("Unable to decompress gzip stream")
-            if rule == 'b64_exe':
-                try:
-                    raw_exe = b64decode(b64_str)
-                    paste_object["exe_size"] = len(raw_exe)
-                    paste_object["exe_md5"] = hashlib.md5(raw_exe).hexdigest()
-                    paste_object["exe_sha256"] = hashlib.sha256(raw_exe).hexdigest()
-                    paste_object["VT"] = 'https://www.virustotal.com/#/file/{0}'.format(paste_object["exe_md5"])
+            if len(b64_str) > 0:
+                if rule == 'b64_gzip':
+                    # Lets try to decode and get a file listing.
+                    # Also get the MD5 of the decoded file
+                    try:
+                        uncompressed = gzip.decompress(b64decode(b64_str))
+                        paste_object["decompressed_stream"] = uncompressed
+                    except Exception as e:
+                        logging.error("Unable to decompress gzip stream")
+                if rule == 'b64_exe':
+                    try:
+                        raw_exe = b64decode(b64_str)
+                        paste_object["exe_size"] = len(raw_exe)
+                        paste_object["exe_md5"] = hashlib.md5(raw_exe).hexdigest()
+                        paste_object["exe_sha256"] = hashlib.sha256(raw_exe).hexdigest()
+                        paste_object["VT"] = 'https://www.virustotal.com/#/file/{0}'.format(paste_object["exe_md5"])
 
-                    # Cuckoo
-                    if conf["post_process"]["post_b64"]["cuckoo"]["enabled"]:
-                        try:
-                            task_id = send_to_cuckoo(raw_exe, paste_object["pasteid"])
-                            paste_object["Cuckoo Task ID"] = task_id
-                            logging.info("exe submitted to Cuckoo with task id {0}".format(task_id))
-                        except Exception as e:
-                            logging.error("Unabled to submit sample to cuckoo")
+                        # Cuckoo
+                        if conf["post_process"]["post_b64"]["cuckoo"]["enabled"]:
+                            logging.info("Submitting to Cuckoo")
+                            try:
+                                task_id = send_to_cuckoo(raw_exe, paste_object["pasteid"])
+                                paste_object["Cuckoo Task ID"] = task_id
+                                logging.info("exe submitted to Cuckoo with task id {0}".format(task_id))
+                            except Exception as e:
+                                logging.error("Unabled to submit sample to cuckoo")
 
-                    # Viper
-                    if conf["post_process"]["post_b64"]["viper"]["enabled"]:
-                        send_to_cuckoo(raw_exe, paste_object["pasteid"])
+                        # Viper
+                        if conf["post_process"]["post_b64"]["viper"]["enabled"]:
+                            send_to_cuckoo(raw_exe, paste_object["pasteid"])
 
-                except Exception as e:
-                    logging.error("Unable to decode exe file")
+                    except Exception as e:
+                        logging.error("Unable to decode exe file")
 
         # Increase the counter
         counter += 1
@@ -80,4 +86,3 @@ def send_to_cuckoo(raw_exe, pasteid):
             logging.error(submit_file)
 
     return task_id
-

From a627293ad996b4ce24827e9b545452c73453be1f Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Fri, 1 Dec 2017 11:50:53 +0000
Subject: [PATCH 024/178] issues with storing gzip decompressed streams that
 are not ascii text

---
 postprocess/post_b64.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/postprocess/post_b64.py b/postprocess/post_b64.py
index 69bcab7..ddd3564 100644
--- a/postprocess/post_b64.py
+++ b/postprocess/post_b64.py
@@ -26,7 +26,11 @@ def run(results, raw_paste_data, paste_object):
                 # Also get the MD5 of the decoded file
                 try:
                     uncompressed = gzip.decompress(b64decode(b64_str))
-                    paste_object["decompressed_stream"] = uncompressed
+                    try:
+                        encoded_paste_data = uncompressed.encode('utf-8')
+                        paste_object["decompressed_stream"] = encoded_paste_data
+                    except:
+                        logging.error("Unable to store data")
                 except Exception as e:
                     logging.error("Unable to decompress gzip stream")
             if rule == 'b64_exe':

From 405c30a0387c7e43a103f8b2594624b2a9800f19 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Fri, 1 Dec 2017 14:29:06 +0000
Subject: [PATCH 025/178] Improve B64 rule and post processor

---
 YaraRules/base64.yar    |  3 +-
 postprocess/post_b64.py | 73 ++++++++++++++++++++++-------------------
 2 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/YaraRules/base64.yar b/YaraRules/base64.yar
index 2d00a68..15a03d3 100644
--- a/YaraRules/base64.yar
+++ b/YaraRules/base64.yar
@@ -10,7 +10,8 @@ rule b64_exe
         reference = "https://github.com/kevthehermit/PasteHunter"
 
     strings:
-        $b64_exe = /TV(oA|pB|pQ|qA|qQ|ro)/
+        $b64_exe = /\bTV(oA|pB|pQ|qA|qQ|ro)/
+        // Double b64 = VFZxUU
     condition:
         $b64_exe
 
diff --git a/postprocess/post_b64.py b/postprocess/post_b64.py
index ddd3564..aead507 100644
--- a/postprocess/post_b64.py
+++ b/postprocess/post_b64.py
@@ -10,52 +10,58 @@
 conf = parse_config()
 
 
-
 def run(results, raw_paste_data, paste_object):
     # Figure out which b64 rule fire
 
-    b64_re = '(?:[A-Za-z0-9+/]{4}){2,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)'
+    # The base64 re can hang on occasion with this one
+    # b64_re = '(?:[A-Za-z0-9+/]{4}){2,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)'
+
+    # This one has a few empty results i need to catch but doesn't kill pastehunter
+    b64_re = '(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?'
     b64_strings = re.findall(b64_re, raw_paste_data)
 
+
     # Set a counter for multiple streams.
     counter = 0
     for b64_str in b64_strings:
         for rule in results:
-            if rule == 'b64_gzip':
-                # Lets try to decode and get a file listing.
-                # Also get the MD5 of the decoded file
-                try:
-                    uncompressed = gzip.decompress(b64decode(b64_str))
+            if len(b64_str) > 0:
+                if rule == 'b64_gzip':
+                    # Lets try to decode and get a file listing.
+                    # Also get the MD5 of the decoded file
                     try:
-                        encoded_paste_data = uncompressed.encode('utf-8')
-                        paste_object["decompressed_stream"] = encoded_paste_data
-                    except:
-                        logging.error("Unable to store data")
-                except Exception as e:
-                    logging.error("Unable to decompress gzip stream")
-            if rule == 'b64_exe':
-                try:
-                    raw_exe = b64decode(b64_str)
-                    paste_object["exe_size"] = len(raw_exe)
-                    paste_object["exe_md5"] = hashlib.md5(raw_exe).hexdigest()
-                    paste_object["exe_sha256"] = hashlib.sha256(raw_exe).hexdigest()
-                    paste_object["VT"] = 'https://www.virustotal.com/#/file/{0}'.format(paste_object["exe_md5"])
-
-                    # Cuckoo
-                    if conf["post_process"]["post_b64"]["cuckoo"]["enabled"]:
+                        uncompressed = gzip.decompress(b64decode(b64_str))
                         try:
-                            task_id = send_to_cuckoo(raw_exe, paste_object["pasteid"])
-                            paste_object["Cuckoo Task ID"] = task_id
-                            logging.info("exe submitted to Cuckoo with task id {0}".format(task_id))
-                        except Exception as e:
-                            logging.error("Unabled to submit sample to cuckoo")
+                            encoded_paste_data = uncompressed.encode('utf-8')
+                            paste_object["decompressed_stream"] = encoded_paste_data
+                        except:
+                            logging.error("Unable to store data")
+                    except Exception as e:
+                        logging.error("Unable to decompress gzip stream")
+                if rule == 'b64_exe':
+                    try:
+                        raw_exe = b64decode(b64_str)
+                        paste_object["exe_size"] = len(raw_exe)
+                        paste_object["exe_md5"] = hashlib.md5(raw_exe).hexdigest()
+                        paste_object["exe_sha256"] = hashlib.sha256(raw_exe).hexdigest()
+                        paste_object["VT"] = 'https://www.virustotal.com/#/file/{0}'.format(paste_object["exe_md5"])
+
+                        # Cuckoo
+                        if conf["post_process"]["post_b64"]["cuckoo"]["enabled"]:
+                            logging.info("Submitting to Cuckoo")
+                            try:
+                                task_id = send_to_cuckoo(raw_exe, paste_object["pasteid"])
+                                paste_object["Cuckoo Task ID"] = task_id
+                                logging.info("exe submitted to Cuckoo with task id {0}".format(task_id))
+                            except Exception as e:
+                                logging.error("Unabled to submit sample to cuckoo")
 
-                    # Viper
-                    if conf["post_process"]["post_b64"]["viper"]["enabled"]:
-                        send_to_cuckoo(raw_exe, paste_object["pasteid"])
+                        # Viper
+                        if conf["post_process"]["post_b64"]["viper"]["enabled"]:
+                            send_to_cuckoo(raw_exe, paste_object["pasteid"])
 
-                except Exception as e:
-                    logging.error("Unable to decode exe file")
+                    except Exception as e:
+                        logging.error("Unable to decode exe file")
 
         # Increase the counter
         counter += 1
@@ -84,4 +90,3 @@ def send_to_cuckoo(raw_exe, pasteid):
             logging.error(submit_file)
 
     return task_id
-

From 3b03d113e5a97c99c579f76a5d10e425a5c0035e Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Fri, 1 Dec 2017 14:42:21 +0000
Subject: [PATCH 026/178] revert b64 to at 0 until i can improve the post
 processor

---
 YaraRules/base64.yar    |  2 +-
 postprocess/post_b64.py | 88 ++++++++++++++++++++++-------------------
 2 files changed, 49 insertions(+), 41 deletions(-)

diff --git a/YaraRules/base64.yar b/YaraRules/base64.yar
index 15a03d3..af50c0c 100644
--- a/YaraRules/base64.yar
+++ b/YaraRules/base64.yar
@@ -13,7 +13,7 @@ rule b64_exe
         $b64_exe = /\bTV(oA|pB|pQ|qA|qQ|ro)/
         // Double b64 = VFZxUU
     condition:
-        $b64_exe
+        $b64_exe at 0
 
 }
 
diff --git a/postprocess/post_b64.py b/postprocess/post_b64.py
index aead507..3e7da2e 100644
--- a/postprocess/post_b64.py
+++ b/postprocess/post_b64.py
@@ -13,61 +13,69 @@
 def run(results, raw_paste_data, paste_object):
     # Figure out which b64 rule fire
 
-    # The base64 re can hang on occasion with this one
-    # b64_re = '(?:[A-Za-z0-9+/]{4}){2,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)'
+    '''
+    ToDo: This needs a lot of work to correctly parse a full exe file from middle of content
+    ToDo: For now going back to @0 in the yara rule
 
-    # This one has a few empty results i need to catch but doesn't kill pastehunter
+    The base64 re can hang on occasion with this one
+    b64_re = '(?:[A-Za-z0-9+/]{4}){2,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)'
+
+    This one has a few empty results i need to catch but doesn't kill pastehunter
     b64_re = '(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?'
+
     b64_strings = re.findall(b64_re, raw_paste_data)
 
 
+
     # Set a counter for multiple streams.
     counter = 0
     for b64_str in b64_strings:
+        logging.info(b64_str[:10])
         for rule in results:
             if len(b64_str) > 0:
-                if rule == 'b64_gzip':
-                    # Lets try to decode and get a file listing.
-                    # Also get the MD5 of the decoded file
+    '''
+
+    b64_str = raw_paste_data
+
+    for rule in results:
+        if rule == 'b64_gzip':
+            # Lets try to decode and get a file listing.
+            # Also get the MD5 of the decoded file
+            try:
+                uncompressed = gzip.decompress(b64decode(b64_str))
+                try:
+                    encoded_paste_data = uncompressed.encode('utf-8')
+                    paste_object["decompressed_stream"] = encoded_paste_data
+                except:
+                    logging.error("Unable to store data")
+            except Exception as e:
+                logging.error("Unable to decompress gzip stream")
+        if rule == 'b64_exe':
+            try:
+                raw_exe = b64decode(raw_paste_data)
+                paste_object["exe_size"] = len(raw_exe)
+                paste_object["exe_md5"] = hashlib.md5(raw_exe).hexdigest()
+                paste_object["exe_sha256"] = hashlib.sha256(raw_exe).hexdigest()
+                paste_object["VT"] = 'https://www.virustotal.com/#/file/{0}'.format(paste_object["exe_md5"])
+
+                # Cuckoo
+                if conf["post_process"]["post_b64"]["cuckoo"]["enabled"]:
+                    logging.info("Submitting to Cuckoo")
                     try:
-                        uncompressed = gzip.decompress(b64decode(b64_str))
-                        try:
-                            encoded_paste_data = uncompressed.encode('utf-8')
-                            paste_object["decompressed_stream"] = encoded_paste_data
-                        except:
-                            logging.error("Unable to store data")
+                        task_id = send_to_cuckoo(raw_exe, paste_object["pasteid"])
+                        paste_object["Cuckoo Task ID"] = task_id
+                        logging.info("exe submitted to Cuckoo with task id {0}".format(task_id))
                     except Exception as e:
-                        logging.error("Unable to decompress gzip stream")
-                if rule == 'b64_exe':
-                    try:
-                        raw_exe = b64decode(b64_str)
-                        paste_object["exe_size"] = len(raw_exe)
-                        paste_object["exe_md5"] = hashlib.md5(raw_exe).hexdigest()
-                        paste_object["exe_sha256"] = hashlib.sha256(raw_exe).hexdigest()
-                        paste_object["VT"] = 'https://www.virustotal.com/#/file/{0}'.format(paste_object["exe_md5"])
-
-                        # Cuckoo
-                        if conf["post_process"]["post_b64"]["cuckoo"]["enabled"]:
-                            logging.info("Submitting to Cuckoo")
-                            try:
-                                task_id = send_to_cuckoo(raw_exe, paste_object["pasteid"])
-                                paste_object["Cuckoo Task ID"] = task_id
-                                logging.info("exe submitted to Cuckoo with task id {0}".format(task_id))
-                            except Exception as e:
-                                logging.error("Unabled to submit sample to cuckoo")
-
-                        # Viper
-                        if conf["post_process"]["post_b64"]["viper"]["enabled"]:
-                            send_to_cuckoo(raw_exe, paste_object["pasteid"])
+                        logging.error("Unabled to submit sample to cuckoo")
 
-                    except Exception as e:
-                        logging.error("Unable to decode exe file")
+                # Viper
+                if conf["post_process"]["post_b64"]["viper"]["enabled"]:
+                    send_to_cuckoo(raw_exe, paste_object["pasteid"])
+
+            except Exception as e:
+                logging.error("Unable to decode exe file")
 
-        # Increase the counter
-        counter += 1
 
-    # Get unique domain count
-    # Update the json
 
     # Send the updated json back
     return paste_object

From dc773f4229a5ca0da315d00e622db5ba5048a033 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Sun, 3 Dec 2017 13:04:30 +0000
Subject: [PATCH 027/178] Update Dox rule

---
 YaraRules/core_keywords.yar | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/YaraRules/core_keywords.yar b/YaraRules/core_keywords.yar
index 79e8a7d..8d8119a 100644
--- a/YaraRules/core_keywords.yar
+++ b/YaraRules/core_keywords.yar
@@ -16,10 +16,28 @@ rule core_keywords
         $enable_pass = "enable password" wide ascii nocase
         $ssh_priv = "BEGIN RSA PRIVATE KEY" wide ascii nocase
         $pgp_priv = "BEGIN PGP PRIVATE KEY" wide ascii nocase
-        $DOX = " DOX" wide ascii nocase
         $hacked = "hacked by" wide ascii nocase
         $onion_url = /.*.\.onion/
     condition:
         any of them
 
+}
+
+rule dox
+{
+    meta:
+        author = "@KevTheHermit"
+        info = "Part of PasteHunter"
+        reference = "https://github.com/kevthehermit/PasteHunter"
+
+    strings:
+        $dox = "DOX" wide ascii nocase fullword
+        $keywords1 = "name" wide ascii nocase
+        $keyword2 = "dob" wide ascii nocase
+        $keyword3 = "age" wide ascii nocase
+        $keyword4 = "password" wide ascii nocase
+        $keyword5 = "email" wide ascii nocase
+    condition:
+        $dox and 3 of ($keywords*)
+
 }
\ No newline at end of file

From 2b82017685f5922c76b113874c63ce2824abe7d6 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Sun, 3 Dec 2017 16:35:32 +0000
Subject: [PATCH 028/178] Add some base64 rules and tune DOX

---
 YaraRules/base64.yar        | 68 ++++++++++++++++++++++++++++++++++++-
 YaraRules/core_keywords.yar |  4 +--
 postprocess/post_b64.py     |  4 +++
 3 files changed, 73 insertions(+), 3 deletions(-)

diff --git a/YaraRules/base64.yar b/YaraRules/base64.yar
index 2d00a68..8078195 100644
--- a/YaraRules/base64.yar
+++ b/YaraRules/base64.yar
@@ -88,4 +88,70 @@ rule b64_url
     condition:
         any of them
 
-}
\ No newline at end of file
+}
+
+rule b64_doc
+{
+    meta:
+        author = "@KevTheHermit"
+        info = "Part of PasteHunter"
+        reference = "https://github.com/kevthehermit/PasteHunter"
+
+    strings:
+        $b64_doc = "0M8R4" // d0cf11
+    condition:
+        $b64_doc at 0
+
+}
+
+rule b64_rtf
+{
+    meta:
+        author = "@KevTheHermit"
+        info = "Part of PasteHunter"
+        reference = "https://github.com/kevthehermit/PasteHunter"
+
+    strings:
+        $b64_rtf = "e1xydGY" // {\rtf
+    condition:
+        $b64_rtf at 0
+
+}
+
+rule b64_docx
+{
+    meta:
+        author = "@KevTheHermit"
+        info = "Part of PasteHunter"
+        reference = "https://github.com/kevthehermit/PasteHunter"
+
+    strings:
+        $b64_zip = "UEs"
+        $docx1 = "d29yZC9fcmVsc" // word/_rel
+        $docx2 = "Zm9udFRhYmxl" // fontTable
+        $docx3 = "ZG9jUHJvcHM" // docProps
+        $docx4 = "Q29udGVudF9UeXBlcw" // Content_Types
+        $docx5 = "c2V0dGluZ3M" //settings
+    condition:
+        $b64_zip at 0 and 3 of ($docx*)
+
+}
+
+rule b64_xml_doc
+{
+    meta:
+        author = "@KevTheHermit"
+        info = "Part of PasteHunter"
+        reference = "https://github.com/kevthehermit/PasteHunter"
+
+    strings:
+        $b64_xml = "PD94bWwg"
+        $docx1 = "b3BlbmRvY3VtZW50" // opendocument
+        $docx2 = "InBhcmFncmFwaCI" // "paragraph"
+        $docx3 = "b2ZmaWNlL3dvcmQv" // office/word/
+        $docx4 = "RG9jdW1lbnRQcm9wZXJ0aWVz" // DocumentProperties
+    condition:
+        $b64_xml at 0 and 3 of ($docx*)
+
+}
+PD94bWwg
\ No newline at end of file
diff --git a/YaraRules/core_keywords.yar b/YaraRules/core_keywords.yar
index 8d8119a..609b309 100644
--- a/YaraRules/core_keywords.yar
+++ b/YaraRules/core_keywords.yar
@@ -32,12 +32,12 @@ rule dox
 
     strings:
         $dox = "DOX" wide ascii nocase fullword
-        $keywords1 = "name" wide ascii nocase
+        $keyword1 = "name" wide ascii nocase
         $keyword2 = "dob" wide ascii nocase
         $keyword3 = "age" wide ascii nocase
         $keyword4 = "password" wide ascii nocase
         $keyword5 = "email" wide ascii nocase
     condition:
-        $dox and 3 of ($keywords*)
+        $dox and 3 of ($keyword*)
 
 }
\ No newline at end of file
diff --git a/postprocess/post_b64.py b/postprocess/post_b64.py
index acad122..46c6540 100644
--- a/postprocess/post_b64.py
+++ b/postprocess/post_b64.py
@@ -40,6 +40,8 @@ def run(results, raw_paste_data, paste_object):
                         paste_object["exe_size"] = len(raw_exe)
                         paste_object["exe_md5"] = hashlib.md5(raw_exe).hexdigest()
                         paste_object["exe_sha256"] = hashlib.sha256(raw_exe).hexdigest()
+
+                        # We are guessing that the sample has been submitted, and crafting a URL
                         paste_object["VT"] = 'https://www.virustotal.com/#/file/{0}'.format(paste_object["exe_md5"])
 
                         # Cuckoo
@@ -56,6 +58,8 @@ def run(results, raw_paste_data, paste_object):
                         if conf["post_process"]["post_b64"]["viper"]["enabled"]:
                             send_to_cuckoo(raw_exe, paste_object["pasteid"])
 
+                        # VirusTotal
+
                     except Exception as e:
                         logging.error("Unable to decode exe file")
 

From 8de4e1ad5fa9b83391706ad8d18050a1a298f1b2 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Sun, 3 Dec 2017 16:36:16 +0000
Subject: [PATCH 029/178] Fix extra b64

---
 YaraRules/base64.yar | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/YaraRules/base64.yar b/YaraRules/base64.yar
index 8078195..ca15308 100644
--- a/YaraRules/base64.yar
+++ b/YaraRules/base64.yar
@@ -153,5 +153,4 @@ rule b64_xml_doc
     condition:
         $b64_xml at 0 and 3 of ($docx*)
 
-}
-PD94bWwg
\ No newline at end of file
+}
\ No newline at end of file

From 7e3157f7c583da42dac784777025280df632a040 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Sun, 3 Dec 2017 23:04:48 +0000
Subject: [PATCH 030/178] Update Base64

---
 YaraRules/base64.yar | 2 +-
 pastehunter.py       | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/YaraRules/base64.yar b/YaraRules/base64.yar
index f0eca55..663d6ca 100644
--- a/YaraRules/base64.yar
+++ b/YaraRules/base64.yar
@@ -13,7 +13,7 @@ rule b64_exe
         $b64_exe = /\bTV(oA|pB|pQ|qA|qQ|ro)/
         // Double b64 = VFZxUU
     condition:
-        $b64_exe
+        $b64_exe at 0
 
 }
 
diff --git a/pastehunter.py b/pastehunter.py
index 9d3473f..343f937 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -71,6 +71,7 @@ def paste_scanner():
             matches = rules.match(data=raw_paste_data)
         except Exception as e:
             logging.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
+            q.task_done()
             continue
 
         results = []
@@ -135,7 +136,10 @@ def paste_scanner():
             paste_data['raw_paste'] = raw_paste_data
             paste_data['YaraRule'] = results
             for output in outputs:
-                output.store_paste(paste_data)
+                try:
+                    output.store_paste(paste_data)
+                except Exception as e:
+                    logging.error("Unable to store {0}".format(paste_data["pasteid"]))
 
         # Mark Tasks as complete
         q.task_done()

From d20c6d1815917a851ca4c60ee273b36d7084b1d0 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Mon, 4 Dec 2017 16:17:17 +0000
Subject: [PATCH 031/178] utf8 encode the uncompressed stream. Still needs work

---
 postprocess/post_b64.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/postprocess/post_b64.py b/postprocess/post_b64.py
index 6922b7f..cc7ef5f 100644
--- a/postprocess/post_b64.py
+++ b/postprocess/post_b64.py
@@ -40,7 +40,8 @@ def run(results, raw_paste_data, paste_object):
                 # Also get the MD5 of the decoded file
                 try:
                     uncompressed = gzip.decompress(b64decode(raw_paste_data))
-                    paste_object["decompressed_stream"] = uncompressed
+                    encoded = uncompressed.encode('utf-8')
+                    paste_object["decompressed_stream"] = encoded
                 except Exception as e:
                     logging.error("Unable to decompress gzip stream")
             if rule == 'b64_exe':

From a5ab8ca6b93ca2b2cf8d495862ed597bf9d7a89d Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Mon, 4 Dec 2017 16:17:33 +0000
Subject: [PATCH 032/178] Add test rules file

---
 YaraRules/test_rules.yar | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 YaraRules/test_rules.yar

diff --git a/YaraRules/test_rules.yar b/YaraRules/test_rules.yar
new file mode 100644
index 0000000..9d8ce11
--- /dev/null
+++ b/YaraRules/test_rules.yar
@@ -0,0 +1,39 @@
+/*
+    These are test rules
+*/
+
+rule test_hex_MZ
+{
+    meta:
+        author = "kevthehermit"
+        info = "Part of PasteHunter"
+        reference = "https://github.com/kevthehermit/PasteHunter"
+
+    strings:
+        $mz_hex  = "4d5a" nocase wide ascii
+
+    condition:
+        $mz_hex at 0
+
+}
+
+rule test_vbscript
+{
+    meta:
+        author = "kevthehermit"
+        info = "Part of PasteHunter"
+        reference = "https://github.com/kevthehermit/PasteHunter"
+
+    strings:
+        $a1 = "Function" nocase wide ascii fullword
+        $a2 = "CreateObject" nocase wide ascii fullword
+        $a3  = "Wscript" nocase wide ascii fullword
+        $a4 = "As Long" nocase wide ascii fullword
+        $a5 = "run" nocase wide ascii fullword
+        $b1 = "NtAllocateVirtualMemory" nocase wide ascii fullword
+        $b2 = "NtWriteVirtualMemory" nocase wide ascii fullword
+
+
+    condition:
+        3 of them
+}
\ No newline at end of file

From 83b4ab2704bf2c5650717e9ad4e5ad114c69c2bd Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Fri, 8 Dec 2017 15:06:17 +0000
Subject: [PATCH 033/178] Enable Test Rules

---
 YaraRules/test_rules.yar | 43 ++++++++++++++++++++++++++++++++--------
 pastehunter.py           | 17 ++++++++++++++--
 2 files changed, 50 insertions(+), 10 deletions(-)

diff --git a/YaraRules/test_rules.yar b/YaraRules/test_rules.yar
index 9d8ce11..c6f884f 100644
--- a/YaraRules/test_rules.yar
+++ b/YaraRules/test_rules.yar
@@ -25,15 +25,42 @@ rule test_vbscript
         reference = "https://github.com/kevthehermit/PasteHunter"
 
     strings:
-        $a1 = "Function" nocase wide ascii fullword
-        $a2 = "CreateObject" nocase wide ascii fullword
-        $a3  = "Wscript" nocase wide ascii fullword
-        $a4 = "As Long" nocase wide ascii fullword
-        $a5 = "run" nocase wide ascii fullword
-        $b1 = "NtAllocateVirtualMemory" nocase wide ascii fullword
-        $b2 = "NtWriteVirtualMemory" nocase wide ascii fullword
+        $a = "Function" nocase wide ascii fullword
+        $b = "CreateObject" nocase wide ascii fullword
+        $c  = "Wscript" nocase wide ascii fullword
+        $d = "As Long" nocase wide ascii fullword
+        $e = "run" nocase wide ascii fullword
+        $f = "for each" nocase wide ascii fullword
+        $g = "end function" nocase wide ascii fullword
+        $h = "NtAllocateVirtualMemory" nocase wide ascii fullword
+        $i = "NtWriteVirtualMemory" nocase wide ascii fullword
 
 
     condition:
-        3 of them
+        5 of them
+}
+
+rule test_autoit
+{
+    meta:
+        author = "kevthehermit"
+        info = "Part of PasteHunter"
+        reference = "https://github.com/kevthehermit/PasteHunter"
+
+    strings:
+        $tray = "NoTrayIcon" nocase wide ascii fullword
+        $a = "iniread" nocase wide ascii fullword
+        $b = "fileinstall" nocase wide ascii fullword
+        $c  = "EndFunc" nocase wide ascii fullword
+        $d = "FileRead" nocase wide ascii fullword
+        $e = "DllStructSetData" nocase wide ascii fullword
+        $f = "Global Const" nocase wide ascii fullword
+        $g = "Run(@AutoItExe" nocase wide ascii fullword
+        $h = "StringReplace" nocase wide ascii fullword
+        $i = "filewrite" nocase wide ascii fullword
+
+
+
+    condition:
+        ($tray and 3 of them) or (5 of them)
 }
\ No newline at end of file
diff --git a/pastehunter.py b/pastehunter.py
index 343f937..2bd3a95 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -44,11 +44,21 @@
         outputs.append(instance)
 
 
-def yara_index(rule_path):
+def yara_index(rule_path, blacklist, test_rules):
     index_file = os.path.join(rule_path, 'index.yar')
     with open(index_file, 'w') as yar:
         for filename in os.listdir('YaraRules'):
             if filename.endswith('.yar') and filename != 'index.yar':
+                if filename == 'blacklist.yar':
+                    if blacklist:
+                        logging.info("Enable Blacklist Rules")
+                    else:
+                        continue
+                if filename == 'test_rules.yar':
+                    if test_rules:
+                        logging.info("Enable Test Rules")
+                    else:
+                        continue
                 include = 'include "{0}"\n'.format(filename)
                 yar.write(include)
 
@@ -149,7 +159,10 @@ def paste_scanner():
     logging.info("Compile Yara Rules")
     try:
         # Update the yara rules index
-        yara_index(conf['yara']['rule_path'])
+        yara_index(conf['yara']['rule_path'],
+                   conf['yara']['blacklist'],
+                   conf['yara']['test_rules'])
+
         # Compile the yara rules we will use to match pastes
         index_file = os.path.join(conf['yara']['rule_path'], 'index.yar')
         rules = yara.compile(index_file)

From 2b96c517dd35ad06e69ce4e9672c7fef1e4125b0 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Sun, 10 Dec 2017 16:21:18 +0000
Subject: [PATCH 034/178] Add Weekly Index to Elastic Output

---
 README.md                 | 5 +++--
 outputs/elastic_output.py | 8 ++++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index c375fd4..185084a 100644
--- a/README.md
+++ b/README.md
@@ -19,8 +19,9 @@ Pastehunter comes with a couple of post process modules that extact useful data
 The following are default modules:
 
  - Emails
- - 
-
+ - Base64 Decoders
+   - Cuckoo
+   - Viper
 
 ## PreReqs
 
diff --git a/outputs/elastic_output.py b/outputs/elastic_output.py
index d29e534..996c0bc 100644
--- a/outputs/elastic_output.py
+++ b/outputs/elastic_output.py
@@ -1,8 +1,9 @@
 from elasticsearch import Elasticsearch
 from common import parse_config
+from datetime import datetime
+import logging
 
 config = parse_config()
-import logging
 
 
 class ElasticOutput():
@@ -13,6 +14,7 @@ def __init__(self):
         es_user = config['outputs']['elastic_output']['elastic_user']
         es_pass = config['outputs']['elastic_output']['elastic_pass']
         self.es_index = config['outputs']['elastic_output']['elastic_index']
+        self.weekly = config['outputs']['elastic_output']['weekly-index']
         es_ssl = config['outputs']['elastic_output']['elastic_ssl']
         self.test = False
         try:
@@ -25,7 +27,9 @@ def __init__(self):
     def store_paste(self, paste_data):
         if self.test:
             index_name = self.es_index
-            # Consider adding date to the index
+            if self.weekly:
+                week_number = datetime.date(datetime.now()).isocalendar()[1]
+                index_name = '{0}-{1}'.format(index_name, week_number)
             # ToDo: With multiple paste sites a pasteid collision is more likly!
             self.es.index(index=index_name, doc_type='paste', id=paste_data['pasteid'], body=paste_data)
             logging.info("Stored {0} Paste {1}, Matched Rule {2}".format(paste_data['pastesite'],

From c45df9399434245519c918d4ad933b7aa1a732c5 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Sun, 10 Dec 2017 16:28:10 +0000
Subject: [PATCH 035/178] Update Readme for latest yara

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 185084a..fffd80d 100644
--- a/README.md
+++ b/README.md
@@ -47,7 +47,7 @@ https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html
 https://www.elastic.co/guide/en/kibana/current/deb.html
 
 ### Yara
-https://yara.readthedocs.io/en/v3.6.0/gettingstarted.html#compiling-and-installing-yara
+https://yara.readthedocs.io/en/latest/gettingstarted.html#compiling-and-installing-yara
 
 If you have yara errors check the installed version numbers for yara and yara-python match the lastest versions.
 

From d2065bda05695df1efcb28f566f2224f27d44431 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Sun, 10 Dec 2017 16:39:58 +0000
Subject: [PATCH 036/178] Store All for pastebin

---
 pastehunter.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pastehunter.py b/pastehunter.py
index 2bd3a95..7442144 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -131,8 +131,10 @@ def paste_scanner():
         # then append "no_match" to results. This will then force output.
 
         #ToDo: Need to make this check for each output not universal
+
+        paste_site = paste_data['pastesite']
         store_all = conf['inputs']['pastebin']['store_all']
-        if store_all is True:
+        if store_all is True and paste_site == 'pastebin.com':
             if len(results) == 0:
                 results.append('no_match')
 

From 81268474f05d5917f9c2d1d1c7c1ffbeece63540 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Sun, 10 Dec 2017 16:43:51 +0000
Subject: [PATCH 037/178] Weekly Index for Elastic

---
 outputs/elastic_output.py | 2 +-
 settings.json.sample      | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/outputs/elastic_output.py b/outputs/elastic_output.py
index 996c0bc..fd03054 100644
--- a/outputs/elastic_output.py
+++ b/outputs/elastic_output.py
@@ -14,7 +14,7 @@ def __init__(self):
         es_user = config['outputs']['elastic_output']['elastic_user']
         es_pass = config['outputs']['elastic_output']['elastic_pass']
         self.es_index = config['outputs']['elastic_output']['elastic_index']
-        self.weekly = config['outputs']['elastic_output']['weekly-index']
+        self.weekly = config['outputs']['elastic_output']['weekly_index']
         es_ssl = config['outputs']['elastic_output']['elastic_ssl']
         self.test = False
         try:
diff --git a/settings.json.sample b/settings.json.sample
index 2d2c341..e64121d 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -36,7 +36,8 @@
       "elastic_port": 9200,
       "elastic_user": "elastic",
       "elastic_pass": "changeme",
-      "elastic_ssl": false
+      "elastic_ssl": false,
+      "weekly_index" false
     },
     "json_output": {
       "enabled": true,

From 30e4af421073d2fa62704eecd118fa25a9cd5bbc Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Sun, 10 Dec 2017 16:45:22 +0000
Subject: [PATCH 038/178] Fix Weekly Index

---
 settings.json.sample | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/settings.json.sample b/settings.json.sample
index e64121d..d39f245 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -37,7 +37,7 @@
       "elastic_user": "elastic",
       "elastic_pass": "changeme",
       "elastic_ssl": false,
-      "weekly_index" false
+      "weekly_index": false
     },
     "json_output": {
       "enabled": true,

From a6bc827572292266b0f24b5bb081807e6c65a821 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Sun, 10 Dec 2017 16:53:47 +0000
Subject: [PATCH 039/178] Enable Test Rules in config

---
 settings.json.sample | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/settings.json.sample b/settings.json.sample
index d39f245..7df54f0 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -75,7 +75,8 @@
   },
   "yara": {
     "rule_path": "YaraRules",
-    "blacklist": true
+    "blacklist": true,
+    "test_rules": false
   },
   "post_process": {
     "post_email": {

From 86b5b252b76755dba4f0c36daabe2784675f7a15 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Sun, 10 Dec 2017 17:00:53 +0000
Subject: [PATCH 040/178] Add Year to weekly index in Elastic output

---
 outputs/elastic_output.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/outputs/elastic_output.py b/outputs/elastic_output.py
index fd03054..f333d07 100644
--- a/outputs/elastic_output.py
+++ b/outputs/elastic_output.py
@@ -28,8 +28,9 @@ def store_paste(self, paste_data):
         if self.test:
             index_name = self.es_index
             if self.weekly:
+                year_number = datetime.date(datetime.now()).isocalendar()[0]
                 week_number = datetime.date(datetime.now()).isocalendar()[1]
-                index_name = '{0}-{1}'.format(index_name, week_number)
+                index_name = '{0}-{1}-{2}'.format(index_name, year_number, week_number)
             # ToDo: With multiple paste sites a pasteid collision is more likly!
             self.es.index(index=index_name, doc_type='paste', id=paste_data['pasteid'], body=paste_data)
             logging.info("Stored {0} Paste {1}, Matched Rule {2}".format(paste_data['pastesite'],

From f45ff8c0069a5cb92ad474bb4c2e3a09d8b8637f Mon Sep 17 00:00:00 2001
From: MrAdz350 <adamorton@blueyonder.co.uk>
Date: Mon, 11 Dec 2017 21:17:23 +0000
Subject: [PATCH 041/178] Added setting to allow configuration of sleep
 duration between runs

---
 pastehunter.py       | 4 ++--
 settings.json.sample | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/pastehunter.py b/pastehunter.py
index 7442144..5c5ba83 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -215,8 +215,8 @@ def paste_scanner():
             q.join()
 
             # Slow it down a little
-            logging.info("Sleeping for 10 Seconds")
-            sleep(10)
+            logging.info("Sleeping for " + str(conf['general']['run_frequency']) + " Seconds")
+            sleep(conf['general']['run_frequency'])
 
     except KeyboardInterrupt:
         logging.info("Stopping Threads")
diff --git a/settings.json.sample b/settings.json.sample
index 7df54f0..a340105 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -78,6 +78,9 @@
     "blacklist": true,
     "test_rules": false
   },
+  "general": {
+    "run_frequency": 300
+  },
   "post_process": {
     "post_email": {
       "enabled": false,

From 4d42642552f24f3a5cb960ed9a8945349fb349f1 Mon Sep 17 00:00:00 2001
From: ntddk <ntinternal@gmail.com>
Date: Thu, 14 Dec 2017 13:28:37 +0900
Subject: [PATCH 042/178] Enrich rules

---
 YaraRules/api_keys.yar      | 53 +++++++++++++++++++++++++++++++++++++
 YaraRules/core_keywords.yar |  5 +++-
 2 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/YaraRules/api_keys.yar b/YaraRules/api_keys.yar
index f5a393b..f131355 100644
--- a/YaraRules/api_keys.yar
+++ b/YaraRules/api_keys.yar
@@ -50,3 +50,56 @@ rule google_api
     condition:
         all of them
 }
+
+rule slack_api
+{
+    meta:
+        author = "@ntddk"
+        info = "Part of PasteHunter"
+        reference = "https://github.com/kevthehermit/PasteHunter"
+
+    strings:
+        $a = /(xox[p|b|o|a]-[0-9]{12}-[0-9]{12}-[0-9]{12}-[a-z0-9]{32})/
+    condition:
+        all of them
+}
+
+rule github_api
+{
+    meta:
+        author = "@ntddk"
+        info = "Part of PasteHunter"
+        reference = "https://github.com/kevthehermit/PasteHunter"
+
+    strings:
+        $a = /[g|G][i|I][t|T][h|H][u|U][b|B].*[[\'|"]0-9a-zA-Z]{35,40}[\'|"]/
+    condition:
+        all of them
+}
+
+rule aws_api
+{
+    meta:
+        author = "@ntddk"
+        info = "Part of PasteHunter"
+        reference = "https://github.com/kevthehermit/PasteHunter"
+
+    strings:
+        $a = /AKIA[0-9A-Z]{16}/
+    condition:
+        all of them
+}
+
+rule heroku_api
+{
+    meta:
+        author = "@ntddk"
+        info = "Part of PasteHunter"
+        reference = "https://github.com/kevthehermit/PasteHunter"
+
+    strings:
+        $a = /[h|H][e|E][r|R][o|O][k|K][u|U].*[0-9A-F]{8}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{12}/
+    condition:
+        all of them
+}
+
diff --git a/YaraRules/core_keywords.yar b/YaraRules/core_keywords.yar
index 609b309..50c67e3 100644
--- a/YaraRules/core_keywords.yar
+++ b/YaraRules/core_keywords.yar
@@ -15,6 +15,9 @@ rule core_keywords
         $enabled_sec = "enable secret" wide ascii nocase
         $enable_pass = "enable password" wide ascii nocase
         $ssh_priv = "BEGIN RSA PRIVATE KEY" wide ascii nocase
+        $openssh_priv = "BEGIN OPENSSH PRIVATE KEY" wide ascii nocase
+        $dsa_priv = "BEGIN DSA PRIVATE KEY" wide ascii nocase
+        $ec_priv = "BEGIN EC PRIVATE KEY" wide ascii nocase
         $pgp_priv = "BEGIN PGP PRIVATE KEY" wide ascii nocase
         $hacked = "hacked by" wide ascii nocase
         $onion_url = /.*.\.onion/
@@ -40,4 +43,4 @@ rule dox
     condition:
         $dox and 3 of ($keyword*)
 
-}
\ No newline at end of file
+}

From 4f1aa41ae4469f99f5879e1592d8d54301574103 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Mon, 18 Dec 2017 11:53:28 +0000
Subject: [PATCH 043/178] Update Docker to 6.1.0 Add troubleshoot to readme for
 docker

---
 README.md          |  7 +++++++
 docker-compose.yml | 12 ++++++------
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index fffd80d..7e9dcd6 100644
--- a/README.md
+++ b/README.md
@@ -82,6 +82,13 @@ Kibana is using the static IP address : 172.16.10.12 in the `esnet`  network
 Elasticsearch is running only on the localhost interface on default port 9200.
 The mount point is `/usr/share/elasticsearch/data` by default
 
+if elastic search fails to start and you see "max virtual memory areas vm.max_map_count [65530] likely too low"
+in the logs then try 
+
+`sudo sysctl -w vm.max_map_count=262144`
+
+https://elk-docker.readthedocs.io/#troubleshooting Paragraph starting As from version 5
+
 #### Pastehunter
 
 You can re-run the pastehunter script by doing `docker-compose up -d`
diff --git a/docker-compose.yml b/docker-compose.yml
index 370e750..0f20be9 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -7,12 +7,12 @@ services:
       - ./logs:/usr/src/pastehunter/logs
     networks:
       esnet:
-        ipv4_address: 172.16.10.11
+        ipv4_address: 172.18.10.11
     depends_on:
       - "elasticsearch"
       - "kibana"
   elasticsearch:
-    image: docker.elastic.co/elasticsearch/elasticsearch:5.6.2
+    image: docker.elastic.co/elasticsearch/elasticsearch:6.1.0
     container_name: elasticsearch
     environment:
       - cluster.name=docker-cluster
@@ -29,15 +29,15 @@ services:
       - "127.0.0.1:9200:9200"
     networks:
       esnet:
-        ipv4_address: 172.16.10.10
+        ipv4_address: 172.18.10.10
 
   kibana:
-    image: docker.elastic.co/kibana/kibana:5.6.2
+    image: docker.elastic.co/kibana/kibana:6.1.0
     ports:
       - "127.0.0.1:5601:5601"
     networks:
       esnet:
-        ipv4_address: 172.16.10.12
+        ipv4_address: 172.18.10.12
     depends_on:
       - "elasticsearch"
 
@@ -49,4 +49,4 @@ networks:
     driver: bridge
     ipam:
       config:
-      - subnet: 172.16.10.0/24
+      - subnet: 172.18.10.0/24

From bf199457e25f36617fc17ccff3dfe599a0e3bbb3 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Mon, 18 Dec 2017 15:08:43 +0000
Subject: [PATCH 044/178] Fix Slack API Key Rule

---
 YaraRules/api_keys.yar | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/YaraRules/api_keys.yar b/YaraRules/api_keys.yar
index f131355..c0e4189 100644
--- a/YaraRules/api_keys.yar
+++ b/YaraRules/api_keys.yar
@@ -59,7 +59,7 @@ rule slack_api
         reference = "https://github.com/kevthehermit/PasteHunter"
 
     strings:
-        $a = /(xox[p|b|o|a]-[0-9]{12}-[0-9]{12}-[0-9]{12}-[a-z0-9]{32})/
+        $a = /(xox(p|b|o|a)-[0-9]{9,12}-[0-9]{9,12}-[0-9]{9,12}-[a-z0-9]{32})/
     condition:
         all of them
 }

From 72396aa77a35569a656ca6f3d2a7bd88333a5a83 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Sat, 23 Dec 2017 11:10:02 +0000
Subject: [PATCH 045/178] Fix Config name in readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7e9dcd6..6ec504d 100644
--- a/README.md
+++ b/README.md
@@ -97,7 +97,7 @@ Docker-compose will use already running instances of Elasticsearch and Kibana
 
 # Configure
 
-copy settings.conf.sample to settings.conf
+copy settings.json.sample to settings.json
 populate the details.
 For the scraping API you need to whitelist your IP on pastebin. No API key is required. See the link above
 

From e2d5f001a83c18e6f4f725717eb65fb3387adc38 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Fri, 5 Jan 2018 09:40:11 +0000
Subject: [PATCH 046/178] Update default config settings

---
 settings.json.sample | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/settings.json.sample b/settings.json.sample
index a340105..00804db 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -37,10 +37,10 @@
       "elastic_user": "elastic",
       "elastic_pass": "changeme",
       "elastic_ssl": false,
-      "weekly_index": false
+      "weekly_index": true
     },
     "json_output": {
-      "enabled": true,
+      "enabled": false,
       "module": "outputs.json_output",
       "classname": "JsonOutput",
       "output_path": "logs/json/",
@@ -83,12 +83,12 @@
   },
   "post_process": {
     "post_email": {
-      "enabled": false,
+      "enabled": true,
       "module": "postprocess.post_email",
       "rule_list": ["email_list"]
     },
     "post_b64": {
-      "enabled": false,
+      "enabled": true,
       "module": "postprocess.post_b64",
       "rule_list": ["b64_exe", "b64_rar", "b64_zip", "b64_gzip"],
       "cuckoo": {

From c26dd9bdad80ea448d84137719454837bc35e766 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Sun, 28 Jan 2018 13:05:23 +0000
Subject: [PATCH 047/178] Add email body.

---
 outputs/smtp_output.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/outputs/smtp_output.py b/outputs/smtp_output.py
index 2bae275..4bf5b34 100644
--- a/outputs/smtp_output.py
+++ b/outputs/smtp_output.py
@@ -44,7 +44,12 @@ def store_paste(self, paste_data):
         msg['From'] = self.smtp_user
         msg['To'] = self.recipient
 
-        body = 'This is the body of the email'
+        body = 'PasteHunter Notification\n' \
+               'The following rules {0} were found in paste {1}.\n' \
+               'A Copy of the paste has been attached'.format(', '.join(paste_data['YaraRule']),
+                                                              paste_data['pasteid'])
+
+
         json_body = json.dumps(paste_data)
         # Attach the body
         msg.attach(email.mime.text.MIMEText(body, 'plain'))

From 742b4ae96642379403431ebce5e251442ff33098 Mon Sep 17 00:00:00 2001
From: kevthehermit <kevthehermit@gmail.com>
Date: Sun, 28 Jan 2018 13:06:01 +0000
Subject: [PATCH 048/178] Update Email Subject line

---
 outputs/smtp_output.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/outputs/smtp_output.py b/outputs/smtp_output.py
index 4bf5b34..7dfe080 100644
--- a/outputs/smtp_output.py
+++ b/outputs/smtp_output.py
@@ -40,7 +40,7 @@ def store_paste(self, paste_data):
             return
 
         msg = MIMEMultipart()
-        msg['Subject'] = 'PasteHunter Alert {0}'.format(paste_data['YaraRule'])
+        msg['Subject'] = 'PasteHunter Alert {0}'.format(', '.join(paste_data['YaraRule']))
         msg['From'] = self.smtp_user
         msg['To'] = self.recipient
 

From 23d477eaca7fc090cc4497599f0b69ebc3ee7b7f Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Sat, 10 Feb 2018 12:25:52 +0000
Subject: [PATCH 049/178] Fix path for csv logging

---
 settings.json.sample | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/settings.json.sample b/settings.json.sample
index 00804db..c2189b5 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -51,7 +51,7 @@
       "enabled": false,
       "module": "outputs.csv_output",
       "classname": "CSVOutput",
-      "output_path": "/logs/csv/"
+      "output_path": "logs/csv/"
     },
     "syslog_output": {
       "enabled": false,

From db6bbf9b9e8c8cfbe0736a59eacb417f79e853b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guillaume=20Qu=C3=A9r=C3=A9?= <none@none.none>
Date: Wed, 21 Feb 2018 11:42:11 +0100
Subject: [PATCH 050/178] [add] Output to multiple recipients over SMTP

Also adds basic logic to the rules by introducing mandatory rules.
---
 README.md              | 11 +++++-
 outputs/smtp_output.py | 90 ++++++++++++++++++++++++------------------
 settings.json.sample   | 14 ++++++-
 3 files changed, 73 insertions(+), 42 deletions(-)

diff --git a/README.md b/README.md
index 6ec504d..7027a34 100644
--- a/README.md
+++ b/README.md
@@ -13,12 +13,19 @@ Support for the following sites is listed as ToDo:
  - paste.ee
 
 ## Supported Outputs
+Pastehunter supports several output modules:
+ - dump to ElasticSearch DB (default)
+ - email sending over SMTP
+ - dump to JSON file
+ - dump to CSV file
+
+### SMTP
+Multiple recipients can be specified, with different rulesets each.
+It's possible to combine these rules using simple OR or AND logic (respectively rule_list and mandatory_rule_list).
 
 ## PostProcess Modules
 Pastehunter comes with a couple of post process modules that extact useful data from pastes or pass them to other services
 The following are default modules:
-
- - Emails
  - Base64 Decoders
    - Cuckoo
    - Viper
diff --git a/outputs/smtp_output.py b/outputs/smtp_output.py
index 7dfe080..d38873a 100644
--- a/outputs/smtp_output.py
+++ b/outputs/smtp_output.py
@@ -19,58 +19,72 @@ def __init__(self):
         self.smtp_tls = config['outputs']['smtp_output']['smtp_tls']
         self.smtp_user = config['outputs']['smtp_output']['smtp_user']
         self.smtp_pass = config['outputs']['smtp_output']['smtp_pass']
-        self.recipient = config['outputs']['smtp_output']['recipient']
-        self.alert_list = config['outputs']['smtp_output']['rule_list']
+        self.recipients = config['outputs']['smtp_output']['recipients']
 
-    def store_paste(self, paste_data):
-
-        alert_email = False
-        # Alert on All
-        if 'all' in self.alert_list:
-            alert_email = True
-
-        # Alert on specific rules e.g. custom_keywords
-        if 'all' not in self.alert_list:
-            for yara in paste_data['YaraRule']:
-                if yara in self.alert_list:
-                    alert_email = True
 
-        # To Alert or not to Alert
-        if not alert_email:
-            return
+    def _send_mail(self, send_to_address, paste_data):
 
+        # Create the message
         msg = MIMEMultipart()
         msg['Subject'] = 'PasteHunter Alert {0}'.format(', '.join(paste_data['YaraRule']))
         msg['From'] = self.smtp_user
-        msg['To'] = self.recipient
-
-        body = 'PasteHunter Notification\n' \
-               'The following rules {0} were found in paste {1}.\n' \
-               'A Copy of the paste has been attached'.format(', '.join(paste_data['YaraRule']),
-                                                              paste_data['pasteid'])
+        msg['To'] = send_to_address
 
-
-        json_body = json.dumps(paste_data)
         # Attach the body
+        body = 'Rules : {0}\n' \
+               'Paste : {1} from {2}\n\n' \
+               'A Copy of the paste has been attached'.format(', '.join(paste_data['YaraRule']),
+                                                              paste_data['pasteid'],
+                                                              paste_data['pastesite'])
         msg.attach(email.mime.text.MIMEText(body, 'plain'))
 
-        # Attach the raw paste
-
-        json_att = email.mime.base.MIMEBase('application', 'json')
-        json_att.set_payload(json_body)
-        email.encoders.encode_base64(json_att)
-        json_att.add_header('Content-Disposition', 'attachment; filename="Alert.json"')
-        msg.attach(json_att)
-
-        # Connect and send
-
-        smtp_conn = smtplib.SMTP(self.smtp_host, self.smtp_port)
+        # Attach the raw paste as JSON
+        attachment = email.mime.base.MIMEBase('application', 'json')
+        json_body = json.dumps(paste_data)
+        attachment.set_payload(json_body)
+        email.encoders.encode_base64(attachment)
+        attachment.add_header('Content-Disposition', 'attachment; filename="Alert.json"')
+        msg.attach(attachment)
 
+        # Connect to the SMTP server and send
+        smtp_conn = smtplib.SMTP_SSL(self.smtp_host, self.smtp_port)
         smtp_conn.ehlo()
         if self.smtp_tls:
             smtp_conn.starttls()
-
         smtp_conn.login(self.smtp_user, self.smtp_pass)
-        logging.info("Email Sent")
         smtp_conn.send_message(msg)
         smtp_conn.quit()
+
+        logging.info("Sent mail to {0} with rules {1}".format(send_to_address,
+                                                              ', '.join(paste_data['YaraRule'])))
+
+
+    def store_paste(self, paste_data):
+
+        for recipient_name in self.recipients:
+
+            # Read each recipient's config
+            recipient = self.recipients[recipient_name]
+            recipient_address = recipient['address']
+            all_rules_mandatory = False
+            if len(recipient['mandatory_rule_list']):
+                recipient_rule_list = recipient['mandatory_rule_list']
+                all_rules_mandatory = True
+            else:
+                recipient_rule_list = recipient['rule_list']
+
+            # Check if the recipient has special rule 'all' meaning it gets all alerts
+            if 'all' in recipient_rule_list:
+                self._send_mail(recipient_address, paste_data)
+                return
+
+            # Check if all of the recipient's rules need to be found in the alert
+            if all_rules_mandatory:
+                if all(elem in paste_data['YaraRule'] for elem in recipient_rule_list):
+                    self._send_mail(recipient_address, paste_data)
+                return
+
+            # Nominal case, check if at least one rule is found in the alert
+            if any(elem in paste_data['YaraRule'] for elem in recipient_rule_list):
+                self._send_mail(recipient_address, paste_data)
+                return
diff --git a/settings.json.sample b/settings.json.sample
index c2189b5..9542bc8 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -69,8 +69,18 @@
       "smtp_tls": true,
       "smtp_user": "smtpusername",
       "smtp_pass": "smtppassword",
-      "recipient": "emailaddress that gets the alerts",
-      "rule_list": ["custom_keywords"]
+      "recipients" : {
+        "recipient_1": {
+          "address": "emailaddress that gets the alerts",
+          "rule_list": ["custom_keywords"],
+          "mandatory_rule_list": []
+        },
+        "recipient_2": {
+          "address": "emailaddress that gets the alerts",
+          "rule_list": [],
+          "mandatory_rule_list": ["keyword1", "keyword2"]
+        }
+      }
     }
   },
   "yara": {

From efb617e65cb9f5bdb39964eecfd4a823b928425f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guillaume=20Qu=C3=A9r=C3=A9?= <none@none.none>
Date: Mon, 26 Feb 2018 08:18:50 +0100
Subject: [PATCH 051/178] [fix] Logic flaw when testing rules for multiple
 receivers

Performing return after successful checks wasn't the intended behaviour
since it meant that after a match the rest of the recipient list wouldn't
be checked.

Fix by moving the logic to a separate function working on a single recipient
object, where return has the intended effect.
---
 outputs/smtp_output.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/outputs/smtp_output.py b/outputs/smtp_output.py
index d38873a..839dbfc 100644
--- a/outputs/smtp_output.py
+++ b/outputs/smtp_output.py
@@ -59,9 +59,7 @@ def _send_mail(self, send_to_address, paste_data):
                                                               ', '.join(paste_data['YaraRule'])))
 
 
-    def store_paste(self, paste_data):
-
-        for recipient_name in self.recipients:
+    def _check_recipient_rules(self, paste_data, recipient_name):
 
             # Read each recipient's config
             recipient = self.recipients[recipient_name]
@@ -88,3 +86,9 @@ def store_paste(self, paste_data):
             if any(elem in paste_data['YaraRule'] for elem in recipient_rule_list):
                 self._send_mail(recipient_address, paste_data)
                 return
+
+
+    def store_paste(self, paste_data):
+
+        for recipient_name in self.recipients:
+            self._check_recipient_rules(paste_data, recipient_name)

From 65dfc9e4a81ca53e4d9d4d9faa9f1c5e846ba2af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guillaume=20Qu=C3=A9r=C3=A9?= <none@none.none>
Date: Mon, 26 Feb 2018 09:05:12 +0100
Subject: [PATCH 052/178] [fix] Maintain compatibility with older configuration
 format

---
 outputs/smtp_output.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/outputs/smtp_output.py b/outputs/smtp_output.py
index 839dbfc..c1dad7e 100644
--- a/outputs/smtp_output.py
+++ b/outputs/smtp_output.py
@@ -14,12 +14,19 @@
 
 class SMTPOutput():
     def __init__(self):
-        self.smtp_host = config['outputs']['smtp_output']['smtp_host']
-        self.smtp_port = config['outputs']['smtp_output']['smtp_port']
-        self.smtp_tls = config['outputs']['smtp_output']['smtp_tls']
-        self.smtp_user = config['outputs']['smtp_output']['smtp_user']
-        self.smtp_pass = config['outputs']['smtp_output']['smtp_pass']
-        self.recipients = config['outputs']['smtp_output']['recipients']
+        smtp_object = config['outputs']['smtp_output']
+        self.smtp_host = smtp_object['smtp_host']
+        self.smtp_port = smtp_object['smtp_port']
+        self.smtp_tls = smtp_object['smtp_tls']
+        self.smtp_user = smtp_object['smtp_user']
+        self.smtp_pass = smtp_object['smtp_pass']
+        if 'recipients' in smtp_object:
+            self.recipients = smtp_object['recipients']
+        else:
+            # maintain compatibility with older single recipient config format
+            self.recipients = {'main': {'address': smtp_object['recipient'],
+                                        'rule_list': smtp_object['rule_list'],
+                                        'mandatory_rule_list': []}}
 
 
     def _send_mail(self, send_to_address, paste_data):
@@ -89,6 +96,5 @@ def _check_recipient_rules(self, paste_data, recipient_name):
 
 
     def store_paste(self, paste_data):
-
         for recipient_name in self.recipients:
             self._check_recipient_rules(paste_data, recipient_name)

From 40a2380d11c065d24ec531490b0ed458c9c60d11 Mon Sep 17 00:00:00 2001
From: daverstephens <daverstephens@gmail.com>
Date: Wed, 25 Apr 2018 15:20:16 +0100
Subject: [PATCH 053/178] Updated pastebin scrape URL

---
 settings.json.sample | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/settings.json.sample b/settings.json.sample
index c2189b5..2f2964a 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -3,8 +3,8 @@
     "pastebin":{
       "enabled": true,
       "module": "inputs.pastebin",
-      "api_scrape": "https://pastebin.com/api_scraping.php",
-      "api_raw": "https://pastebin.com/api_scrape_item.php?i=",
+      "api_scrape": "https://scrape.pastebin.com/api_scraping.php",
+      "api_raw": "https://scrape.pastebin.com/api_scrape_item.php?i=",
       "paste_limit": 200,
       "store_all": false
     },

From 38a4322d3c4b71ed46491af9bf120e10bcd7b25d Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Thu, 26 Apr 2018 19:44:42 +0100
Subject: [PATCH 054/178] Update gitignore for code editor

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 6d8e73b..683bd90 100644
--- a/.gitignore
+++ b/.gitignore
@@ -105,3 +105,4 @@ ENV/
 /settings.json
 /.idea
 /postprocess/tester.py
+.vscode/settings.json

From 440283e4b920d790f9ca77a8827f091d26b4100f Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Thu, 26 Apr 2018 20:45:44 +0100
Subject: [PATCH 055/178] Match IP Addresses in Docker and compose

---
 .vscode/sftp.json  | 6 ++++++
 docker-compose.yml | 8 ++++----
 2 files changed, 10 insertions(+), 4 deletions(-)
 create mode 100644 .vscode/sftp.json

diff --git a/.vscode/sftp.json b/.vscode/sftp.json
new file mode 100644
index 0000000..a1b5c2f
--- /dev/null
+++ b/.vscode/sftp.json
@@ -0,0 +1,6 @@
+{
+    "protocol": "sftp",
+    "host": "192.168.1.166",
+    "username": "localadmin",
+    "remotePath": "/home/localadmin/pastehunter_dev/"
+}
diff --git a/docker-compose.yml b/docker-compose.yml
index 0f20be9..0f22642 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -7,7 +7,7 @@ services:
       - ./logs:/usr/src/pastehunter/logs
     networks:
       esnet:
-        ipv4_address: 172.18.10.11
+        ipv4_address: 172.16.10.11
     depends_on:
       - "elasticsearch"
       - "kibana"
@@ -29,7 +29,7 @@ services:
       - "127.0.0.1:9200:9200"
     networks:
       esnet:
-        ipv4_address: 172.18.10.10
+        ipv4_address: 172.16.10.10
 
   kibana:
     image: docker.elastic.co/kibana/kibana:6.1.0
@@ -37,7 +37,7 @@ services:
       - "127.0.0.1:5601:5601"
     networks:
       esnet:
-        ipv4_address: 172.18.10.12
+        ipv4_address: 172.16.10.12
     depends_on:
       - "elasticsearch"
 
@@ -49,4 +49,4 @@ networks:
     driver: bridge
     ipam:
       config:
-      - subnet: 172.18.10.0/24
+      - subnet: 172.16.10.0/24

From 28d7623963346f4d2debca1e3e8da77795fb6a2d Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Thu, 26 Apr 2018 22:13:11 +0100
Subject: [PATCH 056/178] Update logging and SMTP

---
 README.md                 | 19 +++++++++++
 common.py                 |  3 +-
 inputs/dumpz.py           |  3 +-
 inputs/gists.py           | 15 +++++----
 inputs/pastebin.py        |  3 +-
 outputs/elastic_output.py |  7 ++--
 outputs/json_output.py    |  7 ++--
 outputs/smtp_output.py    | 13 +++++---
 pastehunter.py            | 67 ++++++++++++++++++++++++++-------------
 postprocess/post_b64.py   | 13 ++++----
 settings.json.sample      |  2 +-
 11 files changed, 104 insertions(+), 48 deletions(-)

diff --git a/README.md b/README.md
index 7027a34..662f9b7 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,12 @@ Pastehunter supports several output modules:
 ### SMTP
 Multiple recipients can be specified, with different rulesets each.
 It's possible to combine these rules using simple OR or AND logic (respectively rule_list and mandatory_rule_list).
+You need to set SMTP_SECURITY in the config file to one of the following options:
+ - 'tls'
+ - 'starttls'
+ - 'none'
+
+ Refer to your email provider to determine which you require.
 
 ## PostProcess Modules
 Pastehunter comes with a couple of post process modules that extact useful data from pastes or pass them to other services
@@ -108,6 +114,19 @@ copy settings.json.sample to settings.json
 populate the details.
 For the scraping API you need to whitelist your IP on pastebin. No API key is required. See the link above
 
+The logging level can be set to one of the following values. 
+
+
+| Level    | Numerical |
+|----------|-----------|
+| CRITICAL | 50        |
+| ERROR    | 40        |
+| WARNING  | 30        |
+| INFO     | 20        |
+| DEBUG    | 10        |
+| NOTSET   | 0         |
+
+The default is INFO:20
 
 
 # Running
diff --git a/common.py b/common.py
index 81f58b7..1ea162b 100644
--- a/common.py
+++ b/common.py
@@ -1,6 +1,7 @@
 import json
 import logging
 
+logger = logging.getLogger('pastehunter')
 
 # Parse the config file in to a dict
 def parse_config():
@@ -10,6 +11,6 @@ def parse_config():
         with open(conf_file, 'r') as read_conf:
             conf = json.load(read_conf)
     except Exception as e:
-        logging.error("Unable to parse config file: {0}".format(e))
+        logger.error("Unable to parse config file: {0}".format(e))
 
     return conf
diff --git a/inputs/dumpz.py b/inputs/dumpz.py
index ebcbfa2..0132d5f 100644
--- a/inputs/dumpz.py
+++ b/inputs/dumpz.py
@@ -1,6 +1,7 @@
 import requests
 import logging
 
+logger = logging.getLogger('pastehunter')
 
 def recent_pastes(conf, input_history):
     # populate vars from config
@@ -41,5 +42,5 @@ def recent_pastes(conf, input_history):
         return paste_list, history
 
     except Exception as e:
-        logging.error("Unable to parse paste results: {0}".format(e))
+        logger.error("Unable to parse paste results: {0}".format(e))
         return paste_list, history
\ No newline at end of file
diff --git a/inputs/gists.py b/inputs/gists.py
index c56edb4..973f603 100644
--- a/inputs/gists.py
+++ b/inputs/gists.py
@@ -4,6 +4,7 @@
 from datetime import datetime
 
 # Set some logging options
+logger = logging.getLogger('pastehunter')
 logging.getLogger('requests').setLevel(logging.ERROR)
 
 api_uri = 'https://api.github.com/gists/public'
@@ -34,22 +35,22 @@ def recent_pastes(conf, input_history):
         # Get the required amount of entries via pagination
         for page_num in range(1, page_count + 1):
             url = '{0}?page={1}&per_page=100'.format(api_uri, page_num)
-            logging.debug("Fetching page: {0}".format(page_num))
+            logger.debug("Fetching page: {0}".format(page_num))
             req = requests.get(url, headers=headers)
             # Check some headers
             reset_date = datetime.utcfromtimestamp(float(req.headers['X-RateLimit-Reset'])).isoformat()
             # logging.info("Limit Reset: {0}".format(reset_date))
-            logging.info("Remaining Limit: {0}. Resets at {1}".format(req.headers['X-RateLimit-Remaining'],
+            logger.info("Remaining Limit: {0}. Resets at {1}".format(req.headers['X-RateLimit-Remaining'],
                                                                       reset_date))
 
             if req.status_code == 200:
                 result_pages.append(req.json())
 
             if req.status_code == 401:
-                logging.error("Auth Failed")
+                logger.error("Auth Failed")
 
             elif req.status_code == 403:
-                logging.error("Login Attempts Exceeded")
+                logger.error("Login Attempts Exceeded")
 
         # Parse results
 
@@ -61,13 +62,13 @@ def recent_pastes(conf, input_history):
                     continue
 
                 if gist_meta['user'] in gist_user_blacklist:
-                    logging.info("Blacklisting Gist from user: {0}".format(gist_meta['owner']['login']))
+                    logger.info("Blacklisting Gist from user: {0}".format(gist_meta['owner']['login']))
                     continue
 
                 for file_name, file_meta in gist_meta['files'].items():
 
                     if file_name in gist_file_blacklist:
-                        logging.info("Blacklisting Paste {0}".format(file_name))
+                        logger.info("Blacklisting Paste {0}".format(file_name))
                         continue
 
                     gist_data = file_meta
@@ -83,5 +84,5 @@ def recent_pastes(conf, input_history):
         # Return results and history
         return paste_list, history
     except Exception as e:
-        logging.error("Unable to parse paste results: {0}".format(e))
+        logger.error("Unable to parse paste results: {0}".format(e))
         return paste_list, history
diff --git a/inputs/pastebin.py b/inputs/pastebin.py
index 2fe66bd..fdae9e1 100644
--- a/inputs/pastebin.py
+++ b/inputs/pastebin.py
@@ -2,6 +2,7 @@
 import logging
 from datetime import datetime
 
+logger = logging.getLogger('pastehunter')
 
 def recent_pastes(conf, input_history):
     # populate vars from config
@@ -34,7 +35,7 @@ def recent_pastes(conf, input_history):
         return paste_list, history
 
     except Exception as e:
-        logging.error("Unable to parse paste results: {0}".format(e))
+        logger.error("Unable to parse paste results: {0}".format(e))
         return paste_list, history
 
 
diff --git a/outputs/elastic_output.py b/outputs/elastic_output.py
index f333d07..3d54bcf 100644
--- a/outputs/elastic_output.py
+++ b/outputs/elastic_output.py
@@ -3,6 +3,7 @@
 from datetime import datetime
 import logging
 
+logger = logging.getLogger('pastehunter')
 config = parse_config()
 
 
@@ -21,7 +22,7 @@ def __init__(self):
             self.es = Elasticsearch(es_host, port=es_port, http_auth=(es_user, es_pass), use_ssl=es_ssl)
             self.test = True
         except Exception as e:
-            print(e)
+            logger.error(e)
             raise Exception('Unable to Connect') from None
 
     def store_paste(self, paste_data):
@@ -33,10 +34,10 @@ def store_paste(self, paste_data):
                 index_name = '{0}-{1}-{2}'.format(index_name, year_number, week_number)
             # ToDo: With multiple paste sites a pasteid collision is more likly!
             self.es.index(index=index_name, doc_type='paste', id=paste_data['pasteid'], body=paste_data)
-            logging.info("Stored {0} Paste {1}, Matched Rule {2}".format(paste_data['pastesite'],
+            logger.debug("Stored {0} Paste {1}, Matched Rule {2}".format(paste_data['pastesite'],
                                                                          paste_data['pasteid'],
                                                                          paste_data['YaraRule']
                                                                          )
                          )
         else:
-            logging.error("Elastic Search Enabled, not configured!")
+            logger.error("Elastic Search Enabled, not configured!")
diff --git a/outputs/json_output.py b/outputs/json_output.py
index 490059b..7158bd2 100644
--- a/outputs/json_output.py
+++ b/outputs/json_output.py
@@ -1,8 +1,11 @@
 import os
 import datetime
 import json
+import logging
 from common import parse_config
 
+logger = logging.getLogger('pastehunter')
+
 config = parse_config()
 
 class JsonOutput():
@@ -14,7 +17,7 @@ def __init__(self):
                 os.makedirs(base_path)
                 self.test = True
             except OSError as e:
-                print("Unable to create Json Path: {0}".format(e))
+                logger.error("Unable to create Json Path: {0}".format(e))
                 self.test = False
         else:
             self.test = True
@@ -28,4 +31,4 @@ def store_paste(self, paste_data):
             with open(json_file, 'w') as out:
                 out.write(json.dumps(paste_data, indent=4))
         else:
-            print("JsonOutput Error")
+            logger.error("JsonOutput Error")
diff --git a/outputs/smtp_output.py b/outputs/smtp_output.py
index c1dad7e..4d9907f 100644
--- a/outputs/smtp_output.py
+++ b/outputs/smtp_output.py
@@ -9,6 +9,7 @@
 import logging
 
 from common import parse_config
+logger = logging.getLogger('pastehunter')
 
 config = parse_config()
 
@@ -17,7 +18,7 @@ def __init__(self):
         smtp_object = config['outputs']['smtp_output']
         self.smtp_host = smtp_object['smtp_host']
         self.smtp_port = smtp_object['smtp_port']
-        self.smtp_tls = smtp_object['smtp_tls']
+        self.smtp_security = smtp_object['smtp_security']
         self.smtp_user = smtp_object['smtp_user']
         self.smtp_pass = smtp_object['smtp_pass']
         if 'recipients' in smtp_object:
@@ -30,6 +31,7 @@ def __init__(self):
 
 
     def _send_mail(self, send_to_address, paste_data):
+        logger.info("crafting email for {0}".format(send_to_address))
 
         # Create the message
         msg = MIMEMultipart()
@@ -54,15 +56,18 @@ def _send_mail(self, send_to_address, paste_data):
         msg.attach(attachment)
 
         # Connect to the SMTP server and send
-        smtp_conn = smtplib.SMTP_SSL(self.smtp_host, self.smtp_port)
+        if self.smtp_security == 'ssl':
+            smtp_conn = smtplib.SMTP_SSL(self.smtp_host, self.smtp_port)
+        else:
+            smtp_conn = smtplib.SMTP(self.smtp_host, self.smtp_port)
         smtp_conn.ehlo()
-        if self.smtp_tls:
+        if self.smtp_security == 'tls':
             smtp_conn.starttls()
         smtp_conn.login(self.smtp_user, self.smtp_pass)
         smtp_conn.send_message(msg)
         smtp_conn.quit()
 
-        logging.info("Sent mail to {0} with rules {1}".format(send_to_address,
+        logger.info("Sent mail to {0} with rules {1}".format(send_to_address,
                                                               ', '.join(paste_data['YaraRule'])))
 
 
diff --git a/pastehunter.py b/pastehunter.py
index 5c5ba83..2331605 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -14,30 +14,53 @@
 from common import parse_config
 from postprocess import post_email
 
+VERSION = 0.1
+
 lock = threading.Lock()
 
-# Set some logging options
-logging.basicConfig(format='%(levelname)s:%(filename)s:%(message)s', level=logging.INFO)
-logging.getLogger('requests').setLevel(logging.ERROR)
-logging.getLogger('elasticsearch').setLevel(logging.ERROR)
+# Setup Default logging
+logger = logging.getLogger('pastehunter')
+logger.setLevel(logging.INFO)
+ch = logging.StreamHandler()
+ch.setLevel(logging.DEBUG)
+formatter = logging.Formatter('%(levelname)s:%(filename)s:%(message)s')
+ch.setFormatter(formatter)
+logger.addHandler(ch)
+
+# Version info
+logger.info("Starting PasteHunter Version: {0}".format(VERSION))
 
-logging.info("Reading Configs")
 # Parse the config file
+logger.info("Reading Configs")
 conf = parse_config()
 
-logging.info("Configure Inputs")
+# Override Log level if needed
+if "logging_level" in conf["general"]:
+    log_level = conf["general"]["logging_level"]
+else:
+    # For old configs
+    logger.error("Log Level not in config file. Update your base config file!")
+    log_level = 20
+logger.info("Setting Log Level to {0}".format(log_level))
+logging.getLogger('requests').setLevel(log_level)
+logging.getLogger('elasticsearch').setLevel(log_level)
+logging.getLogger('pastehunter').setLevel(log_level)
+
+# Configure Inputs
+logger.info("Configure Inputs")
 input_list = []
 for input_type, input_values in conf["inputs"].items():
     if input_values["enabled"]:
         input_list.append(input_values["module"])
-        logging.info("Enabled Input: {0}".format(input_type))
+        logger.info("Enabled Input: {0}".format(input_type))
 
 
-logging.info("Configure Outputs")
+# Configure Outputs
+logger.info("Configure Outputs")
 outputs = []
 for output_type, output_values in conf["outputs"].items():
     if output_values["enabled"]:
-        logging.info("Enabled Output: {0}".format(output_type))
+        logger.info("Enabled Output: {0}".format(output_type))
         _module = importlib.import_module(output_values["module"])
         _class = getattr(_module, output_values["classname"])
         instance = _class()
@@ -51,12 +74,12 @@ def yara_index(rule_path, blacklist, test_rules):
             if filename.endswith('.yar') and filename != 'index.yar':
                 if filename == 'blacklist.yar':
                     if blacklist:
-                        logging.info("Enable Blacklist Rules")
+                        logger.info("Enable Blacklist Rules")
                     else:
                         continue
                 if filename == 'test_rules.yar':
                     if test_rules:
-                        logging.info("Enable Test Rules")
+                        logger.info("Enable Test Rules")
                     else:
                         continue
                 include = 'include "{0}"\n'.format(filename)
@@ -70,7 +93,7 @@ def paste_scanner():
     # Store the Paste
     while True:
         paste_data = q.get()
-        logging.debug("Found New {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
+        logger.debug("Found New {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
         # get raw paste and hash them
         raw_paste_uri = paste_data['scrape_url']
         raw_paste_data = requests.get(raw_paste_uri).text
@@ -80,7 +103,7 @@ def paste_scanner():
             # Scan with yara
             matches = rules.match(data=raw_paste_data)
         except Exception as e:
-            logging.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
+            logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
             q.task_done()
             continue
 
@@ -106,7 +129,7 @@ def paste_scanner():
         # If any of the blacklist rules appear then empty the result set
         if conf['yara']['blacklist'] and 'blacklist' in results:
             results = []
-            logging.info("Blacklisted {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
+            logger.info("Blacklisted {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
 
         # Post Process
 
@@ -115,7 +138,7 @@ def paste_scanner():
         for post_process, post_values in conf["post_process"].items():
             if post_values["enabled"]:
                 if any(i in results for i in post_values["rule_list"]):
-                    logging.info("Running Post Module on {0}".format(paste_data["pasteid"]))
+                    logger.info("Running Post Module {0} on {1}".format(post_values["module"], paste_data["pasteid"]))
                     post_module = importlib.import_module(post_values["module"])
                     post_results = post_module.run(results,
                                                    raw_paste_data,
@@ -151,14 +174,14 @@ def paste_scanner():
                 try:
                     output.store_paste(paste_data)
                 except Exception as e:
-                    logging.error("Unable to store {0}".format(paste_data["pasteid"]))
+                    logger.error("Unable to store {0} to {1}".format(paste_data["pasteid"], e))
 
         # Mark Tasks as complete
         q.task_done()
 
 
 if __name__ == "__main__":
-    logging.info("Compile Yara Rules")
+    logger.info("Compile Yara Rules")
     try:
         # Update the yara rules index
         yara_index(conf['yara']['rule_path'],
@@ -185,7 +208,7 @@ def paste_scanner():
     try:
         while True:
             # Paste History
-            logging.info("Populating Queue")
+            logger.info("Populating Queue")
             if os.path.exists('paste_history.tmp'):
                 with open('paste_history.tmp') as json_file:
                     paste_history = json.load(json_file)
@@ -200,13 +223,13 @@ def paste_scanner():
 
                 i = importlib.import_module(input_name)
                 # Get list of recent pastes
-                logging.info("Fetching paste list from {0}".format(input_name))
+                logger.info("Fetching paste list from {0}".format(input_name))
                 paste_list, history = i.recent_pastes(conf, input_history)
                 for paste in paste_list:
                     q.put(paste)
                 paste_history[input_name] = history
 
-            logging.debug("Writing History")
+            logger.debug("Writing History")
             # Write History
             with open('paste_history.tmp', 'w') as outfile:
                 json.dump(paste_history, outfile)
@@ -215,8 +238,8 @@ def paste_scanner():
             q.join()
 
             # Slow it down a little
-            logging.info("Sleeping for " + str(conf['general']['run_frequency']) + " Seconds")
+            logger.info("Sleeping for " + str(conf['general']['run_frequency']) + " Seconds")
             sleep(conf['general']['run_frequency'])
 
     except KeyboardInterrupt:
-        logging.info("Stopping Threads")
+        logger.info("Stopping Threads")
diff --git a/postprocess/post_b64.py b/postprocess/post_b64.py
index cc7ef5f..fe3b383 100644
--- a/postprocess/post_b64.py
+++ b/postprocess/post_b64.py
@@ -9,6 +9,7 @@
 from common import parse_config
 conf = parse_config()
 
+logger = logging.getLogger('pastehunter')
 
 def run(results, raw_paste_data, paste_object):
 
@@ -43,7 +44,7 @@ def run(results, raw_paste_data, paste_object):
                     encoded = uncompressed.encode('utf-8')
                     paste_object["decompressed_stream"] = encoded
                 except Exception as e:
-                    logging.error("Unable to decompress gzip stream")
+                    logger.error("Unable to decompress gzip stream")
             if rule == 'b64_exe':
                 try:
                     raw_exe = b64decode(raw_paste_data)
@@ -56,13 +57,13 @@ def run(results, raw_paste_data, paste_object):
 
                     # Cuckoo
                     if conf["post_process"]["post_b64"]["cuckoo"]["enabled"]:
-                        logging.info("Submitting to Cuckoo")
+                        logger.info("Submitting to Cuckoo")
                         try:
                             task_id = send_to_cuckoo(raw_exe, paste_object["pasteid"])
                             paste_object["Cuckoo Task ID"] = task_id
-                            logging.info("exe submitted to Cuckoo with task id {0}".format(task_id))
+                            logger.info("exe submitted to Cuckoo with task id {0}".format(task_id))
                         except Exception as e:
-                            logging.error("Unabled to submit sample to cuckoo")
+                            logger.error("Unabled to submit sample to cuckoo")
 
                     # Viper
                     if conf["post_process"]["post_b64"]["viper"]["enabled"]:
@@ -71,7 +72,7 @@ def run(results, raw_paste_data, paste_object):
                     # VirusTotal
 
                 except Exception as e:
-                    logging.error("Unable to decode exe file")
+                    logger.error("Unable to decode exe file")
 
 
     # Get unique domain count
@@ -95,6 +96,6 @@ def send_to_cuckoo(raw_exe, pasteid):
         try:
             task_id = submit_file['task_ids'][0]
         except KeyError:
-            logging.error(submit_file)
+            logger.error(submit_file)
 
     return task_id
diff --git a/settings.json.sample b/settings.json.sample
index 6ca74d9..e49f71b 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -66,7 +66,7 @@
       "classname": "SMTPOutput",
       "smtp_host": "smtp.server.com",
       "smtp_port": 25,
-      "smtp_tls": true,
+      "smtp_security": "starttls",
       "smtp_user": "smtpusername",
       "smtp_pass": "smtppassword",
       "recipients" : {

From f4a762f0ff26e40e12c6907adeb61bc20849a139 Mon Sep 17 00:00:00 2001
From: localadmin <localadmin@pastehunter.star-labs.co.uk>
Date: Thu, 26 Apr 2018 22:17:43 +0100
Subject: [PATCH 057/178] remove config file

---
 .vscode/sftp.json | 6 ------
 1 file changed, 6 deletions(-)
 delete mode 100644 .vscode/sftp.json

diff --git a/.vscode/sftp.json b/.vscode/sftp.json
deleted file mode 100644
index a1b5c2f..0000000
--- a/.vscode/sftp.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-    "protocol": "sftp",
-    "host": "192.168.1.166",
-    "username": "localadmin",
-    "remotePath": "/home/localadmin/pastehunter_dev/"
-}

From 188572cbecc9c3e89f3cb44b0246b819fa8e8985 Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Fri, 4 May 2018 19:46:38 +0100
Subject: [PATCH 058/178] update settings to include log level

---
 .gitignore           | 2 +-
 settings.json.sample | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 683bd90..eb5870e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -105,4 +105,4 @@ ENV/
 /settings.json
 /.idea
 /postprocess/tester.py
-.vscode/settings.json
+.vscode/
diff --git a/settings.json.sample b/settings.json.sample
index e49f71b..3ff8856 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -90,6 +90,7 @@
   },
   "general": {
     "run_frequency": 300
+    "logging_level": 20
   },
   "post_process": {
     "post_email": {

From 26c0e297eff4cac82e9b291aed61b485d667e8ea Mon Sep 17 00:00:00 2001
From: kovacsbalu <kovacsbalu@gmail.com>
Date: Tue, 19 Jun 2018 12:54:34 +0200
Subject: [PATCH 059/178] Added slexy.org parser

---
 inputs/slexy.py | 88 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)
 create mode 100644 inputs/slexy.py

diff --git a/inputs/slexy.py b/inputs/slexy.py
new file mode 100644
index 0000000..791668e
--- /dev/null
+++ b/inputs/slexy.py
@@ -0,0 +1,88 @@
+import logging
+import re
+import urllib.request as urllib
+from datetime import datetime
+
+logger = logging.getLogger('pastehunter')
+
+
+class SlexySite(object):
+
+    def __init__(self):
+        self.site = "slexy.org"
+        url_slexy = "http://" + self.site
+        self.url_recent = url_slexy + "/recent"
+        self.url_view = url_slexy + "/view"
+        self.url_raw = url_slexy + "/raw"
+
+    def view_link(self, pid):
+        return "%s/%s" % (self.url_view, pid)
+
+    def raw_link(self, pid, args):
+        return "%s/%s%s" % (self.url_raw, pid, args)
+
+
+class SlexyPaste(SlexySite):
+    def __init__(self, pid):
+        super(SlexyPaste, self).__init__()
+        self.pid = pid
+        self.site = self.site
+        self.url = None
+        self.timestamp = None
+        self.parse()
+
+    def parse(self):
+        data = urllib.urlopen(self.view_link(self.pid)).read().decode('utf-8')
+        self.url = self.get_raw_link(data)
+        self.timestamp = self.get_timestamp(data)
+
+    def get_raw_link(self, data):
+        pattern = '<a href="/raw/%s(.*?)"' % self.pid
+        token = re.findall(pattern, data)[0]
+        return self.raw_link(self.pid, token)
+
+    def get_raw_data(self):
+        return urllib.urlopen(self.url).read().decode('utf-8')
+
+    def get_timestamp(self, data):
+        pattern = 'Timestamp: <b>(.*?)</b>'
+        ts = re.findall(pattern, data)[0]
+        return datetime.strptime(ts, "%Y-%m-%d %H:%M:%S %z").isoformat()
+
+    def __repr__(self):
+        return self.pid
+
+
+class SlexyScraper(SlexySite):
+
+    def __init__(self):
+        super(SlexyScraper, self).__init__()
+
+    def get_recents(self):
+        getdata = urllib.urlopen(self.url_recent).read().decode('utf-8')
+        pids = re.findall('<td><a href="/view/(.*?)">', getdata)
+        recents = []
+        for pid in pids:
+            recents.append(SlexyPaste(pid))
+        return recents
+
+
+def recent_pastes(conf, input_history):
+    history = []
+    paste_list = []
+    try:
+        my_scraper = SlexyScraper()
+        for paste in my_scraper.get_recents():
+            history.append(paste.pid)
+            if paste.pid in input_history:
+                continue
+            paste_data = {}
+            paste_data['scrape_url'] = paste.url
+            paste_data['pasteid'] = paste.pid
+            paste_data['pastesite'] = paste.site
+            paste_data['@timestamp'] = paste.timestamp
+            paste_list.append(paste_data)
+        return paste_list, history
+    except Exception as e:
+        logger.error("Unable to parse paste results: %s", e)
+        return paste_list, history

From 09759ae28ac97590b7c641d43c04bf661981c568 Mon Sep 17 00:00:00 2001
From: kovacsbalu <kovacsbalu@gmail.com>
Date: Wed, 20 Jun 2018 14:06:58 +0200
Subject: [PATCH 060/178] Added slexy to example settings

---
 settings.json.sample | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/settings.json.sample b/settings.json.sample
index 3ff8856..371a2fa 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -24,6 +24,11 @@
       "store_all": false,
       "user_blacklist": [],
       "file_blacklist": ["grahamcofborg-eval-package-list"]
+    },
+    "slexy":{
+      "enabled": true,
+      "module": "inputs.slexy",
+      "store_all": false
     }
   },
   "outputs": {

From 1cd3af732ab9171498946cf71b014140b6be26f2 Mon Sep 17 00:00:00 2001
From: kovacsbalu <kovacsbalu@gmail.com>
Date: Wed, 20 Jun 2018 15:30:56 +0200
Subject: [PATCH 061/178] Added better emaail filter pattern

---
 YaraRules/email_filter.yar | 19 +++++++++++++++++++
 YaraRules/index.yar        |  1 +
 2 files changed, 20 insertions(+)
 create mode 100644 YaraRules/email_filter.yar

diff --git a/YaraRules/email_filter.yar b/YaraRules/email_filter.yar
new file mode 100644
index 0000000..8d546bd
--- /dev/null
+++ b/YaraRules/email_filter.yar
@@ -0,0 +1,19 @@
+/*
+    These rules attempt to find email leaks
+*/
+
+rule email_filter
+{
+    meta:
+        author = "@kovacsbalu"
+        info = "Better email pattern"
+        reference = "https://github.com/securenetworx/PasteHunter/tree/fix-email-filter"
+
+    strings:
+	$email_add = /\b[\w-]+(\.[\w-]+)*@[\w-]+(\.[\w-]+)*\.[a-zA-Z-]+[\w-]\b/
+    condition:
+        #email_add > 20
+
+}
+
+
diff --git a/YaraRules/index.yar b/YaraRules/index.yar
index 9068a02..60b074f 100644
--- a/YaraRules/index.yar
+++ b/YaraRules/index.yar
@@ -8,3 +8,4 @@ include "hak5.yar"
 include "custom_keywords.yar"
 include "general.yar"
 include "powershell.yar"
+include "email_filter.yar"
\ No newline at end of file

From fa22ad6e3acfb13ead96ec616489c5a0d45a0a0b Mon Sep 17 00:00:00 2001
From: kovacsbalu <kovacsbalu@gmail.com>
Date: Tue, 26 Jun 2018 13:10:06 +0200
Subject: [PATCH 062/178] Fix #39 missing comma

---
 settings.json.sample | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/settings.json.sample b/settings.json.sample
index 3ff8856..4a4a375 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -89,7 +89,7 @@
     "test_rules": false
   },
   "general": {
-    "run_frequency": 300
+    "run_frequency": 300,
     "logging_level": 20
   },
   "post_process": {

From 89d0efeba00c2bd1002c2b0eb8bd647be3fbdeb0 Mon Sep 17 00:00:00 2001
From: sfinlon <finlon@gmail.com>
Date: Wed, 25 Jul 2018 23:56:51 -0400
Subject: [PATCH 063/178] add date header to the smtp output

---
 outputs/smtp_output.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/outputs/smtp_output.py b/outputs/smtp_output.py
index 4d9907f..6ade375 100644
--- a/outputs/smtp_output.py
+++ b/outputs/smtp_output.py
@@ -4,6 +4,7 @@
 import email.mime.base
 import email.mime.multipart
 import email.mime.text
+from email.utils import formatdate
 from email.mime.multipart import MIMEMultipart
 import json
 import logging
@@ -38,6 +39,7 @@ def _send_mail(self, send_to_address, paste_data):
         msg['Subject'] = 'PasteHunter Alert {0}'.format(', '.join(paste_data['YaraRule']))
         msg['From'] = self.smtp_user
         msg['To'] = send_to_address
+        msg["Date"] = formatdate(localtime=True)
 
         # Attach the body
         body = 'Rules : {0}\n' \

From ec0460fe8891c3a982991fb29e988f721aa9561a Mon Sep 17 00:00:00 2001
From: kovacsbalu <kovacsbalu@gmail.com>
Date: Thu, 18 Oct 2018 12:52:12 +0200
Subject: [PATCH 064/178] Added slexy.org

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 662f9b7..c69ed8f 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,7 @@ Pastehunter currently has support for the following sites:
  - pastebin.com
  - dumpz.org
  - gist.github.com
+ - slexy.org
 
 Support for the following sites is listed as ToDo:
  - paste.ee

From 4fc4f7d9a26d623d8851593a9c43d16c3d7492ff Mon Sep 17 00:00:00 2001
From: Spencer Thomason <spencer.thomason@gmail.com>
Date: Thu, 18 Oct 2018 08:56:37 -0700
Subject: [PATCH 065/178] Update to pip3 / python3

#add python3-pip here ..
apt-get -y install automake libtool make gcc git python3-pip && \
#change this to pip3 here
pip3 install yara-python && \
#update to the latest yara here:
wget https://github.com/VirusTotal/yara/archive/v3.8.1.tar.gz -O yara.tar.gz && \

#maybe change to a version variable at the top so that we don't have to change all the way thru?  Otherwise will have to change all the way thru
RUN cd yara-3.8.1 && \

#change to pip3 here
RUN pip3 install -r requirements.txt

#change to python3 here
CMD ["/usr/src/wait-for-it/wait-for-it.sh","-t", "0","172.16.10.10:9200","--", "python3", "pastehunter.py"]
---
 Dockerfile | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 020e44a..daed46c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,11 +1,12 @@
 FROM python:3
 
-RUN apt-get install automake libtool make gcc git && \
-pip install yara-python && \
-wget https://github.com/VirusTotal/yara/archive/v3.6.3.tar.gz -O yara.tar.gz && \
+RUN apt-get update && \
+apt-get -y install automake libtool make gcc git python3-pip && \
+pip3 install yara-python && \
+wget https://github.com/VirusTotal/yara/archive/v3.8.1.tar.gz -O yara.tar.gz && \
 tar -zxf yara.tar.gz
 
-RUN cd yara-3.6.3 && \
+RUN cd yara-3.8.1 && \
 ./bootstrap.sh && \
 ./configure && \
 make && \
@@ -18,6 +19,7 @@ chmod +x /usr/src/wait-for-it/wait-for-it.sh
 WORKDIR /usr/src/pastehunter
 
 COPY . ./
-RUN pip install -r requirements.txt
+RUN pip3 install -r requirements.txt
+
+CMD ["/usr/src/wait-for-it/wait-for-it.sh","-t", "0","172.16.10.10:9200","--", "python3", "pastehunter.py"]
 
-CMD ["/usr/src/wait-for-it/wait-for-it.sh","-t", "0","172.16.10.10:9200","--", "python", "pastehunter.py"]

From ee0c01e0d42c2bbd32b1a82c9d9fa128263350b0 Mon Sep 17 00:00:00 2001
From: Spencer Thomason <spencer.thomason@gmail.com>
Date: Thu, 18 Oct 2018 09:25:21 -0700
Subject: [PATCH 066/178] Updated to ELK 6.4.2 and cleanup

Added the container_name to kibana as it would grab something else wonky
---
 docker-compose.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 0f22642..11c3fa6 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -12,7 +12,7 @@ services:
       - "elasticsearch"
       - "kibana"
   elasticsearch:
-    image: docker.elastic.co/elasticsearch/elasticsearch:6.1.0
+    image: docker.elastic.co/elasticsearch/elasticsearch:6.4.2
     container_name: elasticsearch
     environment:
       - cluster.name=docker-cluster
@@ -32,7 +32,8 @@ services:
         ipv4_address: 172.16.10.10
 
   kibana:
-    image: docker.elastic.co/kibana/kibana:6.1.0
+    image: docker.elastic.co/kibana/kibana:6.4.2
+    container_name: kibana
     ports:
       - "127.0.0.1:5601:5601"
     networks:
@@ -50,3 +51,4 @@ networks:
     ipam:
       config:
       - subnet: 172.16.10.0/24
+      

From 3af02aedb5b53ee9188b209752267ecd85ef8d95 Mon Sep 17 00:00:00 2001
From: kovacsbalu <kovacsbalu@gmail.com>
Date: Mon, 5 Nov 2018 14:03:13 +0100
Subject: [PATCH 067/178] Fill referer and user-agent

---
 inputs/slexy.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/inputs/slexy.py b/inputs/slexy.py
index 791668e..791f48a 100644
--- a/inputs/slexy.py
+++ b/inputs/slexy.py
@@ -16,10 +16,20 @@ def __init__(self):
         self.url_raw = url_slexy + "/raw"
 
     def view_link(self, pid):
-        return "%s/%s" % (self.url_view, pid)
+        return self.create_req("%s/%s" % (self.url_view, pid))
 
     def raw_link(self, pid, args):
-        return "%s/%s%s" % (self.url_raw, pid, args)
+        return self.create_req("%s/%s%s" % (self.url_raw, pid, args))
+
+    def create_req(self, url):
+        return urllib.Request(
+            url,
+            data=None,
+            headers={
+              'Referer': self.url_recent,
+              'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36'
+            }
+        )
 
 
 class SlexyPaste(SlexySite):

From 77e64427aafd2750f7ce368b8919e4b402140947 Mon Sep 17 00:00:00 2001
From: kovacsbalu <kovacsbalu@gmail.com>
Date: Mon, 5 Nov 2018 14:09:28 +0100
Subject: [PATCH 068/178] Timeout added

---
 inputs/slexy.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/inputs/slexy.py b/inputs/slexy.py
index 791f48a..8235a88 100644
--- a/inputs/slexy.py
+++ b/inputs/slexy.py
@@ -42,7 +42,7 @@ def __init__(self, pid):
         self.parse()
 
     def parse(self):
-        data = urllib.urlopen(self.view_link(self.pid)).read().decode('utf-8')
+        data = urllib.urlopen(self.view_link(self.pid), timeout=10).read().decode('utf-8')
         self.url = self.get_raw_link(data)
         self.timestamp = self.get_timestamp(data)
 
@@ -52,7 +52,7 @@ def get_raw_link(self, data):
         return self.raw_link(self.pid, token)
 
     def get_raw_data(self):
-        return urllib.urlopen(self.url).read().decode('utf-8')
+        return urllib.urlopen(self.url, timeout=10).read().decode('utf-8')
 
     def get_timestamp(self, data):
         pattern = 'Timestamp: <b>(.*?)</b>'

From 7315566860cab2ec6ec809a67ec81dc092214a7a Mon Sep 17 00:00:00 2001
From: kovacsbalu <kovacsbalu@gmail.com>
Date: Mon, 5 Nov 2018 14:17:03 +0100
Subject: [PATCH 069/178] Remove duplicated ids

---
 inputs/slexy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inputs/slexy.py b/inputs/slexy.py
index 8235a88..04e530b 100644
--- a/inputs/slexy.py
+++ b/inputs/slexy.py
@@ -72,7 +72,7 @@ def get_recents(self):
         getdata = urllib.urlopen(self.url_recent).read().decode('utf-8')
         pids = re.findall('<td><a href="/view/(.*?)">', getdata)
         recents = []
-        for pid in pids:
+        for pid in list(set(pids)):
             recents.append(SlexyPaste(pid))
         return recents
 

From 6333c4961e6dcce25fa967eec5e0fc56af4cdfdb Mon Sep 17 00:00:00 2001
From: kovacsbalu <kovacsbalu@gmail.com>
Date: Mon, 5 Nov 2018 15:02:38 +0100
Subject: [PATCH 070/178] Fix scrape url

---
 inputs/slexy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inputs/slexy.py b/inputs/slexy.py
index 04e530b..ca258b3 100644
--- a/inputs/slexy.py
+++ b/inputs/slexy.py
@@ -87,7 +87,7 @@ def recent_pastes(conf, input_history):
             if paste.pid in input_history:
                 continue
             paste_data = {}
-            paste_data['scrape_url'] = paste.url
+            paste_data['scrape_url'] = paste.url.full_url
             paste_data['pasteid'] = paste.pid
             paste_data['pastesite'] = paste.site
             paste_data['@timestamp'] = paste.timestamp

From 3b75c38e4cce73d1e174f78e772d74cdb4635633 Mon Sep 17 00:00:00 2001
From: kovacsbalu <kovacsbalu@gmail.com>
Date: Mon, 5 Nov 2018 15:03:03 +0100
Subject: [PATCH 071/178] Fix for #52 Process only new pastes

---
 inputs/slexy.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/inputs/slexy.py b/inputs/slexy.py
index ca258b3..8bdcde8 100644
--- a/inputs/slexy.py
+++ b/inputs/slexy.py
@@ -71,21 +71,24 @@ def __init__(self):
     def get_recents(self):
         getdata = urllib.urlopen(self.url_recent).read().decode('utf-8')
         pids = re.findall('<td><a href="/view/(.*?)">', getdata)
-        recents = []
-        for pid in list(set(pids)):
-            recents.append(SlexyPaste(pid))
-        return recents
+        return list(set(pids))
 
 
 def recent_pastes(conf, input_history):
     history = []
     paste_list = []
+    my_scraper = SlexyScraper()
+    recent_pids = my_scraper.get_recents()
+    pid_to_process = set()
+    for pid in recent_pids:
+        if pid in input_history:
+           history.append(pid)
+        else:
+           pid_to_process.add(pid)
     try:
-        my_scraper = SlexyScraper()
-        for paste in my_scraper.get_recents():
+        for pid in pid_to_process:
+            paste = SlexyPaste(pid)
             history.append(paste.pid)
-            if paste.pid in input_history:
-                continue
             paste_data = {}
             paste_data['scrape_url'] = paste.url.full_url
             paste_data['pasteid'] = paste.pid

From 5d106c9fdce09a1cf05c7554bd31e8129f17db22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thi=C3=AAn=20To=C3=A1n?= <toanalien@gmail.com>
Date: Wed, 21 Nov 2018 23:13:38 +0700
Subject: [PATCH 072/178] Set Elasticsearch corresponding docker-compose config

---
 settings.json.sample | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/settings.json.sample b/settings.json.sample
index a15e840..ac24869 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -37,7 +37,7 @@
       "module": "outputs.elastic_output",
       "classname": "ElasticOutput",
       "elastic_index": "paste-test",
-      "elastic_host": "192.168.1.22",
+      "elastic_host": "172.16.10.10",
       "elastic_port": 9200,
       "elastic_user": "elastic",
       "elastic_pass": "changeme",

From a0797f60fa6d948d8235455f5846dc5915b88d23 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thi=C3=AAn=20To=C3=A1n?= <toanalien@gmail.com>
Date: Wed, 21 Nov 2018 23:16:49 +0700
Subject: [PATCH 073/178] Disable X-Pack because of missing license

---
 docker-compose.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docker-compose.yml b/docker-compose.yml
index 0f22642..ece69f9 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -18,6 +18,13 @@ services:
       - cluster.name=docker-cluster
       - bootstrap.memory_lock=true
       - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
+      - xpack.security.enabled=false
+      - xpack.monitoring.enabled=false
+      - xpack.ml.enabled=false
+      - xpack.graph.enabled=false
+      - xpack.watcher.enabled=false
+      - discovery.zen.minimum_master_nodes=1
+      - discovery.type=single-node
     ulimits:
       memlock:
         soft: -1

From dbd030b3b9cd7b0592db7933f687294079e9717c Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Thu, 27 Dec 2018 20:55:02 +0000
Subject: [PATCH 074/178] Add IP Check error state to pastebin collector

---
 inputs/pastebin.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/inputs/pastebin.py b/inputs/pastebin.py
index fdae9e1..93b9cbc 100644
--- a/inputs/pastebin.py
+++ b/inputs/pastebin.py
@@ -16,6 +16,11 @@ def recent_pastes(conf, input_history):
         # Get some pastes and convert to json
         # Get last 'paste_limit' pastes
         paste_list_request = requests.get(scrape_uri)
+
+        # Check to see if our IP is whitelisted or not. 
+        if 'DOES NOT HAVE ACCESS' in paste_list_request.text:
+            logger.error("Your IP is not whitelisted visits 'https://pastebin.com/doc_scraping_api'")
+            return [], []
         paste_list_json = paste_list_request.json()
 
         for paste in paste_list_json:

From d028234da97597ecd1004ab83e0038fe4162a754 Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Thu, 27 Dec 2018 22:52:47 +0000
Subject: [PATCH 075/178] switch to multiprocessing instead of threading

---
 .gitignore     |  1 +
 pastehunter.py | 53 +++++++++++++++++++++++++++++++++-----------------
 2 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/.gitignore b/.gitignore
index eb5870e..be347da 100644
--- a/.gitignore
+++ b/.gitignore
@@ -106,3 +106,4 @@ ENV/
 /.idea
 /postprocess/tester.py
 .vscode/
+logs/
\ No newline at end of file
diff --git a/pastehunter.py b/pastehunter.py
index 2331605..3577f86 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -6,17 +6,18 @@
 import json
 import hashlib
 import requests
-import threading
+import multiprocessing
 import importlib
 import logging
+import time
 from time import sleep
-from queue import Queue
+#from queue import Queue
 from common import parse_config
 from postprocess import post_email
 
-VERSION = 0.1
+from multiprocessing import Queue
 
-lock = threading.Lock()
+VERSION = 0.2
 
 # Setup Default logging
 logger = logging.getLogger('pastehunter')
@@ -93,19 +94,20 @@ def paste_scanner():
     # Store the Paste
     while True:
         paste_data = q.get()
+
+        # Start a timer
+        start_time = time.time()
         logger.debug("Found New {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
         # get raw paste and hash them
         raw_paste_uri = paste_data['scrape_url']
         raw_paste_data = requests.get(raw_paste_uri).text
         # Process the paste data here
-
         try:
             # Scan with yara
             matches = rules.match(data=raw_paste_data)
         except Exception as e:
             logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
-            q.task_done()
-            continue
+            return
 
         results = []
         for match in matches:
@@ -141,9 +143,9 @@ def paste_scanner():
                     logger.info("Running Post Module {0} on {1}".format(post_values["module"], paste_data["pasteid"]))
                     post_module = importlib.import_module(post_values["module"])
                     post_results = post_module.run(results,
-                                                   raw_paste_data,
-                                                   paste_data
-                                                   )
+                                                    raw_paste_data,
+                                                    paste_data
+                                                    )
 
         # Throw everything back to paste_data for ease.
         paste_data = post_results
@@ -175,9 +177,13 @@ def paste_scanner():
                     output.store_paste(paste_data)
                 except Exception as e:
                     logger.error("Unable to store {0} to {1}".format(paste_data["pasteid"], e))
+        
+        end_time = time.time()
+        logger.debug("Processing Finished for {0} in {1} seconds".format(
+            paste_data["pasteid"],
+            (end_time - start_time)
+        ))
 
-        # Mark Tasks as complete
-        q.task_done()
 
 
 if __name__ == "__main__":
@@ -197,16 +203,19 @@ def paste_scanner():
 
     # Create Queue to hold paste URI's
     q = Queue()
+    processes = []
 
     # Threads
     for i in range(5):
-        t = threading.Thread(target=paste_scanner)
-        t.daemon = True
-        t.start()
+        m = multiprocessing.Process(target=paste_scanner)
+        # Add new process to list so we can run join on them later. 
+        processes.append(m)
+        m.start()
 
     # Now Fill the Queue
     try:
         while True:
+            queue_count = 0
             # Paste History
             logger.info("Populating Queue")
             if os.path.exists('paste_history.tmp'):
@@ -227,19 +236,27 @@ def paste_scanner():
                 paste_list, history = i.recent_pastes(conf, input_history)
                 for paste in paste_list:
                     q.put(paste)
+                    queue_count += 1
                 paste_history[input_name] = history
 
             logger.debug("Writing History")
             # Write History
             with open('paste_history.tmp', 'w') as outfile:
                 json.dump(paste_history, outfile)
+            logger.info("Added {0} Items to the queue".format(queue_count))
 
-            # Flush the list
-            q.join()
+            for proc in processes:
+                proc.join(2)
 
             # Slow it down a little
             logger.info("Sleeping for " + str(conf['general']['run_frequency']) + " Seconds")
             sleep(conf['general']['run_frequency'])
+        
+
 
     except KeyboardInterrupt:
-        logger.info("Stopping Threads")
+        logger.info("Stopping Processes")
+        for proc in processes:
+            proc.terminate()
+            proc.join()
+

From a0545e0215c06104b2d809591844a1b49ec1ea9d Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Thu, 27 Dec 2018 22:54:46 +0000
Subject: [PATCH 076/178] dumpz API has changed disable this by default

---
 settings.json.sample | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/settings.json.sample b/settings.json.sample
index a15e840..99838cb 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -9,7 +9,8 @@
       "store_all": false
     },
     "dumpz": {
-      "enabled": true,
+      "enabled": false,
+      "comment": "This api endpoint has been removed.",
       "module": "inputs.dumpz",
       "api_scrape": "https://dumpz.org/api/recent",
       "api_raw": "https://dumpz.org/api/dump",
@@ -28,7 +29,10 @@
     "slexy":{
       "enabled": true,
       "module": "inputs.slexy",
-      "store_all": false
+      "store_all": false,
+      "api_scrape": "http://slexy.org/recent",
+      "api_raw": "http://slexy.org/raw",
+      "api_view": "http://slexy.org/view"
     }
   },
   "outputs": {

From f5d49cdd33bc04c4b29be20fd7dc034d9a1e2c3b Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Thu, 27 Dec 2018 23:49:02 +0000
Subject: [PATCH 077/178] Add slack output for selected rules

---
 outputs/slack_output.py | 51 +++++++++++++++++++++++++++++++++++++++++
 pastehunter.py          |  2 +-
 settings.json.sample    |  7 ++++++
 3 files changed, 59 insertions(+), 1 deletion(-)
 create mode 100644 outputs/slack_output.py

diff --git a/outputs/slack_output.py b/outputs/slack_output.py
new file mode 100644
index 0000000..91e5f3f
--- /dev/null
+++ b/outputs/slack_output.py
@@ -0,0 +1,51 @@
+import os
+import datetime
+import json
+import logging
+import requests
+from common import parse_config
+
+logger = logging.getLogger('pastehunter')
+
+config = parse_config()
+
+
+class SlackOutput():
+    def __init__(self):
+        self.valid = True
+        self.webhook_url = config['outputs']['slack_output']['webhook_url']
+        self.accepted_rules = config['outputs']['slack_output']['rule_list']
+
+        if self.webhook_url == '':
+            logging.error("Slack Webhook not configured")
+            self.valid = False
+        if self.webhook_url == '':
+            logging.error("No Rules configured to alert")
+
+    def store_paste(self, paste_data):
+        if self.valid:
+            send = False
+
+            for rule in self.accepted_rules:
+                if rule in paste_data['YaraRule']:
+                    send = True
+
+            if send:
+                json_data = {
+                    "text": "Pastehunter alert!",
+                    "attachments": [
+                        {
+                            "fallback": "Plan a vacation",
+                            "author_name": "PasteHunter",
+                            "title": "Paste ID {0}".format(paste_data['pasteid']),
+                            "text": "Yara Rule {0} Found on {1}".format(paste_data['YaraRule'], paste_data['pastesite'])
+                        }
+                    ]
+                }
+
+                req = requests.post(self.webhook_url, json=json_data)
+                if req.status_code == 200 and req.text == 'ok':
+                    logger.debug("Paste sent to slack")
+                else:
+                    logger.error(
+                        "Failed to post to slack Status Code {0}".format(req.status_code))
diff --git a/pastehunter.py b/pastehunter.py
index 3577f86..3efc700 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -176,7 +176,7 @@ def paste_scanner():
                 try:
                     output.store_paste(paste_data)
                 except Exception as e:
-                    logger.error("Unable to store {0} to {1}".format(paste_data["pasteid"], e))
+                    logger.error("Unable to store {0} to {1} with error {2}".format(paste_data["pasteid"], output, e))
         
         end_time = time.time()
         logger.debug("Processing Finished for {0} in {1} seconds".format(
diff --git a/settings.json.sample b/settings.json.sample
index 99838cb..cf9ebf1 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -90,6 +90,13 @@
           "mandatory_rule_list": ["keyword1", "keyword2"]
         }
       }
+    },
+    "slack_output": {
+      "enabled": true,
+      "module": "outputs.slack_output",
+      "classname": "SlackOutput",
+      "webhook_url": "",
+      "rule_list": ["custom_keywords"]
     }
   },
   "yara": {

From 795dabcc290db0f2efb269752fdf1662be63b38f Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Fri, 28 Dec 2018 00:11:45 +0000
Subject: [PATCH 078/178] Add Entropy calculation to pastes

---
 pastehunter.py              |  2 +-
 postprocess/post_entropy.py | 16 ++++++++++++++++
 settings.json.sample        |  7 ++++++-
 3 files changed, 23 insertions(+), 2 deletions(-)
 create mode 100644 postprocess/post_entropy.py

diff --git a/pastehunter.py b/pastehunter.py
index 3efc700..d9ad5ea 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -139,7 +139,7 @@ def paste_scanner():
         post_results = paste_data
         for post_process, post_values in conf["post_process"].items():
             if post_values["enabled"]:
-                if any(i in results for i in post_values["rule_list"]):
+                if any(i in results for i in post_values["rule_list"]) or "ALL" in post_values["rule_list"]:
                     logger.info("Running Post Module {0} on {1}".format(post_values["module"], paste_data["pasteid"]))
                     post_module = importlib.import_module(post_values["module"])
                     post_results = post_module.run(results,
diff --git a/postprocess/post_entropy.py b/postprocess/post_entropy.py
new file mode 100644
index 0000000..ca12b67
--- /dev/null
+++ b/postprocess/post_entropy.py
@@ -0,0 +1,16 @@
+import re
+import math
+from collections import Counter
+
+def shannon_entropy(s):
+    # https://rosettacode.org/wiki/Entropy#Python
+    s = str(s)
+    p, lns = Counter(s), float(len(s))
+    return -sum(count / lns * math.log(count / lns, 2) for count in p.values())
+
+
+def run(results, raw_paste_data, paste_object):
+    # Calculate the Shannon Entropy for the raw paste
+    paste_object["Shannon Entropy"] = shannon_entropy(raw_paste_data)
+    # Send the updated json back
+    return paste_object
diff --git a/settings.json.sample b/settings.json.sample
index cf9ebf1..54aec48 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -127,7 +127,12 @@
         "enabled": false,
         "api_host": "127.0.0.1",
         "api_port": 8080
-      }
+      },
+    "post_entropy": {
+      "enabled": true,
+      "module": "postprocess.post_entropy",
+      "rule_list": ["ALL"]
+    }
     }
   }
 }

From e389fb7efd1983b4ae9e69bac9677ad63c5700fd Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Sat, 29 Dec 2018 15:02:37 +0000
Subject: [PATCH 079/178] Pastebin cache check

---
 pastehunter.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/pastehunter.py b/pastehunter.py
index d9ad5ea..4b794f6 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -101,13 +101,22 @@ def paste_scanner():
         # get raw paste and hash them
         raw_paste_uri = paste_data['scrape_url']
         raw_paste_data = requests.get(raw_paste_uri).text
+
+        # Pastebin Cache
+        if raw_paste_data == "File is not ready for scraping yet. Try again in 1 minute.":
+            logger.info("Paste is still cached sleeping to try again")
+            sleep(45)
+            # get raw paste and hash them
+            raw_paste_uri = paste_data['scrape_url']
+            raw_paste_data = requests.get(raw_paste_uri).text
+
         # Process the paste data here
         try:
             # Scan with yara
             matches = rules.match(data=raw_paste_data)
         except Exception as e:
             logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
-            return
+            continue
 
         results = []
         for match in matches:

From 79875a5f8ca0c69ea138fee7d252ecdbce083ea4 Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Sat, 29 Dec 2018 15:03:01 +0000
Subject: [PATCH 080/178] Update powershell and add more certificates to a new
 rule file

---
 YaraRules/certificates.yar  | 27 +++++++++++++++++++++++++++
 YaraRules/core_keywords.yar |  7 -------
 YaraRules/powershell.yar    |  3 +++
 3 files changed, 30 insertions(+), 7 deletions(-)
 create mode 100644 YaraRules/certificates.yar

diff --git a/YaraRules/certificates.yar b/YaraRules/certificates.yar
new file mode 100644
index 0000000..99a4ba9
--- /dev/null
+++ b/YaraRules/certificates.yar
@@ -0,0 +1,27 @@
+/*
+    This rule will match any of the keywords in the list
+*/
+
+rule core_keywords
+{
+    meta:
+        author = "@KevTheHermit"
+        info = "Part of PasteHunter"
+        reference = "https://github.com/kevthehermit/PasteHunter"
+
+    strings:
+        $enabled_sec = "enable secret" wide ascii nocase
+        $enable_pass = "enable password" wide ascii nocase
+        $ssh_priv = "BEGIN RSA PRIVATE KEY" wide ascii nocase
+        $openssh_priv = "BEGIN OPENSSH PRIVATE KEY" wide ascii nocase
+        $dsa_priv = "BEGIN DSA PRIVATE KEY" wide ascii nocase
+        $ec_priv = "BEGIN EC PRIVATE KEY" wide ascii nocase
+        $pgp_priv = "BEGIN PGP PRIVATE KEY" wide ascii nocase
+        $pem_cert = "BEGIN CERTIFICATE" wide ascii nocase
+        $pkcs7 = "BEGIN PKCS7"
+
+    condition:
+        any of them
+
+}
+
diff --git a/YaraRules/core_keywords.yar b/YaraRules/core_keywords.yar
index 50c67e3..10e23ff 100644
--- a/YaraRules/core_keywords.yar
+++ b/YaraRules/core_keywords.yar
@@ -12,13 +12,6 @@ rule core_keywords
     strings:
         $tango_down = "TANGO DOWN" wide ascii nocase
         $antisec = "antisec" wide ascii nocase
-        $enabled_sec = "enable secret" wide ascii nocase
-        $enable_pass = "enable password" wide ascii nocase
-        $ssh_priv = "BEGIN RSA PRIVATE KEY" wide ascii nocase
-        $openssh_priv = "BEGIN OPENSSH PRIVATE KEY" wide ascii nocase
-        $dsa_priv = "BEGIN DSA PRIVATE KEY" wide ascii nocase
-        $ec_priv = "BEGIN EC PRIVATE KEY" wide ascii nocase
-        $pgp_priv = "BEGIN PGP PRIVATE KEY" wide ascii nocase
         $hacked = "hacked by" wide ascii nocase
         $onion_url = /.*.\.onion/
     condition:
diff --git a/YaraRules/powershell.yar b/YaraRules/powershell.yar
index 8d88209..a0d4e99 100644
--- a/YaraRules/powershell.yar
+++ b/YaraRules/powershell.yar
@@ -19,6 +19,9 @@ rule powershell
         $g = "invoke" nocase
         $h = "bitsadmin" nocase
         $i = "certutil -decode" nocase
+        $j = "hidden" nocase
+        $k = "nop" nocase
+        $l = "-e" nocase
     condition:
         4 of them
 

From 709132b732075f379657fbf7d7d7d45d394dd368 Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Sat, 29 Dec 2018 15:05:02 +0000
Subject: [PATCH 081/178] add slack hooks to rule

---
 YaraRules/api_keys.yar | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/YaraRules/api_keys.yar b/YaraRules/api_keys.yar
index c0e4189..1b1a252 100644
--- a/YaraRules/api_keys.yar
+++ b/YaraRules/api_keys.yar
@@ -48,7 +48,7 @@ rule google_api
     strings:
         $a = /\bAIza.{35}\b/
     condition:
-        all of them
+        any of them
 }
 
 rule slack_api
@@ -60,8 +60,9 @@ rule slack_api
 
     strings:
         $a = /(xox(p|b|o|a)-[0-9]{9,12}-[0-9]{9,12}-[0-9]{9,12}-[a-z0-9]{32})/
+        $b = "hooks.slack.com" nocase
     condition:
-        all of them
+        any of them
 }
 
 rule github_api
@@ -74,7 +75,7 @@ rule github_api
     strings:
         $a = /[g|G][i|I][t|T][h|H][u|U][b|B].*[[\'|"]0-9a-zA-Z]{35,40}[\'|"]/
     condition:
-        all of them
+        any of them
 }
 
 rule aws_api
@@ -87,7 +88,7 @@ rule aws_api
     strings:
         $a = /AKIA[0-9A-Z]{16}/
     condition:
-        all of them
+        any of them
 }
 
 rule heroku_api
@@ -100,6 +101,5 @@ rule heroku_api
     strings:
         $a = /[h|H][e|E][r|R][o|O][k|K][u|U].*[0-9A-F]{8}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{12}/
     condition:
-        all of them
+        any of them
 }
-

From 8100547700e346b09871f5686e3c93d98e56c86b Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Sat, 29 Dec 2018 15:07:36 +0000
Subject: [PATCH 082/178] fix duplicate rule name

---
 YaraRules/certificates.yar | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/YaraRules/certificates.yar b/YaraRules/certificates.yar
index 99a4ba9..7378524 100644
--- a/YaraRules/certificates.yar
+++ b/YaraRules/certificates.yar
@@ -1,8 +1,8 @@
 /*
-    This rule will match any of the keywords in the list
+    This rule will look for common encoded certificates and secrets
 */
 
-rule core_keywords
+rule certificates
 {
     meta:
         author = "@KevTheHermit"

From 333f4806c0a6b7a6a4df9956365b91f633fc597e Mon Sep 17 00:00:00 2001
From: secbug <9038419+secbug@users.noreply.github.com>
Date: Fri, 4 Jan 2019 17:08:39 -0400
Subject: [PATCH 083/178] Handle SSLErrors on paste data fetch Signed-off-by:
 secbug <9038419+secbug@users.noreply.github.com>

---
 pastehunter.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/pastehunter.py b/pastehunter.py
index 4b794f6..1d43fa2 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -100,7 +100,12 @@ def paste_scanner():
         logger.debug("Found New {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
         # get raw paste and hash them
         raw_paste_uri = paste_data['scrape_url']
-        raw_paste_data = requests.get(raw_paste_uri).text
+        # Cover fetch site SSLErrors
+        try:
+            raw_paste_data = requests.get(raw_paste_uri).text
+        except requests.exceptions.SSLError as e:
+            logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
+            continue
 
         # Pastebin Cache
         if raw_paste_data == "File is not ready for scraping yet. Try again in 1 minute.":
@@ -108,8 +113,12 @@ def paste_scanner():
             sleep(45)
             # get raw paste and hash them
             raw_paste_uri = paste_data['scrape_url']
-            raw_paste_data = requests.get(raw_paste_uri).text
-
+            # Cover fetch site SSLErrors
+            try:
+                raw_paste_data = requests.get(raw_paste_uri).text
+            except requests.exceptions.SSLError as e:
+                logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
+                continue
         # Process the paste data here
         try:
             # Scan with yara

From 1e08534f9ba77b9a7c7b0b350d06488cfae3185d Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Tue, 15 Jan 2019 21:16:55 +0000
Subject: [PATCH 084/178] add nmap rule to core keywords

---
 YaraRules/core_keywords.yar | 1 +
 1 file changed, 1 insertion(+)

diff --git a/YaraRules/core_keywords.yar b/YaraRules/core_keywords.yar
index 10e23ff..4da6c76 100644
--- a/YaraRules/core_keywords.yar
+++ b/YaraRules/core_keywords.yar
@@ -14,6 +14,7 @@ rule core_keywords
         $antisec = "antisec" wide ascii nocase
         $hacked = "hacked by" wide ascii nocase
         $onion_url = /.*.\.onion/
+        $nmap_scan = "Nmap scan report for" wide ascii nocase
     condition:
         any of them
 

From 122207b340c4599606256d38925ccddc174f3f5e Mon Sep 17 00:00:00 2001
From: secbug <9038419+secbug@users.noreply.github.com>
Date: Tue, 22 Jan 2019 15:17:52 -0400
Subject: [PATCH 085/178] Get Rid Of Large Log Dumps Getting rid of java and
 minecraft related log dumps that take a lot of processing time and energy.

---
 YaraRules/blacklist.yar | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/YaraRules/blacklist.yar b/YaraRules/blacklist.yar
index c783711..117e526 100644
--- a/YaraRules/blacklist.yar
+++ b/YaraRules/blacklist.yar
@@ -9,6 +9,11 @@ rule blacklist
         $a = "#EXTINF:" nocase // IPTV stream Lists.
         $b = "--app-name=LeagueClient" nocase // League of Legends Debug Log
         $c = "common.application_name: LeagueClient" // League of Legends Debug Log
+        $d = /java\.(util|lang|io)/ // Minecraft and java errors
+        $e = "Traceback (most recent call last)"
+        $f = /define\(.*?\)|require_once\(.*?\)/
+        $g = "Technic Launcher is starting" // Minecraft mod dumps
+        $h = "OTL logfile created on" // 
     condition:
         any of them
 

From 442f511b888253da670288d4b325d0f339f09ecd Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Wed, 23 Jan 2019 19:19:56 +0000
Subject: [PATCH 086/178] remove choclety logs from powershell

---
 YaraRules/powershell.yar | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/YaraRules/powershell.yar b/YaraRules/powershell.yar
index a0d4e99..09681a7 100644
--- a/YaraRules/powershell.yar
+++ b/YaraRules/powershell.yar
@@ -10,19 +10,22 @@ rule powershell
         reference = "https://github.com/kevthehermit/PasteHunter"
 
     strings:
-        $a = "powershell" nocase
-        $b = "IEX" nocase
-        $c = "new-object" nocase
-        $d = "webclient" nocase
-        $e = "downloadstring" nocase
-        $f = "-WindowStyle Hidden" nocase
-        $g = "invoke" nocase
-        $h = "bitsadmin" nocase
-        $i = "certutil -decode" nocase
-        $j = "hidden" nocase
-        $k = "nop" nocase
-        $l = "-e" nocase
+        $a1 = "powershell" nocase
+        $a2 = "IEX" nocase
+        $a3 = "new-object" nocase
+        $a4 = "webclient" nocase
+        $a5 = "downloadstring" nocase
+        $a6 = "-WindowStyle Hidden" nocase
+        $a7 = "invoke" nocase
+        $a8 = "bitsadmin" nocase
+        $a9 = "certutil -decode" nocase
+        $a10 = "hidden" nocase
+        $a11 = "nop" nocase
+        $a12 = "-e" nocase
+
+        $not1 = "chocolatey" nocase
+        $not2 = "XmlConfiguration is now operational" nocase
     condition:
-        4 of them
+        4 of ($a*) and not any of ($not*)
 
 }
\ No newline at end of file

From 2412d3592c2b2991b9f1fbd031f84d1be3f283cb Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Wed, 23 Jan 2019 19:42:02 +0000
Subject: [PATCH 087/178] Update Certificates.yar

---
 YaraRules/certificates.yar | 2 --
 1 file changed, 2 deletions(-)

diff --git a/YaraRules/certificates.yar b/YaraRules/certificates.yar
index 7378524..6b6a667 100644
--- a/YaraRules/certificates.yar
+++ b/YaraRules/certificates.yar
@@ -10,8 +10,6 @@ rule certificates
         reference = "https://github.com/kevthehermit/PasteHunter"
 
     strings:
-        $enabled_sec = "enable secret" wide ascii nocase
-        $enable_pass = "enable password" wide ascii nocase
         $ssh_priv = "BEGIN RSA PRIVATE KEY" wide ascii nocase
         $openssh_priv = "BEGIN OPENSSH PRIVATE KEY" wide ascii nocase
         $dsa_priv = "BEGIN DSA PRIVATE KEY" wide ascii nocase

From a8140231b69f13f846470da4da05aec1438dfaa1 Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Wed, 23 Jan 2019 19:42:10 +0000
Subject: [PATCH 088/178] update core keywords

---
 YaraRules/core_keywords.yar | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/YaraRules/core_keywords.yar b/YaraRules/core_keywords.yar
index 4da6c76..9415629 100644
--- a/YaraRules/core_keywords.yar
+++ b/YaraRules/core_keywords.yar
@@ -15,6 +15,8 @@ rule core_keywords
         $hacked = "hacked by" wide ascii nocase
         $onion_url = /.*.\.onion/
         $nmap_scan = "Nmap scan report for" wide ascii nocase
+        $enabled_sec = "enable secret" wide ascii nocase
+        $enable_pass = "enable password" wide ascii nocase
     condition:
         any of them
 

From f150f23fe7d22a6f6a4cdec4e0e6e526aca2b494 Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Wed, 23 Jan 2019 19:42:22 +0000
Subject: [PATCH 089/178] Add service details to readme

---
 README.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/README.md b/README.md
index 662f9b7..8578d0b 100644
--- a/README.md
+++ b/README.md
@@ -137,3 +137,19 @@ It may be useful to run in a screen to keep it running in the background.
 
 ## Service 
 Service config is coming 
+
+$ cat /etc/systemd/system/pastehunter.service 
+[Unit]
+Description=PasteHunter
+
+[Service]
+WorkingDirectory=/opt/PasteHunter
+ExecStart=/usr/bin/python3 /opt/PasteHunter/pastehunter.py
+User=localuser
+Group=localuser
+Restart=always
+
+[Install]
+WantedBy=multi-user.target
+
+

From 3a6da4dab1d9a0ecc184c914c991632964e31907 Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Wed, 23 Jan 2019 19:42:58 +0000
Subject: [PATCH 090/178] New test string for b64

---
 postprocess/post_b64.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/postprocess/post_b64.py b/postprocess/post_b64.py
index fe3b383..150d671 100644
--- a/postprocess/post_b64.py
+++ b/postprocess/post_b64.py
@@ -24,7 +24,7 @@ def run(results, raw_paste_data, paste_object):
     # b64_re = '(?:[A-Za-z0-9+/]{4}){2,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)'
 
     # This one has a few empty results i need to catch but doesn't kill pastehunter
-    b64_re = '(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?'
+    b64_re = '(?:[A-Za-z0-9+/]{4}){3,}(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?'
     b64_strings = re.findall(b64_re, raw_paste_data)
 
 

From 9afc7547f337bf746de99f19ad03666908f44850 Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Wed, 23 Jan 2019 19:48:26 +0000
Subject: [PATCH 091/178] ignore vendor certificates in b64 url rule

---
 YaraRules/base64.yar | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/YaraRules/base64.yar b/YaraRules/base64.yar
index 663d6ca..6b3b6f0 100644
--- a/YaraRules/base64.yar
+++ b/YaraRules/base64.yar
@@ -82,12 +82,15 @@ rule b64_url
         reference = "https://github.com/kevthehermit/PasteHunter"
 
     strings:
-        $a = "aHR0cDov" // http/s
-        $b = "SFRUUDov" // HTTP/S
-        $c = "d3d3Lg" // www.
-        $d = "V1dXLg" // WWW.
+        $a1 = "aHR0cDov" // http/s
+        $a2 = "SFRUUDov" // HTTP/S
+        $a3 = "d3d3Lg" // www.
+        $a4 = "V1dXLg" // WWW.
+
+        // ignore vendor certs in this rule. The certs rule will pick them up if we want them
+        $not1 = "GlobalSign Root CA" nocase 
     condition:
-        any of them
+        any of ($a*) and not any of ($not*)
 
 }
 

From 598b4ed3f9b31a08443b9294ff9b15409e7109cf Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Wed, 23 Jan 2019 19:57:49 +0000
Subject: [PATCH 092/178] CVE in old version of requests.

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 7e0d85f..a7274c9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
-requests==2.18.4
+requests>=2.20.0
 elasticsearch>=5.0.0,<6.0.0
 yara-python
\ No newline at end of file

From e2174eeb0658f6cf6f0945926bbb65e540ec84c9 Mon Sep 17 00:00:00 2001
From: secbug <9038419+secbug@users.noreply.github.com>
Date: Wed, 23 Jan 2019 16:04:39 -0400
Subject: [PATCH 093/178] Logging and Settings

---
 pastehunter.py       | 28 ++++++++++++++++++++++++++++
 settings.json.sample | 11 +++++++++--
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/pastehunter.py b/pastehunter.py
index 1d43fa2..58961e0 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -9,6 +9,7 @@
 import multiprocessing
 import importlib
 import logging
+from logging import handlers 
 import time
 from time import sleep
 #from queue import Queue
@@ -35,6 +36,33 @@
 logger.info("Reading Configs")
 conf = parse_config()
 
+# Set up the log file
+if "log" in conf and conf["log"]["log_to_file"]:
+    if conf["log"]["log_path"] != "":
+        logfile = "{0}/{1}.log".format(conf["log"]["log_path"], conf["log"]["log_file"])
+        try:
+            os.makedirs(conf["log"]["log_path"], exist_ok=True)  # Python>3.2
+        except TypeError:
+            try:
+                os.makedirs(conf["log"]["log_path"])
+            except OSError as exc: # Python >2.5
+                if exc.errno == errno.EEXIST and os.path.isdir(conf["log"]["log_path"]):
+                    pass
+                else: logger.error("Can not create log file {0}: {1}".format(conf["log"]["log_path"], exc))
+    else:
+        logfile = "{0}.log".format(conf["log"]["log_file"])
+    fileHandler = handlers.RotatingFileHandler(logfile, mode='a+', maxBytes=(1048576*5), backupCount=7)
+    if conf["log"]["format"] != "":
+        fileFormatter = logging.Formatter("{0}".format(conf["log"]["format"]))
+        fileHandler.setFormatter(fileFormatter)
+    else:
+        fileHandler.setFormatter(logFormatter)
+    fileHandler.setLevel(conf["log"]["logging_level"])
+    logger.addHandler(fileHandler)
+    logger.info("Enabled Log File: {0}".format(logfile))
+else:
+    logger.info("Logging to file disabled.")
+
 # Override Log level if needed
 if "logging_level" in conf["general"]:
     log_level = conf["general"]["logging_level"]
diff --git a/settings.json.sample b/settings.json.sample
index 54aec48..b51b0be 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -104,6 +104,13 @@
     "blacklist": true,
     "test_rules": false
   },
+  "log": {
+    "log_to_file": false,
+    "log_file": "pastehunter",
+    "logging_level": 20,
+    "log_path": "logs",
+    "format": "%(asctime)s [%(threadName)-12.12s] %(levelname)s:%(message)s"
+  },
   "general": {
     "run_frequency": 300,
     "logging_level": 20
@@ -127,12 +134,12 @@
         "enabled": false,
         "api_host": "127.0.0.1",
         "api_port": 8080
-      },
+      }
+    }
     "post_entropy": {
       "enabled": true,
       "module": "postprocess.post_entropy",
       "rule_list": ["ALL"]
     }
-    }
   }
 }

From da4f6038070746f33da995a4057ce04091ab33cc Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Wed, 23 Jan 2019 20:11:15 +0000
Subject: [PATCH 094/178] Add log to file

---
 pastehunter.py       | 8 ++++++++
 settings.json.sample | 3 ++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/pastehunter.py b/pastehunter.py
index 4b794f6..a29ac58 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -42,6 +42,14 @@
     # For old configs
     logger.error("Log Level not in config file. Update your base config file!")
     log_level = 20
+
+# Set logtofile if set
+if "logtofile" in conf["general"]:
+    if conf["general"]["logtofile"]:
+        fh = logging.FileHandler(conf["general"]["logtofile"])
+        fh.setFormatter(formatter)
+        logger.addHandler(fh)
+
 logger.info("Setting Log Level to {0}".format(log_level))
 logging.getLogger('requests').setLevel(log_level)
 logging.getLogger('elasticsearch').setLevel(log_level)
diff --git a/settings.json.sample b/settings.json.sample
index c2b4cff..ba4bc5e 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -106,7 +106,8 @@
   },
   "general": {
     "run_frequency": 300,
-    "logging_level": 20
+    "logging_level": 20,
+    "logtofile": "pastehunter.log"
   },
   "post_process": {
     "post_email": {

From b01ccf262054d45e7f9518ba089d81a64fafca94 Mon Sep 17 00:00:00 2001
From: secbug <9038419+secbug@users.noreply.github.com>
Date: Wed, 23 Jan 2019 16:20:08 -0400
Subject: [PATCH 095/178] Fixing the SSLError Fix

---
 pastehunter.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pastehunter.py b/pastehunter.py
index 58961e0..789edbc 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -40,8 +40,8 @@
 if "log" in conf and conf["log"]["log_to_file"]:
     if conf["log"]["log_path"] != "":
         logfile = "{0}/{1}.log".format(conf["log"]["log_path"], conf["log"]["log_file"])
-        try:
-            os.makedirs(conf["log"]["log_path"], exist_ok=True)  # Python>3.2
+        # Assure directory exists
+        try: os.makedirs(conf["log"]["log_path"], exist_ok=True)  # Python>3.2
         except TypeError:
             try:
                 os.makedirs(conf["log"]["log_path"])
@@ -133,7 +133,7 @@ def paste_scanner():
             raw_paste_data = requests.get(raw_paste_uri).text
         except requests.exceptions.SSLError as e:
             logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
-            continue
+            raw_paste_data = ""
 
         # Pastebin Cache
         if raw_paste_data == "File is not ready for scraping yet. Try again in 1 minute.":
@@ -146,7 +146,7 @@ def paste_scanner():
                 raw_paste_data = requests.get(raw_paste_uri).text
             except requests.exceptions.SSLError as e:
                 logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
-                continue
+                raw_paste_data = ""
         # Process the paste data here
         try:
             # Scan with yara

From 81c1c020702971d3fc21506541de6a7d42135f95 Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Wed, 23 Jan 2019 20:25:00 +0000
Subject: [PATCH 096/178] remove duplicate logging

---
 pastehunter.py       | 7 -------
 settings.json.sample | 2 --
 2 files changed, 9 deletions(-)

diff --git a/pastehunter.py b/pastehunter.py
index 907dac1..7e2a541 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -71,13 +71,6 @@
     logger.error("Log Level not in config file. Update your base config file!")
     log_level = 20
 
-# Set logtofile if set
-if "logtofile" in conf["general"]:
-    if conf["general"]["logtofile"]:
-        fh = logging.FileHandler(conf["general"]["logtofile"])
-        fh.setFormatter(formatter)
-        logger.addHandler(fh)
-
 logger.info("Setting Log Level to {0}".format(log_level))
 logging.getLogger('requests').setLevel(log_level)
 logging.getLogger('elasticsearch').setLevel(log_level)
diff --git a/settings.json.sample b/settings.json.sample
index 04eb575..aa1a670 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -113,8 +113,6 @@
   },
   "general": {
     "run_frequency": 300,
-    "logging_level": 20,
-    "logtofile": "pastehunter.log"
   },
   "post_process": {
     "post_email": {

From cbc7857e5e73c49a5b0e39b9e7be7c35100722c1 Mon Sep 17 00:00:00 2001
From: Franco Colombino <fnk0c@users.noreply.github.com>
Date: Thu, 24 Jan 2019 02:41:50 -0200
Subject: [PATCH 097/178] Fix NoneType' is not iterable

When running the program returned `TypeError: argument of type 'NoneType' is not iterable` and crashed. With theses changes I was able to start it
---
 settings.json.sample | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/settings.json.sample b/settings.json.sample
index aa1a670..8c36070 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -112,7 +112,7 @@
     "format": "%(asctime)s [%(threadName)-12.12s] %(levelname)s:%(message)s"
   },
   "general": {
-    "run_frequency": 300,
+    "run_frequency": 300
   },
   "post_process": {
     "post_email": {
@@ -134,7 +134,7 @@
         "api_host": "127.0.0.1",
         "api_port": 8080
       }
-    }
+    },
     "post_entropy": {
       "enabled": true,
       "module": "postprocess.post_entropy",

From b6aa98beefa44a07ef0b7d6997df8c9c34f7757a Mon Sep 17 00:00:00 2001
From: secbug <9038419+secbug@users.noreply.github.com>
Date: Thu, 24 Jan 2019 12:53:14 -0400
Subject: [PATCH 098/178] Correcting a Naming Error And Miscommunication

It seems when I merged my changes for file logs, I failed to explain that I left the logic in for console log level setting so the file logs and console may have different levels set for each. I will remove this and unify the log_level into one within the logs in the config.
---
 pastehunter.py       | 7 +++++--
 settings.json.sample | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/pastehunter.py b/pastehunter.py
index 7e2a541..98522ad 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -64,10 +64,13 @@
     logger.info("Logging to file disabled.")
 
 # Override Log level if needed
-if "logging_level" in conf["general"]:
+if "logging_level" in conf["log"]:
+    log_level = conf["log"]["logging_level"]
+elif "loggin_level" in conf["general"]:
+    # For old configs
     log_level = conf["general"]["logging_level"]
 else:
-    # For old configs
+    # For older configs
     logger.error("Log Level not in config file. Update your base config file!")
     log_level = 20
 
diff --git a/settings.json.sample b/settings.json.sample
index aa1a670..5f4d298 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -134,7 +134,7 @@
         "api_host": "127.0.0.1",
         "api_port": 8080
       }
-    }
+    },
     "post_entropy": {
       "enabled": true,
       "module": "postprocess.post_entropy",

From 97b23c325141017365a2a1b27f57c76a7553c1f2 Mon Sep 17 00:00:00 2001
From: secbug <9038419+secbug@users.noreply.github.com>
Date: Thu, 24 Jan 2019 12:53:14 -0400
Subject: [PATCH 099/178] Correcting a Naming Error And Miscommunication

It seems when I merged my changes for file logs, I failed to explain that I left the logic in for console log level setting so the file logs and console may have different levels set for each. I will remove this and unify the log_level into one within the logs in the config.
---
 pastehunter.py       | 7 +++++--
 settings.json.sample | 4 ++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/pastehunter.py b/pastehunter.py
index 7e2a541..98522ad 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -64,10 +64,13 @@
     logger.info("Logging to file disabled.")
 
 # Override Log level if needed
-if "logging_level" in conf["general"]:
+if "logging_level" in conf["log"]:
+    log_level = conf["log"]["logging_level"]
+elif "loggin_level" in conf["general"]:
+    # For old configs
     log_level = conf["general"]["logging_level"]
 else:
-    # For old configs
+    # For older configs
     logger.error("Log Level not in config file. Update your base config file!")
     log_level = 20
 
diff --git a/settings.json.sample b/settings.json.sample
index aa1a670..8c36070 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -112,7 +112,7 @@
     "format": "%(asctime)s [%(threadName)-12.12s] %(levelname)s:%(message)s"
   },
   "general": {
-    "run_frequency": 300,
+    "run_frequency": 300
   },
   "post_process": {
     "post_email": {
@@ -134,7 +134,7 @@
         "api_host": "127.0.0.1",
         "api_port": 8080
       }
-    }
+    },
     "post_entropy": {
       "enabled": true,
       "module": "postprocess.post_entropy",

From 0457fb30ced31a14a5b636d77668b41f4945b928 Mon Sep 17 00:00:00 2001
From: secbug <9038419+secbug@users.noreply.github.com>
Date: Fri, 25 Jan 2019 08:24:33 -0400
Subject: [PATCH 100/178] Not Kenny Loggin(s) - LOGGING

Danger Zone
---
 pastehunter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pastehunter.py b/pastehunter.py
index 98522ad..e8d74c5 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -66,7 +66,7 @@
 # Override Log level if needed
 if "logging_level" in conf["log"]:
     log_level = conf["log"]["logging_level"]
-elif "loggin_level" in conf["general"]:
+elif "logging_level" in conf["general"]:
     # For old configs
     log_level = conf["general"]["logging_level"]
 else:

From 6effb9965c6b6632fee82834d4a0962dbb76d023 Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Mon, 28 Jan 2019 20:17:02 +0000
Subject: [PATCH 101/178] Add code block to Service in readme

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index de6e2f7..654dd63 100644
--- a/README.md
+++ b/README.md
@@ -139,7 +139,9 @@ It may be useful to run in a screen to keep it running in the background.
 ## Service 
 Service config is coming 
 
-$ cat /etc/systemd/system/pastehunter.service 
+`$ cat /etc/systemd/system/pastehunter.service` 
+
+```
 [Unit]
 Description=PasteHunter
 
@@ -152,5 +154,5 @@ Restart=always
 
 [Install]
 WantedBy=multi-user.target
-
+```
 

From 42e81b26908000a7ab3e74e0c6990b067de7c56a Mon Sep 17 00:00:00 2001
From: thehermit <kevthehermit@gmail.com>
Date: Mon, 28 Jan 2019 20:19:00 +0000
Subject: [PATCH 102/178] Add permissions warning to Service section

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 654dd63..037ecb5 100644
--- a/README.md
+++ b/README.md
@@ -137,7 +137,8 @@ Start the application with ```python3 pastehunter.py```
 It may be useful to run in a screen to keep it running in the background. 
 
 ## Service 
-Service config is coming 
+If your using /opt/ make sure the user your running the service as has permissions to write to /opt/pastehunter/
+Change User and Group to match your system
 
 `$ cat /etc/systemd/system/pastehunter.service` 
 

From 160ed5b9cf9e7dbdad80df34b98a6455f4674098 Mon Sep 17 00:00:00 2001
From: KevTheHermit <kevin@techanarchy.net>
Date: Sat, 2 Feb 2019 19:01:05 +0000
Subject: [PATCH 103/178] add c9 to gitignore

---
 .gitignore | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index be347da..e12df7d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -106,4 +106,6 @@ ENV/
 /.idea
 /postprocess/tester.py
 .vscode/
-logs/
\ No newline at end of file
+logs/
+
+.c9
\ No newline at end of file

From 7a630a9df5a0165b88781685c89523050b3fdb37 Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@ide.star-labs.co.uk>
Date: Sat, 9 Feb 2019 19:13:11 +0000
Subject: [PATCH 104/178] Add Stackexchange and fix store all per site

---
 README.md           | 11 +++++++---
 YaraRules/index.yar | 14 ++++++-------
 inputs/dumpz.py     |  1 +
 inputs/gists.py     |  1 +
 inputs/pastebin.py  |  1 +
 inputs/slexy.py     |  1 +
 pastehunter.py      | 49 +++++++++++++++++++++++++++++++++------------
 7 files changed, 55 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index 037ecb5..0f3ddb7 100644
--- a/README.md
+++ b/README.md
@@ -9,9 +9,14 @@ Pastehunter currently has support for the following sites:
  - dumpz.org
  - gist.github.com
  - slexy.org
-
-Support for the following sites is listed as ToDo:
- - paste.ee
+ - stackexchange # There are about 176! 
+   - Likly to expose privlidged information
+   - stackoverflow
+   - serverfault
+   - superuser
+   - webapps
+   - webmasters
+   - dba
 
 ## Supported Outputs
 Pastehunter supports several output modules:
diff --git a/YaraRules/index.yar b/YaraRules/index.yar
index 60b074f..258864d 100644
--- a/YaraRules/index.yar
+++ b/YaraRules/index.yar
@@ -1,11 +1,11 @@
-include "blacklist.yar"
+include "powershell.yar"
 include "api_keys.yar"
-include "password_leak.yar"
-include "database.yar"
+include "email_filter.yar"
+include "blacklist.yar"
 include "base64.yar"
-include "core_keywords.yar"
+include "database.yar"
 include "hak5.yar"
-include "custom_keywords.yar"
+include "core_keywords.yar"
+include "password_leak.yar"
 include "general.yar"
-include "powershell.yar"
-include "email_filter.yar"
\ No newline at end of file
+include "certificates.yar"
diff --git a/inputs/dumpz.py b/inputs/dumpz.py
index 0132d5f..b58e7a6 100644
--- a/inputs/dumpz.py
+++ b/inputs/dumpz.py
@@ -29,6 +29,7 @@ def recent_pastes(conf, input_history):
 
             # Create a new paste dict for us to normalize
             paste_data = paste
+            paste_data['confname'] = 'dumpz'
             paste_data['pasteid'] = paste['id']
             paste_data['pastesite'] = 'dumpz.org'
 
diff --git a/inputs/gists.py b/inputs/gists.py
index 973f603..15fc866 100644
--- a/inputs/gists.py
+++ b/inputs/gists.py
@@ -72,6 +72,7 @@ def recent_pastes(conf, input_history):
                         continue
 
                     gist_data = file_meta
+                    gist_data['confname'] = 'gist'
                     gist_data['@timestamp'] = gist_meta['created_at']
                     gist_data['pasteid'] = gist_meta['id']
                     gist_data['user'] = gist_meta['user']
diff --git a/inputs/pastebin.py b/inputs/pastebin.py
index 93b9cbc..e7d8794 100644
--- a/inputs/pastebin.py
+++ b/inputs/pastebin.py
@@ -31,6 +31,7 @@ def recent_pastes(conf, input_history):
 
             # Create a new paste dict for us to normalize
             paste_data = paste
+            paste_data['confname'] = 'pastebin'
             paste_data['pasteid'] = paste['key']
             paste_data['pastesite'] = 'pastebin.com'
             # Add a date field that kibana will map
diff --git a/inputs/slexy.py b/inputs/slexy.py
index 8bdcde8..c95b377 100644
--- a/inputs/slexy.py
+++ b/inputs/slexy.py
@@ -90,6 +90,7 @@ def recent_pastes(conf, input_history):
             paste = SlexyPaste(pid)
             history.append(paste.pid)
             paste_data = {}
+            paste_data['confname'] = 'slexy'
             paste_data['scrape_url'] = paste.url.full_url
             paste_data['pasteid'] = paste.pid
             paste_data['pastesite'] = paste.site
diff --git a/pastehunter.py b/pastehunter.py
index e8d74c5..14af42b 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -12,7 +12,7 @@
 from logging import handlers 
 import time
 from time import sleep
-#from queue import Queue
+from urllib.parse import unquote_plus
 from common import parse_config
 from postprocess import post_email
 
@@ -36,6 +36,10 @@
 logger.info("Reading Configs")
 conf = parse_config()
 
+# If the config failed to parse
+if not conf:
+    sys.exit()
+
 # Set up the log file
 if "log" in conf and conf["log"]["log_to_file"]:
     if conf["log"]["log_path"] != "":
@@ -131,10 +135,22 @@ def paste_scanner():
         start_time = time.time()
         logger.debug("Found New {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
         # get raw paste and hash them
+    
         raw_paste_uri = paste_data['scrape_url']
         # Cover fetch site SSLErrors
         try:
-            raw_paste_data = requests.get(raw_paste_uri).text
+            
+            # Stack questions dont have a raw endpoint
+            if paste_data['pastesite'].startswith('stack'):
+                json_body = requests.get(raw_paste_uri).json()
+                
+                # Unescape the code block strings in the json body. 
+                raw_body = json_body['items'][0]['body']
+                raw_paste_data = unquote_plus(raw_body)
+                
+            else:
+                raw_paste_data = requests.get(raw_paste_uri).text
+                
         except requests.exceptions.SSLError as e:
             logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
             raw_paste_data = ""
@@ -207,11 +223,14 @@ def paste_scanner():
 
         #ToDo: Need to make this check for each output not universal
 
-        paste_site = paste_data['pastesite']
-        store_all = conf['inputs']['pastebin']['store_all']
-        if store_all is True and paste_site == 'pastebin.com':
+        paste_site = paste_data['confname']
+        store_all = conf['inputs'][paste_site]['store_all']
+        if store_all is True:
             if len(results) == 0:
                 results.append('no_match')
+                
+        # remove the confname key as its not really needed past this point
+        del paste_data['confname']
 
         if len(results) > 0:
 
@@ -279,15 +298,19 @@ def paste_scanner():
                     input_history = paste_history[input_name]
                 else:
                     input_history = []
+                    
+                try:
 
-                i = importlib.import_module(input_name)
-                # Get list of recent pastes
-                logger.info("Fetching paste list from {0}".format(input_name))
-                paste_list, history = i.recent_pastes(conf, input_history)
-                for paste in paste_list:
-                    q.put(paste)
-                    queue_count += 1
-                paste_history[input_name] = history
+                    i = importlib.import_module(input_name)
+                    # Get list of recent pastes
+                    logger.info("Fetching paste list from {0}".format(input_name))
+                    paste_list, history = i.recent_pastes(conf, input_history)
+                    for paste in paste_list:
+                        q.put(paste)
+                        queue_count += 1
+                    paste_history[input_name] = history
+                except Exception as e:
+                    logger.error("Unable to fetch list from {0}: {1}".format(input_name, e))
 
             logger.debug("Writing History")
             # Write History

From bca406507be68ecee4389a82e7353b7f629842bb Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@ide.star-labs.co.uk>
Date: Sat, 9 Feb 2019 19:35:17 +0000
Subject: [PATCH 105/178] Add stackexchange and fix gists

---
 inputs/gists.py         |  2 +-
 inputs/stackexchange.py | 80 +++++++++++++++++++++++++++++++++++++++++
 settings.json.sample    | 10 ++++++
 3 files changed, 91 insertions(+), 1 deletion(-)
 create mode 100644 inputs/stackexchange.py

diff --git a/inputs/gists.py b/inputs/gists.py
index 15fc866..fd7b1e7 100644
--- a/inputs/gists.py
+++ b/inputs/gists.py
@@ -72,7 +72,7 @@ def recent_pastes(conf, input_history):
                         continue
 
                     gist_data = file_meta
-                    gist_data['confname'] = 'gist'
+                    gist_data['confname'] = 'gists'
                     gist_data['@timestamp'] = gist_meta['created_at']
                     gist_data['pasteid'] = gist_meta['id']
                     gist_data['user'] = gist_meta['user']
diff --git a/inputs/stackexchange.py b/inputs/stackexchange.py
new file mode 100644
index 0000000..0ab3c66
--- /dev/null
+++ b/inputs/stackexchange.py
@@ -0,0 +1,80 @@
+import requests
+import math
+import logging
+from datetime import datetime
+
+# Set some logging options
+logger = logging.getLogger('pastehunter')
+logging.getLogger('requests').setLevel(logging.ERROR)
+
+# Test API Key from the docs - U4DMV*8nvpm3EOpvf69Rxw((
+# https://api.stackexchange.com/2.2/questions?key=U4DMV*8nvpm3EOpvf69Rxw((&site=stackoverflow&page=1&pagesize=100&order=desc&sort=creation&filter=default
+
+
+
+def recent_pastes(conf, input_history):
+    api_key = conf['inputs']['stackexchange']['api_key']
+    api_scrape = conf['inputs']['stackexchange']['api_scrape']
+    site_list = conf['inputs']['stackexchange']['site_list']
+    store_filter = conf['inputs']['stackexchange']['store_filter']
+    question_body_filter = '!bA1dOlliDM)pi9'
+    pagesize = 100 # Default = 30
+    headers = {'user-agent': 'PasteHunter'}
+
+    if api_key == '':
+        logger.error("No API Key configured for StackExchange Access")
+        return [], []
+
+    result_pages = []
+    history = []
+    paste_list = []
+
+    try:
+        
+        # For each of the stack sites we want to query
+        for site in site_list:
+
+            # Create the API uri
+            scrape_uri = '{0}?key={1}&site={2}&page=1&pagesize=100&order=desc&sort=creation&filter={3}'.format(api_scrape, api_key, site, store_filter)
+            # Get some pastes and convert to json
+            # Get last 'paste_limit' pastes
+            paste_list_request = requests.get(scrape_uri)
+    
+            # ToDo: Add an API rate test in here. 
+            paste_list_json = paste_list_request.json()
+            
+            logger.info("Used {0} of {1} in api quota".format(paste_list_json['quota_remaining'], paste_list_json['quota_max']))
+    
+            for question in paste_list_json['items']:
+                # Track question ids to prevent dupes
+                history.append(question['question_id'])
+                if question['question_id'] in input_history:
+                    continue
+    
+                # Create a new question dict for us to normalize
+                question_data = question
+                question_data['confname'] = "stackexchange"
+                question_data['pasteid'] = question['question_id']
+                question_data['pastesite'] = site
+                # Get the author and then trim the data we store. 
+                question_data['username'] = question['owner']['display_name']
+                del question_data['owner']
+                # Add a date field that kibana will map
+                date = datetime.utcfromtimestamp(float(question_data['creation_date'])).isoformat()
+                question_data['@timestamp'] = date
+                question_data['scrape_url'] = "{0}/{1}?key={2}&order=desc&sort=activity&site={3}&filter={4}".format(
+                    api_scrape, 
+                    question['question_id'],
+                    api_key,
+                    site,
+                    question_body_filter
+                    )
+                paste_list.append(question_data)
+            
+        
+        # Return the pastes and update history
+        return paste_list, history
+
+    except Exception as e:
+        logger.error("Unable to parse question results: {0}".format(e))
+        return paste_list, history
\ No newline at end of file
diff --git a/settings.json.sample b/settings.json.sample
index 8c36070..2b64db3 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -33,6 +33,16 @@
       "api_scrape": "http://slexy.org/recent",
       "api_raw": "http://slexy.org/raw",
       "api_view": "http://slexy.org/view"
+    },
+    "stackexchange":{
+      "enabled": false,
+      "module": "inputs.stackexchange",
+      "site_list": ["stackoverflow","serverfault", "superuser", "webapps", "webmasters", "dba"],
+      "api_key": "",
+      "store_filter": "!LZg2mkNj0UY)iKNdTbVP4i",
+      "pagesize": 100,
+      "store_all": true,
+      "api_scrape": "https://api.stackexchange.com/2.2/questions"
     }
   },
   "outputs": {

From 9f694a37a00d73f3f81672003e64a19d27450d8c Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@pastehunter.com>
Date: Sat, 9 Feb 2019 19:43:03 +0000
Subject: [PATCH 106/178] Correctly identify a stackexchange site for body
 extract

---
 pastehunter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pastehunter.py b/pastehunter.py
index 14af42b..d8df180 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -141,7 +141,7 @@ def paste_scanner():
         try:
             
             # Stack questions dont have a raw endpoint
-            if paste_data['pastesite'].startswith('stack'):
+            if ('stackexchange' in conf['inputs']) and (paste_data['pastesite'] in conf['inputs']['stackexchange']['site_list']):
                 json_body = requests.get(raw_paste_uri).json()
                 
                 # Unescape the code block strings in the json body. 

From c96d43569a74f1859fbc9a948bb89aa3c3185400 Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@pastehunter.com>
Date: Sat, 9 Feb 2019 20:18:56 +0000
Subject: [PATCH 107/178] Do not Post Process Blacklisted content

---
 pastehunter.py | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/pastehunter.py b/pastehunter.py
index d8df180..e44841a 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -193,12 +193,22 @@ def paste_scanner():
             else:
                 results.append(match.rule)
 
+        # Store all OverRides other options. 
+        paste_site = paste_data['confname']
+        store_all = conf['inputs'][paste_site]['store_all']
+        # remove the confname key as its not really needed past this point
+        del paste_data['confname']
+
+
         # Blacklist Check
         # If any of the blacklist rules appear then empty the result set
+        blacklisted = False
         if conf['yara']['blacklist'] and 'blacklist' in results:
             results = []
+            blacklisted = True
             logger.info("Blacklisted {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
 
+
         # Post Process
 
         # If post module is enabled and the paste has a matching rule.
@@ -206,12 +216,13 @@ def paste_scanner():
         for post_process, post_values in conf["post_process"].items():
             if post_values["enabled"]:
                 if any(i in results for i in post_values["rule_list"]) or "ALL" in post_values["rule_list"]:
-                    logger.info("Running Post Module {0} on {1}".format(post_values["module"], paste_data["pasteid"]))
-                    post_module = importlib.import_module(post_values["module"])
-                    post_results = post_module.run(results,
-                                                    raw_paste_data,
-                                                    paste_data
-                                                    )
+                    if not Blacklisted and store_all:
+                        logger.info("Running Post Module {0} on {1}".format(post_values["module"], paste_data["pasteid"]))
+                        post_module = importlib.import_module(post_values["module"])
+                        post_results = post_module.run(results,
+                                                        raw_paste_data,
+                                                        paste_data
+                                                        )
 
         # Throw everything back to paste_data for ease.
         paste_data = post_results
@@ -221,17 +232,10 @@ def paste_scanner():
         # If results is empty, ie no match, and store_all is True,
         # then append "no_match" to results. This will then force output.
 
-        #ToDo: Need to make this check for each output not universal
-
-        paste_site = paste_data['confname']
-        store_all = conf['inputs'][paste_site]['store_all']
         if store_all is True:
             if len(results) == 0:
                 results.append('no_match')
                 
-        # remove the confname key as its not really needed past this point
-        del paste_data['confname']
-
         if len(results) > 0:
 
             encoded_paste_data = raw_paste_data.encode('utf-8')

From 4009410e18d88598d7a1b5969b778bf863f31903 Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@pastehunter.com>
Date: Sat, 9 Feb 2019 20:19:41 +0000
Subject: [PATCH 108/178] Disable entropy by default

---
 settings.json.sample | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/settings.json.sample b/settings.json.sample
index 2b64db3..828a04e 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -146,7 +146,7 @@
       }
     },
     "post_entropy": {
-      "enabled": true,
+      "enabled": false,
       "module": "postprocess.post_entropy",
       "rule_list": ["ALL"]
     }

From da508ab302bc49a75c728821f1e23342c4d658e4 Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@pastehunter.com>
Date: Sat, 9 Feb 2019 20:22:59 +0000
Subject: [PATCH 109/178] Add paste size as an int

---
 pastehunter.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pastehunter.py b/pastehunter.py
index e44841a..e19c295 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -245,6 +245,8 @@ def paste_scanner():
             paste_data['SHA256'] = sha256
             paste_data['raw_paste'] = raw_paste_data
             paste_data['YaraRule'] = results
+            # Set the size for all pastes - This will override any size set by the source
+            paste_data['size'] = len(raw_paste_data)
             for output in outputs:
                 try:
                     output.store_paste(paste_data)

From d28a906fd8cb44dc09af32c8590e29a5ddd8a4cc Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@pastehunter.com>
Date: Sat, 9 Feb 2019 20:44:44 +0000
Subject: [PATCH 110/178] Update logging output for stack exchange

---
 inputs/stackexchange.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/inputs/stackexchange.py b/inputs/stackexchange.py
index 0ab3c66..ad967b7 100644
--- a/inputs/stackexchange.py
+++ b/inputs/stackexchange.py
@@ -33,6 +33,7 @@ def recent_pastes(conf, input_history):
         
         # For each of the stack sites we want to query
         for site in site_list:
+            logger.info("Query Stack Exchange site: {0}".format(site))
 
             # Create the API uri
             scrape_uri = '{0}?key={1}&site={2}&page=1&pagesize=100&order=desc&sort=creation&filter={3}'.format(api_scrape, api_key, site, store_filter)
@@ -43,7 +44,7 @@ def recent_pastes(conf, input_history):
             # ToDo: Add an API rate test in here. 
             paste_list_json = paste_list_request.json()
             
-            logger.info("Used {0} of {1} in api quota".format(paste_list_json['quota_remaining'], paste_list_json['quota_max']))
+            
     
             for question in paste_list_json['items']:
                 # Track question ids to prevent dupes
@@ -71,7 +72,12 @@ def recent_pastes(conf, input_history):
                     )
                 paste_list.append(question_data)
             
+            
+            # Record API Quota on last call to save some logging. 
+            quota_max = paste_list_json['quota_max']
+            quota_remaining = paste_list_json['quota_remaining']
         
+        logger.info("Used {0} of {1} in api quota".format(quota_remaining, quota_max))
         # Return the pastes and update history
         return paste_list, history
 

From 4f856899d227126f8b4c2dbf64b0e9960db10807 Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@pastehunter.com>
Date: Sat, 9 Feb 2019 20:57:02 +0000
Subject: [PATCH 111/178] fix blacklist typo

---
 pastehunter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pastehunter.py b/pastehunter.py
index e19c295..d6e6eb6 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -216,7 +216,7 @@ def paste_scanner():
         for post_process, post_values in conf["post_process"].items():
             if post_values["enabled"]:
                 if any(i in results for i in post_values["rule_list"]) or "ALL" in post_values["rule_list"]:
-                    if not Blacklisted and store_all:
+                    if not blacklisted and store_all:
                         logger.info("Running Post Module {0} on {1}".format(post_values["module"], paste_data["pasteid"]))
                         post_module = importlib.import_module(post_values["module"])
                         post_results = post_module.run(results,

From 8e05096cf1bc08a3dfb45d4d6e849c552fdcf288 Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@pastehunter.com>
Date: Sat, 9 Feb 2019 20:57:32 +0000
Subject: [PATCH 112/178] More logging changes to stack exchange

---
 inputs/stackexchange.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inputs/stackexchange.py b/inputs/stackexchange.py
index ad967b7..a56f17b 100644
--- a/inputs/stackexchange.py
+++ b/inputs/stackexchange.py
@@ -77,7 +77,7 @@ def recent_pastes(conf, input_history):
             quota_max = paste_list_json['quota_max']
             quota_remaining = paste_list_json['quota_remaining']
         
-        logger.info("Used {0} of {1} in api quota".format(quota_remaining, quota_max))
+        logger.info("Used {0} of {1} of StackExchange api quota".format(quota_remaining, quota_max))
         # Return the pastes and update history
         return paste_list, history
 

From 29ffa5a603c70d0b253df01f9bc496b70fee0934 Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@pastehunter.com>
Date: Sat, 9 Feb 2019 21:26:17 +0000
Subject: [PATCH 113/178] Add Crypto API YaraRules

---
 YaraRules/CryptoExchangeApi.yar | 87 +++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 YaraRules/CryptoExchangeApi.yar

diff --git a/YaraRules/CryptoExchangeApi.yar b/YaraRules/CryptoExchangeApi.yar
new file mode 100644
index 0000000..7dac34e
--- /dev/null
+++ b/YaraRules/CryptoExchangeApi.yar
@@ -0,0 +1,87 @@
+rule CryptoExchangeApi
+{
+    meta:
+        description = "Contains Crypro Exchange API URL"
+        author = "Jason Schorr (0xBanana)"
+    strings:
+    		$a = "api.binance.com" nocase wide ascii
+		$a0 = "1btcxe.com/api" nocase wide ascii
+		$a1 = "acx.io/api" nocase wide ascii
+		$a2 = "anxpro.com/api" nocase wide ascii
+		$a3 = "anybits.com/api" nocase wide ascii
+		$a4 = "www.bcex.top" nocase wide ascii
+		$a5 = "api.bibox.com" nocase wide ascii
+		$a6 = "bit2c.co.il" nocase wide ascii
+		$a7 = "api.bitfinex.com" nocase wide ascii
+		$a8 = "api.bitfinex.com" nocase wide ascii
+		$a9 = "api.bitflyer.jp" nocase wide ascii
+		$aa = "api.bitforex.com" nocase wide ascii
+		$ab = "bitibu.com" nocase wide ascii
+		$ac = "bitlish.com/api" nocase wide ascii
+		$ad = "www.bitmex.com" nocase wide ascii
+		$ae = "bitsane.com/api" nocase wide ascii
+		$af = "api.bitso.com" nocase wide ascii
+		$ag = "www.bitstamp.net/api" nocase wide ascii
+		$ah = "www.bitstamp.net/api" nocase wide ascii
+		$ai = "api.bl3p.eu" nocase wide ascii
+		$aj = "braziliex.com/api/v1" nocase wide ascii
+		$ak = "btc-alpha.com/api" nocase wide ascii
+		$al = "www.btcbox.co.jp/api" nocase wide ascii
+		$am = "www.btcexchange.ph/api" nocase wide ascii
+		$an = "btc-trade.com.ua/api" nocase wide ascii
+		$ao = "www.btcturk.com/api" nocase wide ascii
+		$ap = "www.buda.com/api" nocase wide ascii
+		$aq = "bx.in.th/api" nocase wide ascii
+		$ar = "cex.io/api" nocase wide ascii
+		$as = "api.cobinhood.com" nocase wide ascii
+		$at = "api.coinbase.com" nocase wide ascii
+		$au = "api.prime.coinbase.com" nocase wide ascii
+		$av = "api.pro.coinbase.com" nocase wide ascii
+		$aw = "coincheck.com/api" nocase wide ascii
+		$ax = "www.coinexchange.io/api/v1" nocase wide ascii
+		$ay = "coinfalcon.com" nocase wide ascii
+		$az = "webapi.coinfloor.co.uk:8090/bist" nocase wide ascii
+		$aa1 = "coinmate.io/api" nocase wide ascii
+		$aa2 = "api.coinone.co.kr" nocase wide ascii
+		$aa3 = "api.crex24.com" nocase wide ascii
+		$aa4 = "api.cryptonbtc.com" nocase wide ascii
+		$aa5 = "www.deribit.com" nocase wide ascii
+		$aa6 = "api.ethfinex.com" nocase wide ascii
+		$aa7 = "api.fcoin.com" nocase wide ascii
+		$aa8 = "api.flowbtc.com:8405/ajax" nocase wide ascii
+		$aa9 = "www.fybse.se/api/SEK" nocase wide ascii
+		$aa0 = "www.fybsg.com/api/SGD" nocase wide ascii
+		$aab = "api.gatecoin.com" nocase wide ascii
+		$aac = "api.gdax.com" nocase wide ascii
+		$aad = "api.gemini.com" nocase wide ascii
+		$aae = "getbtc.org/api" nocase wide ascii
+		$aaf = "api.hitbtc.com" nocase wide ascii
+		$aag = "api.hitbtc.com" nocase wide ascii
+		$aah = "api.huobi.com" nocase wide ascii
+		$aai = "ice3x.com/api" nocase wide ascii
+		$aaj = "api.itbit.com" nocase wide ascii
+		$aak = "www.jubi.com/api" nocase wide ascii
+		$aal = "kuna.io" nocase wide ascii
+		$aam = "api.lakebtc.com" nocase wide ascii
+		$aan = "api.lbank.info" nocase wide ascii
+		$aao = "api.liquid.com" nocase wide ascii
+		$aap = "api.livecoin.net" nocase wide ascii
+		$aaq = "api.mybitx.com/api" nocase wide ascii
+		$aar = "mixcoins.com/api" nocase wide ascii
+		$aas = "novaexchange.com/remote" nocase wide ascii
+		$aat = "paymium.com/api" nocase wide ascii
+		$aau = "api.quadrigacx.com" nocase wide ascii
+		$aav = "www.rightbtc.com/api" nocase wide ascii
+		$aaw = "www.southxchange.com/api" nocase wide ascii
+		$aax = "api.theocean.trade/api" nocase wide ascii
+		$aay = "api.therocktrading.com" nocase wide ascii
+		$aaz = "www.tidebit.com" nocase wide ascii
+		$ba = "open-api.uex.com/open/api" nocase wide ascii
+		$bb = "api.vaultoro.com" nocase wide ascii
+		$bc = "cryptottlivewebapi.xbtce.net:8443/api" nocase wide ascii
+		$bd = "yunbi.com" nocase wide ascii
+		$be = "api.zaif.jp" nocase wide ascii
+
+    condition:
+       any of them
+}
\ No newline at end of file

From 67bf5fad62e8b340649ddbb125497c2c42df091b Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@pastehunter.com>
Date: Sat, 9 Feb 2019 21:33:49 +0000
Subject: [PATCH 114/178] Update CryptoRule

---
 YaraRules/CryptoExchangeApi.yar | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/YaraRules/CryptoExchangeApi.yar b/YaraRules/CryptoExchangeApi.yar
index 7dac34e..c130889 100644
--- a/YaraRules/CryptoExchangeApi.yar
+++ b/YaraRules/CryptoExchangeApi.yar
@@ -3,8 +3,9 @@ rule CryptoExchangeApi
     meta:
         description = "Contains Crypro Exchange API URL"
         author = "Jason Schorr (0xBanana)"
+        source = "https://github.com/cryptodefense/PasteHunter-Yara/blob/master/CryptoExchangeApi.yar"
     strings:
-    		$a = "api.binance.com" nocase wide ascii
+    	$a = "api.binance.com" nocase wide ascii
 		$a0 = "1btcxe.com/api" nocase wide ascii
 		$a1 = "acx.io/api" nocase wide ascii
 		$a2 = "anxpro.com/api" nocase wide ascii

From 5103b735af1c3861e3af2962a65a1f2614647966 Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@pastehunter.com>
Date: Sat, 9 Feb 2019 21:52:19 +0000
Subject: [PATCH 115/178] Blacklist post process check

---
 pastehunter.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pastehunter.py b/pastehunter.py
index d6e6eb6..ec676f9 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -148,6 +148,9 @@ def paste_scanner():
                 raw_body = json_body['items'][0]['body']
                 raw_paste_data = unquote_plus(raw_body)
                 
+                # URL has the API Key in it so make sure it is removed before store.
+                del paste_data['scrape_url']
+                
             else:
                 raw_paste_data = requests.get(raw_paste_uri).text
                 
@@ -216,7 +219,7 @@ def paste_scanner():
         for post_process, post_values in conf["post_process"].items():
             if post_values["enabled"]:
                 if any(i in results for i in post_values["rule_list"]) or "ALL" in post_values["rule_list"]:
-                    if not blacklisted and store_all:
+                    if not blacklisted:
                         logger.info("Running Post Module {0} on {1}".format(post_values["module"], paste_data["pasteid"]))
                         post_module = importlib.import_module(post_values["module"])
                         post_results = post_module.run(results,

From 1bdba3b32a249af121157b6f94f94d025f9e207f Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@pastehunter.com>
Date: Sun, 10 Feb 2019 00:51:37 +0000
Subject: [PATCH 116/178] Add first draft of docs

---
 docs/Makefile         |  19 +++++
 docs/conf.py          | 173 ++++++++++++++++++++++++++++++++++++++++++
 docs/index.rst        |  26 +++++++
 docs/inputs.rst       |  71 +++++++++++++++++
 docs/installation.rst |  99 ++++++++++++++++++++++++
 docs/outputs.rst      |  24 ++++++
 docs/postprocess.rst  |  16 ++++
 7 files changed, 428 insertions(+)
 create mode 100644 docs/Makefile
 create mode 100644 docs/conf.py
 create mode 100644 docs/index.rst
 create mode 100644 docs/inputs.rst
 create mode 100644 docs/installation.rst
 create mode 100644 docs/outputs.rst
 create mode 100644 docs/postprocess.rst

diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..298ea9e
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,19 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..d048dbf
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,173 @@
+# -*- coding: utf-8 -*-
+#
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'PasteHunter'
+copyright = '2019, Kevin Breen'
+author = 'Kevin Breen'
+
+# The short X.Y version
+version = '1.0'
+# The full version, including alpha/beta/rc tags
+release = '1.0'
+
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = None
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself.  Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'PasteHunterdoc'
+
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'PasteHunter.tex', 'PasteHunter Documentation',
+     'Kevin Breen', 'manual'),
+]
+
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'pastehunter', 'PasteHunter Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'PasteHunter', 'PasteHunter Documentation',
+     author, 'PasteHunter', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+
+# -- Options for Epub output -------------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = project
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+
+# A unique identification for the text.
+#
+# epub_uid = ''
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..4cfc327
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,26 @@
+.. PasteHunter documentation master file, created by
+   sphinx-quickstart on Sat Feb  9 22:50:02 2019.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to PasteHunter's documentation!
+=======================================
+
+
+PasteHunter is a python3 application that is designed to query a collection of sites 
+that host publicliy pasted data. For all the pasts it finds it scans the raw contents 
+against a series of yara rules looking for information that can be used by an organisation or a researcher.
+
+
+
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+   
+   installation
+   inputs
+   outputs
+   postprocess
+
+
diff --git a/docs/inputs.rst b/docs/inputs.rst
new file mode 100644
index 0000000..5af237b
--- /dev/null
+++ b/docs/inputs.rst
@@ -0,0 +1,71 @@
+Inputs
+======
+
+This page details all the configuration options per input. 
+
+There are a few generic options for each input. 
+- **enabled**: This turns the input on and off. 
+- **store_all**: ignore the only store on matching rule.
+- **module**: This is used internally by pastehunter.
+
+Pastebin
+------------
+To use the pastebin API you need an API key. These need to be purchased and are almost always on some sort of offer!
+https://pastebin.com/pro The API uses your IP to authenticate instead of a key. You will need to whitelist your IP at https://pastebin.com/api_scraping_faq
+
+- **api_scrape**: The URL endpoint for the list of recent paste ids.
+- **api_raw**: The URL endpoint for the raw paste.
+- **paste_limit**: How many pasteids to fetch from the recent list. 
+- **store_all**: Store all pastes regardless of a rule match.
+
+Github Gists
+---------------
+Github has an API that can be used at no cost to query recent gists. There are two options here. 
+
+- Without an access key - You will have a low rate limit.
+- With an access key - You will have a higher rate limit. 
+
+The unauthenticated option is not suitable for pastehunter running full time. 
+To create your key visit https://github.com/settings/tokens
+
+*YOU DO NOT NEED TO GIVE IT ANY ACCESS PERMISSIONS*
+
+- **api_token**: The token you generated.
+- **api_limit**: Rate limit to prevent being blocked.
+- **store_all**: Store all pastes regardless of a rule match.
+- **user_blacklist**: Do not process gists created by these usernames.
+- **file_blacklist**: Do not process gists that match these filenames.
+
+
+Slexy
+---------
+
+Slexy has some heavy rate limits on it. 
+
+- **store_all**: Store all pastes regardless of a rule match.
+- **api_scrape**: The URL endpoint for the list of recent pastes.
+- **api_raw**: The URL endpoint for the raw paste.
+- **api_view**: The URL enpoint to view the paste.
+
+
+StackExchange
+-------------
+
+The same API is used to query them all. Similar to github there is a public API which has a reduced rate limit 
+or an App API which has a higher cap. There is a cap on 10,000 requests per day per IP, so pulling all would be impractical. 
+Generate a key at https://stackapps.com/.
+
+There are over 170 exchanges that form stackexchange. The following list is the most likly to expose privldidged information.
+
+* stackoverflow
+* serverfault
+* superuser
+* webapps
+* webmasters
+* dba
+
+- **site_list**: List of site shorttitles that will be scraped. 
+- **api_key**: API App key as generated above.
+- **store_filter**: This is the stackexchange filter that determines what fields are returned. 
+- **pagesize**: How many questions to pull from the latest list. 
+- **store_all**: Store all pastes regardless of a rule match.
\ No newline at end of file
diff --git a/docs/installation.rst b/docs/installation.rst
new file mode 100644
index 0000000..45601bc
--- /dev/null
+++ b/docs/installation.rst
@@ -0,0 +1,99 @@
+Installation
+============
+
+There are a few ways to install 
+
+
+
+Local Installation
+------------------
+
+Pastehunter
+^^^^^^^^^^^
+If you want to run the latest stable version grab the latest release from https://github.com/kevthehermit/PasteHunter/releases.
+If you want to run the development version clone the repository or download the latest archive. 
+
+Pastehunter has very few dependancies you can install all the python libraries using the requirements.txt file and ``sudo pip3 install -r requirements.txt``
+    
+    
+    
+
+Yara
+^^^^
+Yara is the scanning engine that scans each paste. Use the official documentation to install yara and the python3 library. 
+https://yara.readthedocs.io/en/latest/gettingstarted.html#compiling-and-installing-yara
+
+All yara rules are stored in the YaraRules directory. An index.yar file is created at run time that includes all additional yar files in this directory. 
+To add or remove yara rules, simply add or remove the rule file from this directory. 
+
+
+
+Elastic Search
+^^^^^^^^^^^^^^
+If you want to use the elastic search output module you will need to install elastic search. Pastehunter has been tested with version 6.x of Elasticsearch.
+To install follow the offical directions on https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html.
+
+You will also need the elasticsearch python library which can be installed using ``sudo pip3 install elasticsearch``.
+
+Kibana
+^^^^^^
+Kibana is the frontend search to Elasticsearch. If you have enabled the Elasticsearch module you probably want this. 
+To install follow the offical directions on https://www.elastic.co/guide/en/kibana/current/deb.html.
+
+
+
+Docker Installation
+-------------------
+You will find a Dockerfile that will build the latest stable version of PasteHunter. 
+
+
+This can be used with the included docker-compose.yml file. 
+A sample podspec for kubernets is coming soon. 
+
+
+Configuration
+-------------
+Before you can get up and running you will need to set up the basic config. 
+Copy the settings.json.sample to settings.json and edit with your editor of choice. 
+
+Yara
+^^^^
+
+- **rule_path**: defaults to the YaraRules directory in the PasteHunter root.
+- **blacklist**: If set to true, any pastes that match this rule will be ignored.
+- **test_rules**: Occasionaly I release some early test rules. Set this to ``true`` to use them.
+
+log
+^^^
+
+Logging for the application is configured here. 
+
+- **log_to_file**: true or false, default is stdout.
+- **log_file**: filename to log out to.
+- **logging_level**: numerical value for logging level see the table below.
+- **log_path**: path on disk to write log_file to.
+- **format**: python logging format string - https://docs.python.org/3/library/logging.html#formatter-objects
+
+=======  =========
+Level    Numerical
+======== =========
+CRITICAL 50
+ERROR    40
+WARNING  30
+INFO     20
+DEBUG    10
+NETSET   0
+======== =========
+
+general
+^^^^^^^
+
+General config options here.
+
+- **run_frequency**: Sleep delay between fetching list of inputs to download. This helps rate limits. 
+
+
+For Input, Output and Postprocess settings please refer to the relevant sections of the docs. 
+    
+    
+    
\ No newline at end of file
diff --git a/docs/outputs.rst b/docs/outputs.rst
new file mode 100644
index 0000000..6edbb4b
--- /dev/null
+++ b/docs/outputs.rst
@@ -0,0 +1,24 @@
+Outputs
+=======
+
+This page details all the confiuration options for the output modules/
+
+
+Elasticsearch
+-------------
+
+JSON
+----
+
+CSV
+---
+
+SMTP
+----
+
+Slack
+-----
+
+
+Syslog
+------
\ No newline at end of file
diff --git a/docs/postprocess.rst b/docs/postprocess.rst
new file mode 100644
index 0000000..7f88bc2
--- /dev/null
+++ b/docs/postprocess.rst
@@ -0,0 +1,16 @@
+PostProcess
+===========
+
+There are a handful of post process modules that can run additional checks on the raw paste data. 
+
+
+Email
+-----
+
+
+Base64
+------
+
+
+Entropy
+-------
\ No newline at end of file

From ee8740face2b6084cd7062ea9679cb1184e8df31 Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@pastehunter.com>
Date: Sun, 10 Feb 2019 01:00:11 +0000
Subject: [PATCH 117/178] Add read the docs theme

---
 docs/conf.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index d048dbf..c0281c2 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -74,13 +74,26 @@
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'alabaster'
+html_theme = 'sphinx_rtd_theme'
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
 #
-# html_theme_options = {}
+html_theme_options = {
+    'canonical_url': '',
+    #'analytics_id': 'UA-XXXXXXX-1',  #  Provided by Google in your dashboard
+    'logo_only': False,
+    'display_version': True,
+    'prev_next_buttons_location': 'bottom',
+    'style_external_links': False,
+    # Toc options
+    'collapse_navigation': True,
+    'sticky_navigation': True,
+    'navigation_depth': 4,
+    'includehidden': True,
+    'titles_only': False
+}
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,

From 48da1074850960ee9d8d4d04db05421261fd7bdf Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@pastehunter.com>
Date: Sun, 10 Feb 2019 13:09:56 +0000
Subject: [PATCH 118/178] Updating docs

---
 docs/installation.rst | 33 ++++++++++++++++++++++-
 docs/outputs.rst      | 62 +++++++++++++++++++++++++++++++++++++++++--
 docs/postprocess.rst  | 39 ++++++++++++++++++++++++++-
 3 files changed, 130 insertions(+), 4 deletions(-)

diff --git a/docs/installation.rst b/docs/installation.rst
index 45601bc..1265cd0 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -95,5 +95,36 @@ General config options here.
 
 For Input, Output and Postprocess settings please refer to the relevant sections of the docs. 
     
+
+Starting
+--------
+
+You can run pastehunter by calling the script by name. 
+
+``python3 pastehunter.py``
+
+Service
+^^^^^^^
+
+You can install pastehunter as a service if your planning on running for long periods of time. An example systemd service scrfile is show below
+
+Create a new service file ``/etc/systemd/system/pastehunter.service``
+
+add the following text updating as appropriate for your setup paying attention to file paths and usernames.:: 
+
+
+    [Unit]
+    Description=PasteHunter
     
-    
\ No newline at end of file
+    [Service]
+    WorkingDirectory=/opt/PasteHunter
+    ExecStart=/usr/bin/python3 /opt/PasteHunter/pastehunter.py
+    User=localuser
+    Group=localuser
+    Restart=always
+    
+    [Install]
+    WantedBy=multi-user.target
+
+
+Before starting the service ensure you have tested the pastehunter app on the command line and identify any errors. Once your ready then update systemctl ``systemctl daemon-reload`` enable the new service ``systemctl enable pastehunter.service`` and start the service ``systemctl start pastehunter`` 
diff --git a/docs/outputs.rst b/docs/outputs.rst
index 6edbb4b..f3ca3a4 100644
--- a/docs/outputs.rst
+++ b/docs/outputs.rst
@@ -2,23 +2,81 @@ Outputs
 =======
 
 This page details all the confiuration options for the output modules/
+There are a few generic options for each input.
 
+- **enabled**: This turns the input on and off. 
+- **module**: This is used internally by pastehunter.
+- **classname**: This is used internally by pastehunter.
 
 Elasticsearch
 -------------
+Elasticsearch was the default output. Storing all pastes and using Kibana as a graphical frontend to view the results
+
+- **elastic_index**: The name of the index.
+- **weekly_index**: Use a numbered index for each week of the year instead of a single index.
+- **elastic_host**: Hostname or IP of the elasticsearch.
+- **elastic_port**: Port number for elasticsearch default is 9200
+- **elastic_user**: Username if using xpack / shield or basic auth.
+- **elastic_pass**: Password if using xpack / shield or basic auth.
+- **elastic_ssl**: True or false if Elasticsearch is served over SSL.
 
 JSON
 ----
 
+This output module will store each paste in a json file on disk. The name of the file is the pasteid. 
+
+- **output_path**: Path on disk to store output files. 
+- **store_raw**: Include the raw paste in the json file. False jsut stores metadata.
+- **encode_raw**: Ignored, Reserved for future usage.
+
 CSV
 ---
 
+The CSV output will append lines to a CSV that contains basic metadata from all paste sources. The raw paste is not included.
+
+- **output_path**: Path on disk to store output files. 
+
+Stored elements are
+
+- Timestamp
+- Pasteid
+- Yara Rules
+- Scrape URL
+- Pastesite
+
+Syslog
+------
+Using the same format as the CSV output this writes paste metadata to a syslog server. The raw paste is not included. 
+
+- **host**: IP or hostname of the syslog server.
+- **port**: Port number of the syslog server.
+
 SMTP
 ----
 
+This output will send an email to specific email addresses depending on the YaraRules that are matched. You need to set up an SMTP server. 
+
+- **smtp_host**: hostname for the SMTP server.
+- **smtp_port**: Port number for the SMTP Server.
+- **smtp_security**: One of ``tls``, ``starttls``, ``none``.
+- **smtp_user**: Username for SMTP Authentication.
+- **smtp_pass**: Password for SMTP Authentication.
+- **recipients**: Json array of recipients and rules.
+  - **address**: Email address to send alerts to.
+  - **rule_list**: A list of rules to alert on. Any of the rules in this list will trigger an email.
+  - **mandatory_rule_list**: List of rules that *MUST* be present to trigger an email alert. 
+
+
 Slack
 -----
 
+This output will send a Notification to a slack web hook. You need to configure the URL and the channel in Slack.
+Head over to https://api.slack.com/apps?new_app=1
 
-Syslog
-------
\ No newline at end of file
+Create a new Slack App with a Name and the workspace that you want to send alerts to. 
+Once created under Add Features and Functionality select Incoming Webhooks and toggle the Active button to on.
+At the bottom of the page select *Add New Webhook to Workspace* This will show another page where you select the Channel that will receive the notifications. 
+Once it has authorized the app you will see a new Webhook URL. This is the URL that needs to be added to the pastehunter config. 
+
+- **webhook_url**: Generated when creating a Slack App as described above. 
+- **rule_list**: List of rules that will generate an alert. 
diff --git a/docs/postprocess.rst b/docs/postprocess.rst
index 7f88bc2..6a90285 100644
--- a/docs/postprocess.rst
+++ b/docs/postprocess.rst
@@ -3,14 +3,51 @@ PostProcess
 
 There are a handful of post process modules that can run additional checks on the raw paste data. 
 
+There are a few generic options for each input.
+
+- **enabled**: This turns the input on and off. 
+- **module**: This is used internally by pastehunter.
+
 
 Email
 -----
+This postprocess module extracts additional information from data that includes email addresses. It will extract counts for:
+
+- Total Emails
+- Unique Email addresses
+- Unique Email domains
+
+These 3 values are then added to the meta data for storage. 
 
+- **rule_list**: List of rules that will trigger the postprocess module. 
 
 Base64
 ------
 
+This postprocess will attempt to decode base64 data and then apply further processing on the new file data. At the moment this module only operates
+when the full paste is a base64 blob, i.e. it will not extract base64 code that is embedded in other data. 
+
+- **rule_list**: List of rules that will trigger the postprocess module. 
+
+
+Cuckoo
+^^^^^^
+If the samples match a binary file format you can optionaly send the file for analysis by a Cuckoo Sandbox.
+
+- **api_host**: IP or hostname for a Cuckoo API endpoint. 
+- **api_port**: Port number for a Cuckoo API endpoint.
+
+Viper
+^^^^^
+If the samples match a binary file format you can optionaly send the file to a Viper instance for further analysis.
+
+- **api_host**: IP or hostname for a Cuckoo API endpoint. 
+- **api_port**: Port number for a Cuckoo API endpoint.
+
 
 Entropy
--------
\ No newline at end of file
+-------
+
+This postprocess module calculates shannon entropy on the raw paste data. This can be used to help identify binary and encoded or encrytped data. 
+
+- **rule_list**: List of rules that will trigger the postprocess module. 

From 6dcda88e87db515c0d558f5acc5fabf2cb356cc9 Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@pastehunter.com>
Date: Sun, 10 Feb 2019 13:26:44 +0000
Subject: [PATCH 119/178] Update docs and readme

---
 docs/installation.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/installation.rst b/docs/installation.rst
index 1265cd0..f3054f6 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -74,7 +74,7 @@ Logging for the application is configured here.
 - **log_path**: path on disk to write log_file to.
 - **format**: python logging format string - https://docs.python.org/3/library/logging.html#formatter-objects
 
-=======  =========
+======== =========
 Level    Numerical
 ======== =========
 CRITICAL 50
@@ -106,11 +106,11 @@ You can run pastehunter by calling the script by name.
 Service
 ^^^^^^^
 
-You can install pastehunter as a service if your planning on running for long periods of time. An example systemd service scrfile is show below
+You can install pastehunter as a service if your planning on running for long periods of time. An example systemd service file is show below
 
 Create a new service file ``/etc/systemd/system/pastehunter.service``
 
-add the following text updating as appropriate for your setup paying attention to file paths and usernames.:: 
+Add the following text updating as appropriate for your setup paying attention to file paths and usernames.:: 
 
 
     [Unit]

From d25ebd3c3036def06f8db6b20c79cfe346120940 Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@pastehunter.com>
Date: Sun, 10 Feb 2019 13:28:09 +0000
Subject: [PATCH 120/178] Update readme to point to docs

---
 README.md | 159 ++++--------------------------------------------------
 1 file changed, 11 insertions(+), 148 deletions(-)

diff --git a/README.md b/README.md
index 0f3ddb7..22bd29e 100644
--- a/README.md
+++ b/README.md
@@ -1,164 +1,27 @@
 # PasteHunter
 PasteHunter is a python3 application that is designed to query a collection of sites that host publicliy pasted data. 
 For all the pasts it finds it scans the raw contents against a series of yara rules looking for information that can be used 
-by an org or a researcher.
+by an organisation or a researcher.
+
+For setup instructions please see the `official documentation <https://pastehunter.readthedocs.io/en/latest/installation.html>`_
 
 ## Supported Inputs
 Pastehunter currently has support for the following sites:
  - pastebin.com
- - dumpz.org
  - gist.github.com
  - slexy.org
  - stackexchange # There are about 176! 
-   - Likly to expose privlidged information
-   - stackoverflow
-   - serverfault
-   - superuser
-   - webapps
-   - webmasters
-   - dba
 
 ## Supported Outputs
 Pastehunter supports several output modules:
- - dump to ElasticSearch DB (default)
- - email sending over SMTP
- - dump to JSON file
- - dump to CSV file
-
-### SMTP
-Multiple recipients can be specified, with different rulesets each.
-It's possible to combine these rules using simple OR or AND logic (respectively rule_list and mandatory_rule_list).
-You need to set SMTP_SECURITY in the config file to one of the following options:
- - 'tls'
- - 'starttls'
- - 'none'
-
- Refer to your email provider to determine which you require.
-
-## PostProcess Modules
-Pastehunter comes with a couple of post process modules that extact useful data from pastes or pass them to other services
-The following are default modules:
- - Base64 Decoders
-   - Cuckoo
-   - Viper
-
-## PreReqs
-
-### Pastebin
-
-You need a Pro account on pastebin that has access to the scraping API.
-https://pastebin.com/api_scraping_faq
-
-### GitHub
-Github needs an oauth token to stop it hitting the free ratelimit. 
-Create one at https://github.com/settings/tokens
-
-YOU DO NOT NEED TO GIVE IT ANY ACCESS PERMISSIONS
-
-# Installation
-
-## Local install 
-
-### Elastic Search
-https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html
-
-### Kibana
-https://www.elastic.co/guide/en/kibana/current/deb.html
-
-### Yara
-https://yara.readthedocs.io/en/latest/gettingstarted.html#compiling-and-installing-yara
-
-If you have yara errors check the installed version numbers for yara and yara-python match the lastest versions.
-
-### PasteHunter
-git clone https://github.com/kevthehermit/pastehunter
-
-### Python / Deps
-Python 3
-```pip3 install -r requirements.txt```
-
-## Using Docker
-
-Install Docker & docker-compose
-
-`docker build . -t pastehunter`
-
-## Using Docker-compose
-
-### Running all the applications
-Run `docker-compose up -d`
-
-#### Kibana
-
-Kibana is running only on the localhost interface on default port (5601).
-
-Kibana use the default login and password : `elastic` and `changme`
-
-Kibana is using the static IP address : 172.16.10.12 in the `esnet`  network
-
-#### Elasticsearch
-
-Elasticsearch is running only on the localhost interface on default port 9200.
-The mount point is `/usr/share/elasticsearch/data` by default
-
-if elastic search fails to start and you see "max virtual memory areas vm.max_map_count [65530] likely too low"
-in the logs then try 
-
-`sudo sysctl -w vm.max_map_count=262144`
-
-https://elk-docker.readthedocs.io/#troubleshooting Paragraph starting As from version 5
-
-#### Pastehunter
-
-You can re-run the pastehunter script by doing `docker-compose up -d`
-Docker-compose will use already running instances of Elasticsearch and Kibana
-
-
-# Configure
-
-copy settings.json.sample to settings.json
-populate the details.
-For the scraping API you need to whitelist your IP on pastebin. No API key is required. See the link above
-
-The logging level can be set to one of the following values. 
-
-
-| Level    | Numerical |
-|----------|-----------|
-| CRITICAL | 50        |
-| ERROR    | 40        |
-| WARNING  | 30        |
-| INFO     | 20        |
-| DEBUG    | 10        |
-| NOTSET   | 0         |
-
-The default is INFO:20
-
-
-# Running
-
-Start the application with ```python3 pastehunter.py```
-
-It may be useful to run in a screen to keep it running in the background. 
-
-## Service 
-If your using /opt/ make sure the user your running the service as has permissions to write to /opt/pastehunter/
-Change User and Group to match your system
-
-`$ cat /etc/systemd/system/pastehunter.service` 
-
-```
-[Unit]
-Description=PasteHunter
+ - dump to ElasticSearch DB (default).
+ - Email alerts (SMTP).
+ - Slack Channel notifications.
+ - Dump to JSON file.
+ - Dump to CSV file.
+ - Send to syslog.
 
-[Service]
-WorkingDirectory=/opt/PasteHunter
-ExecStart=/usr/bin/python3 /opt/PasteHunter/pastehunter.py
-User=localuser
-Group=localuser
-Restart=always
+For examples of data discovered using pastehunter check out my posts `Using pastehunter https://techanarchy.net/blog/hunting-pastebin-with-pastehunter`_ and 
+`Pastehunter results <https://techanarchy.net/blog/pastehunter-the-results>`_
 
-[Install]
-WantedBy=multi-user.target
-```
 

From 16fdaf1bd6ac231eb9bf83d3686c5c4aa2246483 Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@pastehunter.com>
Date: Sun, 10 Feb 2019 14:20:16 +0000
Subject: [PATCH 121/178] add logging for api errors in stackexchange

---
 inputs/stackexchange.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/inputs/stackexchange.py b/inputs/stackexchange.py
index a56f17b..2edb7c5 100644
--- a/inputs/stackexchange.py
+++ b/inputs/stackexchange.py
@@ -44,6 +44,10 @@ def recent_pastes(conf, input_history):
             # ToDo: Add an API rate test in here. 
             paste_list_json = paste_list_request.json()
             
+            if "error_id" in paste_list_json:
+                logging.error("StackExchange API Error: {0}".format(paste_list_json['error_message']))
+                return [], []
+            
             
     
             for question in paste_list_json['items']:

From b9337a4c4e550b344e7845df3346671384c5c4e8 Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@pastehunter.com>
Date: Sun, 10 Feb 2019 14:20:36 +0000
Subject: [PATCH 122/178] Update readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 22bd29e..4045c87 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ Pastehunter supports several output modules:
  - Dump to CSV file.
  - Send to syslog.
 
-For examples of data discovered using pastehunter check out my posts `Using pastehunter https://techanarchy.net/blog/hunting-pastebin-with-pastehunter`_ and 
+For examples of data discovered using pastehunter check out my posts `Using pastehunter <https://techanarchy.net/blog/hunting-pastebin-with-pastehunter>`_ and 
 `Pastehunter results <https://techanarchy.net/blog/pastehunter-the-results>`_
 
 

From 114a4ba8b1ecf9fad2b6f63a2cda2964a0ee120d Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@pastehunter.com>
Date: Sun, 10 Feb 2019 14:47:35 +0000
Subject: [PATCH 123/178] Reduce the number of API calls for stackexchange
 input

---
 YaraRules/index.yar     | 1 +
 docs/inputs.rst         | 2 +-
 inputs/stackexchange.py | 7 -------
 pastehunter.py          | 7 ++-----
 4 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/YaraRules/index.yar b/YaraRules/index.yar
index 258864d..2319b39 100644
--- a/YaraRules/index.yar
+++ b/YaraRules/index.yar
@@ -4,6 +4,7 @@ include "email_filter.yar"
 include "blacklist.yar"
 include "base64.yar"
 include "database.yar"
+include "CryptoExchangeApi.yar"
 include "hak5.yar"
 include "core_keywords.yar"
 include "password_leak.yar"
diff --git a/docs/inputs.rst b/docs/inputs.rst
index 5af237b..a344d96 100644
--- a/docs/inputs.rst
+++ b/docs/inputs.rst
@@ -66,6 +66,6 @@ There are over 170 exchanges that form stackexchange. The following list is the
 
 - **site_list**: List of site shorttitles that will be scraped. 
 - **api_key**: API App key as generated above.
-- **store_filter**: This is the stackexchange filter that determines what fields are returned. 
+- **store_filter**: This is the stackexchange filter that determines what fields are returned. It must contain the body element.
 - **pagesize**: How many questions to pull from the latest list. 
 - **store_all**: Store all pastes regardless of a rule match.
\ No newline at end of file
diff --git a/inputs/stackexchange.py b/inputs/stackexchange.py
index 2edb7c5..152c3c0 100644
--- a/inputs/stackexchange.py
+++ b/inputs/stackexchange.py
@@ -67,13 +67,6 @@ def recent_pastes(conf, input_history):
                 # Add a date field that kibana will map
                 date = datetime.utcfromtimestamp(float(question_data['creation_date'])).isoformat()
                 question_data['@timestamp'] = date
-                question_data['scrape_url'] = "{0}/{1}?key={2}&order=desc&sort=activity&site={3}&filter={4}".format(
-                    api_scrape, 
-                    question['question_id'],
-                    api_key,
-                    site,
-                    question_body_filter
-                    )
                 paste_list.append(question_data)
             
             
diff --git a/pastehunter.py b/pastehunter.py
index ec676f9..d2e486c 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -142,15 +142,12 @@ def paste_scanner():
             
             # Stack questions dont have a raw endpoint
             if ('stackexchange' in conf['inputs']) and (paste_data['pastesite'] in conf['inputs']['stackexchange']['site_list']):
-                json_body = requests.get(raw_paste_uri).json()
+                # The body is already included in the first request so we do not need a second call to the API. 
                 
                 # Unescape the code block strings in the json body. 
-                raw_body = json_body['items'][0]['body']
+                raw_body = paste_data['body']
                 raw_paste_data = unquote_plus(raw_body)
                 
-                # URL has the API Key in it so make sure it is removed before store.
-                del paste_data['scrape_url']
-                
             else:
                 raw_paste_data = requests.get(raw_paste_uri).text
                 

From 40768f0510fdce714f26106885885ed9f4ae12f4 Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@pastehunter.com>
Date: Sun, 10 Feb 2019 14:48:20 +0000
Subject: [PATCH 124/178] New StackExcahnge filter to include body in API
 request

---
 settings.json.sample | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/settings.json.sample b/settings.json.sample
index 828a04e..e24816f 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -39,7 +39,7 @@
       "module": "inputs.stackexchange",
       "site_list": ["stackoverflow","serverfault", "superuser", "webapps", "webmasters", "dba"],
       "api_key": "",
-      "store_filter": "!LZg2mkNj0UY)iKNdTbVP4i",
+      "store_filter": "!)r_ttsG0v3bE1vo3*8Ki",
       "pagesize": 100,
       "store_all": true,
       "api_scrape": "https://api.stackexchange.com/2.2/questions"

From 3563340f78770e0ad6568ec7b5ada47993533c15 Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@pastehunter.com>
Date: Sun, 10 Feb 2019 15:10:24 +0000
Subject: [PATCH 125/178] Remove duplicate body key

---
 pastehunter.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pastehunter.py b/pastehunter.py
index d2e486c..8bbc7a9 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -148,6 +148,9 @@ def paste_scanner():
                 raw_body = paste_data['body']
                 raw_paste_data = unquote_plus(raw_body)
                 
+                # now remove the old body key as we dont need it any more
+                del paste_data['body']
+                
             else:
                 raw_paste_data = requests.get(raw_paste_uri).text
                 

From ae9b1bd5392882467102cd8093eb3eae6c578ad1 Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@pastehunter.com>
Date: Sun, 10 Feb 2019 15:14:16 +0000
Subject: [PATCH 126/178] Fix key error in stackexchange

---
 pastehunter.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pastehunter.py b/pastehunter.py
index 8bbc7a9..424dbca 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -136,8 +136,7 @@ def paste_scanner():
         logger.debug("Found New {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
         # get raw paste and hash them
     
-        raw_paste_uri = paste_data['scrape_url']
-        # Cover fetch site SSLErrors
+
         try:
             
             # Stack questions dont have a raw endpoint
@@ -152,8 +151,10 @@ def paste_scanner():
                 del paste_data['body']
                 
             else:
+                raw_paste_uri = paste_data['scrape_url']
                 raw_paste_data = requests.get(raw_paste_uri).text
                 
+        # Cover fetch site SSLErrors
         except requests.exceptions.SSLError as e:
             logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
             raw_paste_data = ""

From e1a752c8f90a6f27ddc88dd7aa7b7f0422d54476 Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@pastehunter.com>
Date: Sun, 10 Feb 2019 18:07:28 +0000
Subject: [PATCH 127/178] 1.0 Release

---
 pastehunter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pastehunter.py b/pastehunter.py
index 424dbca..cd6a0f0 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -18,7 +18,7 @@
 
 from multiprocessing import Queue
 
-VERSION = 0.2
+VERSION = 1.0
 
 # Setup Default logging
 logger = logging.getLogger('pastehunter')

From 3f04e789d6a32866ce897cce47ea7e1619dc204b Mon Sep 17 00:00:00 2001
From: thehermit <thehermit@pastehunter.com>
Date: Sun, 10 Feb 2019 18:11:35 +0000
Subject: [PATCH 128/178] Fix links in readme

---
 README.md | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 4045c87..effe0ba 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,9 @@
 # PasteHunter
-PasteHunter is a python3 application that is designed to query a collection of sites that host publicliy pasted data. 
-For all the pasts it finds it scans the raw contents against a series of yara rules looking for information that can be used 
+PasteHunter is a python3 application that is designed to query a collection of sites that host publicly pasted data. 
+For all the pasts it finds it scans the raw contents against a series of Yara rules looking for information that can be used 
 by an organisation or a researcher.
 
-For setup instructions please see the `official documentation <https://pastehunter.readthedocs.io/en/latest/installation.html>`_
+For setup instructions please see the official documentation https://pastehunter.readthedocs.io/en/latest/installation.html
 
 ## Supported Inputs
 Pastehunter currently has support for the following sites:
@@ -21,7 +21,4 @@ Pastehunter supports several output modules:
  - Dump to CSV file.
  - Send to syslog.
 
-For examples of data discovered using pastehunter check out my posts `Using pastehunter <https://techanarchy.net/blog/hunting-pastebin-with-pastehunter>`_ and 
-`Pastehunter results <https://techanarchy.net/blog/pastehunter-the-results>`_
-
-
+For examples of data discovered using pastehunter check out my posts https://techanarchy.net/blog/hunting-pastebin-with-pastehunter and https://techanarchy.net/blog/pastehunter-the-results

From ffe8f68be23d463d63e2812bbc3873f7e2792992 Mon Sep 17 00:00:00 2001
From: KevTheHermit <thehermit@pastehunter.com>
Date: Sun, 10 Mar 2019 23:13:39 +0000
Subject: [PATCH 129/178] Force type for pasteid to avoid elastic search index
 errors

---
 inputs/stackexchange.py   |  3 ++-
 outputs/elastic_output.py | 16 ++++++++++------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/inputs/stackexchange.py b/inputs/stackexchange.py
index 152c3c0..aa5ef07 100644
--- a/inputs/stackexchange.py
+++ b/inputs/stackexchange.py
@@ -59,7 +59,8 @@ def recent_pastes(conf, input_history):
                 # Create a new question dict for us to normalize
                 question_data = question
                 question_data['confname'] = "stackexchange"
-                question_data['pasteid'] = question['question_id']
+                # Force type to string else it breaks ES Index mappings
+                question_data['pasteid'] = str(question['question_id']) 
                 question_data['pastesite'] = site
                 # Get the author and then trim the data we store. 
                 question_data['username'] = question['owner']['display_name']
diff --git a/outputs/elastic_output.py b/outputs/elastic_output.py
index 3d54bcf..12eccbe 100644
--- a/outputs/elastic_output.py
+++ b/outputs/elastic_output.py
@@ -33,11 +33,15 @@ def store_paste(self, paste_data):
                 week_number = datetime.date(datetime.now()).isocalendar()[1]
                 index_name = '{0}-{1}-{2}'.format(index_name, year_number, week_number)
             # ToDo: With multiple paste sites a pasteid collision is more likly!
-            self.es.index(index=index_name, doc_type='paste', id=paste_data['pasteid'], body=paste_data)
-            logger.debug("Stored {0} Paste {1}, Matched Rule {2}".format(paste_data['pastesite'],
-                                                                         paste_data['pasteid'],
-                                                                         paste_data['YaraRule']
-                                                                         )
-                         )
+            try:
+                pasteid = str(paste_data['pasteid'])
+                self.es.index(index=index_name, doc_type='paste', id=pasteid, body=paste_data)
+                logger.debug("Stored {0} Paste {1}, Matched Rule {2}".format(paste_data['pastesite'],
+                                                                             paste_data['pasteid'],
+                                                                             paste_data['YaraRule']
+                                                                             )
+                             )
+            except Exception as e:
+                logger.error(e)
         else:
             logger.error("Elastic Search Enabled, not configured!")

From 6f8a086b26a5729047be7930b1229bf22ff42806 Mon Sep 17 00:00:00 2001
From: KevTheHermit <thehermit@pastehunter.com>
Date: Sun, 7 Apr 2019 18:35:00 +0000
Subject: [PATCH 130/178] Add Timout to processes

---
 pastehunter.py | 274 ++++++++++++++++++++++++++-----------------------
 1 file changed, 147 insertions(+), 127 deletions(-)

diff --git a/pastehunter.py b/pastehunter.py
index cd6a0f0..0ffdfc2 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -11,11 +11,14 @@
 import logging
 from logging import handlers 
 import time
+import errno
+import signal
 from time import sleep
 from urllib.parse import unquote_plus
 from common import parse_config
 from postprocess import post_email
 
+
 from multiprocessing import Queue
 
 VERSION = 1.0
@@ -40,6 +43,23 @@
 if not conf:
     sys.exit()
 
+class TimeoutError(Exception):
+    pass
+
+class timeout:
+    def __init__(self, seconds=1, error_message='Timeout'):
+        self.seconds = seconds
+        self.error_message = error_message
+    def handle_timeout(self, signum, frame):
+        raise TimeoutError(self.error_message)
+    def __enter__(self):
+        signal.signal(signal.SIGALRM, self.handle_timeout)
+        signal.alarm(self.seconds)
+    def __exit__(self, type, value, traceback):
+        signal.alarm(0)
+
+
+
 # Set up the log file
 if "log" in conf and conf["log"]["log_to_file"]:
     if conf["log"]["log_path"] != "":
@@ -129,139 +149,137 @@ def paste_scanner():
     # scan the Paste
     # Store the Paste
     while True:
-        paste_data = q.get()
-
-        # Start a timer
-        start_time = time.time()
-        logger.debug("Found New {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
-        # get raw paste and hash them
-    
-
-        try:
-            
-            # Stack questions dont have a raw endpoint
-            if ('stackexchange' in conf['inputs']) and (paste_data['pastesite'] in conf['inputs']['stackexchange']['site_list']):
-                # The body is already included in the first request so we do not need a second call to the API. 
-                
-                # Unescape the code block strings in the json body. 
-                raw_body = paste_data['body']
-                raw_paste_data = unquote_plus(raw_body)
-                
-                # now remove the old body key as we dont need it any more
-                del paste_data['body']
-                
-            else:
-                raw_paste_uri = paste_data['scrape_url']
-                raw_paste_data = requests.get(raw_paste_uri).text
-                
-        # Cover fetch site SSLErrors
-        except requests.exceptions.SSLError as e:
-            logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
-            raw_paste_data = ""
-
-        # Pastebin Cache
-        if raw_paste_data == "File is not ready for scraping yet. Try again in 1 minute.":
-            logger.info("Paste is still cached sleeping to try again")
-            sleep(45)
+        with timeout(seconds=3):
+            paste_data = q.get()
+            # Start a timer
+            start_time = time.time()
+            logger.debug("Found New {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
             # get raw paste and hash them
-            raw_paste_uri = paste_data['scrape_url']
-            # Cover fetch site SSLErrors
             try:
-                raw_paste_data = requests.get(raw_paste_uri).text
+                
+                # Stack questions dont have a raw endpoint
+                if ('stackexchange' in conf['inputs']) and (paste_data['pastesite'] in conf['inputs']['stackexchange']['site_list']):
+                    # The body is already included in the first request so we do not need a second call to the API. 
+                    
+                    # Unescape the code block strings in the json body. 
+                    raw_body = paste_data['body']
+                    raw_paste_data = unquote_plus(raw_body)
+                    
+                    # now remove the old body key as we dont need it any more
+                    del paste_data['body']
+                    
+                else:
+                    raw_paste_uri = paste_data['scrape_url']
+                    raw_paste_data = requests.get(raw_paste_uri).text
+                    
+            # Cover fetch site SSLErrors
             except requests.exceptions.SSLError as e:
                 logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
                 raw_paste_data = ""
-        # Process the paste data here
-        try:
-            # Scan with yara
-            matches = rules.match(data=raw_paste_data)
-        except Exception as e:
-            logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
-            continue
-
-        results = []
-        for match in matches:
-            # For keywords get the word from the matched string
-            if match.rule == 'core_keywords' or match.rule == 'custom_keywords':
-                for s in match.strings:
-                    rule_match = s[1].lstrip('$')
-                    if rule_match not in results:
-                        results.append(rule_match)
-                results.append(str(match.rule))
-
-            # But a break in here for the base64. Will use it later.
-            elif match.rule.startswith('b64'):
-                results.append(match.rule)
-
-            # Else use the rule name
-            else:
-                results.append(match.rule)
-
-        # Store all OverRides other options. 
-        paste_site = paste_data['confname']
-        store_all = conf['inputs'][paste_site]['store_all']
-        # remove the confname key as its not really needed past this point
-        del paste_data['confname']
-
-
-        # Blacklist Check
-        # If any of the blacklist rules appear then empty the result set
-        blacklisted = False
-        if conf['yara']['blacklist'] and 'blacklist' in results:
-            results = []
-            blacklisted = True
-            logger.info("Blacklisted {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
-
-
-        # Post Process
-
-        # If post module is enabled and the paste has a matching rule.
-        post_results = paste_data
-        for post_process, post_values in conf["post_process"].items():
-            if post_values["enabled"]:
-                if any(i in results for i in post_values["rule_list"]) or "ALL" in post_values["rule_list"]:
-                    if not blacklisted:
-                        logger.info("Running Post Module {0} on {1}".format(post_values["module"], paste_data["pasteid"]))
-                        post_module = importlib.import_module(post_values["module"])
-                        post_results = post_module.run(results,
-                                                        raw_paste_data,
-                                                        paste_data
-                                                        )
-
-        # Throw everything back to paste_data for ease.
-        paste_data = post_results
-
-
-        # If we have a result add some meta data and send to storage
-        # If results is empty, ie no match, and store_all is True,
-        # then append "no_match" to results. This will then force output.
-
-        if store_all is True:
-            if len(results) == 0:
-                results.append('no_match')
-                
-        if len(results) > 0:
-
-            encoded_paste_data = raw_paste_data.encode('utf-8')
-            md5 = hashlib.md5(encoded_paste_data).hexdigest()
-            sha256 = hashlib.sha256(encoded_paste_data).hexdigest()
-            paste_data['MD5'] = md5
-            paste_data['SHA256'] = sha256
-            paste_data['raw_paste'] = raw_paste_data
-            paste_data['YaraRule'] = results
-            # Set the size for all pastes - This will override any size set by the source
-            paste_data['size'] = len(raw_paste_data)
-            for output in outputs:
+    
+            # Pastebin Cache
+            if raw_paste_data == "File is not ready for scraping yet. Try again in 1 minute.":
+                logger.info("Paste is still cached sleeping to try again")
+                sleep(45)
+                # get raw paste and hash them
+                raw_paste_uri = paste_data['scrape_url']
+                # Cover fetch site SSLErrors
                 try:
-                    output.store_paste(paste_data)
-                except Exception as e:
-                    logger.error("Unable to store {0} to {1} with error {2}".format(paste_data["pasteid"], output, e))
-        
-        end_time = time.time()
-        logger.debug("Processing Finished for {0} in {1} seconds".format(
-            paste_data["pasteid"],
-            (end_time - start_time)
-        ))
+                    raw_paste_data = requests.get(raw_paste_uri).text
+                except requests.exceptions.SSLError as e:
+                    logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
+                    raw_paste_data = ""
+            # Process the paste data here
+            try:
+                # Scan with yara
+                matches = rules.match(data=raw_paste_data)
+            except Exception as e:
+                logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
+                continue
+    
+            results = []
+            for match in matches:
+                # For keywords get the word from the matched string
+                if match.rule == 'core_keywords' or match.rule == 'custom_keywords':
+                    for s in match.strings:
+                        rule_match = s[1].lstrip('$')
+                        if rule_match not in results:
+                            results.append(rule_match)
+                    results.append(str(match.rule))
+    
+                # But a break in here for the base64. Will use it later.
+                elif match.rule.startswith('b64'):
+                    results.append(match.rule)
+    
+                # Else use the rule name
+                else:
+                    results.append(match.rule)
+    
+            # Store all OverRides other options. 
+            paste_site = paste_data['confname']
+            store_all = conf['inputs'][paste_site]['store_all']
+            # remove the confname key as its not really needed past this point
+            del paste_data['confname']
+    
+    
+            # Blacklist Check
+            # If any of the blacklist rules appear then empty the result set
+            blacklisted = False
+            if conf['yara']['blacklist'] and 'blacklist' in results:
+                results = []
+                blacklisted = True
+                logger.info("Blacklisted {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
+    
+    
+            # Post Process
+    
+            # If post module is enabled and the paste has a matching rule.
+            post_results = paste_data
+            for post_process, post_values in conf["post_process"].items():
+                if post_values["enabled"]:
+                    if any(i in results for i in post_values["rule_list"]) or "ALL" in post_values["rule_list"]:
+                        if not blacklisted:
+                            logger.info("Running Post Module {0} on {1}".format(post_values["module"], paste_data["pasteid"]))
+                            post_module = importlib.import_module(post_values["module"])
+                            post_results = post_module.run(results,
+                                                            raw_paste_data,
+                                                            paste_data
+                                                            )
+    
+            # Throw everything back to paste_data for ease.
+            paste_data = post_results
+    
+    
+            # If we have a result add some meta data and send to storage
+            # If results is empty, ie no match, and store_all is True,
+            # then append "no_match" to results. This will then force output.
+    
+            if store_all is True:
+                if len(results) == 0:
+                    results.append('no_match')
+                    
+            if len(results) > 0:
+    
+                encoded_paste_data = raw_paste_data.encode('utf-8')
+                md5 = hashlib.md5(encoded_paste_data).hexdigest()
+                sha256 = hashlib.sha256(encoded_paste_data).hexdigest()
+                paste_data['MD5'] = md5
+                paste_data['SHA256'] = sha256
+                paste_data['raw_paste'] = raw_paste_data
+                paste_data['YaraRule'] = results
+                # Set the size for all pastes - This will override any size set by the source
+                paste_data['size'] = len(raw_paste_data)
+                for output in outputs:
+                    try:
+                        output.store_paste(paste_data)
+                    except Exception as e:
+                        logger.error("Unable to store {0} to {1} with error {2}".format(paste_data["pasteid"], output, e))
+            
+            end_time = time.time()
+            logger.debug("Processing Finished for {0} in {1} seconds".format(
+                paste_data["pasteid"],
+                (end_time - start_time)
+            ))
 
 
 
@@ -295,6 +313,8 @@ def paste_scanner():
     try:
         while True:
             queue_count = 0
+            
+            # Check if the processors are active
             # Paste History
             logger.info("Populating Queue")
             if os.path.exists('paste_history.tmp'):

From 6188bf0164d05ddc39292feccb0d74ab2241c914 Mon Sep 17 00:00:00 2001
From: KevTheHermit <thehermit@pastehunter.com>
Date: Mon, 8 Apr 2019 17:41:55 +0000
Subject: [PATCH 131/178] increase timeout

---
 pastehunter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pastehunter.py b/pastehunter.py
index 0ffdfc2..1cd7b01 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -149,7 +149,7 @@ def paste_scanner():
     # scan the Paste
     # Store the Paste
     while True:
-        with timeout(seconds=3):
+        with timeout(seconds=10):
             paste_data = q.get()
             # Start a timer
             start_time = time.time()

From 62f262b571c7e4df8caef4c27537e25aeeadc318 Mon Sep 17 00:00:00 2001
From: KevTheHermit <thehermit@pastehunter.com>
Date: Mon, 8 Apr 2019 17:48:47 +0000
Subject: [PATCH 132/178] Stop timout on empty queue

---
 pastehunter.py | 248 +++++++++++++++++++++++++------------------------
 1 file changed, 125 insertions(+), 123 deletions(-)

diff --git a/pastehunter.py b/pastehunter.py
index 1cd7b01..d89e9a5 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -149,137 +149,139 @@ def paste_scanner():
     # scan the Paste
     # Store the Paste
     while True:
-        with timeout(seconds=10):
+        while not q.empty():
             paste_data = q.get()
-            # Start a timer
-            start_time = time.time()
-            logger.debug("Found New {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
-            # get raw paste and hash them
-            try:
+            with timeout(seconds=10):
                 
-                # Stack questions dont have a raw endpoint
-                if ('stackexchange' in conf['inputs']) and (paste_data['pastesite'] in conf['inputs']['stackexchange']['site_list']):
-                    # The body is already included in the first request so we do not need a second call to the API. 
-                    
-                    # Unescape the code block strings in the json body. 
-                    raw_body = paste_data['body']
-                    raw_paste_data = unquote_plus(raw_body)
-                    
-                    # now remove the old body key as we dont need it any more
-                    del paste_data['body']
-                    
-                else:
-                    raw_paste_uri = paste_data['scrape_url']
-                    raw_paste_data = requests.get(raw_paste_uri).text
-                    
-            # Cover fetch site SSLErrors
-            except requests.exceptions.SSLError as e:
-                logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
-                raw_paste_data = ""
-    
-            # Pastebin Cache
-            if raw_paste_data == "File is not ready for scraping yet. Try again in 1 minute.":
-                logger.info("Paste is still cached sleeping to try again")
-                sleep(45)
+                # Start a timer
+                start_time = time.time()
+                logger.debug("Found New {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
                 # get raw paste and hash them
-                raw_paste_uri = paste_data['scrape_url']
-                # Cover fetch site SSLErrors
                 try:
-                    raw_paste_data = requests.get(raw_paste_uri).text
+                    
+                    # Stack questions dont have a raw endpoint
+                    if ('stackexchange' in conf['inputs']) and (paste_data['pastesite'] in conf['inputs']['stackexchange']['site_list']):
+                        # The body is already included in the first request so we do not need a second call to the API. 
+                        
+                        # Unescape the code block strings in the json body. 
+                        raw_body = paste_data['body']
+                        raw_paste_data = unquote_plus(raw_body)
+                        
+                        # now remove the old body key as we dont need it any more
+                        del paste_data['body']
+                        
+                    else:
+                        raw_paste_uri = paste_data['scrape_url']
+                        raw_paste_data = requests.get(raw_paste_uri).text
+                        
+                # Cover fetch site SSLErrors
                 except requests.exceptions.SSLError as e:
                     logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
                     raw_paste_data = ""
-            # Process the paste data here
-            try:
-                # Scan with yara
-                matches = rules.match(data=raw_paste_data)
-            except Exception as e:
-                logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
-                continue
-    
-            results = []
-            for match in matches:
-                # For keywords get the word from the matched string
-                if match.rule == 'core_keywords' or match.rule == 'custom_keywords':
-                    for s in match.strings:
-                        rule_match = s[1].lstrip('$')
-                        if rule_match not in results:
-                            results.append(rule_match)
-                    results.append(str(match.rule))
-    
-                # But a break in here for the base64. Will use it later.
-                elif match.rule.startswith('b64'):
-                    results.append(match.rule)
-    
-                # Else use the rule name
-                else:
-                    results.append(match.rule)
-    
-            # Store all OverRides other options. 
-            paste_site = paste_data['confname']
-            store_all = conf['inputs'][paste_site]['store_all']
-            # remove the confname key as its not really needed past this point
-            del paste_data['confname']
-    
-    
-            # Blacklist Check
-            # If any of the blacklist rules appear then empty the result set
-            blacklisted = False
-            if conf['yara']['blacklist'] and 'blacklist' in results:
-                results = []
-                blacklisted = True
-                logger.info("Blacklisted {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
-    
-    
-            # Post Process
-    
-            # If post module is enabled and the paste has a matching rule.
-            post_results = paste_data
-            for post_process, post_values in conf["post_process"].items():
-                if post_values["enabled"]:
-                    if any(i in results for i in post_values["rule_list"]) or "ALL" in post_values["rule_list"]:
-                        if not blacklisted:
-                            logger.info("Running Post Module {0} on {1}".format(post_values["module"], paste_data["pasteid"]))
-                            post_module = importlib.import_module(post_values["module"])
-                            post_results = post_module.run(results,
-                                                            raw_paste_data,
-                                                            paste_data
-                                                            )
-    
-            # Throw everything back to paste_data for ease.
-            paste_data = post_results
-    
-    
-            # If we have a result add some meta data and send to storage
-            # If results is empty, ie no match, and store_all is True,
-            # then append "no_match" to results. This will then force output.
-    
-            if store_all is True:
-                if len(results) == 0:
-                    results.append('no_match')
-                    
-            if len(results) > 0:
-    
-                encoded_paste_data = raw_paste_data.encode('utf-8')
-                md5 = hashlib.md5(encoded_paste_data).hexdigest()
-                sha256 = hashlib.sha256(encoded_paste_data).hexdigest()
-                paste_data['MD5'] = md5
-                paste_data['SHA256'] = sha256
-                paste_data['raw_paste'] = raw_paste_data
-                paste_data['YaraRule'] = results
-                # Set the size for all pastes - This will override any size set by the source
-                paste_data['size'] = len(raw_paste_data)
-                for output in outputs:
+        
+                # Pastebin Cache
+                if raw_paste_data == "File is not ready for scraping yet. Try again in 1 minute.":
+                    logger.info("Paste is still cached sleeping to try again")
+                    sleep(45)
+                    # get raw paste and hash them
+                    raw_paste_uri = paste_data['scrape_url']
+                    # Cover fetch site SSLErrors
                     try:
-                        output.store_paste(paste_data)
-                    except Exception as e:
-                        logger.error("Unable to store {0} to {1} with error {2}".format(paste_data["pasteid"], output, e))
-            
-            end_time = time.time()
-            logger.debug("Processing Finished for {0} in {1} seconds".format(
-                paste_data["pasteid"],
-                (end_time - start_time)
-            ))
+                        raw_paste_data = requests.get(raw_paste_uri).text
+                    except requests.exceptions.SSLError as e:
+                        logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
+                        raw_paste_data = ""
+                # Process the paste data here
+                try:
+                    # Scan with yara
+                    matches = rules.match(data=raw_paste_data)
+                except Exception as e:
+                    logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
+                    continue
+        
+                results = []
+                for match in matches:
+                    # For keywords get the word from the matched string
+                    if match.rule == 'core_keywords' or match.rule == 'custom_keywords':
+                        for s in match.strings:
+                            rule_match = s[1].lstrip('$')
+                            if rule_match not in results:
+                                results.append(rule_match)
+                        results.append(str(match.rule))
+        
+                    # But a break in here for the base64. Will use it later.
+                    elif match.rule.startswith('b64'):
+                        results.append(match.rule)
+        
+                    # Else use the rule name
+                    else:
+                        results.append(match.rule)
+        
+                # Store all OverRides other options. 
+                paste_site = paste_data['confname']
+                store_all = conf['inputs'][paste_site]['store_all']
+                # remove the confname key as its not really needed past this point
+                del paste_data['confname']
+        
+        
+                # Blacklist Check
+                # If any of the blacklist rules appear then empty the result set
+                blacklisted = False
+                if conf['yara']['blacklist'] and 'blacklist' in results:
+                    results = []
+                    blacklisted = True
+                    logger.info("Blacklisted {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
+        
+        
+                # Post Process
+        
+                # If post module is enabled and the paste has a matching rule.
+                post_results = paste_data
+                for post_process, post_values in conf["post_process"].items():
+                    if post_values["enabled"]:
+                        if any(i in results for i in post_values["rule_list"]) or "ALL" in post_values["rule_list"]:
+                            if not blacklisted:
+                                logger.info("Running Post Module {0} on {1}".format(post_values["module"], paste_data["pasteid"]))
+                                post_module = importlib.import_module(post_values["module"])
+                                post_results = post_module.run(results,
+                                                                raw_paste_data,
+                                                                paste_data
+                                                                )
+        
+                # Throw everything back to paste_data for ease.
+                paste_data = post_results
+        
+        
+                # If we have a result add some meta data and send to storage
+                # If results is empty, ie no match, and store_all is True,
+                # then append "no_match" to results. This will then force output.
+        
+                if store_all is True:
+                    if len(results) == 0:
+                        results.append('no_match')
+                        
+                if len(results) > 0:
+        
+                    encoded_paste_data = raw_paste_data.encode('utf-8')
+                    md5 = hashlib.md5(encoded_paste_data).hexdigest()
+                    sha256 = hashlib.sha256(encoded_paste_data).hexdigest()
+                    paste_data['MD5'] = md5
+                    paste_data['SHA256'] = sha256
+                    paste_data['raw_paste'] = raw_paste_data
+                    paste_data['YaraRule'] = results
+                    # Set the size for all pastes - This will override any size set by the source
+                    paste_data['size'] = len(raw_paste_data)
+                    for output in outputs:
+                        try:
+                            output.store_paste(paste_data)
+                        except Exception as e:
+                            logger.error("Unable to store {0} to {1} with error {2}".format(paste_data["pasteid"], output, e))
+                
+                end_time = time.time()
+                logger.debug("Processing Finished for {0} in {1} seconds".format(
+                    paste_data["pasteid"],
+                    (end_time - start_time)
+                ))
 
 
 

From b5091463a24c357d491e909966f2d95069ad7487 Mon Sep 17 00:00:00 2001
From: KevTheHermit <thehermit@pastehunter.com>
Date: Tue, 9 Apr 2019 20:16:43 +0000
Subject: [PATCH 133/178] Restart dead processes

---
 pastehunter.py | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/pastehunter.py b/pastehunter.py
index d89e9a5..866d520 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -51,7 +51,8 @@ def __init__(self, seconds=1, error_message='Timeout'):
         self.seconds = seconds
         self.error_message = error_message
     def handle_timeout(self, signum, frame):
-        raise TimeoutError(self.error_message)
+        print("Process timeout: {0}".format(self.error_message))
+        sys.exit(0)
     def __enter__(self):
         signal.signal(signal.SIGALRM, self.handle_timeout)
         signal.alarm(self.seconds)
@@ -151,8 +152,7 @@ def paste_scanner():
     while True:
         while not q.empty():
             paste_data = q.get()
-            with timeout(seconds=10):
-                
+            with timeout(seconds=5):
                 # Start a timer
                 start_time = time.time()
                 logger.debug("Found New {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
@@ -304,17 +304,27 @@ def paste_scanner():
     q = Queue()
     processes = []
 
-    # Threads
-    for i in range(5):
-        m = multiprocessing.Process(target=paste_scanner)
-        # Add new process to list so we can run join on them later. 
-        processes.append(m)
-        m.start()
-
     # Now Fill the Queue
     try:
         while True:
             queue_count = 0
+            counter = 0
+            if len(processes) < 5:
+                for i in range(5-len(processes)):
+                    logger.warning("Creating New Process")
+                    m = multiprocessing.Process(target=paste_scanner)
+                    # Add new process to list so we can run join on them later. 
+                    processes.append(m)
+                    m.start()
+            for process in processes:
+                if not process.is_alive():
+                    logger.warning("Restarting Dead Process")
+                    del processes[counter]
+                    m = multiprocessing.Process(target=paste_scanner)
+                    # Add new process to list so we can run join on them later. 
+                    processes.append(m)
+                    m.start()
+                counter += 1
             
             # Check if the processors are active
             # Paste History

From 3f33924a1889b4fdb9f0a98de9474cdb4ad215d7 Mon Sep 17 00:00:00 2001
From: Matt Kalinowski <mkalinowski@pivotal.io>
Date: Thu, 11 Apr 2019 17:13:38 -0500
Subject: [PATCH 134/178] enable 'all' special-rule and include scrape_url in
 message for outputs/slack

---
 outputs/slack_output.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/outputs/slack_output.py b/outputs/slack_output.py
index 91e5f3f..1b003a2 100644
--- a/outputs/slack_output.py
+++ b/outputs/slack_output.py
@@ -24,7 +24,7 @@ def __init__(self):
 
     def store_paste(self, paste_data):
         if self.valid:
-            send = False
+            send = ('all' in self.accepted_rules)
 
             for rule in self.accepted_rules:
                 if rule in paste_data['YaraRule']:
@@ -38,7 +38,7 @@ def store_paste(self, paste_data):
                             "fallback": "Plan a vacation",
                             "author_name": "PasteHunter",
                             "title": "Paste ID {0}".format(paste_data['pasteid']),
-                            "text": "Yara Rule {0} Found on {1}".format(paste_data['YaraRule'], paste_data['pastesite'])
+                            "text": "Yara Rule {0} Found on {1}\n\r{2}".format(paste_data['YaraRule'], paste_data['pastesite'], paste_data['scrape_url'])
                         }
                     ]
                 }

From ea6c883d7f90e88b1f55e64d16d29fed17e4fdbc Mon Sep 17 00:00:00 2001
From: Gabriel Ricci <gabriel.grj@hotmail.com>
Date: Mon, 6 May 2019 12:39:17 -0300
Subject: [PATCH 135/178] Instead of ./YaraRules get path from settings.json

---
 pastehunter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pastehunter.py b/pastehunter.py
index 866d520..cdf847d 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -128,7 +128,7 @@ def __exit__(self, type, value, traceback):
 def yara_index(rule_path, blacklist, test_rules):
     index_file = os.path.join(rule_path, 'index.yar')
     with open(index_file, 'w') as yar:
-        for filename in os.listdir('YaraRules'):
+        for filename in os.listdir(rule_path):
             if filename.endswith('.yar') and filename != 'index.yar':
                 if filename == 'blacklist.yar':
                     if blacklist:

From a3c503b14252a6605590101b0aa08eee8daac9db Mon Sep 17 00:00:00 2001
From: Joel Snape <jsnape@nettitude.com>
Date: Mon, 13 May 2019 12:40:52 +0100
Subject: [PATCH 136/178] changed queue checking logic to prevent busy-looping

---
 pastehunter.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pastehunter.py b/pastehunter.py
index 866d520..84ba758 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -150,7 +150,10 @@ def paste_scanner():
     # scan the Paste
     # Store the Paste
     while True:
-        while not q.empty():
+        if q.empty():
+            # Queue was empty, sleep to prevent busy loop
+            sleep(0.5)
+        else:
             paste_data = q.get()
             with timeout(seconds=5):
                 # Start a timer

From 42e94d89a6a0e4545c98f37e337b96bfb8b3a11b Mon Sep 17 00:00:00 2001
From: KevTheHermit <kevthehermit@gmail.com>
Date: Sat, 18 May 2019 22:49:26 +0100
Subject: [PATCH 137/178] Clean some false positive rules

---
 YaraRules/api_keys.yar | 1 +
 settings.json.sample   | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/YaraRules/api_keys.yar b/YaraRules/api_keys.yar
index 1b1a252..1f4efd4 100644
--- a/YaraRules/api_keys.yar
+++ b/YaraRules/api_keys.yar
@@ -16,6 +16,7 @@ rule generic_api
         $hash64 = /\b[a-fA-F\d]{64}\b/
         $n1 = "#EXTINF"
         $n2 = "m3u8"
+        $n3 = "Chocolatey is running"
 
     condition:
         (any of ($a*)) and (any of ($hash*)) and (not any of ($n*))
diff --git a/settings.json.sample b/settings.json.sample
index e24816f..19d827c 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -24,7 +24,7 @@
       "api_limit": 100,
       "store_all": false,
       "user_blacklist": [],
-      "file_blacklist": ["grahamcofborg-eval-package-list"]
+      "file_blacklist": ["grahamcofborg-eval-package-list", "Changed Paths"]
     },
     "slexy":{
       "enabled": true,

From 1682515a26c7ae1ac86afa15c1455e3ec23f4feb Mon Sep 17 00:00:00 2001
From: KevTheHermit <kevthehermit@gmail.com>
Date: Sat, 25 May 2019 17:24:57 +0100
Subject: [PATCH 138/178] Add twilio Output

---
 outputs/twilio_output.py | 59 ++++++++++++++++++++++++++++++++++++++++
 settings.json.sample     | 12 +++++++-
 2 files changed, 70 insertions(+), 1 deletion(-)
 create mode 100644 outputs/twilio_output.py

diff --git a/outputs/twilio_output.py b/outputs/twilio_output.py
new file mode 100644
index 0000000..cc69a31
--- /dev/null
+++ b/outputs/twilio_output.py
@@ -0,0 +1,59 @@
+import logging
+from twilio.rest import Client
+from common import parse_config
+
+logger = logging.getLogger('pastehunter')
+config = parse_config()
+
+class TwilioOutput():
+    def __init__(self):
+        self.account_sid = config['outputs']['twilio_output']['account_sid']
+        self.auth_token = config['outputs']['twilio_output']['auth_token']
+        self.twilio_sender = config['outputs']['twilio_output']['twilio_sender']
+        self.recipient_list = config['outputs']['twilio_output']['recipient_list']
+        self.message_type = 'sms' # Whatsapp is still in beta on twilio.
+        try:
+            self.client = Client(self.account_sid, self.auth_token)
+            self.test = True
+        except Exception as e:
+            logging.error("Unable to create twilio Client: {0}".format(e))
+            self.test = False
+
+
+    def store_paste(self, paste_data):
+        if self.test:
+
+            message_body = "Yara Rule {0} Found on {1}\n\r{2}".format(
+                paste_data['YaraRule'], 
+                paste_data['pastesite'], 
+                paste_data['scrape_url']
+                )
+
+            if self.message_type == 'sms':
+                for recipient in self.recipient_list:
+                    try:
+                        message = self.client.messages.create( 
+                                                    from_=self.twilio_sender,  
+                                                    body=message_body,      
+                                                    to=recipient 
+                                                )
+                        logging.debug("Sent twilio message with ID: {0}".format(message.sid))
+                    except Exception as e:
+                        logging.error(e)
+
+            elif self.message_type == 'whatsapp':
+                for recipient in self.recipient_list:
+                    try:
+                        message = self.client.messages.create( 
+                                                    from_='whatsapp:{0}'.format(self.twilio_sender),  
+                                                    body=message_body,      
+                                                    to='whatsapp:{0}'.format(recipient) 
+                                                )
+                        logging.debug("Sent twilio message with ID: {0}".format(message.sid))
+                    except Exception as e:
+                        logging.error(e)
+            else:
+                logging.error("No Valid twilio message type found")
+
+        else:
+            logging.error("Unable to send twilio message")
diff --git a/settings.json.sample b/settings.json.sample
index 19d827c..d1fbe35 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -102,11 +102,21 @@
       }
     },
     "slack_output": {
-      "enabled": true,
+      "enabled": false,
       "module": "outputs.slack_output",
       "classname": "SlackOutput",
       "webhook_url": "",
       "rule_list": ["custom_keywords"]
+    },
+    "twilio_output": {
+      "enabled": false,
+      "module": "outputs.twilio_output",
+      "classname": "TwilioOutput",
+      "account_sid": "",
+      "auth_token": "",
+      "twilio_sender": "",
+      "recipient_list": []
+      "rule_list": ["custom_keywords"]
     }
   },
   "yara": {

From a6cbb8489efcae9b996804588ac95d8ef425f517 Mon Sep 17 00:00:00 2001
From: KevTheHermit <kevthehermit@gmail.com>
Date: Sat, 25 May 2019 17:55:24 +0100
Subject: [PATCH 139/178] Fix twilio only send on user set rule list.

---
 outputs/twilio_output.py | 75 ++++++++++++++++++++++------------------
 settings.json.sample     |  2 +-
 2 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/outputs/twilio_output.py b/outputs/twilio_output.py
index cc69a31..3068bae 100644
--- a/outputs/twilio_output.py
+++ b/outputs/twilio_output.py
@@ -11,6 +11,7 @@ def __init__(self):
         self.auth_token = config['outputs']['twilio_output']['auth_token']
         self.twilio_sender = config['outputs']['twilio_output']['twilio_sender']
         self.recipient_list = config['outputs']['twilio_output']['recipient_list']
+        self.accepted_rules = config['outputs']['twilio_output']['rule_list']
         self.message_type = 'sms' # Whatsapp is still in beta on twilio.
         try:
             self.client = Client(self.account_sid, self.auth_token)
@@ -23,37 +24,43 @@ def __init__(self):
     def store_paste(self, paste_data):
         if self.test:
 
-            message_body = "Yara Rule {0} Found on {1}\n\r{2}".format(
-                paste_data['YaraRule'], 
-                paste_data['pastesite'], 
-                paste_data['scrape_url']
-                )
-
-            if self.message_type == 'sms':
-                for recipient in self.recipient_list:
-                    try:
-                        message = self.client.messages.create( 
-                                                    from_=self.twilio_sender,  
-                                                    body=message_body,      
-                                                    to=recipient 
-                                                )
-                        logging.debug("Sent twilio message with ID: {0}".format(message.sid))
-                    except Exception as e:
-                        logging.error(e)
-
-            elif self.message_type == 'whatsapp':
-                for recipient in self.recipient_list:
-                    try:
-                        message = self.client.messages.create( 
-                                                    from_='whatsapp:{0}'.format(self.twilio_sender),  
-                                                    body=message_body,      
-                                                    to='whatsapp:{0}'.format(recipient) 
-                                                )
-                        logging.debug("Sent twilio message with ID: {0}".format(message.sid))
-                    except Exception as e:
-                        logging.error(e)
-            else:
-                logging.error("No Valid twilio message type found")
-
-        else:
-            logging.error("Unable to send twilio message")
+
+            send = ('all' in self.accepted_rules)
+
+            for rule in self.accepted_rules:
+                if rule in paste_data['YaraRule']:
+                    send = True
+
+            if send:
+                message_body = "Yara Rule {0} Found on {1}\n\r{2}".format(
+                    paste_data['YaraRule'], 
+                    paste_data['pastesite'], 
+                    paste_data['scrape_url']
+                    )
+
+                print("Sending Twilio Message")
+                if self.message_type == 'sms':
+                    for recipient in self.recipient_list:
+                        try:
+                            message = self.client.messages.create( 
+                                                        from_=self.twilio_sender,  
+                                                        body=message_body,      
+                                                        to=recipient 
+                                                    )
+                            logging.debug("Sent twilio message with ID: {0}".format(message.sid))
+                        except Exception as e:
+                            logging.error(e)
+
+                elif self.message_type == 'whatsapp':
+                    for recipient in self.recipient_list:
+                        try:
+                            message = self.client.messages.create( 
+                                                        from_='whatsapp:{0}'.format(self.twilio_sender),  
+                                                        body=message_body,      
+                                                        to='whatsapp:{0}'.format(recipient) 
+                                                    )
+                            logging.debug("Sent twilio message with ID: {0}".format(message.sid))
+                        except Exception as e:
+                            logging.error(e)
+                else:
+                    logging.error("No Valid twilio message type found")
diff --git a/settings.json.sample b/settings.json.sample
index d1fbe35..ac19b9a 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -115,7 +115,7 @@
       "account_sid": "",
       "auth_token": "",
       "twilio_sender": "",
-      "recipient_list": []
+      "recipient_list": [],
       "rule_list": ["custom_keywords"]
     }
   },

From 4c755bf2dc6091387c75e0dc1713d46f842809dc Mon Sep 17 00:00:00 2001
From: KevTheHermit <kevthehermit@gmail.com>
Date: Thu, 30 May 2019 21:10:34 +0100
Subject: [PATCH 140/178] Add pasteid to email attachment

---
 outputs/smtp_output.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/outputs/smtp_output.py b/outputs/smtp_output.py
index 6ade375..6090511 100644
--- a/outputs/smtp_output.py
+++ b/outputs/smtp_output.py
@@ -54,7 +54,7 @@ def _send_mail(self, send_to_address, paste_data):
         json_body = json.dumps(paste_data)
         attachment.set_payload(json_body)
         email.encoders.encode_base64(attachment)
-        attachment.add_header('Content-Disposition', 'attachment; filename="Alert.json"')
+        attachment.add_header('Content-Disposition', 'attachment; filename="Alert-{0}.json"'.format(paste_data['pasteid']))
         msg.attach(attachment)
 
         # Connect to the SMTP server and send

From 188badf8df0df323ac99efa3cd127695ced1166a Mon Sep 17 00:00:00 2001
From: Andrew Cabey <me@acabey.xyz>
Date: Thu, 6 Jun 2019 09:13:09 -0400
Subject: [PATCH 141/178] README typo

Correctes "pasts" to "pastes"
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index effe0ba..88f9b0b 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # PasteHunter
 PasteHunter is a python3 application that is designed to query a collection of sites that host publicly pasted data. 
-For all the pasts it finds it scans the raw contents against a series of Yara rules looking for information that can be used 
+For all the pastes it finds it scans the raw contents against a series of Yara rules looking for information that can be used 
 by an organisation or a researcher.
 
 For setup instructions please see the official documentation https://pastehunter.readthedocs.io/en/latest/installation.html

From 0031955eb5953be2bac8a0d94d2ea2250b7f360e Mon Sep 17 00:00:00 2001
From: Dylan Katz <git@dylankatz.com>
Date: Fri, 21 Jun 2019 12:09:14 -0700
Subject: [PATCH 142/178] Added support for compressing paste contents

---
 docs/outputs.rst       |  1 +
 outputs/json_output.py | 18 ++++++++++++++++++
 settings.json.sample   |  1 +
 3 files changed, 20 insertions(+)

diff --git a/docs/outputs.rst b/docs/outputs.rst
index f3ca3a4..59e25d0 100644
--- a/docs/outputs.rst
+++ b/docs/outputs.rst
@@ -27,6 +27,7 @@ This output module will store each paste in a json file on disk. The name of the
 
 - **output_path**: Path on disk to store output files. 
 - **store_raw**: Include the raw paste in the json file. False jsut stores metadata.
+- **compress_raw**: Compresses the data using LZMA if it will reduce the size. Can be decompressed by base64-decoding, then using the `xz command <https://www.systutorials.com/docs/linux/man/1-xz/>`_.
 - **encode_raw**: Ignored, Reserved for future usage.
 
 CSV
diff --git a/outputs/json_output.py b/outputs/json_output.py
index 7158bd2..1254b06 100644
--- a/outputs/json_output.py
+++ b/outputs/json_output.py
@@ -1,3 +1,5 @@
+import base64
+import lzma
 import os
 import datetime
 import json
@@ -25,6 +27,22 @@ def __init__(self):
     def store_paste(self, paste_data):
         if not config['outputs']['json_output']['store_raw']:
             del paste_data['raw_paste']
+        elif config['outputs']['json_output']['compress_raw']:
+            original = paste_data['raw_paste']
+            orig_size = len(original.encode())
+            logger.debug("Compressing paste... Pre-compression size: {}", orig_size)
+            compressed = base64.b64encode(lzma.compress(paste_data['raw_paste'].encode()))
+            compressed_size = len(compressed)
+            logger.debug("Compressing paste... Post-compression size: {}", compressed_size)
+
+            # In some cases compressed blobs may be larger
+            # if not much data is compressed
+            if orig_size > compressed_size:
+                paste_data['raw_paste'] = compressed.decode('utf-8')
+                logger.debug("Compressed data smaller than original blob. Keeping compressed.")
+            else:
+                logger.debug("Original smaller than compressed blob. Keeping original.")
+
 
         if self.test:
             json_file = os.path.join(self.json_path, str(paste_data['pasteid']))
diff --git a/settings.json.sample b/settings.json.sample
index ac19b9a..cc3c0b5 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -64,6 +64,7 @@
       "classname": "JsonOutput",
       "output_path": "logs/json/",
       "store_raw": true,
+      "compress_raw": true,
       "encode_raw": true
     },
     "csv_output": {

From 1c78166794c7006c036d2c7d559390dcefdb76f4 Mon Sep 17 00:00:00 2001
From: Dylan Katz <dylanishappy1@gmail.com>
Date: Sun, 14 Jul 2019 11:41:44 -0700
Subject: [PATCH 143/178] Switched to using postprocessor module

---
 outputs/json_output.py       | 16 ----------------
 pastehunter.py               | 11 ++++++++---
 postprocess/post_compress.py | 26 ++++++++++++++++++++++++++
 settings.json.sample         |  6 +++++-
 4 files changed, 39 insertions(+), 20 deletions(-)
 create mode 100644 postprocess/post_compress.py

diff --git a/outputs/json_output.py b/outputs/json_output.py
index 1254b06..d834e6c 100644
--- a/outputs/json_output.py
+++ b/outputs/json_output.py
@@ -27,22 +27,6 @@ def __init__(self):
     def store_paste(self, paste_data):
         if not config['outputs']['json_output']['store_raw']:
             del paste_data['raw_paste']
-        elif config['outputs']['json_output']['compress_raw']:
-            original = paste_data['raw_paste']
-            orig_size = len(original.encode())
-            logger.debug("Compressing paste... Pre-compression size: {}", orig_size)
-            compressed = base64.b64encode(lzma.compress(paste_data['raw_paste'].encode()))
-            compressed_size = len(compressed)
-            logger.debug("Compressing paste... Post-compression size: {}", compressed_size)
-
-            # In some cases compressed blobs may be larger
-            # if not much data is compressed
-            if orig_size > compressed_size:
-                paste_data['raw_paste'] = compressed.decode('utf-8')
-                logger.debug("Compressed data smaller than original blob. Keeping compressed.")
-            else:
-                logger.debug("Original smaller than compressed blob. Keeping original.")
-
 
         if self.test:
             json_file = os.path.join(self.json_path, str(paste_data['pasteid']))
diff --git a/pastehunter.py b/pastehunter.py
index bf08c85..d360874 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -270,10 +270,15 @@ def paste_scanner():
                     sha256 = hashlib.sha256(encoded_paste_data).hexdigest()
                     paste_data['MD5'] = md5
                     paste_data['SHA256'] = sha256
-                    paste_data['raw_paste'] = raw_paste_data
+                    # It is possible a post module modified or set this field.
+                    if not paste_data.get('raw_paste'):
+                        paste_data['raw_paste'] = raw_paste_data
+                        paste_data['size'] = len(raw_paste_data)
+                    else:
+                        # Set size based on modified value
+                        paste_data['size'] = len(paste_data['raw_paste'])
+                    
                     paste_data['YaraRule'] = results
-                    # Set the size for all pastes - This will override any size set by the source
-                    paste_data['size'] = len(raw_paste_data)
                     for output in outputs:
                         try:
                             output.store_paste(paste_data)
diff --git a/postprocess/post_compress.py b/postprocess/post_compress.py
new file mode 100644
index 0000000..645ac5e
--- /dev/null
+++ b/postprocess/post_compress.py
@@ -0,0 +1,26 @@
+import lzma
+import base64
+import logging
+from common import parse_config
+logger = logging.getLogger('pastehunter')
+config = parse_config()
+
+def run(results, raw_paste_data, paste_object):
+    if config['outputs']['json_output']['store_raw']:
+        original = raw_paste_data
+        orig_size = len(original.encode())
+        logger.debug("Compressing paste... Pre-compression size: {}", orig_size)
+        compressed = base64.b64encode(lzma.compress(raw_paste_data.encode()))
+        compressed_size = len(compressed)
+        logger.debug("Compressing paste... Post-compression size: {}", compressed_size)
+
+        # In some cases compressed blobs may be larger
+        # if not much data is compressed
+        if orig_size > compressed_size:
+            paste_object['raw_paste'] = compressed.decode('utf-8')
+            logger.debug("Compressed data smaller than original blob. Keeping compressed.")
+        else:
+            logger.debug("Original smaller than compressed blob. Keeping original.")
+
+    # Regardless of modification, return the paste object
+    return paste_object
diff --git a/settings.json.sample b/settings.json.sample
index cc3c0b5..8006740 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -64,7 +64,6 @@
       "classname": "JsonOutput",
       "output_path": "logs/json/",
       "store_raw": true,
-      "compress_raw": true,
       "encode_raw": true
     },
     "csv_output": {
@@ -160,6 +159,11 @@
       "enabled": false,
       "module": "postprocess.post_entropy",
       "rule_list": ["ALL"]
+    },
+    "post_compress": {
+      "enabled": true,
+      "module": "postprocess.post_compress",
+      "rule_list": ["ALL"]
     }
   }
 }

From 66bbc3cc0957c89295c445be1946c2310ad964b8 Mon Sep 17 00:00:00 2001
From: Dylan Katz <git@dylankatz.com>
Date: Sun, 14 Jul 2019 11:45:09 -0700
Subject: [PATCH 144/178] Updated docs

---
 docs/outputs.rst     | 1 -
 docs/postprocess.rst | 7 +++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/docs/outputs.rst b/docs/outputs.rst
index 59e25d0..f3ca3a4 100644
--- a/docs/outputs.rst
+++ b/docs/outputs.rst
@@ -27,7 +27,6 @@ This output module will store each paste in a json file on disk. The name of the
 
 - **output_path**: Path on disk to store output files. 
 - **store_raw**: Include the raw paste in the json file. False jsut stores metadata.
-- **compress_raw**: Compresses the data using LZMA if it will reduce the size. Can be decompressed by base64-decoding, then using the `xz command <https://www.systutorials.com/docs/linux/man/1-xz/>`_.
 - **encode_raw**: Ignored, Reserved for future usage.
 
 CSV
diff --git a/docs/postprocess.rst b/docs/postprocess.rst
index 6a90285..1a50062 100644
--- a/docs/postprocess.rst
+++ b/docs/postprocess.rst
@@ -51,3 +51,10 @@ Entropy
 This postprocess module calculates shannon entropy on the raw paste data. This can be used to help identify binary and encoded or encrytped data. 
 
 - **rule_list**: List of rules that will trigger the postprocess module. 
+
+Compress
+-------
+Compresses the data using LZMA(lossless compression) if it will reduce the size. Small pastes or pastes that don't benefit from compression will not be affected by this module. 
+Its outputs can be decompressed by base64-decoding, then using the `xz command <https://www.systutorials.com/docs/linux/man/1-xz/>`_.
+
+- **rule_list**: List of rules that will trigger the postprocess module. 
\ No newline at end of file

From 31a536eb351237904434844c509de1f27c0f2221 Mon Sep 17 00:00:00 2001
From: KevTheHermit <kevthehermit@gmail.com>
Date: Sat, 3 Aug 2019 21:52:16 +0100
Subject: [PATCH 145/178] reduce false positives in powershell rules

---
 YaraRules/powershell.yar | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/YaraRules/powershell.yar b/YaraRules/powershell.yar
index 09681a7..a9f6cce 100644
--- a/YaraRules/powershell.yar
+++ b/YaraRules/powershell.yar
@@ -10,22 +10,25 @@ rule powershell
         reference = "https://github.com/kevthehermit/PasteHunter"
 
     strings:
-        $a1 = "powershell" nocase
-        $a2 = "IEX" nocase
-        $a3 = "new-object" nocase
-        $a4 = "webclient" nocase
-        $a5 = "downloadstring" nocase
-        $a6 = "-WindowStyle Hidden" nocase
-        $a7 = "invoke" nocase
-        $a8 = "bitsadmin" nocase
-        $a9 = "certutil -decode" nocase
-        $a10 = "hidden" nocase
-        $a11 = "nop" nocase
-        $a12 = "-e" nocase
+        $a1 = "powershell" fullword wide ascii nocase
+        $a2 = "IEX" fullword wide ascii nocase
+        $a3 = "new-object" fullword wide ascii nocase
+        $a4 = "webclient" fullword wide ascii nocase
+        $a5 = "downloadstring" fullword wide ascii nocase
+        $a6 = "-WindowStyle Hidden" fullword wide ascii nocase
+        $a7 = "invoke" fullword wide ascii nocase
+        $a8 = "bitsadmin" fullword wide ascii nocase
+        $a9 = "certutil -decode" fullword wide ascii nocase
+        $a10 = "hidden" fullword wide ascii nocase
+        $a11 = "nop" fullword wide ascii nocase
+        $a12 = "Invoke-" fullword wide ascii nocase
+        $a13 = "FromBase64String(" fullword wide ascii nocase
+
+
 
         $not1 = "chocolatey" nocase
         $not2 = "XmlConfiguration is now operational" nocase
     condition:
         4 of ($a*) and not any of ($not*)
 
-}
\ No newline at end of file
+}

From 64dc9089db2e518df4e5a39041dbb22bb3c8ce02 Mon Sep 17 00:00:00 2001
From: KevTheHermit <kevthehermit@gmail.com>
Date: Sat, 3 Aug 2019 21:52:26 +0100
Subject: [PATCH 146/178] add sicord webhooks to api rules

---
 YaraRules/api_keys.yar | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/YaraRules/api_keys.yar b/YaraRules/api_keys.yar
index 1f4efd4..6b02f64 100644
--- a/YaraRules/api_keys.yar
+++ b/YaraRules/api_keys.yar
@@ -104,3 +104,17 @@ rule heroku_api
     condition:
         any of them
 }
+
+rule discord_api
+{
+    meta:
+        author = "@ntddk"
+        info = "Part of PasteHunter"
+        reference = "https://github.com/kevthehermit/PasteHunter"
+
+    strings:
+        $a = "DiscordRelay.BotToken" nocase
+        $b = "discordapp.com/api/webhooks" nocase
+    condition:
+        any of them
+}

From 035658140cbec2bbf0321d712fe66716f8ecf3e8 Mon Sep 17 00:00:00 2001
From: KevTheHermit <kevthehermit@gmail.com>
Date: Sat, 3 Aug 2019 21:52:36 +0100
Subject: [PATCH 147/178] some basic aws cli rules

---
 YaraRules/aws.yar | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 YaraRules/aws.yar

diff --git a/YaraRules/aws.yar b/YaraRules/aws.yar
new file mode 100644
index 0000000..bc7324e
--- /dev/null
+++ b/YaraRules/aws.yar
@@ -0,0 +1,36 @@
+rule aws_cli
+{
+    meta:
+        author = "@KevTheHermit"
+        info = "Part of PasteHunter"
+        reference = "https://github.com/kevthehermit/PasteHunter"
+
+    strings:
+        $a1 = "aws s3 " ascii
+        $a2 = "aws ec2 " ascii
+	$a3 = "aws ecr " ascii
+	$a4 = "aws cognito-identity" ascii
+	$a5 = "aws iam "ascii
+	$a6 - "aws waf " ascii
+
+    condition:
+        any of them
+
+}
+
+rule sw_bucket
+{
+    meta:
+        author = "@KevTheHermit"
+        info = "Part of PasteHunter"
+        reference = "https://github.com/kevthehermit/PasteHunter"
+
+    strings:
+        $a1 = "s3.amazonaws.com" ascii
+
+    condition:
+        any of them
+
+
+
+}

From d79b852c07bd7893199590cd619061e18b6fc391 Mon Sep 17 00:00:00 2001
From: KevTheHermit <kevthehermit@gmail.com>
Date: Sat, 3 Aug 2019 21:55:09 +0100
Subject: [PATCH 148/178] Fix typo in aws rule

---
 YaraRules/aws.yar | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/YaraRules/aws.yar b/YaraRules/aws.yar
index bc7324e..37ef230 100644
--- a/YaraRules/aws.yar
+++ b/YaraRules/aws.yar
@@ -8,10 +8,10 @@ rule aws_cli
     strings:
         $a1 = "aws s3 " ascii
         $a2 = "aws ec2 " ascii
-	$a3 = "aws ecr " ascii
-	$a4 = "aws cognito-identity" ascii
-	$a5 = "aws iam "ascii
-	$a6 - "aws waf " ascii
+        $a3 = "aws ecr " ascii
+        $a4 = "aws cognito-identity" ascii
+        $a5 = "aws iam "ascii
+        $a6 = "aws waf " ascii
 
     condition:
         any of them

From 4872ae5dc24c34aae62bbcc69f53471b8f48201e Mon Sep 17 00:00:00 2001
From: KevTheHermit <kevthehermit@gmail.com>
Date: Sat, 3 Aug 2019 21:55:33 +0100
Subject: [PATCH 149/178] add some more error logging to requests

---
 pastehunter.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/pastehunter.py b/pastehunter.py
index bf08c85..d53485b 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -181,6 +181,11 @@ def paste_scanner():
                 except requests.exceptions.SSLError as e:
                     logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
                     raw_paste_data = ""
+                
+                # General Exception 
+                except Exception as e:
+                    logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
+                    raw_paste_data = ""
         
                 # Pastebin Cache
                 if raw_paste_data == "File is not ready for scraping yet. Try again in 1 minute.":
@@ -194,6 +199,12 @@ def paste_scanner():
                     except requests.exceptions.SSLError as e:
                         logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
                         raw_paste_data = ""
+
+                    # General Exception 
+                    except Exception as e:
+                        logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
+                        raw_paste_data = ""
+
                 # Process the paste data here
                 try:
                     # Scan with yara

From d74986a8fedfb0500b2b143accb07b20aabc6827 Mon Sep 17 00:00:00 2001
From: Scott Knight <4534275+knightsc@users.noreply.github.com>
Date: Wed, 28 Aug 2019 08:27:55 -0400
Subject: [PATCH 150/178] Set up a root logger

Some frameworks, like Splunk, check to see if a root logger is set and if not then they proceed to modify the logging. Without this patch including the splunk sdk would cause double logging to happen. This change should simply be setting up a logging hierarchy with the root logger actually configured. There should be no difference to the output or configuration of the pastehunter logging
---
 pastehunter.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pastehunter.py b/pastehunter.py
index d53485b..a6c0f6e 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -24,13 +24,15 @@
 VERSION = 1.0
 
 # Setup Default logging
-logger = logging.getLogger('pastehunter')
-logger.setLevel(logging.INFO)
+root = logging.getLogger()
 ch = logging.StreamHandler()
 ch.setLevel(logging.DEBUG)
 formatter = logging.Formatter('%(levelname)s:%(filename)s:%(message)s')
 ch.setFormatter(formatter)
-logger.addHandler(ch)
+root.addHandler(ch)
+
+logger = logging.getLogger('pastehunter')
+logger.setLevel(logging.INFO)
 
 # Version info
 logger.info("Starting PasteHunter Version: {0}".format(VERSION))

From eef872c8d6894d3d66603eed4ad495df21436897 Mon Sep 17 00:00:00 2001
From: Scott Knight <4534275+knightsc@users.noreply.github.com>
Date: Wed, 28 Aug 2019 08:28:21 -0400
Subject: [PATCH 151/178] Add splunk support

---
 outputs/splunk_output.py | 42 ++++++++++++++++++++++++++++++++++++++++
 requirements.txt         |  1 +
 settings.json.sample     | 11 +++++++++++
 3 files changed, 54 insertions(+)
 create mode 100644 outputs/splunk_output.py

diff --git a/outputs/splunk_output.py b/outputs/splunk_output.py
new file mode 100644
index 0000000..4d7745c
--- /dev/null
+++ b/outputs/splunk_output.py
@@ -0,0 +1,42 @@
+from common import parse_config
+from datetime import datetime
+import json
+import logging
+import splunklib.client as client
+
+logger = logging.getLogger('pastehunter')
+config = parse_config()
+
+class SplunkOutput():
+    def __init__(self):
+        # Set up the database connection
+        splunk_host = config['outputs']['splunk_output']['splunk_host']
+        splunk_port = config['outputs']['splunk_output']['splunk_port']
+        splunk_user = config['outputs']['splunk_output']['splunk_user']
+        splunk_pass = config['outputs']['splunk_output']['splunk_pass']
+        self.splunk_index = config['outputs']['splunk_output']['splunk_index']
+
+        try:
+            self.service = client.connect(
+                host=splunk_host,
+                port=splunk_port,
+                username=splunk_user,
+                password=splunk_pass,
+                autologin=True)
+
+            self.index = self.service.indexes[self.splunk_index]
+        except Exception as e:
+            logger.error(e)
+            raise Exception('Unable to connect or missing index') from None
+
+    def store_paste(self, paste_data):
+        # Make a copy so we don't affect any other output modules
+        local_data = dict(paste_data)
+        if not config['outputs']['splunk_output']['store_raw']:
+            del local_data['raw_paste']
+
+        try:
+            # The edit_tcp capability is required to access this API
+            self.index.submit(json.dumps(local_data), sourcetype="pastehunter")
+        except Exception as e:
+            logger.error('Error submitting paste_data to splunk')
diff --git a/requirements.txt b/requirements.txt
index a7274c9..8ed65b5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 requests>=2.20.0
 elasticsearch>=5.0.0,<6.0.0
+splunk-sdk
 yara-python
\ No newline at end of file
diff --git a/settings.json.sample b/settings.json.sample
index ac19b9a..02fa620 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -58,6 +58,17 @@
       "elastic_ssl": false,
       "weekly_index": true
     },
+    "splunk_output": {
+      "enabled": false,
+      "module": "outputs.splunk_output",
+      "classname": "SplunkOutput",
+      "splunk_host": "host",
+      "splunk_port": 8089,
+      "splunk_user": "user",
+      "splunk_pass": "pass",
+      "splunk_index": "paste-test",
+      "store_raw": true
+    },
     "json_output": {
       "enabled": false,
       "module": "outputs.json_output",

From b3650442df674961b8bf6bf1b6500e622b4f4dc8 Mon Sep 17 00:00:00 2001
From: Scott Knight <4534275+knightsc@users.noreply.github.com>
Date: Tue, 3 Sep 2019 14:45:27 -0400
Subject: [PATCH 152/178] Update output documentation

---
 docs/outputs.rst | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/docs/outputs.rst b/docs/outputs.rst
index f3ca3a4..8d12262 100644
--- a/docs/outputs.rst
+++ b/docs/outputs.rst
@@ -20,6 +20,17 @@ Elasticsearch was the default output. Storing all pastes and using Kibana as a g
 - **elastic_pass**: Password if using xpack / shield or basic auth.
 - **elastic_ssl**: True or false if Elasticsearch is served over SSL.
 
+Splunk
+-------------
+Splunk output is similar to Elasticsearch. All the data is put into Splunk and then Splunk can be used for graphical frontend and querying.
+
+- **splunk_host**: Hostname of IP of your Splunk instance.
+- **splunk_port**: The Splunk management port. (Usually port 8089)
+- **splunk_user**: Username of your Splunk user.
+- **splunk_pass**: Password for your Splunk user.
+- **splunk_index**: The name of the Splunk index to store the data in.
+- **store_raw**: Include the raw paste in the data sent to Splunk.
+
 JSON
 ----
 

From fc883db195a6529f6d7e1327fd3dff15f0fa3b4d Mon Sep 17 00:00:00 2001
From: Dylan Katz <git@dylankatz.com>
Date: Sun, 15 Sep 2019 14:45:23 -0700
Subject: [PATCH 153/178] Initial github support

---
 README.md            |   3 +-
 inputs/github.py     | 127 +++++++++++++++++++++++++++++++++++++++++++
 settings.json.sample |   9 +++
 3 files changed, 138 insertions(+), 1 deletion(-)
 create mode 100644 inputs/github.py

diff --git a/README.md b/README.md
index 88f9b0b..84d0e7d 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,8 @@ For setup instructions please see the official documentation https://pastehunter
 ## Supported Inputs
 Pastehunter currently has support for the following sites:
  - pastebin.com
- - gist.github.com
+ - gist.github.com # Gists
+ - github.com # Public commit activity feed
  - slexy.org
  - stackexchange # There are about 176! 
 
diff --git a/inputs/github.py b/inputs/github.py
new file mode 100644
index 0000000..ca3b841
--- /dev/null
+++ b/inputs/github.py
@@ -0,0 +1,127 @@
+import logging
+import math
+from datetime import datetime
+
+import requests
+
+# Future work/improvement that can happen here: support PR diffs, they contain a patch URL
+# Set some logging options
+logger = logging.getLogger('pastehunter')
+logging.getLogger('requests').setLevel(logging.ERROR)
+
+api_uri = 'https://api.github.com/events'
+# This event refers to a commit being pushed, and is
+# probably the most significant thing we're concerned about.
+event_types = ['PushEvent']
+api_version = 'application/vnd.github.v3+json'  # Set Accept header to force api v3
+# Important note from github:
+# 'We delay the public events feed by five minutes, which means the most recent event returned by the public events API actually occurred at least five minutes ago.'
+
+# Beware, git diffs can sometimes be very large files, including binaries and zips.
+#                MB    KB     B
+diff_size_limit = 500 * 1000 * 1000
+
+
+def _make_request(url, headers):
+    req = requests.get(url, headers=headers)
+    reset_date = datetime.utcfromtimestamp(float(req.headers['X-RateLimit-Reset'])).isoformat()
+    logger.info('Remaining Limit: {0}. Resets at {1}'.format(req.headers['X-RateLimit-Remaining'],
+                                                              reset_date))
+
+    if req.status_code == 200:
+        return req.json()
+
+    if req.status_code == 401:
+        logger.error('Auth Failed')
+        return None
+
+    elif req.status_code == 403:
+        logger.error('Login Attempts Exceeded')
+        return None
+
+def recent_pastes(conf, input_history):
+    oauth_token = conf['inputs']['github']['api_token']
+    conf_limit = conf['inputs']['github']['api_limit']
+    gh_limit = min(conf_limit, 300)
+    # From GitHub Docs (https://developer.github.com/v3/activity/events/#list-public-events):
+    # Events support pagination, however the per_page option is unsupported. The fixed page size is 30 items. Fetching up to ten pages is supported, for a total of 300 events.
+    # We modify this to be 100 per page, but the limit is still 300.
+    if gh_limit != conf_limit:
+        logger.warning('gh_limit exceeds github items returned from public feed. Limiting to 300.')
+    headers = {'user-agent': 'PasteHunter',
+               'Accept': api_version,
+               'Authorization': 'token {0}'.format(oauth_token)}
+
+    # calculate number of pages
+    page_count = int(math.ceil(gh_limit / 100))
+
+    result_pages = []
+    history = []
+    paste_list = []
+
+    gh_file_blacklist = conf['inputs']['github']['file_blacklist']
+    gh_user_blacklist = conf['inputs']['github']['user_blacklist']
+
+    try:
+        # Get the required amount of entries via pagination
+        for page_num in range(1, page_count + 1):
+            url = '{0}?page={1}&per_page=100'.format(api_uri, page_num)
+            logger.debug('Fetching page: {0}'.format(page_num))
+            req = _make_request(url, headers)
+            if req is not None:
+                result_pages.append(req)
+
+        # Parse results
+
+        for page in result_pages:
+            for event_meta in page:
+                # Track paste ids to prevent dupes
+                event_id = event_meta['id']
+                history.append(event_id)
+                if event_id in input_history:
+                    continue
+                if event_meta['type'] not in event_types:
+                    logger.debug('Skipping event {} due to unwanted type "{}"'.format(event_id, event_meta['type']))
+                # Actor may have been deleted or changed
+                if 'actor' in event_meta:
+                    # If the username is None, this will return false, while event_meta['login'] would error.
+                    if event_meta.get('actor').get('login') in gh_user_blacklist:
+                        logger.info('Blacklisting GitHub event from user: {0}'.format(event_meta.get('login')))
+                        continue
+                payload = event_meta.get('payload')
+                if not 'commits' in payload:
+                    logger.info('Skipping event {} due to no commits. Skipping!'.format(event_id))
+                    continue
+                for commit_meta in payload.get('commits'):
+                    commit_url = commit_meta.get('url')
+                    commit_data = _make_request(commit_url, headers)
+                    if commit_data.get('committer') and commit_data.get('committer').get('login') in gh_user_blacklist:
+                        logger.info('Blacklisting GitHub event from user: {0}'.format(event_meta['owner']['login']))
+                        continue
+                    for file in commit_data.get('files'):
+                        file_path = file.get('filename')
+                        file_name = file_path
+                        # Convert path -> filename
+                        if '/' in file_name:
+                            file_name = file_name.split('/')[-1]
+
+                        if file_name in gh_file_blacklist:
+                            logger.info('Blacklisting file {0} from event {1}'.format(file_name, event_id))
+                            continue
+
+                        gist_data = file
+                        gist_data['confname'] = 'github'
+                        gist_data['@timestamp'] = event_meta['created_at']
+                        gist_data['pasteid'] = event_id
+                        gist_data['user'] = event_meta.get('actor').get('login')
+                        gist_data['pastesite'] = 'github.com'
+                        gist_data['scrape_url'] = file.get('raw_url')
+                        # remove some original keys just to keep it a bit cleaner
+                        del gist_data['raw_url']
+                        paste_list.append(gist_data)
+
+        # Return results and history
+        return paste_list, history
+    except Exception as e:
+        logger.error('Unable to parse paste results: {0}'.format(e))
+        return paste_list, history
diff --git a/settings.json.sample b/settings.json.sample
index ac19b9a..8b8e18d 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -26,6 +26,15 @@
       "user_blacklist": [],
       "file_blacklist": ["grahamcofborg-eval-package-list", "Changed Paths"]
     },
+    "github": {
+      "enabled": false,
+      "module": "inputs.github",
+      "api_token": "",
+      "api_limit": 100,
+      "store_all": false,
+      "user_blacklist": [],
+      "file_blacklist": []
+    },
     "slexy":{
       "enabled": true,
       "module": "inputs.slexy",

From 35970ead32cafca3ea42ff481d7233c7468275dd Mon Sep 17 00:00:00 2001
From: Dylan Katz <git@dylankatz.com>
Date: Sun, 15 Sep 2019 14:51:03 -0700
Subject: [PATCH 154/178] Added support for filenames in rules

---
 pastehunter.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pastehunter.py b/pastehunter.py
index d53485b..2d61803 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -208,7 +208,7 @@ def paste_scanner():
                 # Process the paste data here
                 try:
                     # Scan with yara
-                    matches = rules.match(data=raw_paste_data)
+                    matches = rules.match(data=raw_paste_data, externals={'filename': paste_data.get('paste_data')})
                 except Exception as e:
                     logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
                     continue
@@ -311,7 +311,7 @@ def paste_scanner():
         index_file = os.path.join(conf['yara']['rule_path'], 'index.yar')
         rules = yara.compile(index_file)
     except Exception as e:
-        print("Unable to Create Yara index: ", e)
+        logger.exception("Unable to Create Yara index: ", e)
         sys.exit()
 
     # Create Queue to hold paste URI's

From 1508a3ce400839b0ed6b625bc52cba4b560a1fd4 Mon Sep 17 00:00:00 2001
From: Dylan Katz <git@dylankatz.com>
Date: Sun, 15 Sep 2019 16:10:44 -0700
Subject: [PATCH 155/178] Added filename external, initial dork rules, added
 glob for ignore

also added default ignores to settings.json.sample
---
 YaraRules/github_dorks.yar | 251 +++++++++++++++++++++++++++++++++++++
 inputs/github.py           |   9 +-
 pastehunter.py             |   4 +-
 settings.json.sample       |   2 +-
 4 files changed, 260 insertions(+), 6 deletions(-)
 create mode 100644 YaraRules/github_dorks.yar

diff --git a/YaraRules/github_dorks.yar b/YaraRules/github_dorks.yar
new file mode 100644
index 0000000..59a7c64
--- /dev/null
+++ b/YaraRules/github_dorks.yar
@@ -0,0 +1,251 @@
+/*
+    These are rule derived from github-dorks (https://github.com/techgaun/github-dorks)
+    github-dorks is under the Apache License 2.0:
+    https://github.com/techgaun/github-dorks/blob/master/LICENSE
+*/
+rule NPMRegistry {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "NPM Registry files (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "_auth" nocase
+    condition:
+        all of them and filename matches /.*\.npmrc$/is
+}
+
+rule DockerCfg {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Docker config files (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "auth" nocase
+    condition:
+        all of them and filename matches /.*\.dockercfg$/is
+}
+rule PrivateKeys {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Private key files (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "private" nocase
+    condition:
+        all of them and (filename matches /.*\.pem$/is or filename matches /\.ppk$/is
+                or filename matches /(\/|^)id_(r|d)sa$/is)
+}
+rule SQLDump {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "SQL dumps (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "mysql" nocase
+        $ = "dump" nocase
+    condition:
+        all of them and (filename matches /.*\.sql$/is)
+}
+rule S3Credentials {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "S3 Credentials (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "aws_access_key_id" nocase
+    condition:
+        filename matches /(\/|^)\.s3cfg$/is or filename matches /(\/|^)credentials$/is and all of them
+}
+rule WPConfig {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Wordpress config files (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    condition:
+        filename matches /(\/|^)wp-config.php$/is
+}
+rule HTPasswd {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "htpasswd files (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    condition:
+        filename matches /(\/|^)\.htpasswd$/is
+}
+rule EnvFile {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = ".env files, Matches laravel, mailservers, and various CI and config files (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $db_usr = "DB_USERNAME"
+        $mail_host = "MAIL_HOST=smtp."
+        $excl = "homestead" nocase
+    condition:
+        filename matches /(\/|^)\.env/is and any of ($db_usr, $mail_host) and not $excl
+}
+rule GitCredentials {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = ".git-credentials files (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    condition:
+        filename matches /(\/|^)\.git-credentials$/is
+}
+rule PivotalToken {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "PivotalTracker token (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "PT_TOKEN"
+    condition:
+        any of them
+}
+
+rule BashProfile {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Sensitive info in profile files, specifically .bashrc and .bash_profile (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "password" nocase
+        $ = "mailchimp" nocase
+        $ = "aws" nocase
+        $ = "secret" nocase
+    condition:
+        filename matches /(\/|^)\.bash(rc|_profile)$/is and any of them
+}
+rule AmazonCredentials {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Generic AWS credentials for RDS or EC2 (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $rds = "rds.amazonaws.com" nocase
+        $ec2 = "ec2.amazonaws.com" nocase
+        $pass = "password" nocase
+    condition:
+        $pass and ($rds or $ec2)
+}
+rule MongoLab {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "MongoLab Credentials (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "mongolab.com" nocase
+    condition:
+        filename matches /.*(\.conf|\.yaml|\.yml|\.json)$/is and all of them
+}
+rule RoboMongo {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "RoboMongo Credentials (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    condition:
+        filename matches /(\/|^)robomongo\.json$/is
+}
+rule JSForce {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Salesforce Credentials for JSForce (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "jsforce" nocase
+        $ = "conn.login" nocase
+    condition:
+        filename matches /.*js$/is and all of them
+}
+rule Salesforce {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Generic salesforce Credentials (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "SF_USERNAME" nocase
+        $ = "salesforce" nocase
+    condition:
+        all of them
+}
+rule Tugboat {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "DigitalOcean Tugboat Configurations (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "_tugboat"
+    condition:
+        filename matches /(\/|^)\.tugboat$/is and not any of them
+}
+rule Hub {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Hub files that contain oauth tokens (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = /oauth_token: [a-zA-Z0-9]+/ nocase
+    condition:
+        filename matches /(\/|^)hub$/is and any of them
+}
+rule NetRC {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Netrc files that contain 'password' or 'key' (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "password"
+        $ = "key"
+    condition:
+        filename matches /(\/|^)\.?_?netrc/is and any of them
+}
+rule Filezilla {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Filezilla configuration files with passwords (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "Pass"
+    condition:
+        (filename matches /(\/|^)filezilla\.xml$/is or filename matches /(\/|^)recentservers.xml$/is) and any of them
+}
+rule Docker {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Docker authentication config (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "auths"
+    condition:
+        filename matches /(\/|^)config\.json$/is and any of them
+}
+rule IdeaKey {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "License Keys for IDEA IDEs (IntelliJ, PyCharm, etc) (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    condition:
+        filename matches /(\/|^)idea[0-9]{0,}\.key$/is
+}
\ No newline at end of file
diff --git a/inputs/github.py b/inputs/github.py
index ca3b841..605705a 100644
--- a/inputs/github.py
+++ b/inputs/github.py
@@ -1,5 +1,6 @@
 import logging
 import math
+import pathlib
 from datetime import datetime
 
 import requests
@@ -90,7 +91,8 @@ def recent_pastes(conf, input_history):
                         continue
                 payload = event_meta.get('payload')
                 if not 'commits' in payload:
-                    logger.info('Skipping event {} due to no commits. Skipping!'.format(event_id))
+                    # Debug, because this is high output
+                    logger.debug('Skipping event {} due to no commits.'.format(event_id))
                     continue
                 for commit_meta in payload.get('commits'):
                     commit_url = commit_meta.get('url')
@@ -104,7 +106,8 @@ def recent_pastes(conf, input_history):
                         # Convert path -> filename
                         if '/' in file_name:
                             file_name = file_name.split('/')[-1]
-
+                        for pattern in gh_file_blacklist:
+                            pathlib.PurePath(file_path).match(pattern)
                         if file_name in gh_file_blacklist:
                             logger.info('Blacklisting file {0} from event {1}'.format(file_name, event_id))
                             continue
@@ -123,5 +126,5 @@ def recent_pastes(conf, input_history):
         # Return results and history
         return paste_list, history
     except Exception as e:
-        logger.error('Unable to parse paste results: {0}'.format(e))
+        logger.exception('Unable to parse paste results: {0}'.format(e), e)
         return paste_list, history
diff --git a/pastehunter.py b/pastehunter.py
index 2d61803..c48ce7a 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -208,7 +208,7 @@ def paste_scanner():
                 # Process the paste data here
                 try:
                     # Scan with yara
-                    matches = rules.match(data=raw_paste_data, externals={'filename': paste_data.get('paste_data')})
+                    matches = rules.match(data=raw_paste_data, externals={'filename': paste_data.get('filename')})
                 except Exception as e:
                     logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
                     continue
@@ -309,7 +309,7 @@ def paste_scanner():
 
         # Compile the yara rules we will use to match pastes
         index_file = os.path.join(conf['yara']['rule_path'], 'index.yar')
-        rules = yara.compile(index_file)
+        rules = yara.compile(index_file, externals={'filename': ''})
     except Exception as e:
         logger.exception("Unable to Create Yara index: ", e)
         sys.exit()
diff --git a/settings.json.sample b/settings.json.sample
index 8b8e18d..06b9db2 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -33,7 +33,7 @@
       "api_limit": 100,
       "store_all": false,
       "user_blacklist": [],
-      "file_blacklist": []
+      "file_blacklist": ["node_modules/*", "__pycache__/*", "*/grahamcofborg-eval-package-list", "*/yarn.lock", "*.3ds", "*.3g2", "*.3gp", "*.7z", "*.DS_Store", "*.a", "*.aac", "*.adp", "*.ai", "*.aif", "*.aiff", "*.alz", "*.ape", "*.apk", "*.ar", "*.arj", "*.asf", "*.au", "*.avi", "*.bak", "*.bh", "*.bin", "*.bk", "*.bmp", "*.btif", "*.bz2", "*.bzip2", "*.cab", "*.caf", "*.cgm", "*.class", "*.cmx", "*.cpio", "*.cr2", "*.csv", "*.cur", "*.dat", "*.deb", "*.dex", "*.djvu", "*.dll", "*.dmg", "*.dng", "*.doc", "*.docm", "*.docx", "*.dot", "*.dotm", "*.dra", "*.dsk", "*.dts", "*.dtshd", "*.dvb", "*.dwg", "*.dxf", "*.ecelp4800", "*.ecelp7470", "*.ecelp9600", "*.egg", "*.eol", "*.eot", "*.epub", "*.exe", "*.f4v", "*.fbs", "*.fh", "*.fla", "*.flac", "*.fli", "*.flv", "*.fpx", "*.fst", "*.fvt", "*.g3", "*.gif", "*.graffle", "*.gz", "*.gzip", "*.h261", "*.h263", "*.h264", "*.ico", "*.ief", "*.img", "*.ipa", "*.iso", "*.jar", "*.jpeg", "*.jpg", "*.jpgv", "*.jpm", "*.jxr", "*.key", "*.ktx", "*.lha", "*.lvp", "*.lz", "*.lzh", "*.lzma", "*.lzo", "*.m3u", "*.m4a", "*.m4v", "*.mar", "*.mdi", "*.mht", "*.mid", "*.midi", "*.mj2", "*.mka", "*.mkv", "*.mmr", "*.mng", "*.mobi", "*.mov", "*.movie", "*.mp3", "*.mp4", "*.mp4a", "*.mpeg", "*.mpg", "*.mpga", "*.mxu", "*.nef", "*.npx", "*.numbers", "*.o", "*.oga", "*.ogg", "*.ogv", "*.otf", "*.pages", "*.pbm", "*.pcx", "*.pdf", "*.pea", "*.pgm", "*.pic", "*.png", "*.pnm", "*.pot", "*.potm", "*.potx", "*.ppa", "*.ppam", "*.ppm", "*.pps", "*.ppsm", "*.ppsx", "*.ppt", "*.pptm", "*.pptx", "*.psd", "*.pya", "*.pyc", "*.pyo", "*.pyv", "*.qt", "*.rar", "*.ras", "*.raw", "*.rgb", "*.rip", "*.rlc", "*.rmf", "*.rmvb", "*.rtf", "*.rz", "*.s3m", "*.s7z", "*.scpt", "*.sgi", "*.shar", "*.sil", "*.sketch", "*.slk", "*.smv", "*.so", "*.sub", "*.swf", "*.tar", "*.tbz", "*.tbz2", "*.tga", "*.tgz", "*.thmx", "*.tif", "*.tiff", "*.tlz", "*.ttc", "*.ttf", "*.txz", "*.udf", "*.uvh", "*.uvi", "*.uvm", "*.uvp", "*.uvs", "*.uvu", "*.viv", "*.vob", "*.war", "*.wav", "*.wax", "*.wbmp", "*.wdp", "*.weba", "*.webm", "*.webp", "*.whl", "*.wim", "*.wm", "*.wma", "*.wmv", "*.wmx", "*.woff", "*.woff2", "*.wvx", "*.xbm", "*.xif", "*.xla", "*.xlam", "*.xls", "*.xlsb", "*.xlsm", "*.xlsx", "*.xlt", "*.xltm", "*.xltx", "*.xm", "*.xmind", "*.xpi", "*.xpm", "*.xwd", "*.xz", "*.z", "*.zip", "*.zipx"]
     },
     "slexy":{
       "enabled": true,

From 7464eea5e51f0d69fce67d30038efbb4e5046548 Mon Sep 17 00:00:00 2001
From: Dylan Katz <git@dylankatz.com>
Date: Sun, 15 Sep 2019 16:11:14 -0700
Subject: [PATCH 156/178] Increased timeout (github can return large files)

---
 pastehunter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pastehunter.py b/pastehunter.py
index c48ce7a..cdae07d 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -155,7 +155,7 @@ def paste_scanner():
             sleep(0.5)
         else:
             paste_data = q.get()
-            with timeout(seconds=5):
+            with timeout(seconds=10):
                 # Start a timer
                 start_time = time.time()
                 logger.debug("Found New {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))

From 1260ebad07e3212f4421a6eb7dd661c0631e975b Mon Sep 17 00:00:00 2001
From: Dylan Katz <git@dylankatz.com>
Date: Sun, 15 Sep 2019 16:59:49 -0700
Subject: [PATCH 157/178] Added remaining dorks,  added some error cases

---
 YaraRules/github_dorks.yar | 300 +++++++++++++++++++++++++++++++++++++
 YaraRules/index.yar        |  16 +-
 inputs/github.py           |   3 +
 pastehunter.py             |   7 +-
 4 files changed, 318 insertions(+), 8 deletions(-)

diff --git a/YaraRules/github_dorks.yar b/YaraRules/github_dorks.yar
index 59a7c64..227e83b 100644
--- a/YaraRules/github_dorks.yar
+++ b/YaraRules/github_dorks.yar
@@ -248,4 +248,304 @@ rule IdeaKey {
         date = "09/15/19"
     condition:
         filename matches /(\/|^)idea[0-9]{0,}\.key$/is
+}
+rule DBServers {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Database servers (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    condition:
+        filename matches /(\/|^)connections\.xml$/is
+            or filename matches /(\/|^)\.pgpass$/is
+}
+rule Proftpd {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Proftpd configuration files created by cpanel (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    condition:
+        filename matches /(\/|^)proftpdpasswd$/is
+}
+rule Ventrilo {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Ventrilo server configuration files (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    condition:
+        filename matches /(\/|^)ventrilo_srv\.ini/is
+}
+rule WinFrameClient {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "WinFrame-Client configuration used to connect to Citrix Application Servers (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "[WFClient] Password="
+    condition:
+        all of them and filename matches /.*\.ica/is
+}
+rule CounterStrikeRCON {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "RCON Credentials for CounterStrike servers (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "rcon" nocase
+        $ = "password" nocase
+    condition:
+        all of them and filename matches /(\/|^)server\.cfg/is
+}
+rule JekyllGitHub {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Jekyll Token for GitHub (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "JEKYLL_GITHUB_TOKEN" nocase
+    condition:
+        all of them
+}
+rule SshdConfig {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "SSHD config files (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    condition:
+        filename matches /(\/|^)sshd_config/is
+}
+rule DhcpConfig {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "DHCP Config files (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    condition:
+        filename matches /(\/|^)dhcpd\.conf/is
+}
+rule Phoenix {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Phoenix prod config and secret files (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "prod.secret.exs"
+    condition:
+        filename matches /(\/|^)prod\.secret\.exs/is or (filename matches /(\/|^)prod\.exs/is and not any of them)
+}
+rule JoomlaConfig {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Joomla config files (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "JConfig" nocase
+        $ = "password" nocase
+    condition:
+        filename matches /(\/|^)configuration.php/is and all of them
+}
+rule PasswdFile {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Unix /etc/passwd files (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "/bin/bash" nocase
+        $ = "/bin/sh" nocase
+        $ = "/usr/sbin/nologin" nocase
+    condition:
+        filename matches /(\/|^)passwd$/is and any of them
+}
+rule ShadowFile {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Unix /etc/shadow files (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = ":17737:0:99999" nocase
+        $ = "root:*:" nocase
+        // MD5
+        $ = "$1" nocase
+        // SHA-256
+        $ = "$5" nocase
+        // SHA-1
+        $ = "$6" nocase
+    condition:
+        filename matches /(\/|^)passwd$/is and any of them
+}
+rule Shodan {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Shodan API Keys (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = /shodan_api_key: [a-zA-Z0-9]+/ nocase
+        $ = /shodan_api_key=[a-zA-Z0-9]+/ nocase
+    condition:
+        any of them
+}
+rule Avast {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Avast license files (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "support.avast.com" nocase
+    condition:
+        all of them and (filename matches /.*\.avastlic$/is)
+}
+rule DBeaver {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "DBeaver configuration files (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    condition:
+        filename matches /(\/|^)dbeaver-data-sources.xml$/is
+}
+rule ESmtp {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = ".esmtpdrc files (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "password" nocase
+    condition:
+        filename matches /(\/|^)\.esmtprc$/is and all of them
+}
+rule Homebrew {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Homebrew github tokens (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "HOMEBREW_GITHUB_API_TOKEN" nocase
+    condition:
+        all of them
+}
+rule MLab {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "MLab mongodb credentials (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = ".mlab.com" nocase
+        $ = "password" nocase
+    condition:
+        all of them
+}
+rule Firefox {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Firefox saved passwords (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    condition:
+        filename matches /(\/|^)logins\.json$/is
+}
+rule CCCam {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "CCCam server config files (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    condition:
+        filename matches /(\/|^)CCCam\.cfg$/is
+}
+rule IRC {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Nickserv auth configs (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "msg nickserv identify" nocase
+    condition:
+        filename matches /(\/|^)config$/is and all of them
+}
+rule Django {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Django secret keys (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "SECRET_KEY" nocase
+    condition:
+        filename matches /(\/|^)settings.py$/is and all of them
+}
+rule RailsSecrets {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Ruby on rails secrets.yml files (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "password" nocase
+    condition:
+        filename matches /(\/|^)secrets\.yml$/is and all of them
+}
+rule RailsMasterKey {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Rails master key files (used for decrypting credentials.yml.enc for Rails 5.2+) (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    strings:
+        $ = "password" nocase
+    condition:
+        filename matches /(\/|^)config\/master\.key$/is and all of them
+}
+rule AtomDeployments {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Multiple files created by different atom extensions for authentication (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    condition:
+        filename matches /(\/|^)deployment-config\.json$/is or
+        filename matches /(\/|^)remote-sync\.json$/is or
+        filename matches /(\/|^)\.ftpconfig$/is
+}
+rule VscodeSftp {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "VSCode SFTP files (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    condition:
+        filename matches /(\/|^)\.vscode\/sftp\.json$/is
+}
+rule SublimeSftp {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Sublime SFTP files (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    condition:
+        filename matches /(\/|^)sftp-config\.json$/is
+}
+rule JetbrainsCreds {
+    meta:
+        author = "Dylan Katz (@Plazmaz)"
+        description = "Jetbrains IDE webserver credentials with encoded passwords (Created as part of PasteHunter)"
+        reference = "https://github.com/techgaun/github-dorks"
+        date = "09/15/19"
+    condition:
+        filename matches /(\/|^)WebServers\.xml$/is
 }
\ No newline at end of file
diff --git a/YaraRules/index.yar b/YaraRules/index.yar
index 2319b39..d09326e 100644
--- a/YaraRules/index.yar
+++ b/YaraRules/index.yar
@@ -1,12 +1,14 @@
-include "powershell.yar"
 include "api_keys.yar"
-include "email_filter.yar"
-include "blacklist.yar"
+include "aws.yar"
 include "base64.yar"
-include "database.yar"
+include "blacklist.yar"
+include "certificates.yar"
+include "core_keywords.yar"
 include "CryptoExchangeApi.yar"
+include "database.yar"
+include "email_filter.yar"
+include "general.yar"
+include "github_dorks.yar"
 include "hak5.yar"
-include "core_keywords.yar"
 include "password_leak.yar"
-include "general.yar"
-include "certificates.yar"
+include "powershell.yar"
diff --git a/inputs/github.py b/inputs/github.py
index 605705a..251ee3e 100644
--- a/inputs/github.py
+++ b/inputs/github.py
@@ -97,6 +97,9 @@ def recent_pastes(conf, input_history):
                 for commit_meta in payload.get('commits'):
                     commit_url = commit_meta.get('url')
                     commit_data = _make_request(commit_url, headers)
+                    if not commit_data:
+                        logger.info('No data returned for url {}. Skipping...'.format(commit_url))
+                        continue
                     if commit_data.get('committer') and commit_data.get('committer').get('login') in gh_user_blacklist:
                         logger.info('Blacklisting GitHub event from user: {0}'.format(event_meta['owner']['login']))
                         continue
diff --git a/pastehunter.py b/pastehunter.py
index cdae07d..6271a73 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -175,7 +175,12 @@ def paste_scanner():
                         
                     else:
                         raw_paste_uri = paste_data['scrape_url']
-                        raw_paste_data = requests.get(raw_paste_uri).text
+                        if not raw_paste_uri:
+                            logger.info('Unable to retrieve paste, no uri found.')
+                            logger.debug(json.dumps(paste_data))
+                            raw_paste_data = ""
+                        else:
+                            raw_paste_data = requests.get(raw_paste_uri).text
                         
                 # Cover fetch site SSLErrors
                 except requests.exceptions.SSLError as e:

From 15f213faf4036a32adcf7fe4318a8f90509502b4 Mon Sep 17 00:00:00 2001
From: Scott Knight <4534275+knightsc@users.noreply.github.com>
Date: Sun, 22 Sep 2019 12:44:09 -0400
Subject: [PATCH 158/178] Make sourcetype configurable

---
 outputs/splunk_output.py | 5 +++--
 settings.json.sample     | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/outputs/splunk_output.py b/outputs/splunk_output.py
index 4d7745c..a825f63 100644
--- a/outputs/splunk_output.py
+++ b/outputs/splunk_output.py
@@ -37,6 +37,7 @@ def store_paste(self, paste_data):
 
         try:
             # The edit_tcp capability is required to access this API
-            self.index.submit(json.dumps(local_data), sourcetype="pastehunter")
+            sourcetype = config['outputs']['splunk_output']['splunk_sourcetype']
+            self.index.submit(json.dumps(local_data), sourcetype=sourcetype)
         except Exception as e:
-            logger.error('Error submitting paste_data to splunk')
+            logger.exception('Error submitting paste_data to splunk', e)
diff --git a/settings.json.sample b/settings.json.sample
index 02fa620..3ae5ea2 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -67,6 +67,7 @@
       "splunk_user": "user",
       "splunk_pass": "pass",
       "splunk_index": "paste-test",
+      "splunk_sourcetype": "pastehunter",
       "store_raw": true
     },
     "json_output": {

From 04a8ba65b6fd8f72fe3d171dd7525c6dc24048fd Mon Sep 17 00:00:00 2001
From: KevTheHermit <kevthehermit@gmail.com>
Date: Tue, 24 Sep 2019 20:07:10 +0100
Subject: [PATCH 159/178] Fix filename yara rules

---
 inputs/pastebin.py      | 1 +
 inputs/stackexchange.py | 1 +
 settings.json.sample    | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/inputs/pastebin.py b/inputs/pastebin.py
index e7d8794..23cb0a0 100644
--- a/inputs/pastebin.py
+++ b/inputs/pastebin.py
@@ -31,6 +31,7 @@ def recent_pastes(conf, input_history):
 
             # Create a new paste dict for us to normalize
             paste_data = paste
+            paste_data['filename'] = paste['key']
             paste_data['confname'] = 'pastebin'
             paste_data['pasteid'] = paste['key']
             paste_data['pastesite'] = 'pastebin.com'
diff --git a/inputs/stackexchange.py b/inputs/stackexchange.py
index aa5ef07..1d9e6f7 100644
--- a/inputs/stackexchange.py
+++ b/inputs/stackexchange.py
@@ -58,6 +58,7 @@ def recent_pastes(conf, input_history):
     
                 # Create a new question dict for us to normalize
                 question_data = question
+                question_data['filename'] = ''
                 question_data['confname'] = "stackexchange"
                 # Force type to string else it breaks ES Index mappings
                 question_data['pasteid'] = str(question['question_id']) 
diff --git a/settings.json.sample b/settings.json.sample
index a9b5316..f006a2e 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -182,7 +182,7 @@
       "rule_list": ["ALL"]
     },
     "post_compress": {
-      "enabled": true,
+      "enabled": false,
       "module": "postprocess.post_compress",
       "rule_list": ["ALL"]
     }

From c598dc326ddd6d6a91eaa7386075b49635844121 Mon Sep 17 00:00:00 2001
From: Dylan Katz <git@dylankatz.com>
Date: Wed, 25 Sep 2019 19:37:37 -0600
Subject: [PATCH 160/178] Fixed scrape_uri for stackoverflow

---
 inputs/stackexchange.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/inputs/stackexchange.py b/inputs/stackexchange.py
index 1d9e6f7..244e1f5 100644
--- a/inputs/stackexchange.py
+++ b/inputs/stackexchange.py
@@ -63,6 +63,8 @@ def recent_pastes(conf, input_history):
                 # Force type to string else it breaks ES Index mappings
                 question_data['pasteid'] = str(question['question_id']) 
                 question_data['pastesite'] = site
+                # Set the raw uri to avoid breaking other things. Defaults to empty if not found
+                question_data['scrape_url'] = question.get('link', '')
                 # Get the author and then trim the data we store. 
                 question_data['username'] = question['owner']['display_name']
                 del question_data['owner']

From ff2a2f45bde191bc86ae3ec5ee1094bfd8f1d05c Mon Sep 17 00:00:00 2001
From: Scott Knight <4534275+knightsc@users.noreply.github.com>
Date: Wed, 28 Aug 2019 08:23:00 -0400
Subject: [PATCH 161/178] Allow process_timeout configuration

Based on the sandboxes being used the per process timeout might need to be longer than 5 seconds. This change allows the settings to configure this timeout.
---
 pastehunter.py       | 2 +-
 settings.json.sample | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/pastehunter.py b/pastehunter.py
index 25877d2..13314c6 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -157,7 +157,7 @@ def paste_scanner():
             sleep(0.5)
         else:
             paste_data = q.get()
-            with timeout(seconds=10):
+            with timeout(seconds=conf['general']['process_timeout']):
                 # Start a timer
                 start_time = time.time()
                 logger.debug("Found New {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
diff --git a/settings.json.sample b/settings.json.sample
index f006a2e..062155d 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -153,7 +153,9 @@
     "format": "%(asctime)s [%(threadName)-12.12s] %(levelname)s:%(message)s"
   },
   "general": {
-    "run_frequency": 300
+    "run_frequency": 300,
+    "process_timeout": 5
+  },
   },
   "post_process": {
     "post_email": {

From c86f2b54eb9792645b22ad6520f817323d9cc5a9 Mon Sep 17 00:00:00 2001
From: Scott Knight <4534275+knightsc@users.noreply.github.com>
Date: Wed, 28 Aug 2019 08:24:29 -0400
Subject: [PATCH 162/178] Ensure all paste_data is available to post processing

Previously things like hashes and triggered YaraRules were not available in the post processing steps. This change moves the setting of these values earlier so post processing actions can use them in their logic
---
 pastehunter.py | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/pastehunter.py b/pastehunter.py
index 13314c6..8a1f796 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -237,6 +237,17 @@ def paste_scanner():
                     # Else use the rule name
                     else:
                         results.append(match.rule)
+
+                # Store additional fields for passing on to post processing
+                encoded_paste_data = raw_paste_data.encode('utf-8')
+                md5 = hashlib.md5(encoded_paste_data).hexdigest()
+                sha256 = hashlib.sha256(encoded_paste_data).hexdigest()
+                paste_data['MD5'] = md5
+                paste_data['SHA256'] = sha256
+                paste_data['raw_paste'] = raw_paste_data
+                paste_data['YaraRule'] = results
+                # Set the size for all pastes - This will override any size set by the source
+                paste_data['size'] = len(raw_paste_data)
         
                 # Store all OverRides other options. 
                 paste_site = paste_data['confname']
@@ -282,21 +293,6 @@ def paste_scanner():
                         results.append('no_match')
                         
                 if len(results) > 0:
-        
-                    encoded_paste_data = raw_paste_data.encode('utf-8')
-                    md5 = hashlib.md5(encoded_paste_data).hexdigest()
-                    sha256 = hashlib.sha256(encoded_paste_data).hexdigest()
-                    paste_data['MD5'] = md5
-                    paste_data['SHA256'] = sha256
-                    # It is possible a post module modified or set this field.
-                    if not paste_data.get('raw_paste'):
-                        paste_data['raw_paste'] = raw_paste_data
-                        paste_data['size'] = len(raw_paste_data)
-                    else:
-                        # Set size based on modified value
-                        paste_data['size'] = len(paste_data['raw_paste'])
-                    
-                    paste_data['YaraRule'] = results
                     for output in outputs:
                         try:
                             output.store_paste(paste_data)

From 40d1ccf5a81904d5599de1609900090d9acf8fe8 Mon Sep 17 00:00:00 2001
From: Scott Knight <4534275+knightsc@users.noreply.github.com>
Date: Wed, 28 Aug 2019 08:25:50 -0400
Subject: [PATCH 163/178] Modularize sandbox support

Sandboxes can now be created by simply making a new python file and adding their config info into settings.
---
 postprocess/post_b64.py | 45 ++++++++---------------------------------
 sandboxes/__init__.py   |  0
 sandboxes/cuckoo.py     | 36 +++++++++++++++++++++++++++++++++
 sandboxes/viper.py      | 19 +++++++++++++++++
 settings.json.sample    | 25 +++++++++++++----------
 5 files changed, 77 insertions(+), 48 deletions(-)
 create mode 100644 sandboxes/__init__.py
 create mode 100644 sandboxes/cuckoo.py
 create mode 100644 sandboxes/viper.py

diff --git a/postprocess/post_b64.py b/postprocess/post_b64.py
index 150d671..260e16e 100644
--- a/postprocess/post_b64.py
+++ b/postprocess/post_b64.py
@@ -1,9 +1,8 @@
-import io
 import re
 import hashlib
+import importlib
 import gzip
 import logging
-import requests
 from base64 import b64decode
 # This gets the raw paste and the paste_data json object
 from common import parse_config
@@ -45,6 +44,7 @@ def run(results, raw_paste_data, paste_object):
                     paste_object["decompressed_stream"] = encoded
                 except Exception as e:
                     logger.error("Unable to decompress gzip stream")
+
             if rule == 'b64_exe':
                 try:
                     raw_exe = b64decode(raw_paste_data)
@@ -55,47 +55,18 @@ def run(results, raw_paste_data, paste_object):
                     # We are guessing that the sample has been submitted, and crafting a URL
                     paste_object["VT"] = 'https://www.virustotal.com/#/file/{0}'.format(paste_object["exe_md5"])
 
-                    # Cuckoo
-                    if conf["post_process"]["post_b64"]["cuckoo"]["enabled"]:
-                        logger.info("Submitting to Cuckoo")
-                        try:
-                            task_id = send_to_cuckoo(raw_exe, paste_object["pasteid"])
-                            paste_object["Cuckoo Task ID"] = task_id
-                            logger.info("exe submitted to Cuckoo with task id {0}".format(task_id))
-                        except Exception as e:
-                            logger.error("Unabled to submit sample to cuckoo")
-
-                    # Viper
-                    if conf["post_process"]["post_b64"]["viper"]["enabled"]:
-                        send_to_cuckoo(raw_exe, paste_object["pasteid"])
-
-                    # VirusTotal
+                    # If sandbox modules are enabled then submit the file
+                    for sandbox, sandbox_values in conf["sandboxes"].items():
+                        if sandbox_values["enabled"]:
+                            logger.info("Uploading file {0} using {1}".format(paste_object["pasteid"], sandbox_values["module"]))
+                            sandbox_module = importlib.import_module(sandbox_values["module"])
+                            paste_object = sandbox_module.upload_file(raw_exe, paste_object)
 
                 except Exception as e:
                     logger.error("Unable to decode exe file")
 
-
     # Get unique domain count
     # Update the json
 
     # Send the updated json back
     return paste_object
-
-
-def send_to_cuckoo(raw_exe, pasteid):
-    cuckoo_ip = conf["post_process"]["post_b64"]["cuckoo"]["api_host"]
-    cuckoo_port = conf["post_process"]["post_b64"]["cuckoo"]["api_port"]
-    cuckoo_host = 'http://{0}:{1}'.format(cuckoo_ip, cuckoo_port)
-    submit_file_url = '{0}/tasks/create/file'.format(cuckoo_host)
-    files = {'file': ('{0}.exe'.format(pasteid), io.BytesIO(raw_exe))}
-    submit_file = requests.post(submit_file_url, files=files).json()
-    task_id = None
-    try:
-        task_id = submit_file['task_id']
-    except KeyError:
-        try:
-            task_id = submit_file['task_ids'][0]
-        except KeyError:
-            logger.error(submit_file)
-
-    return task_id
diff --git a/sandboxes/__init__.py b/sandboxes/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sandboxes/cuckoo.py b/sandboxes/cuckoo.py
new file mode 100644
index 0000000..7685646
--- /dev/null
+++ b/sandboxes/cuckoo.py
@@ -0,0 +1,36 @@
+import io
+import logging
+import requests
+from common import parse_config
+conf = parse_config()
+
+logger = logging.getLogger('pastehunter')
+
+def upload_file(raw_file, paste_object):
+    try:
+        task_id = send_to_cuckoo(raw_file, paste_object["pasteid"])
+        paste_object["Cuckoo Task ID"] = task_id
+        logger.info("exe submitted to Cuckoo with task id {0}".format(task_id))
+    except Exception as e:
+        logger.error("Unabled to submit sample to cuckoo")
+
+    # Send any updated json back
+    return paste_object
+
+def send_to_cuckoo(raw_exe, pasteid):
+    cuckoo_ip = conf["sandboxes"]["cuckoo"]["api_host"]
+    cuckoo_port = conf["sandboxes"]["cuckoo"]["api_port"]
+    cuckoo_host = 'http://{0}:{1}'.format(cuckoo_ip, cuckoo_port)
+    submit_file_url = '{0}/tasks/create/file'.format(cuckoo_host)
+    files = {'file': ('{0}.exe'.format(pasteid), io.BytesIO(raw_exe))}
+    submit_file = requests.post(submit_file_url, files=files).json()
+    task_id = None
+    try:
+        task_id = submit_file['task_id']
+    except KeyError:
+        try:
+            task_id = submit_file['task_ids'][0]
+        except KeyError:
+            logger.error(submit_file)
+
+    return task_id
diff --git a/sandboxes/viper.py b/sandboxes/viper.py
new file mode 100644
index 0000000..b7f085d
--- /dev/null
+++ b/sandboxes/viper.py
@@ -0,0 +1,19 @@
+import io
+import logging
+import requests
+from common import parse_config
+conf = parse_config()
+
+logger = logging.getLogger('pastehunter')
+
+def upload_file(raw_file, paste_object):
+    viper_ip = conf["sandboxes"]["viper"]["api_host"]
+    viper_port = conf["sandboxes"]["viper"]["api_port"]
+    viper_host = 'http://{0}:{1}'.format(viper_ip, viper_port)
+
+    submit_file_url = '{0}/tasks/create/file'.format(viper_host)
+    files = {'file': ('{0}.exe'.format(paste_object["pasteid"]), io.BytesIO(raw_file))}
+    submit_file = requests.post(submit_file_url, files=files).json()
+
+    # Send any updated json back
+    return paste_object
diff --git a/settings.json.sample b/settings.json.sample
index 062155d..f2cc1b0 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -156,6 +156,19 @@
     "run_frequency": 300,
     "process_timeout": 5
   },
+  "sandboxes": {
+    "cuckoo": {
+      "enabled": false,
+      "module": "sandboxes.cuckoo",
+      "api_host": "127.0.0.1",
+      "api_port": 8080
+    },
+    "viper": {
+      "enabled": false,
+      "module": "sandboxes.viper",
+      "api_host": "127.0.0.1",
+      "api_port": 8080
+    }
   },
   "post_process": {
     "post_email": {
@@ -166,17 +179,7 @@
     "post_b64": {
       "enabled": true,
       "module": "postprocess.post_b64",
-      "rule_list": ["b64_exe", "b64_rar", "b64_zip", "b64_gzip"],
-      "cuckoo": {
-        "enabled": false,
-        "api_host": "127.0.0.1",
-        "api_port": 8080
-      },
-      "viper": {
-        "enabled": false,
-        "api_host": "127.0.0.1",
-        "api_port": 8080
-      }
+      "rule_list": ["b64_exe", "b64_rar", "b64_zip", "b64_gzip"]
     },
     "post_entropy": {
       "enabled": false,

From c33ab4318e3094a0bb5c069118148d83de4166fa Mon Sep 17 00:00:00 2001
From: Scott Knight <4534275+knightsc@users.noreply.github.com>
Date: Tue, 3 Sep 2019 14:50:46 -0400
Subject: [PATCH 164/178] Add documentation around the new sandboxes module and
 config

---
 README.md            |  5 +++++
 docs/postprocess.rst | 15 +--------------
 docs/sandboxes.rst   | 25 +++++++++++++++++++++++++
 3 files changed, 31 insertions(+), 14 deletions(-)
 create mode 100644 docs/sandboxes.rst

diff --git a/README.md b/README.md
index 84d0e7d..9b696e3 100644
--- a/README.md
+++ b/README.md
@@ -22,4 +22,9 @@ Pastehunter supports several output modules:
  - Dump to CSV file.
  - Send to syslog.
 
+ ## Supported Sandboxes
+ Pastehunter supports several sandboxes that decoded data can be sent to:
+ - Cuckoo
+ - Viper
+
 For examples of data discovered using pastehunter check out my posts https://techanarchy.net/blog/hunting-pastebin-with-pastehunter and https://techanarchy.net/blog/pastehunter-the-results
diff --git a/docs/postprocess.rst b/docs/postprocess.rst
index eaac1d5..15512d9 100644
--- a/docs/postprocess.rst
+++ b/docs/postprocess.rst
@@ -29,20 +29,7 @@ when the full paste is a base64 blob, i.e. it will not extract base64 code that
 
 - **rule_list**: List of rules that will trigger the postprocess module. 
 
-
-Cuckoo
-^^^^^^
-If the samples match a binary file format you can optionaly send the file for analysis by a Cuckoo Sandbox.
-
-- **api_host**: IP or hostname for a Cuckoo API endpoint. 
-- **api_port**: Port number for a Cuckoo API endpoint.
-
-Viper
-^^^^^
-If the samples match a binary file format you can optionaly send the file to a Viper instance for further analysis.
-
-- **api_host**: IP or hostname for a Cuckoo API endpoint. 
-- **api_port**: Port number for a Cuckoo API endpoint.
+See the `Sandboxes documentation <sandboxes.rst>`_ for information on how to configure the sandboxes used for scanning decoded base64 data.
 
 
 Entropy
diff --git a/docs/sandboxes.rst b/docs/sandboxes.rst
new file mode 100644
index 0000000..07d099c
--- /dev/null
+++ b/docs/sandboxes.rst
@@ -0,0 +1,25 @@
+Sandboxes
+=========
+
+There are a few sandboxes that can be configured and used in various post process steps.
+
+There are a few generic options for each input.
+
+- **enabled**: This turns the sandbox on and off. 
+- **module**: This is used internally by pastehunter.
+
+Cuckoo
+------
+
+If the samples match a binary file format you can optionaly send the file for analysis by a Cuckoo Sandbox.
+
+- **api_host**: IP or hostname for a Cuckoo API endpoint. 
+- **api_port**: Port number for a Cuckoo API endpoint.
+
+Viper
+-----
+
+If the samples match a binary file format you can optionaly send the file to a Viper instance for further analysis.
+
+- **api_host**: IP or hostname for a Viper API endpoint. 
+- **api_port**: Port number for a Viper API endpoint.

From 71f5f32d0bbada6c903d2b2db205e80a2c668463 Mon Sep 17 00:00:00 2001
From: Alesandro Ortiz <github@thephpjedi.com>
Date: Mon, 25 Nov 2019 20:54:18 -0500
Subject: [PATCH 165/178] Update Slexy input to use HTTPS, unique UA

---
 inputs/slexy.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/inputs/slexy.py b/inputs/slexy.py
index c95b377..856be13 100644
--- a/inputs/slexy.py
+++ b/inputs/slexy.py
@@ -10,7 +10,7 @@ class SlexySite(object):
 
     def __init__(self):
         self.site = "slexy.org"
-        url_slexy = "http://" + self.site
+        url_slexy = "https://" + self.site
         self.url_recent = url_slexy + "/recent"
         self.url_view = url_slexy + "/view"
         self.url_raw = url_slexy + "/raw"
@@ -27,7 +27,7 @@ def create_req(self, url):
             data=None,
             headers={
               'Referer': self.url_recent,
-              'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36'
+              'User-Agent': 'PasteHunter'
             }
         )
 

From 4c7204a2145bd92097bb52980e98247c7f9b613b Mon Sep 17 00:00:00 2001
From: Dylan Katz <git@dylankatz.com>
Date: Sun, 8 Dec 2019 16:38:17 -0800
Subject: [PATCH 166/178] Fixed error when running on pastes without filenames

Should fix #96
---
 pastehunter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pastehunter.py b/pastehunter.py
index 8a1f796..aeaf2d1 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -215,7 +215,7 @@ def paste_scanner():
                 # Process the paste data here
                 try:
                     # Scan with yara
-                    matches = rules.match(data=raw_paste_data, externals={'filename': paste_data.get('filename')})
+                    matches = rules.match(data=raw_paste_data, externals={'filename': paste_data.get('filename', '')})
                 except Exception as e:
                     logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
                     continue

From 6e394a3a796767e275e43af956f53f5dd0ad7caf Mon Sep 17 00:00:00 2001
From: Dylan Katz <git@dylankatz.com>
Date: Tue, 24 Dec 2019 17:59:47 -0700
Subject: [PATCH 167/178] Switched to using fnmatch

---
 inputs/github.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/inputs/github.py b/inputs/github.py
index 251ee3e..cb4130f 100644
--- a/inputs/github.py
+++ b/inputs/github.py
@@ -1,8 +1,8 @@
 import logging
 import math
-import pathlib
 from datetime import datetime
 
+import fnmatch
 import requests
 
 # Future work/improvement that can happen here: support PR diffs, they contain a patch URL
@@ -105,15 +105,10 @@ def recent_pastes(conf, input_history):
                         continue
                     for file in commit_data.get('files'):
                         file_path = file.get('filename')
-                        file_name = file_path
-                        # Convert path -> filename
-                        if '/' in file_name:
-                            file_name = file_name.split('/')[-1]
                         for pattern in gh_file_blacklist:
-                            pathlib.PurePath(file_path).match(pattern)
-                        if file_name in gh_file_blacklist:
-                            logger.info('Blacklisting file {0} from event {1}'.format(file_name, event_id))
-                            continue
+                            if fnmatch.fnmatch(file_path, pattern):
+                                logger.info('Blacklisting file {0} from event {1} (matched pattern "{2}")'.format(file_path, event_id, pattern))
+                                continue
 
                         gist_data = file
                         gist_data['confname'] = 'github'

From 427b30a85a07c9c6b04d99a8cd929b5123d0557f Mon Sep 17 00:00:00 2001
From: Dylan Katz <git@dylankatz.com>
Date: Tue, 24 Dec 2019 18:00:32 -0700
Subject: [PATCH 168/178] Switched to using mutliprocessing pool and updated
 ignore list

---
 pastehunter.py       | 371 +++++++++++++++++++++----------------------
 settings.json.sample |   2 +-
 2 files changed, 179 insertions(+), 194 deletions(-)

diff --git a/pastehunter.py b/pastehunter.py
index aeaf2d1..05aa30e 100644
--- a/pastehunter.py
+++ b/pastehunter.py
@@ -1,25 +1,23 @@
 #!/usr/bin/python3
-
-import os
-import sys
-import yara
-import json
+import errno
 import hashlib
-import requests
-import multiprocessing
 import importlib
+import json
 import logging
-from logging import handlers 
-import time
-import errno
+import multiprocessing
+import os
 import signal
+import sys
+import time
+from io import StringIO, BytesIO
+from logging import handlers
 from time import sleep
 from urllib.parse import unquote_plus
-from common import parse_config
-from postprocess import post_email
 
+import requests
+import yara
 
-from multiprocessing import Queue
+from common import parse_config
 
 VERSION = 1.0
 
@@ -53,8 +51,7 @@ def __init__(self, seconds=1, error_message='Timeout'):
         self.seconds = seconds
         self.error_message = error_message
     def handle_timeout(self, signum, frame):
-        print("Process timeout: {0}".format(self.error_message))
-        sys.exit(0)
+        raise TimeoutError("Process timeout: {0}".format(self.error_message))
     def __enter__(self):
         signal.signal(signal.SIGALRM, self.handle_timeout)
         signal.alarm(self.seconds)
@@ -146,168 +143,167 @@ def yara_index(rule_path, blacklist, test_rules):
                 yar.write(include)
 
 
-def paste_scanner():
-    # Get a paste URI from the Queue
+def paste_scanner(paste_data, rules_buff):
+    # Grab yara rules from passed buffer
     # Fetch the raw paste
     # scan the Paste
     # Store the Paste
-    while True:
-        if q.empty():
-            # Queue was empty, sleep to prevent busy loop
-            sleep(0.5)
-        else:
-            paste_data = q.get()
-            with timeout(seconds=conf['general']['process_timeout']):
-                # Start a timer
-                start_time = time.time()
-                logger.debug("Found New {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
-                # get raw paste and hash them
-                try:
-                    
-                    # Stack questions dont have a raw endpoint
-                    if ('stackexchange' in conf['inputs']) and (paste_data['pastesite'] in conf['inputs']['stackexchange']['site_list']):
-                        # The body is already included in the first request so we do not need a second call to the API. 
-                        
-                        # Unescape the code block strings in the json body. 
-                        raw_body = paste_data['body']
-                        raw_paste_data = unquote_plus(raw_body)
-                        
-                        # now remove the old body key as we dont need it any more
-                        del paste_data['body']
-                        
+
+    rules_buff.seek(0)
+    rules = yara.load(file=rules_buff)
+    try:
+        with timeout(seconds=conf['general']['process_timeout']):
+            # Start a timer
+            start_time = time.time()
+            logger.debug("Found New {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
+            # get raw paste and hash them
+            try:
+
+                # Stack questions dont have a raw endpoint
+                if ('stackexchange' in conf['inputs']) and (paste_data['pastesite'] in conf['inputs']['stackexchange']['site_list']):
+                    # The body is already included in the first request so we do not need a second call to the API.
+
+                    # Unescape the code block strings in the json body.
+                    raw_body = paste_data['body']
+                    raw_paste_data = unquote_plus(raw_body)
+
+                    # now remove the old body key as we dont need it any more
+                    del paste_data['body']
+
+                else:
+                    raw_paste_uri = paste_data['scrape_url']
+                    if not raw_paste_uri:
+                        logger.info('Unable to retrieve paste, no uri found.')
+                        logger.debug(json.dumps(paste_data))
+                        raw_paste_data = ""
                     else:
-                        raw_paste_uri = paste_data['scrape_url']
-                        if not raw_paste_uri:
-                            logger.info('Unable to retrieve paste, no uri found.')
-                            logger.debug(json.dumps(paste_data))
-                            raw_paste_data = ""
-                        else:
-                            raw_paste_data = requests.get(raw_paste_uri).text
-                        
+                        raw_paste_data = requests.get(raw_paste_uri).text
+
+            # Cover fetch site SSLErrors
+            except requests.exceptions.SSLError as e:
+                logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
+                raw_paste_data = ""
+
+            # General Exception
+            except Exception as e:
+                logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
+                raw_paste_data = ""
+
+            # Pastebin Cache
+            if raw_paste_data == "File is not ready for scraping yet. Try again in 1 minute.":
+                logger.info("Paste is still cached sleeping to try again")
+                sleep(45)
+                # get raw paste and hash them
+                raw_paste_uri = paste_data['scrape_url']
                 # Cover fetch site SSLErrors
+                try:
+                    raw_paste_data = requests.get(raw_paste_uri).text
                 except requests.exceptions.SSLError as e:
                     logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
                     raw_paste_data = ""
-                
-                # General Exception 
+
+                # General Exception
                 except Exception as e:
                     logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
                     raw_paste_data = ""
-        
-                # Pastebin Cache
-                if raw_paste_data == "File is not ready for scraping yet. Try again in 1 minute.":
-                    logger.info("Paste is still cached sleeping to try again")
-                    sleep(45)
-                    # get raw paste and hash them
-                    raw_paste_uri = paste_data['scrape_url']
-                    # Cover fetch site SSLErrors
-                    try:
-                        raw_paste_data = requests.get(raw_paste_uri).text
-                    except requests.exceptions.SSLError as e:
-                        logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
-                        raw_paste_data = ""
 
-                    # General Exception 
-                    except Exception as e:
-                        logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
-                        raw_paste_data = ""
-
-                # Process the paste data here
-                try:
-                    # Scan with yara
-                    matches = rules.match(data=raw_paste_data, externals={'filename': paste_data.get('filename', '')})
-                except Exception as e:
-                    logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
-                    continue
-        
+            # Process the paste data here
+            try:
+                # Scan with yara
+                matches = rules.match(data=raw_paste_data, externals={'filename': paste_data.get('filename', '')})
+            except Exception as e:
+                logger.error("Unable to scan raw paste : {0} - {1}".format(paste_data['pasteid'], e))
+                return False
+
+            results = []
+            for match in matches:
+                # For keywords get the word from the matched string
+                if match.rule == 'core_keywords' or match.rule == 'custom_keywords':
+                    for s in match.strings:
+                        rule_match = s[1].lstrip('$')
+                        if rule_match not in results:
+                            results.append(rule_match)
+                    results.append(str(match.rule))
+
+                # But a break in here for the base64. Will use it later.
+                elif match.rule.startswith('b64'):
+                    results.append(match.rule)
+
+                # Else use the rule name
+                else:
+                    results.append(match.rule)
+
+            # Store additional fields for passing on to post processing
+            encoded_paste_data = raw_paste_data.encode('utf-8')
+            md5 = hashlib.md5(encoded_paste_data).hexdigest()
+            sha256 = hashlib.sha256(encoded_paste_data).hexdigest()
+            paste_data['MD5'] = md5
+            paste_data['SHA256'] = sha256
+            paste_data['raw_paste'] = raw_paste_data
+            paste_data['YaraRule'] = results
+            # Set the size for all pastes - This will override any size set by the source
+            paste_data['size'] = len(raw_paste_data)
+
+            # Store all OverRides other options.
+            paste_site = paste_data['confname']
+            store_all = conf['inputs'][paste_site]['store_all']
+            # remove the confname key as its not really needed past this point
+            del paste_data['confname']
+
+
+            # Blacklist Check
+            # If any of the blacklist rules appear then empty the result set
+            blacklisted = False
+            if conf['yara']['blacklist'] and 'blacklist' in results:
                 results = []
-                for match in matches:
-                    # For keywords get the word from the matched string
-                    if match.rule == 'core_keywords' or match.rule == 'custom_keywords':
-                        for s in match.strings:
-                            rule_match = s[1].lstrip('$')
-                            if rule_match not in results:
-                                results.append(rule_match)
-                        results.append(str(match.rule))
-        
-                    # But a break in here for the base64. Will use it later.
-                    elif match.rule.startswith('b64'):
-                        results.append(match.rule)
-        
-                    # Else use the rule name
-                    else:
-                        results.append(match.rule)
-
-                # Store additional fields for passing on to post processing
-                encoded_paste_data = raw_paste_data.encode('utf-8')
-                md5 = hashlib.md5(encoded_paste_data).hexdigest()
-                sha256 = hashlib.sha256(encoded_paste_data).hexdigest()
-                paste_data['MD5'] = md5
-                paste_data['SHA256'] = sha256
-                paste_data['raw_paste'] = raw_paste_data
-                paste_data['YaraRule'] = results
-                # Set the size for all pastes - This will override any size set by the source
-                paste_data['size'] = len(raw_paste_data)
-        
-                # Store all OverRides other options. 
-                paste_site = paste_data['confname']
-                store_all = conf['inputs'][paste_site]['store_all']
-                # remove the confname key as its not really needed past this point
-                del paste_data['confname']
-        
-        
-                # Blacklist Check
-                # If any of the blacklist rules appear then empty the result set
-                blacklisted = False
-                if conf['yara']['blacklist'] and 'blacklist' in results:
-                    results = []
-                    blacklisted = True
-                    logger.info("Blacklisted {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
-        
-        
-                # Post Process
-        
-                # If post module is enabled and the paste has a matching rule.
-                post_results = paste_data
-                for post_process, post_values in conf["post_process"].items():
-                    if post_values["enabled"]:
-                        if any(i in results for i in post_values["rule_list"]) or "ALL" in post_values["rule_list"]:
-                            if not blacklisted:
-                                logger.info("Running Post Module {0} on {1}".format(post_values["module"], paste_data["pasteid"]))
-                                post_module = importlib.import_module(post_values["module"])
-                                post_results = post_module.run(results,
-                                                                raw_paste_data,
-                                                                paste_data
-                                                                )
-        
-                # Throw everything back to paste_data for ease.
-                paste_data = post_results
-        
-        
-                # If we have a result add some meta data and send to storage
-                # If results is empty, ie no match, and store_all is True,
-                # then append "no_match" to results. This will then force output.
-        
-                if store_all is True:
-                    if len(results) == 0:
-                        results.append('no_match')
-                        
-                if len(results) > 0:
-                    for output in outputs:
-                        try:
-                            output.store_paste(paste_data)
-                        except Exception as e:
-                            logger.error("Unable to store {0} to {1} with error {2}".format(paste_data["pasteid"], output, e))
-                
-                end_time = time.time()
-                logger.debug("Processing Finished for {0} in {1} seconds".format(
-                    paste_data["pasteid"],
-                    (end_time - start_time)
-                ))
-
-
-
-if __name__ == "__main__":
+                blacklisted = True
+                logger.info("Blacklisted {0} paste {1}".format(paste_data['pastesite'], paste_data['pasteid']))
+
+
+            # Post Process
+
+            # If post module is enabled and the paste has a matching rule.
+            post_results = paste_data
+            for post_process, post_values in conf["post_process"].items():
+                if post_values["enabled"]:
+                    if any(i in results for i in post_values["rule_list"]) or "ALL" in post_values["rule_list"]:
+                        if not blacklisted:
+                            logger.info("Running Post Module {0} on {1}".format(post_values["module"], paste_data["pasteid"]))
+                            post_module = importlib.import_module(post_values["module"])
+                            post_results = post_module.run(results,
+                                                            raw_paste_data,
+                                                            paste_data
+                                                            )
+
+            # Throw everything back to paste_data for ease.
+            paste_data = post_results
+
+
+            # If we have a result add some meta data and send to storage
+            # If results is empty, ie no match, and store_all is True,
+            # then append "no_match" to results. This will then force output.
+
+            if store_all is True:
+                if len(results) == 0:
+                    results.append('no_match')
+
+            if len(results) > 0:
+                for output in outputs:
+                    try:
+                        output.store_paste(paste_data)
+                    except Exception as e:
+                        logger.error("Unable to store {0} to {1} with error {2}".format(paste_data["pasteid"], output, e))
+
+            end_time = time.time()
+            logger.debug("Processing Finished for {0} in {1} seconds".format(
+                paste_data["pasteid"],
+                (end_time - start_time)
+            ))
+            return True
+    except TimeoutError:
+        return False
+
+def main():
     logger.info("Compile Yara Rules")
     try:
         # Update the yara rules index
@@ -318,37 +314,24 @@ def paste_scanner():
         # Compile the yara rules we will use to match pastes
         index_file = os.path.join(conf['yara']['rule_path'], 'index.yar')
         rules = yara.compile(index_file, externals={'filename': ''})
+
+        # Used for sharing across processes
+        rules_buff = BytesIO()
+        rules.save(file=rules_buff)
+
     except Exception as e:
         logger.exception("Unable to Create Yara index: ", e)
         sys.exit()
 
     # Create Queue to hold paste URI's
-    q = Queue()
-    processes = []
+    pool = multiprocessing.Pool(processes=5)
+    results = []
 
     # Now Fill the Queue
     try:
         while True:
             queue_count = 0
-            counter = 0
-            if len(processes) < 5:
-                for i in range(5-len(processes)):
-                    logger.warning("Creating New Process")
-                    m = multiprocessing.Process(target=paste_scanner)
-                    # Add new process to list so we can run join on them later. 
-                    processes.append(m)
-                    m.start()
-            for process in processes:
-                if not process.is_alive():
-                    logger.warning("Restarting Dead Process")
-                    del processes[counter]
-                    m = multiprocessing.Process(target=paste_scanner)
-                    # Add new process to list so we can run join on them later. 
-                    processes.append(m)
-                    m.start()
-                counter += 1
-            
-            # Check if the processors are active
+
             # Paste History
             logger.info("Populating Queue")
             if os.path.exists('paste_history.tmp'):
@@ -362,7 +345,7 @@ def paste_scanner():
                     input_history = paste_history[input_name]
                 else:
                     input_history = []
-                    
+
                 try:
 
                     i = importlib.import_module(input_name)
@@ -370,7 +353,8 @@ def paste_scanner():
                     logger.info("Fetching paste list from {0}".format(input_name))
                     paste_list, history = i.recent_pastes(conf, input_history)
                     for paste in paste_list:
-                        q.put(paste)
+                        # Create a new async job for the existing pool and apply it to "results"
+                        results.append(pool.apply_async(paste_scanner, (paste, rules_buff)))
                         queue_count += 1
                     paste_history[input_name] = history
                 except Exception as e:
@@ -382,18 +366,19 @@ def paste_scanner():
                 json.dump(paste_history, outfile)
             logger.info("Added {0} Items to the queue".format(queue_count))
 
-            for proc in processes:
-                proc.join(2)
+            # Wait for all work to finish
+            [result.wait() for result in results]
 
             # Slow it down a little
             logger.info("Sleeping for " + str(conf['general']['run_frequency']) + " Seconds")
             sleep(conf['general']['run_frequency'])
-        
+
 
 
     except KeyboardInterrupt:
         logger.info("Stopping Processes")
-        for proc in processes:
-            proc.terminate()
-            proc.join()
+        pool.terminate()
+        pool.join()
 
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/settings.json.sample b/settings.json.sample
index f2cc1b0..3f3cad0 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -33,7 +33,7 @@
       "api_limit": 100,
       "store_all": false,
       "user_blacklist": [],
-      "file_blacklist": ["node_modules/*", "__pycache__/*", "*/grahamcofborg-eval-package-list", "*/yarn.lock", "*.3ds", "*.3g2", "*.3gp", "*.7z", "*.DS_Store", "*.a", "*.aac", "*.adp", "*.ai", "*.aif", "*.aiff", "*.alz", "*.ape", "*.apk", "*.ar", "*.arj", "*.asf", "*.au", "*.avi", "*.bak", "*.bh", "*.bin", "*.bk", "*.bmp", "*.btif", "*.bz2", "*.bzip2", "*.cab", "*.caf", "*.cgm", "*.class", "*.cmx", "*.cpio", "*.cr2", "*.csv", "*.cur", "*.dat", "*.deb", "*.dex", "*.djvu", "*.dll", "*.dmg", "*.dng", "*.doc", "*.docm", "*.docx", "*.dot", "*.dotm", "*.dra", "*.dsk", "*.dts", "*.dtshd", "*.dvb", "*.dwg", "*.dxf", "*.ecelp4800", "*.ecelp7470", "*.ecelp9600", "*.egg", "*.eol", "*.eot", "*.epub", "*.exe", "*.f4v", "*.fbs", "*.fh", "*.fla", "*.flac", "*.fli", "*.flv", "*.fpx", "*.fst", "*.fvt", "*.g3", "*.gif", "*.graffle", "*.gz", "*.gzip", "*.h261", "*.h263", "*.h264", "*.ico", "*.ief", "*.img", "*.ipa", "*.iso", "*.jar", "*.jpeg", "*.jpg", "*.jpgv", "*.jpm", "*.jxr", "*.key", "*.ktx", "*.lha", "*.lvp", "*.lz", "*.lzh", "*.lzma", "*.lzo", "*.m3u", "*.m4a", "*.m4v", "*.mar", "*.mdi", "*.mht", "*.mid", "*.midi", "*.mj2", "*.mka", "*.mkv", "*.mmr", "*.mng", "*.mobi", "*.mov", "*.movie", "*.mp3", "*.mp4", "*.mp4a", "*.mpeg", "*.mpg", "*.mpga", "*.mxu", "*.nef", "*.npx", "*.numbers", "*.o", "*.oga", "*.ogg", "*.ogv", "*.otf", "*.pages", "*.pbm", "*.pcx", "*.pdf", "*.pea", "*.pgm", "*.pic", "*.png", "*.pnm", "*.pot", "*.potm", "*.potx", "*.ppa", "*.ppam", "*.ppm", "*.pps", "*.ppsm", "*.ppsx", "*.ppt", "*.pptm", "*.pptx", "*.psd", "*.pya", "*.pyc", "*.pyo", "*.pyv", "*.qt", "*.rar", "*.ras", "*.raw", "*.rgb", "*.rip", "*.rlc", "*.rmf", "*.rmvb", "*.rtf", "*.rz", "*.s3m", "*.s7z", "*.scpt", "*.sgi", "*.shar", "*.sil", "*.sketch", "*.slk", "*.smv", "*.so", "*.sub", "*.swf", "*.tar", "*.tbz", "*.tbz2", "*.tga", "*.tgz", "*.thmx", "*.tif", "*.tiff", "*.tlz", "*.ttc", "*.ttf", "*.txz", "*.udf", "*.uvh", "*.uvi", "*.uvm", "*.uvp", "*.uvs", "*.uvu", "*.viv", "*.vob", "*.war", "*.wav", "*.wax", "*.wbmp", "*.wdp", "*.weba", "*.webm", "*.webp", "*.whl", "*.wim", "*.wm", "*.wma", "*.wmv", "*.wmx", "*.woff", "*.woff2", "*.wvx", "*.xbm", "*.xif", "*.xla", "*.xlam", "*.xls", "*.xlsb", "*.xlsm", "*.xlsx", "*.xlt", "*.xltm", "*.xltx", "*.xm", "*.xmind", "*.xpi", "*.xpm", "*.xwd", "*.xz", "*.z", "*.zip", "*.zipx"]
+      "file_blacklist": ["node_modules/*", "__pycache__/*", "*/grahamcofborg-eval-package-list", "*.lock", "*.3ds", "*.3g2", "*.3gp", "*.7z", "*.DS_Store", "*.a", "*.aac", "*.adp", "*.ai", "*.aif", "*.aiff", "*.alz", "*.ape", "*.apk", "*.ar", "*.arj", "*.asf", "*.au", "*.avi", "*.bak", "*.bh", "*.bin", "*.bk", "*.bmp", "*.btif", "*.bz2", "*.bzip2", "*.cab", "*.caf", "*.cgm", "*.class", "*.cmx", "*.cpio", "*.cr2", "*.cur", "*.dat", "*.deb", "*.dex", "*.djvu", "*.dll", "*.dmg", "*.dng", "*.doc", "*.docm", "*.docx", "*.dot", "*.dotm", "*.dra", "*.dsk", "*.dts", "*.dtshd", "*.dvb", "*.dwg", "*.dxf", "*.ecelp4800", "*.ecelp7470", "*.ecelp9600", "*.egg", "*.eol", "*.eot", "*.epub", "*.exe", "*.f4v", "*.fbs", "*.fh", "*.fla", "*.flac", "*.fli", "*.flv", "*.fpx", "*.fst", "*.fvt", "*.g3", "*.gif", "*.graffle", "*.gz", "*.gzip", "*.h261", "*.h263", "*.h264", "*.ico", "*.ief", "*.img", "*.ipa", "*.iso", "*.jar", "*.jpeg", "*.jpg", "*.jpgv", "*.jpm", "*.jxr","*.ktx", "*.lha", "*.lvp", "*.lz", "*.lzh", "*.lzma", "*.lzo", "*.m3u", "*.m4a", "*.m4v", "*.mar", "*.mdi", "*.mht", "*.mid", "*.midi", "*.mj2", "*.mka", "*.mkv", "*.mmr", "*.mng", "*.mobi", "*.mov", "*.movie", "*.mp3", "*.mp4", "*.mp4a", "*.mpeg", "*.mpg", "*.mpga", "*.mxu", "*.nef", "*.npx", "*.numbers", "*.o", "*.oga", "*.ogg", "*.ogv", "*.otf", "*.pages", "*.pbm", "*.pcx", "*.pdf", "*.pea", "*.pgm", "*.pic", "*.png", "*.pnm", "*.pot", "*.potm", "*.potx", "*.ppa", "*.ppam", "*.ppm", "*.pps", "*.ppsm", "*.ppsx", "*.ppt", "*.pptm", "*.pptx", "*.psd", "*.pya", "*.pyc", "*.pyo", "*.pyv", "*.qt", "*.rar", "*.ras", "*.raw", "*.rgb", "*.rip", "*.rlc", "*.rmf", "*.rmvb", "*.rtf", "*.rz", "*.s3m", "*.s7z", "*.scpt", "*.sgi", "*.shar", "*.sil", "*.sketch", "*.slk", "*.smv", "*.so", "*.sub", "*.swf", "*.tar", "*.tbz", "*.tbz2", "*.tga", "*.tgz", "*.thmx", "*.tif", "*.tiff", "*.tlz", "*.ttc", "*.ttf", "*.txz", "*.udf", "*.uvh", "*.uvi", "*.uvm", "*.uvp", "*.uvs", "*.uvu", "*.viv", "*.vob", "*.war", "*.wav", "*.wax", "*.wbmp", "*.wdp", "*.weba", "*.webm", "*.webp", "*.whl", "*.wim", "*.wm", "*.wma", "*.wmv", "*.wmx", "*.woff", "*.woff2", "*.wvx", "*.xbm", "*.xif", "*.xla", "*.xlam", "*.xls", "*.xlsb", "*.xlsm", "*.xlsx", "*.xlt", "*.xltm", "*.xltx", "*.xm", "*.xmind", "*.xpi", "*.xpm", "*.xwd", "*.xz", "*.z", "*.zip", "*.zipx"]
     },
     "slexy":{
       "enabled": true,

From 1ec55ce9763bd3c9b2b3968063926208fefc82d4 Mon Sep 17 00:00:00 2001
From: Dylan Katz <git@dylankatz.com>
Date: Wed, 25 Dec 2019 13:25:48 -0700
Subject: [PATCH 169/178] Began ignoring data uris in base64 to improve
 accuracy

---
 YaraRules/base64.yar | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/YaraRules/base64.yar b/YaraRules/base64.yar
index 6b3b6f0..d0c4317 100644
--- a/YaraRules/base64.yar
+++ b/YaraRules/base64.yar
@@ -88,7 +88,11 @@ rule b64_url
         $a4 = "V1dXLg" // WWW.
 
         // ignore vendor certs in this rule. The certs rule will pick them up if we want them
-        $not1 = "GlobalSign Root CA" nocase 
+        $not1 = "GlobalSign Root CA" nocase
+        $not2 = /data:[a-z\/]+;aHR0cDov/ nocase
+        $not3 = /data:[a-z\/]+;SFRUUDov/ nocase
+        $not4 = /data:[a-z\/]+;d3d3Lg/ nocase
+        $not5 = /data:[a-z\/]+;V1dXLg/ nocase
     condition:
         any of ($a*) and not any of ($not*)
 

From b38f312133cc3e4c70894dd730dd3bc2c4571dd7 Mon Sep 17 00:00:00 2001
From: Dylan Katz <git@dylankatz.com>
Date: Wed, 25 Dec 2019 15:33:23 -0700
Subject: [PATCH 170/178] Added support for ignoring bots, minor fix to b64
 urls

---
 YaraRules/base64.yar | 11 +++++++----
 inputs/github.py     |  5 +++++
 settings.json.sample |  1 +
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/YaraRules/base64.yar b/YaraRules/base64.yar
index d0c4317..913d12c 100644
--- a/YaraRules/base64.yar
+++ b/YaraRules/base64.yar
@@ -89,10 +89,13 @@ rule b64_url
 
         // ignore vendor certs in this rule. The certs rule will pick them up if we want them
         $not1 = "GlobalSign Root CA" nocase
-        $not2 = /data:[a-z\/]+;aHR0cDov/ nocase
-        $not3 = /data:[a-z\/]+;SFRUUDov/ nocase
-        $not4 = /data:[a-z\/]+;d3d3Lg/ nocase
-        $not5 = /data:[a-z\/]+;V1dXLg/ nocase
+
+        // Ignore data: uris. These are common in html and svg files.
+        $not2 = /data:[a-z\/]+;(base64,)?aHR0cDov/ nocase
+        $not3 = /data:[a-z\/]+;(base64,)?SFRUUDov/ nocase
+        $not4 = /data:[a-z\/]+;(base64,)?d3d3Lg/ nocase
+        $not5 = /data:[a-z\/]+;(base64,)?V1dXLg/ nocase
+
     condition:
         any of ($a*) and not any of ($not*)
 
diff --git a/inputs/github.py b/inputs/github.py
index cb4130f..25b44ec 100644
--- a/inputs/github.py
+++ b/inputs/github.py
@@ -62,6 +62,7 @@ def recent_pastes(conf, input_history):
 
     gh_file_blacklist = conf['inputs']['github']['file_blacklist']
     gh_user_blacklist = conf['inputs']['github']['user_blacklist']
+    ignore_bots = conf['inputs']['github']['ignore_bots']
 
     try:
         # Get the required amount of entries via pagination
@@ -89,6 +90,10 @@ def recent_pastes(conf, input_history):
                     if event_meta.get('actor').get('login') in gh_user_blacklist:
                         logger.info('Blacklisting GitHub event from user: {0}'.format(event_meta.get('login')))
                         continue
+                    if ignore_bots and event_meta.get('actor').get('login').endswith("[bot]"):
+                        logger.info('Ignoring GitHub event from bot user: {0}'.format(event_meta.get('login')))
+                        continue
+
                 payload = event_meta.get('payload')
                 if not 'commits' in payload:
                     # Debug, because this is high output
diff --git a/settings.json.sample b/settings.json.sample
index 3f3cad0..938293f 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -32,6 +32,7 @@
       "api_token": "",
       "api_limit": 100,
       "store_all": false,
+      "ignore_bots": false,
       "user_blacklist": [],
       "file_blacklist": ["node_modules/*", "__pycache__/*", "*/grahamcofborg-eval-package-list", "*.lock", "*.3ds", "*.3g2", "*.3gp", "*.7z", "*.DS_Store", "*.a", "*.aac", "*.adp", "*.ai", "*.aif", "*.aiff", "*.alz", "*.ape", "*.apk", "*.ar", "*.arj", "*.asf", "*.au", "*.avi", "*.bak", "*.bh", "*.bin", "*.bk", "*.bmp", "*.btif", "*.bz2", "*.bzip2", "*.cab", "*.caf", "*.cgm", "*.class", "*.cmx", "*.cpio", "*.cr2", "*.cur", "*.dat", "*.deb", "*.dex", "*.djvu", "*.dll", "*.dmg", "*.dng", "*.doc", "*.docm", "*.docx", "*.dot", "*.dotm", "*.dra", "*.dsk", "*.dts", "*.dtshd", "*.dvb", "*.dwg", "*.dxf", "*.ecelp4800", "*.ecelp7470", "*.ecelp9600", "*.egg", "*.eol", "*.eot", "*.epub", "*.exe", "*.f4v", "*.fbs", "*.fh", "*.fla", "*.flac", "*.fli", "*.flv", "*.fpx", "*.fst", "*.fvt", "*.g3", "*.gif", "*.graffle", "*.gz", "*.gzip", "*.h261", "*.h263", "*.h264", "*.ico", "*.ief", "*.img", "*.ipa", "*.iso", "*.jar", "*.jpeg", "*.jpg", "*.jpgv", "*.jpm", "*.jxr","*.ktx", "*.lha", "*.lvp", "*.lz", "*.lzh", "*.lzma", "*.lzo", "*.m3u", "*.m4a", "*.m4v", "*.mar", "*.mdi", "*.mht", "*.mid", "*.midi", "*.mj2", "*.mka", "*.mkv", "*.mmr", "*.mng", "*.mobi", "*.mov", "*.movie", "*.mp3", "*.mp4", "*.mp4a", "*.mpeg", "*.mpg", "*.mpga", "*.mxu", "*.nef", "*.npx", "*.numbers", "*.o", "*.oga", "*.ogg", "*.ogv", "*.otf", "*.pages", "*.pbm", "*.pcx", "*.pdf", "*.pea", "*.pgm", "*.pic", "*.png", "*.pnm", "*.pot", "*.potm", "*.potx", "*.ppa", "*.ppam", "*.ppm", "*.pps", "*.ppsm", "*.ppsx", "*.ppt", "*.pptm", "*.pptx", "*.psd", "*.pya", "*.pyc", "*.pyo", "*.pyv", "*.qt", "*.rar", "*.ras", "*.raw", "*.rgb", "*.rip", "*.rlc", "*.rmf", "*.rmvb", "*.rtf", "*.rz", "*.s3m", "*.s7z", "*.scpt", "*.sgi", "*.shar", "*.sil", "*.sketch", "*.slk", "*.smv", "*.so", "*.sub", "*.swf", "*.tar", "*.tbz", "*.tbz2", "*.tga", "*.tgz", "*.thmx", "*.tif", "*.tiff", "*.tlz", "*.ttc", "*.ttf", "*.txz", "*.udf", "*.uvh", "*.uvi", "*.uvm", "*.uvp", "*.uvs", "*.uvu", "*.viv", "*.vob", "*.war", "*.wav", "*.wax", "*.wbmp", "*.wdp", "*.weba", "*.webm", "*.webp", "*.whl", "*.wim", "*.wm", "*.wma", "*.wmv", "*.wmx", "*.woff", "*.woff2", "*.wvx", "*.xbm", "*.xif", "*.xla", "*.xlam", "*.xls", "*.xlsb", "*.xlsm", "*.xlsx", "*.xlt", "*.xltm", "*.xltx", "*.xm", "*.xmind", "*.xpi", "*.xpm", "*.xwd", "*.xz", "*.z", "*.zip", "*.zipx"]
     },

From 545ec67470f1af5c08e5225a3a6798212ae86f41 Mon Sep 17 00:00:00 2001
From: Dylan Katz <git@dylankatz.com>
Date: Fri, 27 Dec 2019 09:36:29 -0700
Subject: [PATCH 171/178] Moved source to folder

---
 {inputs => src}/__init__.py                       | 0
 common.py => src/common.py                        | 0
 {outputs => src/inputs}/__init__.py               | 0
 {inputs => src/inputs}/dumpz.py                   | 0
 {inputs => src/inputs}/gists.py                   | 0
 {inputs => src/inputs}/github.py                  | 0
 {inputs => src/inputs}/pastebin.py                | 0
 {inputs => src/inputs}/slexy.py                   | 0
 {inputs => src/inputs}/stackexchange.py           | 0
 {postprocess => src/outputs}/__init__.py          | 0
 {outputs => src/outputs}/csv_output.py            | 8 +++++---
 {outputs => src/outputs}/elastic_output.py        | 0
 {outputs => src/outputs}/json_output.py           | 0
 {outputs => src/outputs}/slack_output.py          | 3 ---
 {outputs => src/outputs}/smtp_output.py           | 0
 {outputs => src/outputs}/splunk_output.py         | 1 -
 {outputs => src/outputs}/syslog_output.py         | 0
 {outputs => src/outputs}/twilio_output.py         | 4 ++--
 pastehunter.py => src/pastehunter.py              | 0
 {sandboxes => src/postprocess}/__init__.py        | 0
 {postprocess => src/postprocess}/post_b64.py      | 1 -
 {postprocess => src/postprocess}/post_compress.py | 0
 {postprocess => src/postprocess}/post_email.py    | 0
 {postprocess => src/postprocess}/post_entropy.py  | 0
 src/sandboxes/__init__.py                         | 0
 {sandboxes => src/sandboxes}/cuckoo.py            | 0
 {sandboxes => src/sandboxes}/viper.py             | 0
 27 files changed, 7 insertions(+), 10 deletions(-)
 rename {inputs => src}/__init__.py (100%)
 rename common.py => src/common.py (100%)
 rename {outputs => src/inputs}/__init__.py (100%)
 rename {inputs => src/inputs}/dumpz.py (100%)
 rename {inputs => src/inputs}/gists.py (100%)
 rename {inputs => src/inputs}/github.py (100%)
 rename {inputs => src/inputs}/pastebin.py (100%)
 rename {inputs => src/inputs}/slexy.py (100%)
 rename {inputs => src/inputs}/stackexchange.py (100%)
 rename {postprocess => src/outputs}/__init__.py (100%)
 rename {outputs => src/outputs}/csv_output.py (82%)
 rename {outputs => src/outputs}/elastic_output.py (100%)
 rename {outputs => src/outputs}/json_output.py (100%)
 rename {outputs => src/outputs}/slack_output.py (97%)
 rename {outputs => src/outputs}/smtp_output.py (100%)
 rename {outputs => src/outputs}/splunk_output.py (98%)
 rename {outputs => src/outputs}/syslog_output.py (100%)
 rename {outputs => src/outputs}/twilio_output.py (97%)
 rename pastehunter.py => src/pastehunter.py (100%)
 rename {sandboxes => src/postprocess}/__init__.py (100%)
 rename {postprocess => src/postprocess}/post_b64.py (99%)
 rename {postprocess => src/postprocess}/post_compress.py (100%)
 rename {postprocess => src/postprocess}/post_email.py (100%)
 rename {postprocess => src/postprocess}/post_entropy.py (100%)
 create mode 100644 src/sandboxes/__init__.py
 rename {sandboxes => src/sandboxes}/cuckoo.py (100%)
 rename {sandboxes => src/sandboxes}/viper.py (100%)

diff --git a/inputs/__init__.py b/src/__init__.py
similarity index 100%
rename from inputs/__init__.py
rename to src/__init__.py
diff --git a/common.py b/src/common.py
similarity index 100%
rename from common.py
rename to src/common.py
diff --git a/outputs/__init__.py b/src/inputs/__init__.py
similarity index 100%
rename from outputs/__init__.py
rename to src/inputs/__init__.py
diff --git a/inputs/dumpz.py b/src/inputs/dumpz.py
similarity index 100%
rename from inputs/dumpz.py
rename to src/inputs/dumpz.py
diff --git a/inputs/gists.py b/src/inputs/gists.py
similarity index 100%
rename from inputs/gists.py
rename to src/inputs/gists.py
diff --git a/inputs/github.py b/src/inputs/github.py
similarity index 100%
rename from inputs/github.py
rename to src/inputs/github.py
diff --git a/inputs/pastebin.py b/src/inputs/pastebin.py
similarity index 100%
rename from inputs/pastebin.py
rename to src/inputs/pastebin.py
diff --git a/inputs/slexy.py b/src/inputs/slexy.py
similarity index 100%
rename from inputs/slexy.py
rename to src/inputs/slexy.py
diff --git a/inputs/stackexchange.py b/src/inputs/stackexchange.py
similarity index 100%
rename from inputs/stackexchange.py
rename to src/inputs/stackexchange.py
diff --git a/postprocess/__init__.py b/src/outputs/__init__.py
similarity index 100%
rename from postprocess/__init__.py
rename to src/outputs/__init__.py
diff --git a/outputs/csv_output.py b/src/outputs/csv_output.py
similarity index 82%
rename from outputs/csv_output.py
rename to src/outputs/csv_output.py
index d79cf67..d9def77 100644
--- a/outputs/csv_output.py
+++ b/src/outputs/csv_output.py
@@ -1,11 +1,13 @@
+import logging
 import os
 import datetime
 from common import parse_config
 
+logger = logging.getLogger('pastehunter')
 config = parse_config()
 
 
-class CSVOutput():
+class CSVOutput(object):
     def __init__(self):
         base_path = config['outputs']['csv_output']['output_path']
         # Get todays CSV
@@ -18,7 +20,7 @@ def __init__(self):
                 os.makedirs(base_path)
                 self.test = True
             except OSError as e:
-                print("Unable to create CSV Path: {0}".format(e))
+                logger.error("Unable to create CSV Path: {}".format(e))
                 self.test = False
         else:
             self.test = True
@@ -34,4 +36,4 @@ def store_paste(self, paste_data):
             with open(self.csv_path, 'a') as out:
                 out.write('{0}\n'.format(csv_line))
         else:
-            print("CSV Output Error")
+            logging.error("CSV Output Error. Output path '{}' was never created.".format(self.csv_path))
diff --git a/outputs/elastic_output.py b/src/outputs/elastic_output.py
similarity index 100%
rename from outputs/elastic_output.py
rename to src/outputs/elastic_output.py
diff --git a/outputs/json_output.py b/src/outputs/json_output.py
similarity index 100%
rename from outputs/json_output.py
rename to src/outputs/json_output.py
diff --git a/outputs/slack_output.py b/src/outputs/slack_output.py
similarity index 97%
rename from outputs/slack_output.py
rename to src/outputs/slack_output.py
index 1b003a2..30d1fcf 100644
--- a/outputs/slack_output.py
+++ b/src/outputs/slack_output.py
@@ -1,6 +1,3 @@
-import os
-import datetime
-import json
 import logging
 import requests
 from common import parse_config
diff --git a/outputs/smtp_output.py b/src/outputs/smtp_output.py
similarity index 100%
rename from outputs/smtp_output.py
rename to src/outputs/smtp_output.py
diff --git a/outputs/splunk_output.py b/src/outputs/splunk_output.py
similarity index 98%
rename from outputs/splunk_output.py
rename to src/outputs/splunk_output.py
index a825f63..d0aaede 100644
--- a/outputs/splunk_output.py
+++ b/src/outputs/splunk_output.py
@@ -1,5 +1,4 @@
 from common import parse_config
-from datetime import datetime
 import json
 import logging
 import splunklib.client as client
diff --git a/outputs/syslog_output.py b/src/outputs/syslog_output.py
similarity index 100%
rename from outputs/syslog_output.py
rename to src/outputs/syslog_output.py
diff --git a/outputs/twilio_output.py b/src/outputs/twilio_output.py
similarity index 97%
rename from outputs/twilio_output.py
rename to src/outputs/twilio_output.py
index 3068bae..8f6a034 100644
--- a/outputs/twilio_output.py
+++ b/src/outputs/twilio_output.py
@@ -5,7 +5,7 @@
 logger = logging.getLogger('pastehunter')
 config = parse_config()
 
-class TwilioOutput():
+class TwilioOutput(object):
     def __init__(self):
         self.account_sid = config['outputs']['twilio_output']['account_sid']
         self.auth_token = config['outputs']['twilio_output']['auth_token']
@@ -38,7 +38,7 @@ def store_paste(self, paste_data):
                     paste_data['scrape_url']
                     )
 
-                print("Sending Twilio Message")
+                logger.debug("Sending Twilio Message")
                 if self.message_type == 'sms':
                     for recipient in self.recipient_list:
                         try:
diff --git a/pastehunter.py b/src/pastehunter.py
similarity index 100%
rename from pastehunter.py
rename to src/pastehunter.py
diff --git a/sandboxes/__init__.py b/src/postprocess/__init__.py
similarity index 100%
rename from sandboxes/__init__.py
rename to src/postprocess/__init__.py
diff --git a/postprocess/post_b64.py b/src/postprocess/post_b64.py
similarity index 99%
rename from postprocess/post_b64.py
rename to src/postprocess/post_b64.py
index 260e16e..8930164 100644
--- a/postprocess/post_b64.py
+++ b/src/postprocess/post_b64.py
@@ -1,4 +1,3 @@
-import re
 import hashlib
 import importlib
 import gzip
diff --git a/postprocess/post_compress.py b/src/postprocess/post_compress.py
similarity index 100%
rename from postprocess/post_compress.py
rename to src/postprocess/post_compress.py
diff --git a/postprocess/post_email.py b/src/postprocess/post_email.py
similarity index 100%
rename from postprocess/post_email.py
rename to src/postprocess/post_email.py
diff --git a/postprocess/post_entropy.py b/src/postprocess/post_entropy.py
similarity index 100%
rename from postprocess/post_entropy.py
rename to src/postprocess/post_entropy.py
diff --git a/src/sandboxes/__init__.py b/src/sandboxes/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sandboxes/cuckoo.py b/src/sandboxes/cuckoo.py
similarity index 100%
rename from sandboxes/cuckoo.py
rename to src/sandboxes/cuckoo.py
diff --git a/sandboxes/viper.py b/src/sandboxes/viper.py
similarity index 100%
rename from sandboxes/viper.py
rename to src/sandboxes/viper.py

From b120f8fa896fcf3565417e0a470423c02fe08600 Mon Sep 17 00:00:00 2001
From: Dylan Katz <git@dylankatz.com>
Date: Fri, 27 Dec 2019 09:37:46 -0700
Subject: [PATCH 172/178] Fixed file formatter

This fixes an error that would've occurred if the format string was empty in the config for file logging. Silly interpreted languages!
---
 src/pastehunter.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pastehunter.py b/src/pastehunter.py
index 05aa30e..878d6b0 100644
--- a/src/pastehunter.py
+++ b/src/pastehunter.py
@@ -9,7 +9,7 @@
 import signal
 import sys
 import time
-from io import StringIO, BytesIO
+from io import BytesIO
 from logging import handlers
 from time import sleep
 from urllib.parse import unquote_plus
@@ -80,7 +80,7 @@ def __exit__(self, type, value, traceback):
         fileFormatter = logging.Formatter("{0}".format(conf["log"]["format"]))
         fileHandler.setFormatter(fileFormatter)
     else:
-        fileHandler.setFormatter(logFormatter)
+        fileHandler.setFormatter(formatter)
     fileHandler.setLevel(conf["log"]["logging_level"])
     logger.addHandler(fileHandler)
     logger.info("Enabled Log File: {0}".format(logfile))

From bb377128690ce5971412d9125cb0b77c7e06ced4 Mon Sep 17 00:00:00 2001
From: KevTheHermit <kevthehermit@gmail.com>
Date: Sat, 28 Dec 2019 20:26:30 +0000
Subject: [PATCH 173/178] Adjust repo for pypi installation

---
 .travis.yml                                   | 36 ++++++++++++++++
 CHANGELOG.md                                  | 16 +++++++
 MANIFEST.in                                   |  1 +
 src/pastehunter.py => pastehunter-cli         |  2 +-
 .../YaraRules}/CryptoExchangeApi.yar          |  0
 .../YaraRules}/api_keys.yar                   |  0
 {YaraRules => pastehunter/YaraRules}/aws.yar  |  0
 .../YaraRules}/base64.yar                     |  0
 .../YaraRules}/blacklist.yar                  |  0
 .../YaraRules}/certificates.yar               |  0
 .../YaraRules}/core_keywords.yar              |  0
 .../YaraRules}/database.yar                   |  0
 .../YaraRules}/email_filter.yar               |  0
 .../YaraRules}/general.yar                    |  0
 .../YaraRules}/github_dorks.yar               |  0
 {YaraRules => pastehunter/YaraRules}/hak5.yar |  0
 .../YaraRules}/index.yar                      | 17 ++++----
 .../YaraRules}/password_leak.yar              |  0
 .../YaraRules}/powershell.yar                 |  0
 .../YaraRules}/test_rules.yar                 |  0
 {src => pastehunter}/__init__.py              |  0
 {src => pastehunter}/common.py                |  0
 {src => pastehunter}/inputs/__init__.py       |  0
 {src => pastehunter}/inputs/dumpz.py          |  0
 {src => pastehunter}/inputs/gists.py          |  0
 {src => pastehunter}/inputs/github.py         |  0
 {src => pastehunter}/inputs/pastebin.py       |  0
 {src => pastehunter}/inputs/slexy.py          |  0
 {src => pastehunter}/inputs/stackexchange.py  |  0
 {src => pastehunter}/outputs/__init__.py      |  0
 {src => pastehunter}/outputs/csv_output.py    |  2 +-
 .../outputs/elastic_output.py                 |  2 +-
 {src => pastehunter}/outputs/json_output.py   |  2 +-
 {src => pastehunter}/outputs/slack_output.py  |  2 +-
 {src => pastehunter}/outputs/smtp_output.py   |  2 +-
 {src => pastehunter}/outputs/splunk_output.py |  2 +-
 {src => pastehunter}/outputs/syslog_output.py |  2 +-
 {src => pastehunter}/outputs/twilio_output.py |  2 +-
 {src => pastehunter}/postprocess/__init__.py  |  0
 {src => pastehunter}/postprocess/post_b64.py  |  2 +-
 .../postprocess/post_compress.py              |  2 +-
 .../postprocess/post_email.py                 |  0
 .../postprocess/post_entropy.py               |  0
 {src => pastehunter}/sandboxes/__init__.py    |  0
 {src => pastehunter}/sandboxes/cuckoo.py      |  2 +-
 {src => pastehunter}/sandboxes/viper.py       |  2 +-
 settings.json.sample                          | 42 +++++++++----------
 setup.py                                      | 28 +++++++++++++
 48 files changed, 124 insertions(+), 42 deletions(-)
 create mode 100644 .travis.yml
 create mode 100644 CHANGELOG.md
 create mode 100644 MANIFEST.in
 rename src/pastehunter.py => pastehunter-cli (99%)
 rename {YaraRules => pastehunter/YaraRules}/CryptoExchangeApi.yar (100%)
 rename {YaraRules => pastehunter/YaraRules}/api_keys.yar (100%)
 rename {YaraRules => pastehunter/YaraRules}/aws.yar (100%)
 rename {YaraRules => pastehunter/YaraRules}/base64.yar (100%)
 rename {YaraRules => pastehunter/YaraRules}/blacklist.yar (100%)
 rename {YaraRules => pastehunter/YaraRules}/certificates.yar (100%)
 rename {YaraRules => pastehunter/YaraRules}/core_keywords.yar (100%)
 rename {YaraRules => pastehunter/YaraRules}/database.yar (100%)
 rename {YaraRules => pastehunter/YaraRules}/email_filter.yar (100%)
 rename {YaraRules => pastehunter/YaraRules}/general.yar (100%)
 rename {YaraRules => pastehunter/YaraRules}/github_dorks.yar (100%)
 rename {YaraRules => pastehunter/YaraRules}/hak5.yar (100%)
 rename {YaraRules => pastehunter/YaraRules}/index.yar (91%)
 rename {YaraRules => pastehunter/YaraRules}/password_leak.yar (100%)
 rename {YaraRules => pastehunter/YaraRules}/powershell.yar (100%)
 rename {YaraRules => pastehunter/YaraRules}/test_rules.yar (100%)
 rename {src => pastehunter}/__init__.py (100%)
 rename {src => pastehunter}/common.py (100%)
 rename {src => pastehunter}/inputs/__init__.py (100%)
 rename {src => pastehunter}/inputs/dumpz.py (100%)
 rename {src => pastehunter}/inputs/gists.py (100%)
 rename {src => pastehunter}/inputs/github.py (100%)
 rename {src => pastehunter}/inputs/pastebin.py (100%)
 rename {src => pastehunter}/inputs/slexy.py (100%)
 rename {src => pastehunter}/inputs/stackexchange.py (100%)
 rename {src => pastehunter}/outputs/__init__.py (100%)
 rename {src => pastehunter}/outputs/csv_output.py (96%)
 rename {src => pastehunter}/outputs/elastic_output.py (98%)
 rename {src => pastehunter}/outputs/json_output.py (95%)
 rename {src => pastehunter}/outputs/slack_output.py (97%)
 rename {src => pastehunter}/outputs/smtp_output.py (99%)
 rename {src => pastehunter}/outputs/splunk_output.py (97%)
 rename {src => pastehunter}/outputs/syslog_output.py (94%)
 rename {src => pastehunter}/outputs/twilio_output.py (98%)
 rename {src => pastehunter}/postprocess/__init__.py (100%)
 rename {src => pastehunter}/postprocess/post_b64.py (98%)
 rename {src => pastehunter}/postprocess/post_compress.py (95%)
 rename {src => pastehunter}/postprocess/post_email.py (100%)
 rename {src => pastehunter}/postprocess/post_entropy.py (100%)
 rename {src => pastehunter}/sandboxes/__init__.py (100%)
 rename {src => pastehunter}/sandboxes/cuckoo.py (96%)
 rename {src => pastehunter}/sandboxes/viper.py (93%)
 create mode 100644 setup.py

diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..19a05d5
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,36 @@
+language: python
+sudo: required
+dist: bionic
+group: edge
+cache:
+  pip: true
+python:
+- 3.6
+- 3.6-dev
+before_install:
+- sudo apt-get update -qq
+- sudo apt-get install automake libtool make gcc libmagic-dev -yqq python3-pip unzip
+- wget https://github.com/VirusTotal/yara/archive/v3.10.0.tar.gz
+- tar -xzvf v3.10.0.tar.gz
+- cd yara-3.10.0/ && ./bootstrap.sh && ./configure --enable-dotnet --enable-magic
+  && make && sudo make install && cd ../
+- git clone --recursive https://github.com/VirusTotal/yara-python
+- pip3 install pytest codecov pytest-cov
+- cd yara-python
+- python setup.py build --enable-magic --enable-dotnet
+- python setup.py install && cd ../ && rm -rf yara-python && rm -rf yara-3.10.0/
+install:
+- pip install -r requirements.txt
+- pip install -e .
+script:
+- pastehunter-cli
+after_success:
+- python setup.py sdist
+deploy:
+  provider: pypi
+  user: __token__
+  password:
+    secure: ZYILSwAsPcCWa4Ccslu2F+HVw02Rafdf4HqnQla3uCCTlEQQ+cFyuTKxQB46xytgblFQv/99oxq3SwVTUX4C6cIa8D+zHm/6lR4Tu+YPthYZX9IashF/AMKkyKks8bxbB0x/3t7hBX+7w++OcC1wwCXUyX7btsiOBa28k1NZCsB26NgdpBn02wF/GwqDhkxKkW9Bi7KDjb58GdiyhgVXxOOaOYbRyKiNZqUKQx504zmc0aGSPYCs0gSPwoA0T3FUet4IBcjjTP9DsjjkyQ7K6iMWYNGsAP91HnZe5J4sZYqwrGs++vndJVa/bYpiyMCjUrG4c6okdS0zpSmfbrqJay12wH5qroqqLxwuLtrXcHK+ChlyvhsGHMN51rqX811zdt/IzDwi+hXz84e8Y8/YgUTx7j0/HPEdrHjIIbMoIEd9Wy42+TcRCHJOULjsg7Kc7KLd1ILvxxyV+REnkfaazeqmgSNlqFxM2A65dkq3xNt9CDtYQlX/IhTDBy2/qY3m60uOh92ptd5f5eHF28W89APnkRAHD2JSEVRym1fHNrvPl1NCJT8NavbdYup/dH8hQadMx72X022lmyFASHN92G78O3uA0fZ8B/hzCpVQ4KTTIT4/LqkAXuWlfW4z9wC62V2ZdL6E76lqbMPokeXfH8Tf+chAaw/XHr7Wk6bWkOQ=
+  on:
+    branch: master
+  skip_existing: true
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..2f62573
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,16 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+
+## [1.2.0] - 2019-12-28
+### Added
+- Changelog
+- travis CI
+- PyPi Installation
+
+### Changed
+- FilePaths to enable pip
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..fac9ba1
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1 @@
+recursive-include pastehunter/YaraRules *.yar
\ No newline at end of file
diff --git a/src/pastehunter.py b/pastehunter-cli
similarity index 99%
rename from src/pastehunter.py
rename to pastehunter-cli
index 878d6b0..d1f1147 100644
--- a/src/pastehunter.py
+++ b/pastehunter-cli
@@ -17,7 +17,7 @@
 import requests
 import yara
 
-from common import parse_config
+from pastehunter.common import parse_config
 
 VERSION = 1.0
 
diff --git a/YaraRules/CryptoExchangeApi.yar b/pastehunter/YaraRules/CryptoExchangeApi.yar
similarity index 100%
rename from YaraRules/CryptoExchangeApi.yar
rename to pastehunter/YaraRules/CryptoExchangeApi.yar
diff --git a/YaraRules/api_keys.yar b/pastehunter/YaraRules/api_keys.yar
similarity index 100%
rename from YaraRules/api_keys.yar
rename to pastehunter/YaraRules/api_keys.yar
diff --git a/YaraRules/aws.yar b/pastehunter/YaraRules/aws.yar
similarity index 100%
rename from YaraRules/aws.yar
rename to pastehunter/YaraRules/aws.yar
diff --git a/YaraRules/base64.yar b/pastehunter/YaraRules/base64.yar
similarity index 100%
rename from YaraRules/base64.yar
rename to pastehunter/YaraRules/base64.yar
diff --git a/YaraRules/blacklist.yar b/pastehunter/YaraRules/blacklist.yar
similarity index 100%
rename from YaraRules/blacklist.yar
rename to pastehunter/YaraRules/blacklist.yar
diff --git a/YaraRules/certificates.yar b/pastehunter/YaraRules/certificates.yar
similarity index 100%
rename from YaraRules/certificates.yar
rename to pastehunter/YaraRules/certificates.yar
diff --git a/YaraRules/core_keywords.yar b/pastehunter/YaraRules/core_keywords.yar
similarity index 100%
rename from YaraRules/core_keywords.yar
rename to pastehunter/YaraRules/core_keywords.yar
diff --git a/YaraRules/database.yar b/pastehunter/YaraRules/database.yar
similarity index 100%
rename from YaraRules/database.yar
rename to pastehunter/YaraRules/database.yar
diff --git a/YaraRules/email_filter.yar b/pastehunter/YaraRules/email_filter.yar
similarity index 100%
rename from YaraRules/email_filter.yar
rename to pastehunter/YaraRules/email_filter.yar
diff --git a/YaraRules/general.yar b/pastehunter/YaraRules/general.yar
similarity index 100%
rename from YaraRules/general.yar
rename to pastehunter/YaraRules/general.yar
diff --git a/YaraRules/github_dorks.yar b/pastehunter/YaraRules/github_dorks.yar
similarity index 100%
rename from YaraRules/github_dorks.yar
rename to pastehunter/YaraRules/github_dorks.yar
diff --git a/YaraRules/hak5.yar b/pastehunter/YaraRules/hak5.yar
similarity index 100%
rename from YaraRules/hak5.yar
rename to pastehunter/YaraRules/hak5.yar
diff --git a/YaraRules/index.yar b/pastehunter/YaraRules/index.yar
similarity index 91%
rename from YaraRules/index.yar
rename to pastehunter/YaraRules/index.yar
index d09326e..9836632 100644
--- a/YaraRules/index.yar
+++ b/pastehunter/YaraRules/index.yar
@@ -1,14 +1,15 @@
-include "api_keys.yar"
-include "aws.yar"
 include "base64.yar"
+include "api_keys.yar"
+include "database.yar"
+include "github_dorks.yar"
 include "blacklist.yar"
 include "certificates.yar"
 include "core_keywords.yar"
-include "CryptoExchangeApi.yar"
-include "database.yar"
-include "email_filter.yar"
-include "general.yar"
-include "github_dorks.yar"
-include "hak5.yar"
+include "custom_keywords.yar"
 include "password_leak.yar"
 include "powershell.yar"
+include "email_filter.yar"
+include "hak5.yar"
+include "aws.yar"
+include "general.yar"
+include "CryptoExchangeApi.yar"
diff --git a/YaraRules/password_leak.yar b/pastehunter/YaraRules/password_leak.yar
similarity index 100%
rename from YaraRules/password_leak.yar
rename to pastehunter/YaraRules/password_leak.yar
diff --git a/YaraRules/powershell.yar b/pastehunter/YaraRules/powershell.yar
similarity index 100%
rename from YaraRules/powershell.yar
rename to pastehunter/YaraRules/powershell.yar
diff --git a/YaraRules/test_rules.yar b/pastehunter/YaraRules/test_rules.yar
similarity index 100%
rename from YaraRules/test_rules.yar
rename to pastehunter/YaraRules/test_rules.yar
diff --git a/src/__init__.py b/pastehunter/__init__.py
similarity index 100%
rename from src/__init__.py
rename to pastehunter/__init__.py
diff --git a/src/common.py b/pastehunter/common.py
similarity index 100%
rename from src/common.py
rename to pastehunter/common.py
diff --git a/src/inputs/__init__.py b/pastehunter/inputs/__init__.py
similarity index 100%
rename from src/inputs/__init__.py
rename to pastehunter/inputs/__init__.py
diff --git a/src/inputs/dumpz.py b/pastehunter/inputs/dumpz.py
similarity index 100%
rename from src/inputs/dumpz.py
rename to pastehunter/inputs/dumpz.py
diff --git a/src/inputs/gists.py b/pastehunter/inputs/gists.py
similarity index 100%
rename from src/inputs/gists.py
rename to pastehunter/inputs/gists.py
diff --git a/src/inputs/github.py b/pastehunter/inputs/github.py
similarity index 100%
rename from src/inputs/github.py
rename to pastehunter/inputs/github.py
diff --git a/src/inputs/pastebin.py b/pastehunter/inputs/pastebin.py
similarity index 100%
rename from src/inputs/pastebin.py
rename to pastehunter/inputs/pastebin.py
diff --git a/src/inputs/slexy.py b/pastehunter/inputs/slexy.py
similarity index 100%
rename from src/inputs/slexy.py
rename to pastehunter/inputs/slexy.py
diff --git a/src/inputs/stackexchange.py b/pastehunter/inputs/stackexchange.py
similarity index 100%
rename from src/inputs/stackexchange.py
rename to pastehunter/inputs/stackexchange.py
diff --git a/src/outputs/__init__.py b/pastehunter/outputs/__init__.py
similarity index 100%
rename from src/outputs/__init__.py
rename to pastehunter/outputs/__init__.py
diff --git a/src/outputs/csv_output.py b/pastehunter/outputs/csv_output.py
similarity index 96%
rename from src/outputs/csv_output.py
rename to pastehunter/outputs/csv_output.py
index d9def77..7cebe1b 100644
--- a/src/outputs/csv_output.py
+++ b/pastehunter/outputs/csv_output.py
@@ -1,7 +1,7 @@
 import logging
 import os
 import datetime
-from common import parse_config
+from pastehunter.common import parse_config
 
 logger = logging.getLogger('pastehunter')
 config = parse_config()
diff --git a/src/outputs/elastic_output.py b/pastehunter/outputs/elastic_output.py
similarity index 98%
rename from src/outputs/elastic_output.py
rename to pastehunter/outputs/elastic_output.py
index 12eccbe..66ce467 100644
--- a/src/outputs/elastic_output.py
+++ b/pastehunter/outputs/elastic_output.py
@@ -1,5 +1,5 @@
 from elasticsearch import Elasticsearch
-from common import parse_config
+from pastehunter.common import parse_config
 from datetime import datetime
 import logging
 
diff --git a/src/outputs/json_output.py b/pastehunter/outputs/json_output.py
similarity index 95%
rename from src/outputs/json_output.py
rename to pastehunter/outputs/json_output.py
index d98d279..e578a53 100644
--- a/src/outputs/json_output.py
+++ b/pastehunter/outputs/json_output.py
@@ -2,7 +2,7 @@
 import logging
 import os
 
-from common import parse_config
+from pastehunter.common import parse_config
 
 logger = logging.getLogger('pastehunter')
 
diff --git a/src/outputs/slack_output.py b/pastehunter/outputs/slack_output.py
similarity index 97%
rename from src/outputs/slack_output.py
rename to pastehunter/outputs/slack_output.py
index 30d1fcf..8e0abac 100644
--- a/src/outputs/slack_output.py
+++ b/pastehunter/outputs/slack_output.py
@@ -1,6 +1,6 @@
 import logging
 import requests
-from common import parse_config
+from pastehunter.common import parse_config
 
 logger = logging.getLogger('pastehunter')
 
diff --git a/src/outputs/smtp_output.py b/pastehunter/outputs/smtp_output.py
similarity index 99%
rename from src/outputs/smtp_output.py
rename to pastehunter/outputs/smtp_output.py
index 6090511..dd549ec 100644
--- a/src/outputs/smtp_output.py
+++ b/pastehunter/outputs/smtp_output.py
@@ -9,7 +9,7 @@
 import json
 import logging
 
-from common import parse_config
+from pastehunter.common import parse_config
 logger = logging.getLogger('pastehunter')
 
 config = parse_config()
diff --git a/src/outputs/splunk_output.py b/pastehunter/outputs/splunk_output.py
similarity index 97%
rename from src/outputs/splunk_output.py
rename to pastehunter/outputs/splunk_output.py
index d0aaede..9406c42 100644
--- a/src/outputs/splunk_output.py
+++ b/pastehunter/outputs/splunk_output.py
@@ -1,4 +1,4 @@
-from common import parse_config
+from pastehunter.common import parse_config
 import json
 import logging
 import splunklib.client as client
diff --git a/src/outputs/syslog_output.py b/pastehunter/outputs/syslog_output.py
similarity index 94%
rename from src/outputs/syslog_output.py
rename to pastehunter/outputs/syslog_output.py
index 6618eba..88df0a4 100644
--- a/src/outputs/syslog_output.py
+++ b/pastehunter/outputs/syslog_output.py
@@ -1,5 +1,5 @@
 import socket
-from common import parse_config
+from pastehunter.common import parse_config
 
 config = parse_config()
 
diff --git a/src/outputs/twilio_output.py b/pastehunter/outputs/twilio_output.py
similarity index 98%
rename from src/outputs/twilio_output.py
rename to pastehunter/outputs/twilio_output.py
index 8f6a034..c9012af 100644
--- a/src/outputs/twilio_output.py
+++ b/pastehunter/outputs/twilio_output.py
@@ -1,6 +1,6 @@
 import logging
 from twilio.rest import Client
-from common import parse_config
+from pastehunter.common import parse_config
 
 logger = logging.getLogger('pastehunter')
 config = parse_config()
diff --git a/src/postprocess/__init__.py b/pastehunter/postprocess/__init__.py
similarity index 100%
rename from src/postprocess/__init__.py
rename to pastehunter/postprocess/__init__.py
diff --git a/src/postprocess/post_b64.py b/pastehunter/postprocess/post_b64.py
similarity index 98%
rename from src/postprocess/post_b64.py
rename to pastehunter/postprocess/post_b64.py
index 8930164..7a45941 100644
--- a/src/postprocess/post_b64.py
+++ b/pastehunter/postprocess/post_b64.py
@@ -4,7 +4,7 @@
 import logging
 from base64 import b64decode
 # This gets the raw paste and the paste_data json object
-from common import parse_config
+from pastehunter.common import parse_config
 conf = parse_config()
 
 logger = logging.getLogger('pastehunter')
diff --git a/src/postprocess/post_compress.py b/pastehunter/postprocess/post_compress.py
similarity index 95%
rename from src/postprocess/post_compress.py
rename to pastehunter/postprocess/post_compress.py
index 645ac5e..d47f9fb 100644
--- a/src/postprocess/post_compress.py
+++ b/pastehunter/postprocess/post_compress.py
@@ -1,7 +1,7 @@
 import lzma
 import base64
 import logging
-from common import parse_config
+from pastehunter.common import parse_config
 logger = logging.getLogger('pastehunter')
 config = parse_config()
 
diff --git a/src/postprocess/post_email.py b/pastehunter/postprocess/post_email.py
similarity index 100%
rename from src/postprocess/post_email.py
rename to pastehunter/postprocess/post_email.py
diff --git a/src/postprocess/post_entropy.py b/pastehunter/postprocess/post_entropy.py
similarity index 100%
rename from src/postprocess/post_entropy.py
rename to pastehunter/postprocess/post_entropy.py
diff --git a/src/sandboxes/__init__.py b/pastehunter/sandboxes/__init__.py
similarity index 100%
rename from src/sandboxes/__init__.py
rename to pastehunter/sandboxes/__init__.py
diff --git a/src/sandboxes/cuckoo.py b/pastehunter/sandboxes/cuckoo.py
similarity index 96%
rename from src/sandboxes/cuckoo.py
rename to pastehunter/sandboxes/cuckoo.py
index 7685646..fda3dbd 100644
--- a/src/sandboxes/cuckoo.py
+++ b/pastehunter/sandboxes/cuckoo.py
@@ -1,7 +1,7 @@
 import io
 import logging
 import requests
-from common import parse_config
+from pastehunter.common import parse_config
 conf = parse_config()
 
 logger = logging.getLogger('pastehunter')
diff --git a/src/sandboxes/viper.py b/pastehunter/sandboxes/viper.py
similarity index 93%
rename from src/sandboxes/viper.py
rename to pastehunter/sandboxes/viper.py
index b7f085d..f77fb2d 100644
--- a/src/sandboxes/viper.py
+++ b/pastehunter/sandboxes/viper.py
@@ -1,7 +1,7 @@
 import io
 import logging
 import requests
-from common import parse_config
+from pastehunter.common import parse_config
 conf = parse_config()
 
 logger = logging.getLogger('pastehunter')
diff --git a/settings.json.sample b/settings.json.sample
index 938293f..1a6f3f3 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -2,7 +2,7 @@
   "inputs": {
     "pastebin":{
       "enabled": true,
-      "module": "inputs.pastebin",
+      "module": "pastehunter.inputs.pastebin",
       "api_scrape": "https://scrape.pastebin.com/api_scraping.php",
       "api_raw": "https://scrape.pastebin.com/api_scrape_item.php?i=",
       "paste_limit": 200,
@@ -11,7 +11,7 @@
     "dumpz": {
       "enabled": false,
       "comment": "This api endpoint has been removed.",
-      "module": "inputs.dumpz",
+      "module": "pastehunter.inputs.dumpz",
       "api_scrape": "https://dumpz.org/api/recent",
       "api_raw": "https://dumpz.org/api/dump",
       "paste_limit": 200,
@@ -19,7 +19,7 @@
     },
     "gists": {
       "enabled": true,
-      "module": "inputs.gists",
+      "module": "pastehunter.inputs.gists",
       "api_token": "",
       "api_limit": 100,
       "store_all": false,
@@ -28,7 +28,7 @@
     },
     "github": {
       "enabled": false,
-      "module": "inputs.github",
+      "module": "pastehunter.inputs.github",
       "api_token": "",
       "api_limit": 100,
       "store_all": false,
@@ -38,7 +38,7 @@
     },
     "slexy":{
       "enabled": true,
-      "module": "inputs.slexy",
+      "module": "pastehunter.inputs.slexy",
       "store_all": false,
       "api_scrape": "http://slexy.org/recent",
       "api_raw": "http://slexy.org/raw",
@@ -46,7 +46,7 @@
     },
     "stackexchange":{
       "enabled": false,
-      "module": "inputs.stackexchange",
+      "module": "pastehunter.inputs.stackexchange",
       "site_list": ["stackoverflow","serverfault", "superuser", "webapps", "webmasters", "dba"],
       "api_key": "",
       "store_filter": "!)r_ttsG0v3bE1vo3*8Ki",
@@ -58,7 +58,7 @@
   "outputs": {
     "elastic_output": {
       "enabled": true,
-      "module": "outputs.elastic_output",
+      "module": "pastehunter.outputs.elastic_output",
       "classname": "ElasticOutput",
       "elastic_index": "paste-test",
       "elastic_host": "172.16.10.10",
@@ -70,7 +70,7 @@
     },
     "splunk_output": {
       "enabled": false,
-      "module": "outputs.splunk_output",
+      "module": "pastehunter.outputs.splunk_output",
       "classname": "SplunkOutput",
       "splunk_host": "host",
       "splunk_port": 8089,
@@ -82,7 +82,7 @@
     },
     "json_output": {
       "enabled": false,
-      "module": "outputs.json_output",
+      "module": "pastehunter.outputs.json_output",
       "classname": "JsonOutput",
       "output_path": "logs/json/",
       "store_raw": true,
@@ -90,20 +90,20 @@
     },
     "csv_output": {
       "enabled": false,
-      "module": "outputs.csv_output",
+      "module": "pastehunter.outputs.csv_output",
       "classname": "CSVOutput",
       "output_path": "logs/csv/"
     },
     "syslog_output": {
       "enabled": false,
-      "module": "outputs.syslog_output",
+      "module": "pastehunter.outputs.syslog_output",
       "classname": "SyslogOutput",
       "host": "192.168.1.1",
       "port": 514
     },
     "smtp_output": {
       "enabled": false,
-      "module": "outputs.smtp_output",
+      "module": "pastehunter.outputs.smtp_output",
       "classname": "SMTPOutput",
       "smtp_host": "smtp.server.com",
       "smtp_port": 25,
@@ -125,14 +125,14 @@
     },
     "slack_output": {
       "enabled": false,
-      "module": "outputs.slack_output",
+      "module": "pastehunter.outputs.slack_output",
       "classname": "SlackOutput",
       "webhook_url": "",
       "rule_list": ["custom_keywords"]
     },
     "twilio_output": {
       "enabled": false,
-      "module": "outputs.twilio_output",
+      "module": "pastehunter.outputs.twilio_output",
       "classname": "TwilioOutput",
       "account_sid": "",
       "auth_token": "",
@@ -142,7 +142,7 @@
     }
   },
   "yara": {
-    "rule_path": "YaraRules",
+    "rule_path": "pastehunter/YaraRules",
     "blacklist": true,
     "test_rules": false
   },
@@ -160,13 +160,13 @@
   "sandboxes": {
     "cuckoo": {
       "enabled": false,
-      "module": "sandboxes.cuckoo",
+      "module": "pastehunter.sandboxes.cuckoo",
       "api_host": "127.0.0.1",
       "api_port": 8080
     },
     "viper": {
       "enabled": false,
-      "module": "sandboxes.viper",
+      "module": "pastehunter.sandboxes.viper",
       "api_host": "127.0.0.1",
       "api_port": 8080
     }
@@ -174,22 +174,22 @@
   "post_process": {
     "post_email": {
       "enabled": true,
-      "module": "postprocess.post_email",
+      "module": "pastehunter.postprocess.post_email",
       "rule_list": ["email_list"]
     },
     "post_b64": {
       "enabled": true,
-      "module": "postprocess.post_b64",
+      "module": "pastehunter.postprocess.post_b64",
       "rule_list": ["b64_exe", "b64_rar", "b64_zip", "b64_gzip"]
     },
     "post_entropy": {
       "enabled": false,
-      "module": "postprocess.post_entropy",
+      "module": "pastehunter.postprocess.post_entropy",
       "rule_list": ["ALL"]
     },
     "post_compress": {
       "enabled": false,
-      "module": "postprocess.post_compress",
+      "module": "pastehunter.postprocess.post_compress",
       "rule_list": ["ALL"]
     }
   }
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..96fb483
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+from setuptools import setup, find_packages
+
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+
+setup(
+    name='pastehunter',
+    version='1.2.0',
+    author='@kevthehermit @Plazmaz',
+    author_email='info@pastehunter.com',
+    description="Pastehunter",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url='https://pastehunter.com',
+    license='GNU V3',
+    zip_safe=False,
+    packages=find_packages(),
+    include_package_data=True,
+    install_requires=[
+        'yara-python',
+        'requests',
+        'elasticsearch',
+        'splunk-sdk'
+    ],
+    scripts=['pastehunter-cli'],
+    package_data={'': ['*.yar', 'README.md, LICENSE']}
+)
\ No newline at end of file

From 4bddb714f6d0e6141192698ca8db25f64f672715 Mon Sep 17 00:00:00 2001
From: KevTheHermit <kevthehermit@gmail.com>
Date: Sat, 28 Dec 2019 20:35:49 +0000
Subject: [PATCH 174/178] Add badges to readme

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index 9b696e3..d84601b 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,11 @@ by an organisation or a researcher.
 
 For setup instructions please see the official documentation https://pastehunter.readthedocs.io/en/latest/installation.html
 
+[![PyPI version](https://badge.fury.io/py/pastehunter.svg)](https://badge.fury.io/py/pastehunter)
+
+[![Build Status](https://travis-ci.org/kevthehermit/PasteHunter.svg?branch=master)](https://travis-ci.org/kevthehermit/PasteHunter)
+
+
 ## Supported Inputs
 Pastehunter currently has support for the following sites:
  - pastebin.com

From efe5fed78dd7266ec8cee35797492f117aad2eb1 Mon Sep 17 00:00:00 2001
From: KevTheHermit <kevthehermit@gmail.com>
Date: Sun, 29 Dec 2019 01:53:56 +0000
Subject: [PATCH 175/178] Add user config file

---
 pastehunter/common.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/pastehunter/common.py b/pastehunter/common.py
index 1ea162b..54fd4a3 100644
--- a/pastehunter/common.py
+++ b/pastehunter/common.py
@@ -1,16 +1,28 @@
 import json
 import logging
+import os.path
 
 logger = logging.getLogger('pastehunter')
+home = os.path.expanduser("~")
 
 # Parse the config file in to a dict
 def parse_config():
-    conf_file = 'settings.json'
     conf = None
-    try:
-        with open(conf_file, 'r') as read_conf:
-            conf = json.load(read_conf)
-    except Exception as e:
-        logger.error("Unable to parse config file: {0}".format(e))
+    settings_file = os.path.join(home, ".config", "pastehunter.json")
+
+    if os.path.exists(settings_file):
+        conf_file = settings_file
+    else:
+        #ToDo: Copy base settings to the settings file
+        conf_file = None
+
+    if conf_file:
+        try:
+            with open(conf_file, 'r') as read_conf:
+                conf = json.load(read_conf)
+        except Exception as e:
+            logger.error("Unable to parse config file: {0}".format(e))
+    else:
+        logger.error("Unable to read config file '~/.config/pastehunter.json'")
 
     return conf

From e80bf6f80589bb74603c6899ca5fcc287ce32c29 Mon Sep 17 00:00:00 2001
From: KevTheHermit <kevthehermit@gmail.com>
Date: Sun, 29 Dec 2019 02:02:56 +0000
Subject: [PATCH 176/178] Refactor Yara Rules

---
 .gitignore           |  3 +-
 pastehunter-cli      | 75 ++++++++++++++++++++++++++++----------------
 settings.json.sample |  4 ++-
 3 files changed, 53 insertions(+), 29 deletions(-)

diff --git a/.gitignore b/.gitignore
index e12df7d..e7dab63 100644
--- a/.gitignore
+++ b/.gitignore
@@ -108,4 +108,5 @@ ENV/
 .vscode/
 logs/
 
-.c9
\ No newline at end of file
+.c9
+pastehunter/YaraRules/custom_keywords.yar
diff --git a/pastehunter-cli b/pastehunter-cli
index d1f1147..8cab6e2 100644
--- a/pastehunter-cli
+++ b/pastehunter-cli
@@ -16,7 +16,7 @@ from urllib.parse import unquote_plus
 
 import requests
 import yara
-
+import pastehunter
 from pastehunter.common import parse_config
 
 VERSION = 1.0
@@ -124,23 +124,32 @@ for output_type, output_values in conf["outputs"].items():
         outputs.append(instance)
 
 
-def yara_index(rule_path, blacklist, test_rules):
-    index_file = os.path.join(rule_path, 'index.yar')
-    with open(index_file, 'w') as yar:
-        for filename in os.listdir(rule_path):
-            if filename.endswith('.yar') and filename != 'index.yar':
-                if filename == 'blacklist.yar':
-                    if blacklist:
-                        logger.info("Enable Blacklist Rules")
-                    else:
-                        continue
-                if filename == 'test_rules.yar':
-                    if test_rules:
-                        logger.info("Enable Test Rules")
-                    else:
-                        continue
-                include = 'include "{0}"\n'.format(filename)
-                yar.write(include)
+def yara_index(default_rules, custom_rules, exclude_rules, blacklist, test_rules):
+    rules_list = {}
+    counter = 0
+    if default_rules:
+        for filename in os.listdir(default_rules):
+            if filename in exclude_rules:
+                continue
+            if filename == 'blacklist.yar':
+                if blacklist:
+                    logger.info("Enable Blacklist Rules")
+                else:
+                    continue
+            if filename == 'test_rules.yar':
+                if test_rules:
+                    logger.info("Enable Test Rules")
+                else:
+                    continue
+            rules_list['namespace{0}'.format(counter)] = os.path.join(default_rules, filename)
+            logger.info("Adding rules from {0}".format(filename))
+            counter += 1
+    if custom_rules:
+        for filename in os.listdir(custom_rules):
+            rules_list['namespace{0}'.format(counter)] = os.path.join(custom_rules, filename)
+            logger.info("Adding custom rules from {0}".format(filename))
+            counter += 1
+    return rules_list
 
 
 def paste_scanner(paste_data, rules_buff):
@@ -306,21 +315,33 @@ def paste_scanner(paste_data, rules_buff):
 def main():
     logger.info("Compile Yara Rules")
     try:
-        # Update the yara rules index
-        yara_index(conf['yara']['rule_path'],
-                   conf['yara']['blacklist'],
-                   conf['yara']['test_rules'])
-
-        # Compile the yara rules we will use to match pastes
-        index_file = os.path.join(conf['yara']['rule_path'], 'index.yar')
-        rules = yara.compile(index_file, externals={'filename': ''})
+        if conf['yara']['default_rules']:
+            pastehunter_path = pastehunter.__path__[0]
+            default_rules = os.path.join(pastehunter_path, "YaraRules")
+        else:
+            default_rules = False
+        
+        if conf["yara"]["custom_rules"] != "none":
+            custom_rules = conf["yara"]["custom_rules"]
+        else:
+            custom_rules = False
+
+        rule_files = yara_index(
+            default_rules,
+            custom_rules,
+            conf['yara']['exclude_rules'],
+            conf['yara']['blacklist'],
+            conf['yara']['test_rules']
+            )
+
+        rules = yara.compile(filepaths=rule_files, externals={'filename': ''})
 
         # Used for sharing across processes
         rules_buff = BytesIO()
         rules.save(file=rules_buff)
 
     except Exception as e:
-        logger.exception("Unable to Create Yara index: ", e)
+        logger.exception("Unable to Create Yara index: {0}".format(e))
         sys.exit()
 
     # Create Queue to hold paste URI's
diff --git a/settings.json.sample b/settings.json.sample
index 1a6f3f3..138878d 100644
--- a/settings.json.sample
+++ b/settings.json.sample
@@ -142,7 +142,9 @@
     }
   },
   "yara": {
-    "rule_path": "pastehunter/YaraRules",
+    "default_rules": true,
+    "custom_rules": "none",
+    "exclude_rules": [],
     "blacklist": true,
     "test_rules": false
   },

From abed77688d0ee25f231f22eacebbbe6fb7bf243d Mon Sep 17 00:00:00 2001
From: KevTheHermit <kevthehermit@gmail.com>
Date: Sun, 29 Dec 2019 02:09:47 +0000
Subject: [PATCH 177/178] bump version and changelog

---
 CHANGELOG.md | 6 ++++++
 setup.py     | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2f62573..41e3ab8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [1.2.1] - 2019-12-29
+### Changed
+- move config file to ~/.config
+- move custom yara rules
+- refactor yara rules location
+
 ## [1.2.0] - 2019-12-28
 ### Added
 - Changelog
diff --git a/setup.py b/setup.py
index 96fb483..84e2751 100644
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@
 
 setup(
     name='pastehunter',
-    version='1.2.0',
+    version='1.2.1',
     author='@kevthehermit @Plazmaz',
     author_email='info@pastehunter.com',
     description="Pastehunter",

From 1fe59a724bd296103d9d09b2776a94d9fc92ab76 Mon Sep 17 00:00:00 2001
From: KevTheHermit <kevthehermit@gmail.com>
Date: Sun, 29 Dec 2019 12:01:08 +0000
Subject: [PATCH 178/178] remove old index.yar

---
 pastehunter/YaraRules/index.yar | 15 ---------------
 1 file changed, 15 deletions(-)
 delete mode 100644 pastehunter/YaraRules/index.yar

diff --git a/pastehunter/YaraRules/index.yar b/pastehunter/YaraRules/index.yar
deleted file mode 100644
index 9836632..0000000
--- a/pastehunter/YaraRules/index.yar
+++ /dev/null
@@ -1,15 +0,0 @@
-include "base64.yar"
-include "api_keys.yar"
-include "database.yar"
-include "github_dorks.yar"
-include "blacklist.yar"
-include "certificates.yar"
-include "core_keywords.yar"
-include "custom_keywords.yar"
-include "password_leak.yar"
-include "powershell.yar"
-include "email_filter.yar"
-include "hak5.yar"
-include "aws.yar"
-include "general.yar"
-include "CryptoExchangeApi.yar"