crawl

#!/usr/bin/env python

import sys
import re
from crawler import Crawler

# regex obtained from http://stackoverflow.com/questions/7160737/python-how-to-validate-a-url-in-python-malformed-or-not
url_regex = re.compile(
        r'^(?:http|ftp)s?://' # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)

def usage():
  print "usage:\n./crawl <url>"
  exit(0)

def crawl(url):
  crawler = Crawler(url)
  crawler.crawl()
  return crawler.assets_json()

if __name__ == "__main__":
  if len(sys.argv) < 2 or url_regex.match(sys.argv[1]) is None:
    usage()
  else:
    url = sys.argv[1]
    assets_json = crawl(url)
    print assets_json