-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl
executable file
·31 lines (26 loc) · 880 Bytes
/
crawl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#!/usr/bin/env python
import sys
import re
from crawler import Crawler
# regex obtained from http://stackoverflow.com/questions/7160737/python-how-to-validate-a-url-in-python-malformed-or-not
url_regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
r'localhost|' #localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
def usage():
print "usage:\n./crawl <url>"
exit(0)
def crawl(url):
crawler = Crawler(url)
crawler.crawl()
return crawler.assets_json()
if __name__ == "__main__":
if len(sys.argv) < 2 or url_regex.match(sys.argv[1]) is None:
usage()
else:
url = sys.argv[1]
assets_json = crawl(url)
print assets_json