Skip to content

Commit

Permalink
Merge branch 'master' into 228
Browse files Browse the repository at this point in the history
  • Loading branch information
monperrus committed Apr 4, 2024
2 parents 1cee036 + a9b2d45 commit d1085dd
Show file tree
Hide file tree
Showing 5 changed files with 138 additions and 12 deletions.
25 changes: 25 additions & 0 deletions .github/workflows/ci-validation.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# run all tests for crawler-user-agents (python/php)

name: CI validation

on:
# https://stackoverflow.com/questions/64635032/github-actions-run-on-push-to-all-branches
push:
branches:
- '**'
pull_request:
branches:
- master

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: 20
- run: pip3 install jsonschema pytest
- run: py.test -vv
- run: python3 validate.py
- run: php validate.php
36 changes: 36 additions & 0 deletions .github/workflows/npm-publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Continuous Delivery to https://www.npmjs.com/package/crawler-user-agents
# for each commit, with provenance

name: Deploy to NPM

on:
# push:
# branches:
# - master
workflow_run:
workflows: [CI validation]
branches: [master]
types:
- completed

jobs:
publish-npm:
runs-on: ubuntu-latest
permissions:
id-token: write # To attach provenance to the published package
environment:
name: npm_token
url: https://www.npmjs.com/package/crawler-user-agents
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: 20
registry-url: https://registry.npmjs.org/
- run: npm install [email protected]
- run: git checkout .
- run: npm version --no-git-tag-version `node -e 'pacote=require("pacote");pacote.manifest("crawler-user-agents").then(pkgJson => { console.log(pkgJson.version); });'`
- run: npm version --no-git-tag-version patch
- run: npm publish --provenance
env:
NODE_AUTH_TOKEN: ${{secrets.NPM_TOKEN}}
10 changes: 0 additions & 10 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,3 @@ script:
- git checkout .
- npm version `node -e 'pacote=require("pacote");pacote.manifest("crawler-user-agents").then(pkgJson => { console.log(pkgJson.version); });'`
- npm version patch

deploy:
provider: npm
email: [email protected]
skip_cleanup: true
api_key:
secure: DAS1IgbrU085kpY4acrByrU2oWQq2HbP8aNye/Wb7nJIjT2Hv+DrbGoZ38Qiz8szHGmOy0DwtwJGPL6dhM7cC+2PBn5Pu6GzeORCSh+KlrRFIVI09A9BRc6TGNkJIr7ddP0H69U+OcdL53noBeMK4q2nxM6neCZM4Aa45r1jptI=
on:
repo: monperrus/crawler-user-agents
branch: master
71 changes: 69 additions & 2 deletions crawler-user-agents.json
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@
,
{
"pattern": "httpx",
"addition_date":" 2019/12/23",
"addition_date": "2019/12/23",
"instances": [
"python-httpx/0.16.1",
"python-httpx/0.13.0.dev1"
Expand Down Expand Up @@ -265,7 +265,7 @@
,
{
"pattern": "phpcrawl",
"addition_date": "2012-09/17",
"addition_date": "2012/09/17",
"url": "http://phpcrawl.cuab.de/",
"instances": [
"phpcrawl"
Expand Down Expand Up @@ -5223,5 +5223,72 @@
"WordPress/X.X.X; https://example.com"
],
"url": "https://wordpress.org"
},
{
"pattern": "PhxBot",
"addition_date": "2024/01/06",
"instances": [
"PhxBot/0.1 ([email protected])"
]
},
{
"pattern": "ImagesiftBot",
"addition_date": "2024/01/06",
"instances": [
"Mozilla/5.0 (compatible; ImagesiftBot; +imagesift.com)"
],
"url": "https://imagesift.com"
},
{
"pattern": "Expanse",
"addition_date": "2024/02/01",
"instances": [
"Expanse, a Palo Alto Networks company, searches across the global IPv4 space multiple times per day to identify customers' presences on the Internet. If you would like to be excluded from our scans, please send IP addresses/domains to: [email protected]"
],
"url": "https://www.paloaltonetworks.com/cortex/cortex-xpanse"
},
{
"pattern": "InternetMeasurement",
"addition_date": "2024/02/01",
"instances": [
"Mozilla/5.0 (compatible; InternetMeasurement/1.0; +https://internet-measurement.com/)"
],
"url": "https://internet-measurement.com"
},
{
"pattern": "^BW\\/",
"addition_date": "2024/02/08",
"instances": [
"BW/1.1; bit.ly/3eZNDnO",
"BW/1.1; rb.gy/oupwis"
],
"url": "https://builtwith.com/biup"
},
{
"pattern": "GeedoBot",
"addition_date": "2024/02/11",
"instances": [
"Mozilla/5.0 (compatible; GeedoBot; +http://www.geedo.com/bot.html)"
],
"url": "http://www.geedo.com"
},
{
"pattern": "Audisto Crawler",
"addition_date": "2024/03/14",
"instances": [
"Audisto Crawler (mobile; +https://audisto.com/bot)",
"Audisto Crawler (desktop; +https://audisto.com/bot)",
"Audisto Crawler (mobile; essential; +https://audisto.com/bot)",
"Audisto Crawler (desktop; essential; +https://audisto.com/bot)"
],
"url": "https://audisto.com/help/crawler/bot/"
},
{
"pattern": "PerplexityBot\\/",
"addition_date": "2024/03/14",
"instances": [
"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; PerplexityBot/1.0; +https://perplexity.ai/perplexitybot)"
],
"url": "https://docs.perplexity.ai/docs/perplexitybot"
}
]
8 changes: 8 additions & 0 deletions validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import json
import re
from collections import Counter
import datetime

from jsonschema import validate

Expand Down Expand Up @@ -65,6 +66,13 @@ def main():
for entry in json_data:
pattern = entry['pattern']

# assert that field "addition_date" has format "2019/12/23",
if 'addition_date' in entry:
if not re.match(r'\d{4}/\d{2}/\d{2}', entry['addition_date']):
raise ValueError('addition_date {!r} has invalid format'.format(entry['addition_date']))
# parse the date with datetime
datetime.datetime.strptime(entry['addition_date'], '%Y/%m/%d')

# canonicalize entry
if 'depends_on' not in entry: entry['depends_on'] = []

Expand Down

0 comments on commit d1085dd

Please sign in to comment.