-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfeaturize.py
112 lines (85 loc) · 3.2 KB
/
featurize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python
#
# Tokenizes product nut features.
#
#
# Either import in Python, or pipe a jsonlines file with product nuts, like
#
# cat data/product_nuts.jsonl | python featurize.py >data/product_nut_features.jsonl
#
import json
import re
import sys
from unidecode import unidecode
STOPWORDS = '''
het de deze
en of om te hier nog ook al
in van voor mee per als tot uit bij
waar waardoor waarvan wanneer
je uw ze zelf jezelf
ca bijv bijvoorbeeld
is bevat hebben kunnen mogen zullen willen
gemaakt aanbevolen
belangrijk belangrijke heerlijk heerlijke handig handige dagelijks dagelijkse
gebruik allergieinformatie bijdrage smaak hoeveelheid
'''.split()
def clean(s):
if s is None: return None
# @todo keep '0.50%' and the like (or extract separately) - relevant for alcohol-free
s = unidecode(s).strip()
s = re.sub(r'[^A-Za-z0-9\'\s]', '', s, flags=re.MULTILINE)
s = re.sub(r'\s+', ' ', s, flags=re.MULTILINE)
return s
def get_brand_name(j):
'''Return brand name from brand_name or brand_url'''
s = j.get('brand_name', '').strip()
if s == '':
s = j.get('brand_url', '').strip()
s = re.sub(r'(\Ahttps?://(www\.)?|\Awww\.|\.\w{2,3}\/?\Z)', '', s, flags=re.MULTILINE|re.IGNORECASE)
return s
def f_name(j):
f = clean(j.get('name', '').lower())
# strip brand from front of name, would be twice featurized
brand_name_clean = clean(get_brand_name(j).lower())
if brand_name_clean != '' and f.startswith(brand_name_clean):
f = f[len(brand_name_clean):].strip()
if f == '': return []
return f.split()
def f_brand(j):
f = clean(get_brand_name(j))
if f == '': return []
return ['BRN:' + f]
def f_first_ingredient(j):
if 'ingredients' not in j or len(j['ingredients']) == 0: return []
f = j['ingredients'][0].strip().lower()
# we're more interested in whether the ingredient is composed, than its exact content
if re.search(r'[({:;,\n]', f, flags=re.MULTILINE):
f = '(COMPOSED)'
f = clean(f)
if f == '': return []
return ['ING:' + f]
def f_description(j):
f = clean(j.get('description', '')).lower()
if f == '': return []
return ['DSC:' + s for s in f.split()]
def tokenize(j, parts=['name', 'brand', 'first_ingredient']):
'''Returns array of tokens for product nut dict'''
tokens = []
if 'name' in parts: tokens.extend(f_name(j))
if 'brand' in parts: tokens.extend(f_brand(j))
if 'first_ingredient' in parts: tokens.extend(f_first_ingredient(j))
if 'description' in parts: tokens.extend(f_description(j))
tokens = filter(lambda s: s.split(':', 1)[-1] not in STOPWORDS, tokens)
tokens = filter(lambda s: len(s) > 1, tokens)
return tokens
def tokenize_dict(j, parts=['name', 'brand', 'first_ingredient']):
'''Returns a dict with id, tokens and optional usage_name and product_id'''
d = {'id': j['id'], 'tokens': tokenize(j, parts)}
if 'usage' in j: d['usage'] = j['usage']
if 'product_id' in j: d['product_id'] = j['product_id']
return d
if __name__ == '__main__':
for line in map(str.rstrip, sys.stdin):
j = json.loads(line)
d = tokenize_dict(j)
print(json.dumps(d))