From d5330468d1664f009a6ea279696850b9bd1decfc Mon Sep 17 00:00:00 2001 From: Michail Alexakis Date: Sun, 7 Jun 2015 15:24:36 -0400 Subject: [PATCH] Rewrite load/normalize/lookup logic for INSPIRE vocabularies This should resolve #138. It also addresses #137 but tests have to be added. --- .../lib/metadata/types/_common.py | 11 ++++++ .../lib/metadata/types/inspire_metadata.py | 4 +- .../lib/metadata/types/thesaurus.py | 39 ++++++++++++------- .../lib/metadata/vocabularies/__init__.py | 8 ++-- .../lib/metadata/vocabularies/json_loader.py | 26 +++++++------ 5 files changed, 54 insertions(+), 34 deletions(-) diff --git a/ckanext/publicamundi/lib/metadata/types/_common.py b/ckanext/publicamundi/lib/metadata/types/_common.py index 68e06b9..8746eee 100644 --- a/ckanext/publicamundi/lib/metadata/types/_common.py +++ b/ckanext/publicamundi/lib/metadata/types/_common.py @@ -68,6 +68,17 @@ class FreeKeyword(Object): reference_date = None date_type = None + @classmethod + def normalize_keyword(cls, s): + from inflection import dasherize, underscore + return dasherize(underscore(unicode(s))) + + def __init__(self, **kwargs): + value = kwargs.get('value') + if value: + kwargs['value'] = self.normalize_keyword(value) + super(FreeKeyword, self).__init__(**kwargs) + @object_null_adapter() class GeographicBoundingBox(Object): diff --git a/ckanext/publicamundi/lib/metadata/types/inspire_metadata.py b/ckanext/publicamundi/lib/metadata/types/inspire_metadata.py index 808fe98..575052e 100644 --- a/ckanext/publicamundi/lib/metadata/types/inspire_metadata.py +++ b/ckanext/publicamundi/lib/metadata/types/inspire_metadata.py @@ -192,10 +192,9 @@ def to_responsible_party(alist): thes_version = None else: thes_version = re.sub(r'^[ ]*version[ ]+(\d\.\d)$', r'\1', thes_version) - thes_name = 'keywords-' + vocabularies.munge(thes_title) # Note thes_version can be used to enforce a specific thesaurus version try: - thes = Thesaurus.lookup(name=thes_name) + thes = Thesaurus.lookup(title=thes_title, for_keywords=True) except ValueError: thes = None # Treat present keywords depending on if they belong to a thesaurus @@ -214,7 +213,6 @@ def to_responsible_party(alist): vocab_date = to_date(it['thesaurus']['date']) vocab_datetype = it['thesaurus']['datetype'] for keyword in it['keywords']: - # Todo Maybe convert keyword to a canonical form (e.g. munge) free_keywords.append(FreeKeyword( value = keyword, reference_date = vocab_date, diff --git a/ckanext/publicamundi/lib/metadata/types/thesaurus.py b/ckanext/publicamundi/lib/metadata/types/thesaurus.py index 6fe7fca..eeef5ed 100644 --- a/ckanext/publicamundi/lib/metadata/types/thesaurus.py +++ b/ckanext/publicamundi/lib/metadata/types/thesaurus.py @@ -22,31 +22,40 @@ class Thesaurus(Object): @property def vocabulary(self): - spec = vocabularies.get_by_name(self.name) - return spec.get('vocabulary') if spec else None + vocab = vocabularies.get_by_name(self.name) + return vocab.get('vocabulary') if vocab else None # Factory for Thesaurus @classmethod - def lookup(cls, name): - '''Lookup a thesaurus by it's name and return a Thesaurus instance. - The metadata for a newly created thesaurus are queried from vocabularies - module. + def lookup(cls, name=None, title=None, for_keywords=False): + '''Lookup by name or title and return a Thesaurus instance. + + This is a factory method that tries to instantiate a Thesaurus object + from a collection of well-known (mostly related to INSPIRE) vocabularies. ''' + + vocab = None + + if (name is None) and title: + name = vocabularies.normalize_thesaurus_title(title, for_keywords) + + if name: + vocab = vocabularies.get_by_name(name) + else: + raise ValueError('Expected a name/title lookup') - spec = vocabularies.get_by_name(name) - if spec: + if vocab: kwargs = { - 'title': spec.get('title'), - 'name': spec.get('name'), - 'reference_date': spec.get('reference_date'), - 'version' : spec.get('version'), - 'date_type': spec.get('date_type'), + 'title': vocab.get('title'), + 'name': vocab.get('name'), + 'reference_date': vocab.get('reference_date'), + 'version' : vocab.get('version'), + 'date_type': vocab.get('date_type'), } return cls(**kwargs) else: - raise ValueError( - 'Cannot find an INSPIRE thesaurus named "%s"' %(name)) + raise ValueError('Cannot find a thesaurus named "%s"' %(name)) @object_null_adapter() class ThesaurusTerms(Object): diff --git a/ckanext/publicamundi/lib/metadata/vocabularies/__init__.py b/ckanext/publicamundi/lib/metadata/vocabularies/__init__.py index 2efb3ff..611d5bb 100644 --- a/ckanext/publicamundi/lib/metadata/vocabularies/__init__.py +++ b/ckanext/publicamundi/lib/metadata/vocabularies/__init__.py @@ -8,15 +8,13 @@ # Import loader -from ckanext.publicamundi.lib.metadata.vocabularies import json_loader - -munge = json_loader.munge +from ckanext.publicamundi.lib.metadata.vocabularies.json_loader import ( + make_vocabularies, normalize_keyword, normalize_thesaurus_title) def _update(data_file, name_prefix='', overwrite=False): '''Update the module-global vocabularies from external JSON data. ''' - - for name, desc in json_loader.make_vocabularies(data_file): + for name, desc in make_vocabularies(data_file): assert overwrite or not (name in vocabularies), ( 'A vocabulary named %r is allready loaded' % (name)) vocabularies[name_prefix + name] = desc diff --git a/ckanext/publicamundi/lib/metadata/vocabularies/json_loader.py b/ckanext/publicamundi/lib/metadata/vocabularies/json_loader.py index cb7e33a..58c7a50 100644 --- a/ckanext/publicamundi/lib/metadata/vocabularies/json_loader.py +++ b/ckanext/publicamundi/lib/metadata/vocabularies/json_loader.py @@ -6,10 +6,8 @@ import zope.schema from zope.schema.vocabulary import SimpleVocabulary, SimpleTerm -def munge(name): - '''Convert human-friendly to machine-friendly terms. - - Needed when a machine-friendly version is not supplied. +def _munge(name): + '''Convert human-friendly to machine-friendly names. ''' re_bad = re.compile('[\(\),]+') @@ -23,6 +21,15 @@ def munge(name): return name +def normalize_keyword(name): + return _munge(name) + +def normalize_thesaurus_title(name, for_keywords=False): + if not for_keywords: + return _munge(name) + else: + return _munge('keywords' + ' ' + name) + def make_vocabulary(data): '''Convert raw data to a SimpleVocabulary instance. @@ -30,17 +37,15 @@ def make_vocabulary(data): * a list of human-readable terms or a * a dict that maps machine-readable to human-readable terms. ''' - - # Note: A SimpleTerm is a tuple (value, token, title) terms = [] if isinstance(data, list): for t in data: - k = munge(t) + k = normalize_keyword(t) terms.append(SimpleTerm(k, t, t)) elif isinstance(data, dict): for k, t in data.items(): - #k = munge(k) + #k = normalize_keyword(k) terms.append(SimpleTerm(k, t, t)) return SimpleVocabulary(terms, swallow_duplicates=True) @@ -55,7 +60,7 @@ def make_vocabularies(data_file): data = json.loads(fp.read()) for title in (set(data.keys()) - set(['Keywords'])): - name = munge(title) + name = normalize_thesaurus_title(title) desc = { 'name': name, 'title': title, @@ -67,8 +72,7 @@ def make_vocabularies(data_file): for title in keywords_data.keys(): keywords = keywords_data.get(title) keywords_terms = make_vocabulary(keywords.get('terms')) - - name = munge('Keywords-' + title) + name = normalize_thesaurus_title(title, for_keywords=True) desc = { 'name': name, 'title': title,