From 0a64a896f902116bef29e9d77291b3b812e781d9 Mon Sep 17 00:00:00 2001 From: "smanousopoulos@gmail.com" Date: Fri, 5 Jun 2015 22:57:19 +0300 Subject: [PATCH 1/3] made necessary free keywords additions in schema, types, to/from xml --- .../lib/metadata/schemata/inspire_metadata.py | 9 ++++++ .../lib/metadata/types/inspire_metadata.py | 25 +++++++++++++-- .../templates/package/inspire_iso.xml | 32 +++++++++++++++++++ ckanext/publicamundi/tests/test_inspire.py | 4 +++ 4 files changed, 67 insertions(+), 3 deletions(-) diff --git a/ckanext/publicamundi/lib/metadata/schemata/inspire_metadata.py b/ckanext/publicamundi/lib/metadata/schemata/inspire_metadata.py index 8e023e9..468610d 100644 --- a/ckanext/publicamundi/lib/metadata/schemata/inspire_metadata.py +++ b/ckanext/publicamundi/lib/metadata/schemata/inspire_metadata.py @@ -119,6 +119,15 @@ def check_keywords(obj): raise zope.interface.Invalid( 'You need to select at least one keyword from INSPIRE data themes') + free_keywords = zope.schema.List( + title= u'Free Keywords', + description = u"The keyword value is a commonly used word, formalised word or phrase used to describe the subject. While the topic category is too coarse for detailed queries, keywords help narrowing a full text search and they allow for structured keyword search.", + required = False, + max_length = 10, + value_type = zope.schema.Object(IFreeKeyword, + title = u'Free Keyword')) + free_keywords.setTaggedValue('format:markup', { 'descend-if-dictized': False }) + # Geographic bounding_box = zope.schema.List( diff --git a/ckanext/publicamundi/lib/metadata/types/inspire_metadata.py b/ckanext/publicamundi/lib/metadata/types/inspire_metadata.py index 40a7289..c0f4668 100644 --- a/ckanext/publicamundi/lib/metadata/types/inspire_metadata.py +++ b/ckanext/publicamundi/lib/metadata/types/inspire_metadata.py @@ -63,6 +63,7 @@ class InspireMetadata(BaseMetadata): topic_category = list keywords = KeywordsFactory() + free_keywords = list bounding_box = list @@ -177,9 +178,18 @@ def to_resp_party(alist): topic_list.append(topic) keywords_dict = {} + free_keywords = [] + print 'start here' for it in md.identification.keywords: + print it thes_title = it['thesaurus']['title'] - if thes_title is not None: + if thes_title is None: + date = to_date(it['thesaurus']['date']) + datetype= it['thesaurus']['datetype'] + title = it['thesaurus']['title'] + for t in it['keywords']: + free_keywords.append(FreeKeyword(value=t, reference_date=date, date_type=datetype, originating_vocabulary=title )) + else: thes_split = thes_title.split(',') # TODO thes_split[1] (=version) can be used in a get_by_title_and_version() # to enforce a specific thesaurus version. @@ -193,8 +203,15 @@ def to_resp_party(alist): if thes: kw = ThesaurusTerms(thesaurus=thes, terms=term_list) keywords_dict.update({thes_name:kw}) - except: - pass + + except ValueError: + print 'free keywords with name' + date = to_date(it['thesaurus']['date']) + datetype= it['thesaurus']['datetype'] + title = it['thesaurus']['title'] + for t in it['keywords']: + free_keywords.append(FreeKeyword(value=t, reference_date=date, date_type=datetype, originating_vocabulary=title )) + temporal_extent = [] if md.identification.temporalextent_start or md.identification.temporalextent_end: temporal_extent = [TemporalExtent( @@ -301,6 +318,8 @@ def to_resp_party(alist): #obj.resource_language = md.identification.resourcelanguage obj.topic_category = topic_list obj.keywords = keywords_dict + print free_keywords + obj.free_keywords = free_keywords obj.bounding_box = bbox obj.temporal_extent = temporal_extent obj.creation_date = creation_date diff --git a/ckanext/publicamundi/templates/package/inspire_iso.xml b/ckanext/publicamundi/templates/package/inspire_iso.xml index ecce79b..acb831c 100644 --- a/ckanext/publicamundi/templates/package/inspire_iso.xml +++ b/ckanext/publicamundi/templates/package/inspire_iso.xml @@ -175,6 +175,38 @@ + {% endfor -%} + {% for k in data.free_keywords -%} + + + + {{ k['value'] }} + + {% if k['originating_vocabulary'] -%} + + + + {{ k['originating_vocabulary'] }} + + + + + {{ k['reference_date'] }} + + + {{ k['date_type'] }} + + + + + + {% endif -%} + + + {% endfor -%} {% for rc in data.access_constraints -%} diff --git a/ckanext/publicamundi/tests/test_inspire.py b/ckanext/publicamundi/tests/test_inspire.py index f820128..5173c3f 100644 --- a/ckanext/publicamundi/tests/test_inspire.py +++ b/ckanext/publicamundi/tests/test_inspire.py @@ -253,6 +253,10 @@ insp10 = copy.deepcopy(fixtures.inspire1) insp10.keywords = {} +# Keywords completely empty +insp11 = copy.deepcopy(fixtures.inspire1) +insp11.free_keywords = FreeKeywords(value="free-keyword") + # # Tests # From 89b3b70ba7d6bf3da0b5f816a64b3a3ede8266cd Mon Sep 17 00:00:00 2001 From: Michail Alexakis Date: Sat, 6 Jun 2015 20:10:20 -0400 Subject: [PATCH 2/3] Rebuild INSPIRE vocabularies as of #138 (wip) Rewrite the `from_xml()` part to properly store keywords either as thesaurus terms or as free keywords. --- ckanext/publicamundi/controllers/api.py | 8 +-- ckanext/publicamundi/lib/metadata/base.py | 4 +- .../publicamundi/lib/metadata/types/baz.py | 2 +- .../lib/metadata/types/inspire_metadata.py | 55 +++++++++++++------ .../lib/metadata/types/thesaurus.py | 10 ++-- .../metadata/vocabularies/babel_extractors.py | 2 +- .../lib/metadata/vocabularies/json_loader.py | 7 ++- ckanext/publicamundi/tests/fixtures.py | 4 +- 8 files changed, 57 insertions(+), 35 deletions(-) diff --git a/ckanext/publicamundi/controllers/api.py b/ckanext/publicamundi/controllers/api.py index 8565de5..e638af4 100644 --- a/ckanext/publicamundi/controllers/api.py +++ b/ckanext/publicamundi/controllers/api.py @@ -121,15 +121,15 @@ def vocabulary_get(self, name): name = str(name) r = None - vocab = vocabularies.get_by_name(name) + vocab = vocabularies.get_by_name(name) if vocab: - terms = vocab['vocabulary'].by_value r = { 'date_type': vocab.get('date_type'), 'reference_date': vocab.get('reference_date'), 'title': vocab.get('title'), 'name': vocab.get('name'), - 'terms': [{ 'value': k, 'title': terms[k].title } for k in terms], + 'terms': [{'token': t.token, 'value': t.value, 'title': t.title} + for t in vocab['vocabulary']], } response.headers['Content-Type'] = content_types['json'] @@ -149,7 +149,7 @@ def dataset_export(self, name_or_id): return def dataset_import(self): - + post = request.params # Forward to the dataset_import action diff --git a/ckanext/publicamundi/lib/metadata/base.py b/ckanext/publicamundi/lib/metadata/base.py index a05e0b9..2caf6bd 100644 --- a/ckanext/publicamundi/lib/metadata/base.py +++ b/ckanext/publicamundi/lib/metadata/base.py @@ -58,9 +58,9 @@ def flatten_field(field): 'Only zope.schema.Choice supported for key_type' res = {} res1 = flatten_field(field.value_type) - for v in field.key_type.vocabulary: + for t in field.key_type.vocabulary: for k1, field1 in res1.items(): - res[(v.token,) + k1] = field1 + res[(t.value,) + k1] = field1 else: res = { (): field } diff --git a/ckanext/publicamundi/lib/metadata/types/baz.py b/ckanext/publicamundi/lib/metadata/types/baz.py index 4fe5a0b..99e91e3 100644 --- a/ckanext/publicamundi/lib/metadata/types/baz.py +++ b/ckanext/publicamundi/lib/metadata/types/baz.py @@ -7,7 +7,7 @@ from ckanext.publicamundi.lib.metadata.types import Thesaurus, ThesaurusTerms from ckanext.publicamundi.lib.metadata.types._common import * -thesaurus_gemet_themes = Thesaurus.make('keywords-gemet-themes') +thesaurus_gemet_themes = Thesaurus.lookup('keywords-gemet-themes') class KeywordsFactory(object): diff --git a/ckanext/publicamundi/lib/metadata/types/inspire_metadata.py b/ckanext/publicamundi/lib/metadata/types/inspire_metadata.py index 40a7289..786bc75 100644 --- a/ckanext/publicamundi/lib/metadata/types/inspire_metadata.py +++ b/ckanext/publicamundi/lib/metadata/types/inspire_metadata.py @@ -1,4 +1,5 @@ import os +import re import uuid import zope.interface import zope.schema @@ -27,7 +28,7 @@ def __init__(self, thesaurus_name='keywords-gemet-inspire-themes'): def __call__(self): keywords = {} keywords[self._name] = ThesaurusTerms( - terms=[], thesaurus=Thesaurus.make(self._name)) + terms=[], thesaurus=Thesaurus.lookup(self._name)) return keywords class TemporalExtentFactory(object): @@ -162,6 +163,8 @@ def to_resp_party(alist): role = it.role)) return result + # Parse object + md = MD_Metadata(e) datestamp = to_date(md.datestamp) @@ -176,25 +179,43 @@ def to_resp_party(alist): for topic in md.identification.topiccategory: topic_list.append(topic) - keywords_dict = {} + free_keywords = [] + keywords = {} for it in md.identification.keywords: thes_title = it['thesaurus']['title'] - if thes_title is not None: - thes_split = thes_title.split(',') - # TODO thes_split[1] (=version) can be used in a get_by_title_and_version() - # to enforce a specific thesaurus version. - thes_title = thes_split[0] + # Lookup and instantiate a named thesaurus + thes = None + if thes_title: try: - thes_name = vocabularies.munge('Keywords-' + thes_title) - term_list = [] - for t in it['keywords']: - term_list.append(t) - thes = Thesaurus.make(thes_name) - if thes: - kw = ThesaurusTerms(thesaurus=thes, terms=term_list) - keywords_dict.update({thes_name:kw}) + thes_title, thes_version = thes_title.split(',') except: - pass + thes_version = None + else: + thes_version = re.sub(r'^[ ]*version[ ]+(\d\.\d)$', r'\1', thes_version) + thes_name = 'keywords-' + vocabularies.munge(thes_title) + # Note thes_version can be used to enforce a specific thesaurus version + try: + thes = Thesaurus.lookup(thes_name) + except ValueError: + thes = None + # Treat present keywords depending on if they belong to a thesaurus + if thes: + # Treat as thesaurus terms; discard unknown terms + terms = [] + for keyword in it['keywords']: + term = thes.vocabulary.by_value.get(keyword) + if not term: + term = thes.vocabulary.by_token.get(keyword) + if term: + terms.append(term.value) + keywords[thes.name] = ThesaurusTerms(thesaurus=thes, terms=terms) + else: + # Treat as free keywords + # Todo Build a list of FreeKeyword items + for keyword in it['keywords']: + # Todo Maybe convert keyword to a canonical form (e.g. munge) + free_keywords.append(keyword) + temporal_extent = [] if md.identification.temporalextent_start or md.identification.temporalextent_end: temporal_extent = [TemporalExtent( @@ -300,7 +321,7 @@ def to_resp_party(alist): obj.locator = url_list #obj.resource_language = md.identification.resourcelanguage obj.topic_category = topic_list - obj.keywords = keywords_dict + obj.keywords = keywords obj.bounding_box = bbox obj.temporal_extent = temporal_extent obj.creation_date = creation_date diff --git a/ckanext/publicamundi/lib/metadata/types/thesaurus.py b/ckanext/publicamundi/lib/metadata/types/thesaurus.py index 4c94817..6fe7fca 100644 --- a/ckanext/publicamundi/lib/metadata/types/thesaurus.py +++ b/ckanext/publicamundi/lib/metadata/types/thesaurus.py @@ -28,12 +28,12 @@ def vocabulary(self): # Factory for Thesaurus @classmethod - def make(cls, name): - '''Create a new Thesaurus instance from it's machine-name name. - The metadata for this thesaurus are queried from vocabularies module. - - Note: Maybe rename this class-method to lookup + def lookup(cls, name): + '''Lookup a thesaurus by it's name and return a Thesaurus instance. + The metadata for a newly created thesaurus are queried from vocabularies + module. ''' + spec = vocabularies.get_by_name(name) if spec: kwargs = { diff --git a/ckanext/publicamundi/lib/metadata/vocabularies/babel_extractors.py b/ckanext/publicamundi/lib/metadata/vocabularies/babel_extractors.py index d5d1005..3d6a02c 100644 --- a/ckanext/publicamundi/lib/metadata/vocabularies/babel_extractors.py +++ b/ckanext/publicamundi/lib/metadata/vocabularies/babel_extractors.py @@ -8,7 +8,7 @@ # Babel string extraction functions def extract_json(fileobj, keywords, comment_tags, options): - """Extract messages from XXX files. + """Extract messages from files. :param fileobj: the file-like object the messages should be extracted from :param keywords: a list of keywords (i.e. function names) that should be recognized as translation functions :param comment_tags: a list of translator tags to search for and include in the results diff --git a/ckanext/publicamundi/lib/metadata/vocabularies/json_loader.py b/ckanext/publicamundi/lib/metadata/vocabularies/json_loader.py index 75674eb..cb7e33a 100644 --- a/ckanext/publicamundi/lib/metadata/vocabularies/json_loader.py +++ b/ckanext/publicamundi/lib/metadata/vocabularies/json_loader.py @@ -29,18 +29,19 @@ def make_vocabulary(data): The input data can be one of the following: * a list of human-readable terms or a * a dict that maps machine-readable to human-readable terms. - ''' + + # Note: A SimpleTerm is a tuple (value, token, title) terms = [] if isinstance(data, list): for t in data: k = munge(t) - terms.append(SimpleTerm(k, k, t)) + terms.append(SimpleTerm(k, t, t)) elif isinstance(data, dict): for k, t in data.items(): #k = munge(k) - terms.append(SimpleTerm(k, k, t)) + terms.append(SimpleTerm(k, t, t)) return SimpleVocabulary(terms, swallow_duplicates=True) def make_vocabularies(data_file): diff --git a/ckanext/publicamundi/tests/fixtures.py b/ckanext/publicamundi/tests/fixtures.py index f05619a..5e6cce3 100644 --- a/ckanext/publicamundi/tests/fixtures.py +++ b/ckanext/publicamundi/tests/fixtures.py @@ -144,9 +144,9 @@ date_type = 'creation' ) -thesaurus_gemet_themes = Thesaurus.make('keywords-gemet-themes') +thesaurus_gemet_themes = Thesaurus.lookup('keywords-gemet-themes') -thesaurus_gemet_inspire_data_themes = Thesaurus.make('keywords-gemet-inspire-themes') +thesaurus_gemet_inspire_data_themes = Thesaurus.lookup('keywords-gemet-inspire-themes') # Baz From d5330468d1664f009a6ea279696850b9bd1decfc Mon Sep 17 00:00:00 2001 From: Michail Alexakis Date: Sun, 7 Jun 2015 15:24:36 -0400 Subject: [PATCH 3/3] Rewrite load/normalize/lookup logic for INSPIRE vocabularies This should resolve #138. It also addresses #137 but tests have to be added. --- .../lib/metadata/types/_common.py | 11 ++++++ .../lib/metadata/types/inspire_metadata.py | 4 +- .../lib/metadata/types/thesaurus.py | 39 ++++++++++++------- .../lib/metadata/vocabularies/__init__.py | 8 ++-- .../lib/metadata/vocabularies/json_loader.py | 26 +++++++------ 5 files changed, 54 insertions(+), 34 deletions(-) diff --git a/ckanext/publicamundi/lib/metadata/types/_common.py b/ckanext/publicamundi/lib/metadata/types/_common.py index 68e06b9..8746eee 100644 --- a/ckanext/publicamundi/lib/metadata/types/_common.py +++ b/ckanext/publicamundi/lib/metadata/types/_common.py @@ -68,6 +68,17 @@ class FreeKeyword(Object): reference_date = None date_type = None + @classmethod + def normalize_keyword(cls, s): + from inflection import dasherize, underscore + return dasherize(underscore(unicode(s))) + + def __init__(self, **kwargs): + value = kwargs.get('value') + if value: + kwargs['value'] = self.normalize_keyword(value) + super(FreeKeyword, self).__init__(**kwargs) + @object_null_adapter() class GeographicBoundingBox(Object): diff --git a/ckanext/publicamundi/lib/metadata/types/inspire_metadata.py b/ckanext/publicamundi/lib/metadata/types/inspire_metadata.py index 808fe98..575052e 100644 --- a/ckanext/publicamundi/lib/metadata/types/inspire_metadata.py +++ b/ckanext/publicamundi/lib/metadata/types/inspire_metadata.py @@ -192,10 +192,9 @@ def to_responsible_party(alist): thes_version = None else: thes_version = re.sub(r'^[ ]*version[ ]+(\d\.\d)$', r'\1', thes_version) - thes_name = 'keywords-' + vocabularies.munge(thes_title) # Note thes_version can be used to enforce a specific thesaurus version try: - thes = Thesaurus.lookup(name=thes_name) + thes = Thesaurus.lookup(title=thes_title, for_keywords=True) except ValueError: thes = None # Treat present keywords depending on if they belong to a thesaurus @@ -214,7 +213,6 @@ def to_responsible_party(alist): vocab_date = to_date(it['thesaurus']['date']) vocab_datetype = it['thesaurus']['datetype'] for keyword in it['keywords']: - # Todo Maybe convert keyword to a canonical form (e.g. munge) free_keywords.append(FreeKeyword( value = keyword, reference_date = vocab_date, diff --git a/ckanext/publicamundi/lib/metadata/types/thesaurus.py b/ckanext/publicamundi/lib/metadata/types/thesaurus.py index 6fe7fca..eeef5ed 100644 --- a/ckanext/publicamundi/lib/metadata/types/thesaurus.py +++ b/ckanext/publicamundi/lib/metadata/types/thesaurus.py @@ -22,31 +22,40 @@ class Thesaurus(Object): @property def vocabulary(self): - spec = vocabularies.get_by_name(self.name) - return spec.get('vocabulary') if spec else None + vocab = vocabularies.get_by_name(self.name) + return vocab.get('vocabulary') if vocab else None # Factory for Thesaurus @classmethod - def lookup(cls, name): - '''Lookup a thesaurus by it's name and return a Thesaurus instance. - The metadata for a newly created thesaurus are queried from vocabularies - module. + def lookup(cls, name=None, title=None, for_keywords=False): + '''Lookup by name or title and return a Thesaurus instance. + + This is a factory method that tries to instantiate a Thesaurus object + from a collection of well-known (mostly related to INSPIRE) vocabularies. ''' + + vocab = None + + if (name is None) and title: + name = vocabularies.normalize_thesaurus_title(title, for_keywords) + + if name: + vocab = vocabularies.get_by_name(name) + else: + raise ValueError('Expected a name/title lookup') - spec = vocabularies.get_by_name(name) - if spec: + if vocab: kwargs = { - 'title': spec.get('title'), - 'name': spec.get('name'), - 'reference_date': spec.get('reference_date'), - 'version' : spec.get('version'), - 'date_type': spec.get('date_type'), + 'title': vocab.get('title'), + 'name': vocab.get('name'), + 'reference_date': vocab.get('reference_date'), + 'version' : vocab.get('version'), + 'date_type': vocab.get('date_type'), } return cls(**kwargs) else: - raise ValueError( - 'Cannot find an INSPIRE thesaurus named "%s"' %(name)) + raise ValueError('Cannot find a thesaurus named "%s"' %(name)) @object_null_adapter() class ThesaurusTerms(Object): diff --git a/ckanext/publicamundi/lib/metadata/vocabularies/__init__.py b/ckanext/publicamundi/lib/metadata/vocabularies/__init__.py index 2efb3ff..611d5bb 100644 --- a/ckanext/publicamundi/lib/metadata/vocabularies/__init__.py +++ b/ckanext/publicamundi/lib/metadata/vocabularies/__init__.py @@ -8,15 +8,13 @@ # Import loader -from ckanext.publicamundi.lib.metadata.vocabularies import json_loader - -munge = json_loader.munge +from ckanext.publicamundi.lib.metadata.vocabularies.json_loader import ( + make_vocabularies, normalize_keyword, normalize_thesaurus_title) def _update(data_file, name_prefix='', overwrite=False): '''Update the module-global vocabularies from external JSON data. ''' - - for name, desc in json_loader.make_vocabularies(data_file): + for name, desc in make_vocabularies(data_file): assert overwrite or not (name in vocabularies), ( 'A vocabulary named %r is allready loaded' % (name)) vocabularies[name_prefix + name] = desc diff --git a/ckanext/publicamundi/lib/metadata/vocabularies/json_loader.py b/ckanext/publicamundi/lib/metadata/vocabularies/json_loader.py index cb7e33a..58c7a50 100644 --- a/ckanext/publicamundi/lib/metadata/vocabularies/json_loader.py +++ b/ckanext/publicamundi/lib/metadata/vocabularies/json_loader.py @@ -6,10 +6,8 @@ import zope.schema from zope.schema.vocabulary import SimpleVocabulary, SimpleTerm -def munge(name): - '''Convert human-friendly to machine-friendly terms. - - Needed when a machine-friendly version is not supplied. +def _munge(name): + '''Convert human-friendly to machine-friendly names. ''' re_bad = re.compile('[\(\),]+') @@ -23,6 +21,15 @@ def munge(name): return name +def normalize_keyword(name): + return _munge(name) + +def normalize_thesaurus_title(name, for_keywords=False): + if not for_keywords: + return _munge(name) + else: + return _munge('keywords' + ' ' + name) + def make_vocabulary(data): '''Convert raw data to a SimpleVocabulary instance. @@ -30,17 +37,15 @@ def make_vocabulary(data): * a list of human-readable terms or a * a dict that maps machine-readable to human-readable terms. ''' - - # Note: A SimpleTerm is a tuple (value, token, title) terms = [] if isinstance(data, list): for t in data: - k = munge(t) + k = normalize_keyword(t) terms.append(SimpleTerm(k, t, t)) elif isinstance(data, dict): for k, t in data.items(): - #k = munge(k) + #k = normalize_keyword(k) terms.append(SimpleTerm(k, t, t)) return SimpleVocabulary(terms, swallow_duplicates=True) @@ -55,7 +60,7 @@ def make_vocabularies(data_file): data = json.loads(fp.read()) for title in (set(data.keys()) - set(['Keywords'])): - name = munge(title) + name = normalize_thesaurus_title(title) desc = { 'name': name, 'title': title, @@ -67,8 +72,7 @@ def make_vocabularies(data_file): for title in keywords_data.keys(): keywords = keywords_data.get(title) keywords_terms = make_vocabulary(keywords.get('terms')) - - name = munge('Keywords-' + title) + name = normalize_thesaurus_title(title, for_keywords=True) desc = { 'name': name, 'title': title,