Extract fields from DataCite XML

plebln · Mar 22, 2018 · 8878b78 · 8878b78
1 parent a7a6bc9
commit 8878b78
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -19,3 +19,16 @@ http://www.bibtex.org/Format/
 DataCite is ... the provider and maintainer of DataCite XML,
 a schema for metadata about digital objects.
 
+
+Using xml2bib
+-------------
+
+1. Standalone.
+
+	python xml2bib file1.xml [ file2.xml ... ]
+
+    e.g. python xml2bib.py xmlfiles/*  runs xml2bib on all the files listed on the command line.
+
+2. As a Python module. Load your XML, then call xml2dict, and the str() method of BibTeXWriter. [add example]
+
+
diff --git a/xml2bib.py b/xml2bib.py
@@ -44,25 +44,27 @@ def xml2dict(x):
     """
 
     r = x.root
-    print(r.attrib)
+    #print('DEBUG', r.attrib)
 
     xmlfields = ('identifier', 'title', 'publisher',
                  'publicationYear', 'resourceType')
-    nsd = {'dc': 'http://datacite.org/schema/kernel-4'}
+    #nsd = {'dc': 'http://datacite.org/schema/kernel-4'}
+    ns = '{http://datacite.org/schema/kernel-4}'
 
     dc = dict.fromkeys(xmlfields)
     for field in xmlfields:
-        for n in r.findall('dc:' + field, nsd):
+        #for n in r.findall('dc:' + field, nsd):
+        for n in r.iter(ns + field):
             if n is not None:
                 v = n.text
-                print('found', field, v)
+                #print('Found', field, v)
                 dc[field] = v
 
         #for n in r.findall(field):
         #    print('found', field)
         #    dc[field] = n.text
 
-    print(dc)
+    #print('DEBUG dc=', dc)
 
     d = dict()
     d['authors'] = 'Abbott, A and Costello, C'