Added renaming of bibtex id.

ruofeidu · Mar 17, 2019 · 785dcfe · 785dcfe
1 parent 5b6f23f
commit 785dcfe
Show file tree

Hide file tree

Showing 4 changed files with 2,593 additions and 32 deletions.
diff --git a/DuBibtex.py b/DuBibtex.py
@@ -31,6 +31,8 @@ class Paras:
   header = {}
   DOI2URL = False
   removeUrl = False
+  # If optimizeBibId is True, orename Bib Id into LastName2000FirstTitleWord
+  optimizeBibId = False
   fieldRemovalList = []
   autoCommentCredit = '% Automatically generated by DuBibTeX.\n'
   autoCommentUrl = '% https://github.com/ruofeidu/DuBibtex\n'
@@ -82,6 +84,7 @@ def __init__(self):
     Paras.keepComments = config.getboolean(Paras.section, "keepComments")
     Paras.debugBibCrawler = config.getboolean(Paras.section, "debugBibCrawler")
     Paras.debugStatistics = config.getboolean(Paras.section, "debugStatistics")
+    Paras.optimizeBibId = config.getboolean(Paras.section, "optimizeBibId")
     Paras.inputFileList = config.get(Paras.section,
                                      "inputFileList").strip().split(",")
     Paras.doiJsonFile = config.get(Paras.section, "doiJsonFile").strip()
@@ -118,7 +121,27 @@ def fix_doi(self, _doi):
       self.cur['url'] = 'http://doi.org/%s' % _doi
 
   def write_current_item(self):
-    # print(self.cur)
+    # Ensures there is year field.
+    if 'year' not in self.cur or len(self.cur['year']) < 4:
+      m = Re.year.search(self.bib)
+      if m and m.groups():
+        self.cur['year'] = m.groups()[0]
+
+    # Optimizes self.bib id.
+    if Paras.optimizeBibId and 'author' in self.cur and 'title' in self.cur and 'year' in self.cur:
+      self.bib = self.cur['author'].split(
+          ',', 1)[0] + self.cur['year'] + self.cur['title'].split(
+              ' ', 1)[0].capitalize()
+
+    if not 'author' in self.cur:
+      print('Error: No author for ' + self.bib)
+
+    if not 'title' in self.cur:
+      print('Error: No title for ' + self.bib)
+
+    if not 'year' in self.cur:
+      print('Error: No year for ' + self.bib)
+
     self.fout.write('@%s{%s,\n' % (self.cur['type'].lower(), self.bib))
 
     if Paras.defaultAddress and 'address' not in self.cur:
@@ -133,25 +156,20 @@ def write_current_item(self):
         self.debug_bib('PUB\t' + self.cur['title'])
         self.cur['publisher'] = 'ACM'
 
-    if 'year' not in self.cur or len(self.cur['year']) < 4:
-      m = Re.year.search(self.bib)
-      if m and m.groups():
-        self.cur['year'] = m.groups()[0]
-
     if self.bib in self.doiDict:
       # self.debug_bib('Missing DOI, but obtained from the local dict JSON.')
       self.fix_doi(self.doiDict[self.bib])
 
-    # remove invalid doi field
+    # Removes invalid DOI field.
     if 'doi' in self.cur and '/' not in self.cur['doi']:
       del self.cur['doi']
     if self.cur['type'].lower() in ['misc', 'book'] and 'doi' in self.cur:
       del self.cur['doi']
 
-    # search doi field if missing
+    # Searches DOI field if the field was missing.
     if Paras.searchDOI and int(self.cur['year']) > Paras.minYear and 'doi' not in self.cur \
             and self.cur['type'].lower() not in ['misc', 'book', 'techreport'] and 'nodoi' not in self.cur:
-      # search for DOI
+      # Searches for DOI.
       self.debug_bib('Missing DOI, search "%s"...' % self.cur['title'])
 
       if 'journal' in self.cur and self.cur['journal'][:5].lower() == 'arxiv':
@@ -170,7 +188,7 @@ def write_current_item(self):
         else:
           self.numMissing += 1
 
-    # fix underscore and store it in the hash table
+    # Fixes underscore and stores it in the hash table.
     if 'doi' in self.cur:
       self.cur['doi'] = fix_underscore(self.cur['doi'])
       self.doiDict[self.bib] = self.cur['doi']
@@ -214,26 +232,26 @@ def parse_line(self, line):
       self.clear()
       return
 
-    # match duplicates
+    # Matches duplicates.
     if self.duplicated:
       if Paras.debugStatistics:
         print("* duplicated %s" % self.bib)
         self.numDuplicated += 1
       return
 
-    # match new bib item
+    # Matches new bib item.
     m = Re.bib.match(line)
     if m and len(m.groups()) > 0:
       self.add_new_bib(m.groups()[1], m.groups()[0])
 
-    # output comments
+    # Outputs comments when required.
     if not self.bib:
       if Paras.keepComments and line != Paras.autoCommentCredit and line != Paras.autoCommentUrl and len(
           line) > 2:
         self.fout.write(line)
       return
 
-    # for each bibtex, first match {{}} or {""}, then match {} or ""
+    # For each bibtex, first matches {{}} or {""}, then matches {} or "".
     m = Re.item2.match(line)
     if not m:
       m = Re.item.match(line)
@@ -415,6 +433,7 @@ def capitalize(s, spliter=' '):
   for filename in Paras.inputFileList:
     with open(filename, 'r') as f:
       lines = f.readlines()
+      print(filename)
       for line in lines:
         p.parse_line(line)
 

diff --git a/config.ini b/config.ini
@@ -1,23 +1,16 @@
 [DuBibtex]
 header				=	Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36
 searchDOI           =   True
-keepComments        =   True
+keepComments        =   False
 useOfflineDOI       =   True
 printSelfInfo       =   True
-;inputFileList       =   GeolleryTech.bib
-;outputFile          =   GeolleryTech_new.bib
-;inputFileList       =   ORCLayout.bib
-;;outputFile          =   ORCLayout_new.bib
-;inputFileList       =   Geollery.bib
-;outputFile          =   Geollery_new.bib
-inputFileList       =   lucss.bib
-outputFile          =   lucss_new.bib
-#inputFileList       =   Montage4D.bib
-#outputFile          =   Montage4D_new.bib
+inputFileList       =   input.bib,yuan.bib
+outputFile          =   yuan_v2.bib
 doiJsonFile         =   doi_dict.json
 debugBibCrawler     =   True
 debugStatistics     =   True
 defaultAddress      =   True
+optimizeBibId       =   True
 minYear             =   1946
 timeOut             =   3
 DOI2URL             =   False