/usr/share/bibus/Import/BibTeX.py

# Copyright 2005 Nigel Sim <nigel.sim@jcu.edu.au>
# This file is part of Bibus, a bibliographic database that can
# work together with OpenOffice.org to generate bibliographic indexes.
#
# Bibus is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# Bibus is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Bibus; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA.
#
# BibTeX format
# from http://en.wikipedia.org/wiki/BibTeX
#
from __future__ import generators		# to be removed in python 2.3
import BIB
import re

DEFAULT_ENCODING = 'latin_1'

class importRef(object):
	"""Class is iterable. Return records one by one with None for the id (first field)."""
	Type={
	'article':'ARTICLE',
	'book':'BOOK',
	'booklet':'BOOKLET',
	'conference':'CONFERENCE',
	'inbook':'INBOOK',
	'incollection':'INCOLLECTION',
	'inproceedings':'INPROCEEDINGS',
	'manual':'MANUAL',
	'mastersthesis':'MASTERTHESIS',
	'misc':'MISC',
	'phdthesis':'PHDTHESIS',
	'proceedings':'PROCEEDINGS',
	'techreport':'TECHREPORT',
	'unpublished':'UNPUBLISHED'}
	
	"""This is used to handle BibTeX specific extensions"""
	TypeBibTeXExt={'string':'STRING'}

	"""return a list of the record fields using
	('Identifier', 'Bibliographic_Type', 'Address', 'Annote', 'Author', 'Booktitle', 'Chapter', 'Edition', 		'Editor','HowPublished', 'Institution', 'Journal', 'Month', 'Note', 'Number', 'Organizations', 'Pages', 'Publisher',
	'School','Series', 'Title', 'Report_Type', 'Volume', 'Year', 'URL', 'Custom1', 'Custom2', 'Custom3', 'Custom4', 'Custom5', 		'ISBN','Abstract')"""
	Fields={
	'address':3,
	'annote':4,
	'author':5,
	'booktitle':6,
	'chapter':7,
	'edition':8,
	'editor':9,
	'howpublished':10,
	'institution':11,
	'journal':12,
	'month':13,
	'note':14,
	'number':15,
	'organization':16,
	'pages':17,
	'publisher':18,
	'school':19,
	'series':20,
	'title':21,
	'type':22,
	'volume':23,
	'year':24,
	'url':25,
	'doi':26,
	'location':27,
	'isbn':31,
	'issn':31,
	'abstract':32}

	""" Specify the correspondence of the delimmiter (Begin:End)"""
	Delimiter={
	'{':'}',
	'}':'{',
	'(':')',
	')':'(',
	'"':'"'}

	def __init__(self,infile):
		self.infile = infile	# must be a file type. Need a readline() function.

	def __iter__(self):
		"""Generator of records. for record in <instance>: ... """
		ImData = BibTeXImportData()
		ImData.newRecord()

		self.BibTeXInt = False
		self.level = []

		line = self.infile.readline()
		while line != '':
			for l in line:
				#Handling of the delimiter levels
				self.__manageDelimiterLevel(l)

				#Start of content scanning
				if l == '@' and len(self.level) == 0:
					#Check for pending BibTeX reference and Init a new one
					if ImData.pendingRecord(): #If there is a previous reference: submit
						yield ImData.cleanupRecord()
					self.__newReference(ImData)
				elif l in ['\r','\n'] and len(self.level) < 2:
					continue # Do nothing
				elif l == ',' and len(self.level) == 1:
					#We find a comma, which is a field seperator if it is outside the delimiter
					self.__endFieldAction(ImData)
				elif l in ['{','('] and len(self.level) == 1 and ImData.index != None:
					#After @-Tag we detect a beginning of a new reference content
					self.__beginReferencAction(ImData)
				elif l in ['{','(','"'] and len(self.level) == 2:
					if ImData.index and ImData.getCurrentRecItem() == None:
						ImData.setCurrentRecItem("")
					elif ImData.key and ImData.getCurrentKeyItem() == None:
						ImData.setCurrentKeyItem("")
				elif l in ['}',')','"'] and len(self.level) == 1:
					if   (ImData.index and ImData.getCurrentRecItem()) \
					  or (ImData.key and ImData.getCurrentKeyItem()):
						ImData.resetName()
				elif l in ['}',')'] and len(self.level) == 0:
					self.__endReferencAction(ImData)
				elif l == '=' and len(self.level) == 1:
					if self.BibTeXInt:
						ImData.concatKey( ImData.name.strip().lower() )
						ImData.setCurrentKeyItem( None )
						ImData.resetName()
					else:
						ImData.index = self.Fields.get(ImData.name.strip().lower())
						if ImData.index == None:
							print "Unrecognized field: %s" % ImData.name.strip().lower()
						ImData.resetName()
					#print "New index is "+`index`
				elif l == '#' and len(self.level) == 1:
					self.__concatAction(ImData)
				elif len(self.level) != 1 and ImData.index != None and ImData.getCurrentRecItem() != None:
					ImData.concatCurrentRecItem( l )
					#print record[index]
				elif len(self.level) != 1 and ImData.key != None and ImData.getCurrentKeyItem() != None:
					ImData.concatCurrentKeyItem( l )
				elif ImData.index == None \
					or (len(self.level) == 1 \
					    and (ImData.index != None or ImData.key != None)):
							if ImData.name == None: ImData.resetName()
							ImData.concatName( l )
#				elif ImData.index != None and ImData.getCurrentRecItem() == None:
#					pass # do nothing; between the = and the start of the data ({ or ")
				else:
					print "uh oh"
			line = self.infile.readline()
		else:
			yield ImData.cleanupRecord()
			if ImData.strings: print "Used BibTex-STRINGS: ", ImData.strings

	def __newReference(self, ImData):
		"""Do all initialisation for new bibliographic record """
		ImData.newRecord()
		ImData.setRecItem('BibType')

	def __beginReferencAction(self, ImData):
		"""After having detected a potential new reference (@-Tag found)
		   this method handles the beginning of the reference content """
		print ImData.getCurrentRecItem()
		#print self.Type[record[index].lower()]
		bibTypeIdx = self.Type.get(ImData.getCurrentRecItem().strip().lower())
		if bibTypeIdx == None:
			#Check for BibTeX internal commands
			bibTypeIdx = self.TypeBibTeXExt.get(ImData.getCurrentRecItem().strip().lower())
			if bibTypeIdx != None:
				print "BibTeX internal command found: @" + bibTypeIdx
				ImData.setCurrentRecItem( bibTypeIdx )
				self.BibTeXInt = True
				ImData.index = None
			else:
				print "Unrecognized Biblio type: " + ImData.getCurrentRecItem().strip().lower()
				ImData.newRecord()
				ImData.index = None
		else:
			ImData.setCurrentRecItem( BIB.BIBLIOGRAPHIC_TYPE[ bibTypeIdx ] ) # Change the BibTeX type string to a Bibus type
			ImData.index = BIB.BIBLIOGRAPHIC_FIELDS['Identifier']
			ImData.setCurrentRecItem( "" )

	def __endFieldAction(self, ImData):
		"""Do all analysis after field-end token was detected """
		if ImData.key != None: 
			print "Error: multiple string definitions in one @STRING command"
			ImData.resetKey( None )
		if ImData.index == BIB.BIBLIOGRAPHIC_FIELDS['Identifier']:
			ImData.concatCurrentRecItem( ImData.name.strip() )
		elif ImData.index != None:
			#Check for possible pending number or aliases (no delimiter is necessary)
			self.__concatAction(ImData)
		ImData.index = None
		ImData.resetName()

	def __endReferencAction(self, ImData):
		self.__concatAction(ImData)
		if ImData.key != None:
			ImData.resetKey( None )
			ImData.newRecord()
			self.BibTeXInt = False
		if ImData.index != None: ImData.index = None

	def __manageDelimiterLevel(self, l):
		"""Tracks and analysis delimiter level within a reference """
		def upLevel(m,l):
			if m.level[len(m.level)-1] == m.Delimiter[l]:
				m.level.pop()
			else: print('Unhandled case or situation with delimiter "')

		level = len(self.level)
		if l == '"':
			if level == 1: self.level.append(l)
			elif level == 2:
				upLevel(self,l)
			elif level > 3: pass
		elif l in ['{','(']: self.level.append(l)
		elif l in ['}',')']:
			upLevel(self,l)
	
	def __concatAction(self, ImData):
		"""This method handles all potential concatenation actions.
		   In BibTex they are induced by "#" String 
		   if it is not within a field content (level = 1)"""
		tmp = ImData.getCurrentAlias()
		if ImData.key and tmp:
			if ImData.getCurrentKeyItem() == None:
				ImData.setCurrentKeyItem(tmp)
			else:
				ImData.concatCurrentKeyItem(tmp)
		elif ImData.index and tmp:
			if ImData.getCurrentRecItem() == None:
				ImData.setCurrentRecItem(tmp)
			else:
				ImData.concatCurrentRecItem(tmp)
		elif tmp == None:
			pass  # Do nothing
		else: 
			print "Concatenation: Unhandled situation"
		ImData.resetName()

class BibTeXImportData:
	pat = re.compile("({)(.*?)(})")		# used to remove pairs of curly braces in fields
	
	def __init__(self):
		self.newRecord()
		self.name = None
		self.index = None
		self.key = None
		self.strings = dict()
		self.state = None

	def newRecord(self):
		self.record = [None]*len(BIB.BIB_FIELDS)

	def cleanupRecord(self):
		"""Cleans up the record prior submission """
		# Get rid of any remaining None objects
		for i in xrange(1,len(BIB.BIB_FIELDS)):
			if self.record[i] == None:
				self.record[i] = ""
			elif type(self.record[i]) in (str,unicode): # Remove tabs and multiple spaces
					tmp = " ".join( self.record[i].splitlines() )
					self.record[i] = " ".join( tmp.split() )
					self.record[i] = BibTeXImportData.pat.sub("\\2",self.record[i])	# Remove pairs of { }
		self.__formatAuthors()
		self.__formatPages()
		return self.record
		
	def __formatAuthors(self):
		"""Split author list"""
		i = BIB.BIBLIOGRAPHIC_FIELDS['Author']
		splittedauthors = self.record[i].split(' and ')
		for j in xrange(len(splittedauthors)):
			author = splittedauthors[j].replace('.','')
			if (author.find(",") == -1) and ( len(author.rsplit(None,1)) > 1):
				splittedauthors[j] = "%s, %s"%(author.rsplit(None,1)[1].strip(),author.rsplit(None,1)[0].strip())
		self.record[i] = BIB.SEP.join( splittedauthors )
		
	def __formatPages(self):
		i = BIB.BIBLIOGRAPHIC_FIELDS['Pages']
		self.record[i] = self.record[i].replace('--','-')

	def setRecItem(self, item):
		if item == 'BibType':
			#Set the index and activate the Item.
			self.index = BIB.BIBLIOGRAPHIC_FIELDS['BibliographicType']
			self.setCurrentRecItem( "" )
		else:
			print "Unhandled item in setRecItem(): " + item

	def getKeyItem(self,item):
		return self.strings.get(item)

	def getCurrentRecItem(self):
		return self.record[self.index]

	def getCurrentKeyItem(self):
		return self.strings.get(self.key)

	def getCurrentAlias(self):
		a = self.name.strip()
		tmp = self.getKeyItem(a)
		a.isdigit()
		if tmp == None and a.isdigit():
			tmp = a
		if tmp == None and len(a) > 0:
			print "Unknown Alias: ", a
		return tmp

	def setCurrentRecItem(self, item):
		self.record[self.index] = item
		if item == "": self.resetName()

	def setCurrentKeyItem(self, item):
		self.strings[self.key] = item
		if item == "": self.resetName()

	def concatCurrentRecItem(self, item):
		self.record[self.index] = self.record[self.index] + item
		
	def concatCurrentKeyItem(self, item):
		self.strings[self.key] = self.strings[self.key] + item
		
	def pendingRecord(self):
		result = None
		if self.record[BIB.BIBLIOGRAPHIC_FIELDS['BibliographicType']] != None:
			result = True
		return result
		
	def resetName(self, to=""):
		self.name = to

	def concatName(self, item):
		self.name = self.name + item

	def resetKey(self, to=""):
		self.key = to

	def concatKey(self, item):
		if self.key == None: self.resetKey()
		self.key = self.key + item
bibus 1.5.2-4 / usr / share / bibus / Import / BibTeX.py