#!/usr/local/bin/python
#
# -*- coding: iso-8859-15 -*-
#
# pyBibParser.py
#
# Parse an input .aux file to determine appropriate set of bibtex citation
# keys. Use this set to select a subset of entries from the provided .bib
# files. Also, try to handle crossrefs and string definitions.
#
# Jouni Paulus, jouni.paulus(a)tut.fi, 26.1.2010
#

import sys
import re # regexps
from sets import Set
import os.path

defAuxFile='final.aux'
defBibFileList=['allRefs.bib',
                'allConfs.bib',
                'allStatesShort.bib']

MAX_LOOPS=10 # just a safety measure

# month abbreviations defined within BibTeX or LaTeX
monthSet=Set(['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec'])

# default parametrisations
if len(sys.argv)<2:
    auxFile=defAuxFile
else:
    auxFile=sys.argv[1]

if len(sys.argv)<3:
    bibFileList=defBibFileList
else:
    bibFileList=sys.argv[2:]

### operations start

# determine set of unique citation keys from the .aux file
citeKeys=Set()
aFile=open(auxFile,'rb')
for aLine in aFile:
    lMatch=re.search('^\\\\citation{.*$',aLine)
    if lMatch!=None:
        tmpKeys=re.sub('}.*$','',re.sub('^\\\\citation{','',lMatch.group()))
        keyList=tmpKeys.split(',')
        for citeKey in keyList:
            citeKeys.add(citeKey.strip())

aFile.close()

loopCounter=0
stringSet=Set()
outEntrySet=[]
outStringSet=[]
# iterate until all used keys have been found
while ((len(citeKeys)>0) or (len(stringSet)>0)) and (loopCounter<MAX_LOOPS):
    tmpEntryList=[]
    for bibFile in bibFileList:
        bFile=open(bibFile,'rb')
        fileContent=bFile.read()
        bFile.close()

        # maybe that these will fit only the syntaxt I'm using...
        stringEntries=re.findall('@String.*{.*}',fileContent)
        bibEntries=re.findall('@(?!String).*?\n}',fileContent,re.DOTALL)

        # loop bibtex entries and try to parse the contents
        for bEntry in bibEntries:
            keyMatch=re.search('@.*?{.*,',bEntry)
            if keyMatch!=None:
                thisKey=re.sub(',.*','',re.sub('^.*{','',keyMatch.group()))
                
                if thisKey in citeKeys:
                    # this entry was used, so store it and parse further
                    citeKeys.remove(thisKey)
                    tmpEntryList.append(bEntry)
                    
                    for entryLine in bEntry.split('\n'):
                        crefMatch=re.search('.*crossref.*=.*{.*?}',entryLine)
                        if crefMatch!=None:
                            crefKey=re.sub('}','',re.sub('^.*{','',crefMatch.group()))
                            citeKeys.add(crefKey)
                            
                        lineContents=re.sub(',\s*\Z','',re.sub('.*=\s*','',entryLine))
                        # try to detect if there are any strings in use
                        subEntries=lineContents.split('#')
                        for sEntry in subEntries:
                            tmpStr=re.sub('.*}','',re.sub('\".*\"','',re.sub('@.*','',sEntry.rstrip().lstrip()))).rstrip().lstrip()

                            if (len(tmpStr)>0) and (tmpStr.lower() not in monthSet):
                                stringSet.add(tmpStr)

        # parse the string entries and store the ones used
        for sEntry in stringEntries:
            sKey=re.sub('=.*','',re.sub('@String{','',sEntry)).rstrip().lstrip()
            if sKey in stringSet:
                outStringSet.append(sEntry)
                stringSet.remove(sKey)

    tmpEntryList.sort()
    outEntrySet.extend(tmpEntryList)
    loopCounter=loopCounter+1

# did we find everything?
if (len(citeKeys)>0) or (len(stringSet)>0):
    print 'Warning! Some of the keys or strings could not be located during iterations. These include the following:'
    for cKey in citeKeys:
        print '\''+cKey+'\''
    for sKey in stringSet:
        print '\''+sKey+'\''

# create the output
outStringSet.sort()
outFile=open(auxFile+'.bib','wb')
for outString in outStringSet:
    outFile.write(outString+'\n')

outFile.write('\n')
outEntrySet.sort()
print outEntrySet
for outEntry in outEntrySet:
    print 'Entry: ' + outEntry
    outFile.write(outEntry+'\n')

outFile.close()
