#!/usr/bin/env python # -*- coding: utf-8 -*- date = '$Date$' rev = '$Rev$' id = '$Id$' USFMversion = '2.35' # http://ubs-icap.org/chm/usfm/2.35/index.html # usfmtags.py version 1.0 # Copyright 2012 by the CrossWire Bible Society # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # The full text of the GNU General Public License is available at: # . import re, sys, codecs date = date.replace('$', '').strip()[6:16] rev = rev.replace('$', '').strip()[5:] simpleTags = (['\\id', '\\ide', '\\sts', '\\rem', '\\h', '\\toc1', '\\toc2', '\\toc3', '\\ip', '\\ipi', '\\im', '\\imi', '\\ipq', '\\imq', '\\ipr', '\\ib', '\\ili', '\\iot', '\\ior', '\\ior*', '\\iex', '\\iqt', '\\iqt*', '\\imte', '\\ie', '\\mr', '\\sr', '\\r', '\\rq', '\\rq*', '\\d', '\\sp', '\\c', '\\ca', '\\ca*', '\\cl', '\\cp', '\\cd', '\\v', '\\va', '\\va*', '\\vp', '\\vp*', '\\p', '\\m', '\\pmo', '\\pm', '\\pmc', '\\pmr', '\\mi', '\\nb', '\\cls', '\\pc', '\\pr', '\\b', '\\qr', '\\qc', '\\qs', '\\qs*', '\\qa', '\\qac', '\\qac*', '\\tr', '\\f', '\\f*', '\\fe', '\\fe*', '\\fr', '\\fk', '\\fq', '\\fqa', '\\fl', '\\fp', '\\fv', '\\ft', '\\fdc', '\\fdc*', '\\fm', '\\fm*', '\\x', '\\x*', '\\xo', '\\xk', '\\xq', '\\xt', '\\xot', '\\xot*', '\\xnt', '\\xnt*', '\\xdc', '\\xdc*', '\\add', '\\add*', '\\bk', '\\bk*', '\\dc', '\\dc*', '\\k', '\\k*', '\\lit', '\\nd', '\\nd*', '\\ord', '\\ord*', '\\pn', '\\pn*', '\\qt', '\\qt*', '\\sig', '\\sig*', '\\sls', '\\sls*', '\\tl', '\\tl*', '\\wj', '\\wj*', '\\em', '\\em*', '\\bd', '\\bd*', '\\it', '\\it*', '\\bdit', '\\bdit*', '\\no', '\\no*', '\\sc', '\\sc*', '\\pb', '\\fig', '\\fig*', '\\ndx', '\\ndx*', '\\pro', '\\pro*', '\\w', '\\w*', '\\wg', '\\wg*', '\\wh', '\\wh*', '\\periph', '\\ef', '\\ef*', '\\ex', '\\ex*', '\\esb', '\\esbe', '\\cat', '\\z']) digitTags = set(['\\imt', '\\is', '\\iq', '\\io', '\\mt', '\\mte', '\\ms', '\\s', '\\pi', '\\li', '\\ph', '\\q', '\\qm', '\\th', '\\thr', '\\tc', '\\tcr']) def main(argv): tagSet = set() knownSet = set() unknownSet = set() if '-h' in argv or '--help' in argv or len(argv) < 2: printUsage() else: for doc in argv[1:]: text = codecs.open(doc, 'r', 'utf-8').read() tagSet.update(set(re.findall(r'(\\[a-zA-Z0-9]+\b\*?)', text))) for tag in tagSet: if tag in simpleTags: knownSet.add(tag) elif tag.rstrip('1234567890') in digitTags: knownSet.add(tag) else: unknownSet.add(tag) print 'Known USFM Tags: ' + ', '.join(sorted(knownSet)) print 'Unrecognized USFM Tags: ' + ', '.join(sorted(unknownSet)) def printUsage(): print 'usfmtags.py ' print ' Revision: ' + rev + ' (' + date + ')' print '' print ' This utility will scan USFM files and print two lists of all unique tags in them.' print ' The first list identifies all valid tags, identified in the USFM ' + USFMversion + ' spec.' print ' The second list identifies tags unknown to that spec.' exit() if __name__ == "__main__": main(sys.argv)