1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 as published by
5    * the Free Software Foundation. This program is distributed in the hope
6    * that it will be useful, but WITHOUT ANY WARRANTY; without even the
7    * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *       http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * Copyright: 2005
18   *     The copyright to this program is held by it's authors.
19   *
20   * ID: $Id: OSISUtil.java 1877 2008-06-18 20:22:45Z dmsmith $
21   */
22  package org.crosswire.jsword.book;
23  
24  import java.util.ArrayList;
25  import java.util.Arrays;
26  import java.util.Collection;
27  import java.util.HashSet;
28  import java.util.Iterator;
29  import java.util.List;
30  import java.util.Set;
31  import java.util.Stack;
32  import java.util.regex.Matcher;
33  import java.util.regex.Pattern;
34  
35  import org.crosswire.common.diff.Difference;
36  import org.crosswire.common.diff.EditType;
37  import org.crosswire.common.util.Logger;
38  import org.crosswire.jsword.passage.Key;
39  import org.crosswire.jsword.passage.KeyFactory;
40  import org.crosswire.jsword.passage.NoSuchKeyException;
41  import org.crosswire.jsword.passage.NoSuchVerseException;
42  import org.crosswire.jsword.passage.PassageKeyFactory;
43  import org.crosswire.jsword.passage.Verse;
44  import org.crosswire.jsword.passage.VerseFactory;
45  import org.jdom.Content;
46  import org.jdom.Element;
47  import org.jdom.Parent;
48  import org.jdom.Text;
49  
50  /**
51   * Some simple utilities to help working with OSIS classes.
52   *
53   * @see gnu.lgpl.License for license details.
54   *      The copyright to this program is held by it's authors.
55   * @author Joe Walker [joe at eireneh dot com]
56   */
57  public final class OSISUtil
58  {
59      /**
60       * The following are values for the type attribute on the hi element.
61       */
62      /**
63       * Constant for acrostic highlighting
64       */
65      public static final String HI_ACROSTIC = "acrostic"; //$NON-NLS-1$
66  
67      /**
68       * Constant for rendering bold text
69       */
70      public static final String HI_BOLD = "bold"; //$NON-NLS-1$
71  
72      /**
73       * Constant for rendering emphatic text
74       */
75      public static final String HI_EMPHASIS = "emphasis"; //$NON-NLS-1$
76  
77      /**
78       * Constant for rendering illuminated text.
79       */
80      public static final String HI_ILLUMINATED = "illuminated"; //$NON-NLS-1$
81  
82      /**
83       * Constant for rendering italic text.
84       */
85      public static final String HI_ITALIC = "italic"; //$NON-NLS-1$
86  
87      /**
88       * Constant for rendering strike-through text
89       */
90      public static final String HI_LINETHROUGH = "line-through"; //$NON-NLS-1$
91  
92      /**
93       * Constant for rendering normal text.
94       */
95      public static final String HI_NORMAL = "normal"; //$NON-NLS-1$
96  
97      /**
98       * Constant for rendering small caps
99       */
100     public static final String HI_SMALL_CAPS = "small-caps"; //$NON-NLS-1$
101 
102     /**
103      * Constant for rendering subscripts
104      */
105     public static final String HI_SUB = "sub"; //$NON-NLS-1$
106 
107     /**
108      * Constant for rendering superscripts
109      */
110     public static final String HI_SUPER = "super"; //$NON-NLS-1$
111 
112     /**
113      * Constant for rendering underlined text
114      */
115     public static final String HI_UNDERLINE = "underline"; //$NON-NLS-1$
116 
117     /**
118      * Constant for rendering upper case text
119      */
120     public static final String HI_X_CAPS = "x-caps"; //$NON-NLS-1$
121 
122     /**
123      * Constant for rendering big text
124      */
125     public static final String HI_X_BIG = "x-big"; //$NON-NLS-1$
126 
127     /**
128      * Constant for rendering small text
129      */
130     public static final String HI_X_SMALL = "x-small"; //$NON-NLS-1$
131 
132     /**
133      * Constant for rendering tt text
134      */
135     public static final String HI_X_TT = "x-tt"; //$NON-NLS-1$
136 
137     /**
138      * Constant to help narrow down what we use seg for. In this case the justify right tag
139      */
140     public static final String SEG_JUSTIFYRIGHT = "text-align: right;"; //$NON-NLS-1$
141 
142     /**
143      * Constant to help narrow down what we use seg for. In this case the thml center tag
144      */
145     public static final String SEG_CENTER = "text-align: center;"; //$NON-NLS-1$
146 
147     /**
148      * Constant to help narrow down what we use div for. In this case the thml pre tag
149      */
150     public static final String DIV_PRE = "x-pre"; //$NON-NLS-1$
151 
152     /**
153      * Constant to help narrow down what we use seg for. In this case the color tag
154      */
155     public static final String SEG_COLORPREFIX = "color: "; //$NON-NLS-1$
156 
157     /**
158      * Constant to help narrow down what we use seg for. In this case the font-size tag
159      */
160     public static final String SEG_SIZEPREFIX = "font-size: "; //$NON-NLS-1$
161 
162     /**
163      * Constant for x- types
164      */
165     public static final String TYPE_X_PREFIX = "x-"; //$NON-NLS-1$
166 
167     /**
168      * Constant for the study note type
169      */
170     public static final String NOTETYPE_STUDY = "x-StudyNote"; //$NON-NLS-1$
171 
172     /**
173      * Constant for the cross reference note type
174      */
175     public static final String NOTETYPE_REFERENCE = "crossReference"; //$NON-NLS-1$
176 
177     /**
178      * Constant for the variant type segment
179      */
180     public static final String VARIANT_TYPE = "x-variant"; //$NON-NLS-1$
181     public static final String VARIANT_CLASS = "x-class"; //$NON-NLS-1$
182 
183     /**
184      * Constant for JSword generated content. Used for type or subType.
185      */
186     public static final String GENERATED_CONTENT = "x-gen"; //$NON-NLS-1$
187 
188     /**
189      * Constant for the pos (part of speech) type.
190      */
191     public static final String POS_TYPE = "x-pos"; //$NON-NLS-1$
192 
193     /**
194      * Constant for the def (dictionary definition) type
195      */
196     public static final String DEF_TYPE = "x-def"; //$NON-NLS-1$
197 
198     /**
199      * Constant for a Strong's numbering lemma
200      */
201     public static final String LEMMA_STRONGS = "strong:"; //$NON-NLS-1$
202     public static final String MORPH_ROBINSONS = "robinson:"; //$NON-NLS-1$
203 
204     /**
205      * Constant for Strong's numbering morphology
206      */
207     public static final String MORPH_STRONGS = "x-StrongsMorph:T"; //$NON-NLS-1$
208 
209     /**
210      * Constant to help narrow down what we use "q" for. In this case: blockquote
211      */
212     public static final String Q_BLOCK = "blockquote"; //$NON-NLS-1$
213 
214     /**
215      * Constant to help narrow down what we use "q" for. In this case: citation
216      */
217     public static final String Q_CITATION = "citation"; //$NON-NLS-1$
218 
219     /**
220      * Constant to help narrow down what we use "q" for. In this case: embedded
221      */
222     public static final String Q_EMBEDDED = "embedded"; //$NON-NLS-1$
223 
224     /**
225      * Constant to help narrow down what we use "list" for.
226      */
227     public static final String LIST_ORDERED = "x-ordered"; //$NON-NLS-1$
228     public static final String LIST_UNORDERED = "x-unordered"; //$NON-NLS-1$
229 
230     /**
231      * Table roles (on table, row and cell elements) can be "data", the default, or label.
232      */
233     public static final String TABLE_ROLE_LABEL = "label"; //$NON-NLS-1$
234 
235     /**
236      * Possible cell alignments
237      */
238     public static final String CELL_ALIGN_LEFT = "left"; //$NON-NLS-1$
239     public static final String CELL_ALIGN_RIGHT = "right"; //$NON-NLS-1$
240     public static final String CELL_ALIGN_CENTER = "center"; //$NON-NLS-1$
241     public static final String CELL_ALIGN_JUSTIFY = "justify"; //$NON-NLS-1$
242     public static final String CELL_ALIGN_START = "start"; //$NON-NLS-1$
243     public static final String CELL_ALIGN_END = "end"; //$NON-NLS-1$
244 
245     public static final String OSIS_ELEMENT_TITLE = "title"; //$NON-NLS-1$
246     public static final String OSIS_ELEMENT_TABLE = "table"; //$NON-NLS-1$
247     public static final String OSIS_ELEMENT_SPEECH = "speech"; //$NON-NLS-1$
248     public static final String OSIS_ELEMENT_SPEAKER = "speaker"; //$NON-NLS-1$
249     public static final String OSIS_ELEMENT_ROW = "row"; //$NON-NLS-1$
250     public static final String OSIS_ELEMENT_REFERENCE = "reference"; //$NON-NLS-1$
251     public static final String OSIS_ELEMENT_NOTE = "note"; //$NON-NLS-1$
252     public static final String OSIS_ELEMENT_NAME = "name"; //$NON-NLS-1$
253     public static final String OSIS_ELEMENT_Q = "q"; //$NON-NLS-1$
254     public static final String OSIS_ELEMENT_LIST = "list"; //$NON-NLS-1$
255     public static final String OSIS_ELEMENT_P = "p"; //$NON-NLS-1$
256     public static final String OSIS_ELEMENT_ITEM = "item"; //$NON-NLS-1$
257     public static final String OSIS_ELEMENT_FIGURE = "figure"; //$NON-NLS-1$
258     public static final String OSIS_ELEMENT_FOREIGN = "foreign"; //$NON-NLS-1$
259     public static final String OSIS_ELEMENT_W = "w"; //$NON-NLS-1$
260     public static final String OSIS_ELEMENT_CHAPTER = "chapter"; //$NON-NLS-1$
261     public static final String OSIS_ELEMENT_VERSE = "verse"; //$NON-NLS-1$
262     public static final String OSIS_ELEMENT_CELL = "cell"; //$NON-NLS-1$
263     public static final String OSIS_ELEMENT_DIV = "div"; //$NON-NLS-1$
264     public static final String OSIS_ELEMENT_OSIS = "osis"; //$NON-NLS-1$
265     public static final String OSIS_ELEMENT_WORK = "work"; //$NON-NLS-1$
266     public static final String OSIS_ELEMENT_HEADER = "header"; //$NON-NLS-1$
267     public static final String OSIS_ELEMENT_OSISTEXT = "osisText"; //$NON-NLS-1$
268     public static final String OSIS_ELEMENT_SEG = "seg"; //$NON-NLS-1$
269     public static final String OSIS_ELEMENT_LG = "lg"; //$NON-NLS-1$
270     public static final String OSIS_ELEMENT_L = "l"; //$NON-NLS-1$
271     public static final String OSIS_ELEMENT_LB = "lb"; //$NON-NLS-1$
272     public static final String OSIS_ELEMENT_HI = "hi"; //$NON-NLS-1$
273 
274     public static final String ATTRIBUTE_TEXT_OSISIDWORK = "osisIDWork"; //$NON-NLS-1$
275     public static final String ATTRIBUTE_WORK_OSISWORK = "osisWork"; //$NON-NLS-1$
276     public static final String OSIS_ATTR_OSISID = "osisID"; //$NON-NLS-1$
277     public static final String OSIS_ATTR_SID = "sID"; //$NON-NLS-1$
278     public static final String OSIS_ATTR_EID = "eID"; //$NON-NLS-1$
279     public static final String ATTRIBUTE_W_LEMMA = "lemma"; //$NON-NLS-1$
280     public static final String ATTRIBUTE_FIGURE_SRC = "src"; //$NON-NLS-1$
281     public static final String ATTRIBUTE_TABLE_ROLE = "role"; //$NON-NLS-1$
282     public static final String ATTRIBUTE_CELL_ALIGN = "align"; //$NON-NLS-1$
283     public static final String OSIS_ATTR_TYPE = "type"; //$NON-NLS-1$
284     public static final String OSIS_ATTR_CANONICAL = "canonical"; //$NON-NLS-1$
285     public static final String OSIS_ATTR_SUBTYPE = "subType"; //$NON-NLS-1$
286     public static final String OSIS_ATTR_REF = "osisRef"; //$NON-NLS-1$
287     public static final String OSIS_ATTR_LEVEL = "level"; //$NON-NLS-1$
288     public static final String ATTRIBUTE_SPEAKER_WHO = "who"; //$NON-NLS-1$
289     public static final String ATTRIBUTE_W_MORPH = "morph"; //$NON-NLS-1$
290     public static final String ATTRIBUTE_OSISTEXT_OSISIDWORK = "osisIDWork"; //$NON-NLS-1$
291     // OSIS defines the lang attribute as the one from the xml namespace
292     // Typical usage element.setAttribute(OSISUtil.OSIS_ATTR_LANG, lang, Namespace.XML_NAMESPACE);
293     public static final String OSIS_ATTR_LANG = "lang"; //$NON-NLS-1$
294     public static final String ATTRIBUTE_DIV_BOOK = "book"; //$NON-NLS-1$
295 
296     /**
297      * Prefix for OSIS IDs that refer to Bibles
298      */
299     private static final String OSISID_PREFIX_BIBLE = "Bible."; //$NON-NLS-1$
300 
301     private static final Set EXTRA_BIBLICAL_ELEMENTS = new HashSet(Arrays.asList(new String[]
302     {
303         OSIS_ELEMENT_NOTE, OSIS_ELEMENT_TITLE, OSIS_ELEMENT_REFERENCE
304     }));
305 
306     /**
307      * The log stream
308      */
309     private static final Logger log = Logger.getLogger(OSISUtil.class);
310 
311     /**
312      * Prevent instantiation
313      */
314     private OSISUtil()
315     {
316     }
317 
318     private static OSISFactory factory = new OSISFactory();
319 
320     /**
321      * An accessor for the OSISFactory that creates OSIS objects
322      */
323     public static OSISFactory factory()
324     {
325         return factory;
326     }
327 
328     /**
329      * A generic way of creating empty Elements of various types
330      */
331     public static class OSISFactory
332     {
333         /**
334          *
335          */
336         public Element createSeg()
337         {
338             return new Element(OSIS_ELEMENT_SEG);
339         }
340 
341         /**
342          *
343          */
344         public Element createOsisText()
345         {
346             return new Element(OSIS_ELEMENT_OSISTEXT);
347         }
348 
349         /**
350          *
351          */
352         public Element createHeader()
353         {
354             return new Element(OSIS_ELEMENT_HEADER);
355         }
356 
357         /**
358          *
359          */
360         public Element createWork()
361         {
362             return new Element(OSIS_ELEMENT_WORK);
363         }
364 
365         /**
366          *
367          */
368         public Element createOsis()
369         {
370             return new Element(OSIS_ELEMENT_OSIS);
371         }
372 
373         /**
374          *
375          */
376         public Element createDiv()
377         {
378             return new Element(OSIS_ELEMENT_DIV);
379         }
380 
381         /**
382          *
383          */
384         public Element createCell()
385         {
386             return new Element(OSIS_ELEMENT_CELL);
387         }
388 
389         /**
390          *
391          */
392         public Element createHeaderCell()
393         {
394             Element ele = new Element(OSIS_ELEMENT_CELL);
395             ele.setAttribute(ATTRIBUTE_TABLE_ROLE, TABLE_ROLE_LABEL);
396             ele.setAttribute(ATTRIBUTE_CELL_ALIGN, CELL_ALIGN_CENTER);
397             return ele;
398         }
399 
400         /**
401          *
402          */
403         public Element createVerse()
404         {
405             return new Element(OSIS_ELEMENT_VERSE);
406         }
407 
408         /**
409          *
410          */
411         public Element createW()
412         {
413             return new Element(OSIS_ELEMENT_W);
414         }
415 
416         /**
417          *
418          */
419         public Element createFigure()
420         {
421             return new Element(OSIS_ELEMENT_FIGURE);
422         }
423 
424         /**
425          *
426          */
427         public Element createForeign()
428         {
429             return new Element(OSIS_ELEMENT_FOREIGN);
430         }
431 
432         /**
433          *
434          */
435         public Element createItem()
436         {
437             return new Element(OSIS_ELEMENT_ITEM);
438         }
439 
440         /**
441          *
442          */
443         public Element createP()
444         {
445             return new Element(OSIS_ELEMENT_P);
446         }
447 
448         /**
449          *
450          */
451         public Element createList()
452         {
453             return new Element(OSIS_ELEMENT_LIST);
454         }
455 
456         /**
457          *
458          */
459         public Element createQ()
460         {
461             return new Element(OSIS_ELEMENT_Q);
462         }
463 
464         /**
465          *
466          */
467         public Element createName()
468         {
469             return new Element(OSIS_ELEMENT_NAME);
470         }
471 
472         /**
473          *
474          */
475         public Element createNote()
476         {
477             return new Element(OSIS_ELEMENT_NOTE);
478         }
479 
480         /**
481          *
482          */
483         public Element createReference()
484         {
485             return new Element(OSIS_ELEMENT_REFERENCE);
486         }
487 
488         /**
489          *
490          */
491         public Element createRow()
492         {
493             return new Element(OSIS_ELEMENT_ROW);
494         }
495 
496         /**
497          *
498          */
499         public Element createSpeaker()
500         {
501             return new Element(OSIS_ELEMENT_SPEAKER);
502         }
503 
504         /**
505          *
506          */
507         public Element createSpeech()
508         {
509             return new Element(OSIS_ELEMENT_SPEECH);
510         }
511 
512         /**
513          *
514          */
515         public Element createTable()
516         {
517             return new Element(OSIS_ELEMENT_TABLE);
518         }
519 
520         /**
521          *
522          */
523         public Element createTitle()
524         {
525             return new Element(OSIS_ELEMENT_TITLE);
526         }
527         /**
528          * Line Group
529          */
530         public Element createLG()
531         {
532             return new Element(OSIS_ELEMENT_LG);
533         }
534         /**
535          * Line
536          */
537         public Element createL()
538         {
539             return new Element(OSIS_ELEMENT_L);
540         }
541         /**
542          * Line Break
543          */
544         public Element createLB()
545         {
546             return new Element(OSIS_ELEMENT_LB);
547         }
548         /**
549          * Highlight
550          */
551         public Element createHI()
552         {
553             return new Element(OSIS_ELEMENT_HI);
554         }
555 
556         /**
557          * Text
558          */
559         public Text createText(String text)
560         {
561             return new Text(text);
562         }
563     }
564 
565     /**
566      * Dig past the osis and osisText element, if present, to get the meaningful content of the document.
567      *
568      * @return a fragment
569      */
570     public static List getFragment(Element root)
571     {
572         Element content = root;
573         if (OSISUtil.OSIS_ELEMENT_OSIS.equals(root.getName()))
574         {
575             content = root.getChild(OSISUtil.OSIS_ELEMENT_OSISTEXT);
576         }
577 
578         if (OSISUtil.OSIS_ELEMENT_OSISTEXT.equals(root.getName()))
579         {
580             content = root.getChild(OSISUtil.OSIS_ELEMENT_DIV);
581         }
582 
583         // At this point we are at something interesting, possibly null.
584         // If this was a semantically valid OSIS document then it is a div.
585         // As long as this node has one child dig deeper.
586         while (content != null && content.getContentSize() == 1)
587         {
588             Content firstChild = content.getContent(0);
589             if (firstChild instanceof Element && OSISUtil.OSIS_ELEMENT_DIV.equals(((Element) firstChild).getName()))
590             {
591                 content = (Element) firstChild;
592             }
593             break;
594         }
595 
596         assert content != null;
597         return content.getContent();
598     }
599 
600     /**
601      * Get the canonical text from an osis document consisting of a single fragment.
602      * The document is assumed to be valid OSIS2.0 XML. While xml valid
603      * is rigidly defined as meaning that an xml parser can validate the document,
604      * it does not mean that the document is valid OSIS. This is a semantic
605      * problem that is not validated. This method assumes that the
606      * root element is also semantically valid.
607      *
608      * <p>This means that the top level element's tagname is osis.
609      * This can contain either a osisText or an osisCorpus.
610      * If it is an osisCorpus, then it contains an osisText.
611      * However, as a simplification, since JSword constructs
612      * the whole doc for the fragment, osisCorpus can be ignored.
613      * <p>The osisText element contains a div element that is either
614      * a container or a milestone. Again, JSword is providing the
615      * div element and it will be provided as a container. It is this div
616      * that "contains" the actual fragment.</p>
617      * <p>A verse element may either be
618      * a container or a milestone. Sword OSIS books differ in whether
619      * they provide the verse element. Most do not. The few that do are
620      * using the container model, but it has been proposed that milestones
621      * are the best practice.</p>
622      *
623      * <p>The fragment may contain elements that are not a part of the
624      * original text. These are things such as notes.</p>
625      *
626      * <p>Milestones require special handling. Beginning milestones
627      * elements have an sID attribute, while ending milestones have
628      * an eID with the same value as the opening. So everything between
629      * the start and the corresponding end is the content of the element.
630      * Also, for a given element, say div, they have to be properly nested
631      * as if they were container elements.</p>
632      *
633      * @param root the whole osis document.
634      * @return The canonical text without markup
635      */
636     public static String getCanonicalText(Element root)
637     {
638         StringBuffer buffer = new StringBuffer();
639 
640         // Dig past osis, osisText, if present, to get to the real content.
641         List frag = OSISUtil.getFragment(root);
642 
643         Iterator dit = frag.iterator();
644         String sID = null;
645         Object data = null;
646         Element ele = null;
647         while (dit.hasNext())
648         {
649             data = dit.next();
650             if (data instanceof Element)
651             {
652                 ele = (Element) data;
653                 if (!isCanonical(ele))
654                 {
655                     continue;
656                 }
657 
658                 if (ele.getName().equals(OSISUtil.OSIS_ELEMENT_VERSE))
659                 {
660                     sID = ele.getAttributeValue(OSISUtil.OSIS_ATTR_SID);
661                 }
662 
663                 if (sID != null)
664                 {
665                     getCanonicalContent(ele, sID, dit, buffer);
666                 }
667                 else
668                 {
669                     getCanonicalContent(ele, null, ele.getContent().iterator(), buffer);
670                 }
671             }
672             else if (data instanceof Text)
673             {
674                 // make sure that adjacent text elements are separated by whitespace
675                 // TODO(dms): verify that the xml parser does not split words containing entities.
676                 int lastIndex = buffer.length() - 1;
677                 String text = ((Text) data).getText();
678                 if (lastIndex >= 0 && !Character.isWhitespace(buffer.charAt(lastIndex)) && !Character.isWhitespace(text.charAt(0)))
679                 {
680                     buffer.append(' ');
681                 }
682                 buffer.append(text);
683             }
684         }
685 
686         return buffer.toString().trim();
687     }
688 
689     /**
690      * A simplified plain text version of the data in this Element with all
691      * the markup stripped out.
692      * @return The Bible text without markup
693      */
694     public static String getPlainText(Element root)
695     {
696         // Dig past osis, osisText, if present, to get to the real content.
697         return getTextContent(OSISUtil.getFragment(root));
698     }
699 
700     /**
701      * A space separate string containing Strong's numbers.
702      * @return The Strong's numbers in the text
703      */
704     public static String getStrongsNumbers(Element root)
705     {
706         StringBuffer buffer = new StringBuffer();
707 
708         Iterator contentIter = getDeepContent(root, OSISUtil.OSIS_ELEMENT_W).iterator();
709         while (contentIter.hasNext())
710         {
711             Element ele = (Element) contentIter.next();
712             String attr = ele.getAttributeValue(OSISUtil.ATTRIBUTE_W_LEMMA);
713             if (attr != null)
714             {
715                 Matcher matcher = strongsNumberPattern.matcher(attr);
716                 while (matcher.find())
717                 {
718                     String strongsNum = matcher.group(1);
719                     if (buffer.length() > 0)
720                     {
721                         buffer.append(' ');
722                     }
723                     buffer.append(strongsNum);
724                 }
725             }
726         }
727 
728         return buffer.toString().trim();
729     }
730 
731     /**
732      * A space separate string containing osisID from the reference element.
733      * @return The references in the text
734      */
735     public static String getReferences(Element root)
736     {
737         KeyFactory keyf = PassageKeyFactory.instance();
738         Key collector = keyf.createEmptyKeyList();
739 
740         Iterator contentIter = getDeepContent(root, OSISUtil.OSIS_ELEMENT_REFERENCE).iterator();
741         while (contentIter.hasNext())
742         {
743             Element ele = (Element) contentIter.next();
744             String attr = ele.getAttributeValue(OSISUtil.OSIS_ATTR_REF);
745             if (attr != null)
746             {
747                 try
748                 {
749                     Key key = keyf.getKey(attr);
750                     collector.addAll(key);
751                 }
752                 catch (NoSuchKeyException e)
753                 {
754                     log.warn("Unable to parse: " + attr, e); //$NON-NLS-1$
755                 }
756             }
757         }
758 
759         return collector.getOsisID();
760     }
761 
762     /**
763      * The text of non-reference notes.
764      *
765      * @return The references in the text
766      */
767     public static String getNotes(Element root)
768     {
769         StringBuffer buffer = new StringBuffer();
770 
771         Iterator contentIter = getDeepContent(root, OSISUtil.OSIS_ELEMENT_NOTE).iterator();
772         while (contentIter.hasNext())
773         {
774             Element ele = (Element) contentIter.next();
775             String attr = ele.getAttributeValue(OSISUtil.OSIS_ATTR_TYPE);
776             if (attr == null || !attr.equals(NOTETYPE_REFERENCE))
777             {
778                 if (buffer.length() > 0)
779                 {
780                     buffer.append(' ');
781                 }
782                 buffer.append(OSISUtil.getTextContent(ele.getContent()));
783             }
784         }
785 
786         return buffer.toString();
787     }
788 
789     /**
790      * The text of non-reference notes.
791      *
792      * @return The references in the text
793      */
794     public static String getHeadings(Element root)
795     {
796         StringBuffer buffer = new StringBuffer();
797 
798         Iterator contentIter = getDeepContent(root, OSISUtil.OSIS_ELEMENT_TITLE).iterator();
799         while (contentIter.hasNext())
800         {
801             Element ele = (Element) contentIter.next();
802             getCanonicalContent(ele, null, ele.getContent().iterator(), buffer);
803         }
804 
805         return buffer.toString();
806     }
807 
808     private static void getCanonicalContent(Element parent, String sID, Iterator iter, StringBuffer buffer)
809     {
810         if (!isCanonical(parent))
811         {
812             return;
813         }
814 
815         Object data = null;
816         Element ele = null;
817         String eleName = null;
818         String eID = null;
819         while (iter.hasNext())
820         {
821             data = iter.next();
822             if (data instanceof Element)
823             {
824                 ele = (Element) data;
825                 // If the milestoned element is done then quit.
826                 // This should be a eID=, that matches sID, from the same element.
827                 eleName = ele.getName();
828                 eID = ele.getAttributeValue(OSISUtil.OSIS_ATTR_SID);
829                 if (eID != null && eID.equals(sID) && eleName.equals(parent.getName()))
830                 {
831                     break;
832                 }
833                 OSISUtil.getCanonicalContent(ele, sID, ele.getContent().iterator(), buffer);
834             }
835             else if (data instanceof Text)
836             {
837                 // make sure that adjacent text elements are separated by whitespace
838                 // TODO(dms): verify that the xml parser does not split words containing entities.
839                 int lastIndex = buffer.length() - 1;
840                 String text = ((Text) data).getText();
841                 if (lastIndex >= 0 && !Character.isWhitespace(buffer.charAt(lastIndex)) && !Character.isWhitespace(text.charAt(0)))
842                 {
843                     buffer.append(' ');
844                 }
845                 buffer.append(text);
846             }
847         }
848     }
849 
850     private static boolean isCanonical(Content content)
851     {
852         boolean result = true;
853         if (content instanceof Element)
854         {
855             Element element = (Element) content;
856 
857             // Ignore extra-biblical text
858             if (EXTRA_BIBLICAL_ELEMENTS.contains(element.getName()))
859             {
860                 String canonical = element.getAttributeValue(OSISUtil.OSIS_ATTR_CANONICAL);
861                 result = Boolean.valueOf(canonical).booleanValue();
862             }
863         }
864 
865         return result;
866     }
867 
868     private static String getTextContent(List fragment)
869     {
870         StringBuffer buffer = new StringBuffer();
871 
872         Iterator contentIter = fragment.iterator();
873         while (contentIter.hasNext())
874         {
875             Content next = (Content) contentIter.next();
876             recurseElement(next, buffer);
877         }
878 
879         return buffer.toString();
880     }
881 
882     /**
883      * Find all the instances of elements of type <code>find</code> under
884      * the element <code>div</code>.
885      */
886     public static Collection getDeepContent(Element div, String name)
887     {
888         List reply = new ArrayList();
889         recurseDeepContent(div, name, reply);
890         return reply;
891     }
892 
893     /**
894      * Walk up the tree from the W to find out what verse we are in.
895      * @param ele The start point for our verse hunt.
896      * @return The verse we are in
897      */
898     public static Verse getVerse(Element ele) throws BookException
899     {
900         if (ele.getName().equals(OSIS_ELEMENT_VERSE))
901         {
902             // If the element is an OSIS Verse then this is fairly easy
903             String osisid = ele.getAttributeValue(OSIS_ATTR_OSISID);
904 
905             try
906             {
907                 return VerseFactory.fromString(osisid);
908             }
909             catch (NoSuchVerseException ex)
910             {
911                 throw new BookException(Msg.OSIS_BADID, ex, new Object[] { osisid });
912             }
913         }
914 
915         // So we just walk up the tree trying to find a verse
916         Parent parent = ele.getParent();
917         if (parent instanceof Element)
918         {
919             return getVerse((Element) parent);
920         }
921 
922         throw new BookException(Msg.MISSING_VERSE);
923     }
924 
925     /**
926      * Helper method to create the boilerplate headers in an OSIS document from
927      * the current metadata object
928      */
929     public static Element createOsisFramework(BookMetaData bmd)
930     {
931         Element osis = factory().createOsis();
932         String osisid = bmd.getInitials();
933 
934         Element work = factory().createWork();
935         work.setAttribute(ATTRIBUTE_WORK_OSISWORK, osisid);
936 
937         Element header = factory().createHeader();
938         header.addContent(work);
939 
940         Element text = factory().createOsisText();
941         text.setAttribute(ATTRIBUTE_TEXT_OSISIDWORK, OSISID_PREFIX_BIBLE + osisid);
942         text.addContent(header);
943 
944         osis.addContent(text);
945 
946         return osis;
947     }
948 
949     /**
950      * Convert a Difference list into a pretty HTML report.
951      * @param diffs List of Difference objects
952      * @return HTML representation
953      */
954     public static List diffToOsis(List diffs)
955     {
956         Element div = factory().createDiv();
957 
958         for (int x = 0; x < diffs.size(); x++)
959         {
960             Difference diff = (Difference) diffs.get(x);
961             EditType editType = diff.getEditType(); // Mode (delete, equal, insert)
962             Text text = factory.createText(diff.getText()); // Text of change.
963 
964             if (EditType.DELETE.equals(editType))
965             {
966                 Element hi = factory().createHI();
967                 hi.setAttribute(OSISUtil.OSIS_ATTR_TYPE, OSISUtil.HI_LINETHROUGH);
968                 hi.addContent(text);
969                 div.addContent(hi);
970             }
971             else if (EditType.INSERT.equals(editType))
972             {
973                 Element hi = factory().createHI();
974                 hi.setAttribute(OSISUtil.OSIS_ATTR_TYPE, OSISUtil.HI_UNDERLINE);
975                 hi.addContent(text);
976                 div.addContent(hi);
977             }
978             else
979             {
980                 div.addContent(text);
981             }
982         }
983         return div.cloneContent();
984     }
985 
986     public static List rtfToOsis(String rtf)
987     {
988         Element div = factory().createDiv();
989         Stack stack = new Stack();
990         stack.push(div);
991 
992         int strlen = rtf.length();
993 
994         StringBuffer text = new StringBuffer(strlen);
995 
996         int i = 0;
997         for (i = 0; i < strlen; i++)
998         {
999             char curChar = rtf.charAt(i);
1000            if (curChar != '\\')
1001            {
1002                text.append(curChar);
1003                continue;
1004            }
1005
1006            // The following are ordered from most to least common
1007            // and when one is a prefix of another, it follows.
1008
1009            // Used to end all open attributes. Only \qc in our implementation.
1010            if (rtf.startsWith("\\pard", i)) //$NON-NLS-1$
1011            {
1012                Element currentElement = (Element) stack.pop();
1013                currentElement.addContent(text.toString());
1014                text.delete(0, text.length());
1015                stack.clear();
1016                stack.push(div);
1017                i += (i + 5 < strlen && rtf.charAt(i + 5) == ' ') ? 5 : 4;
1018                continue;
1019            }
1020
1021            // Simulate a paragraph break.
1022            if (rtf.startsWith("\\par", i)) //$NON-NLS-1$
1023            {
1024
1025                Element currentElement = (Element) stack.peek();
1026                currentElement.addContent(text.toString());
1027                text.delete(0, text.length());
1028                currentElement.addContent(OSISUtil.factory.createLB());
1029                i += (i + 4 < strlen && rtf.charAt(i + 4) == ' ') ? 4 : 3;
1030                continue;
1031            }
1032
1033            // OSIS does not have the notion of centered text.
1034            // So we define our own
1035            if (rtf.startsWith("\\qc", i)) //$NON-NLS-1$
1036            {
1037                Element centerDiv = OSISUtil.factory.createDiv();
1038                centerDiv.setAttribute(OSIS_ATTR_TYPE, "x-center"); //$NON-NLS-1$
1039                Element currentElement = (Element) stack.peek();
1040                currentElement.addContent(text.toString());
1041                text.delete(0, text.length());
1042                currentElement.addContent(centerDiv);
1043                stack.push(centerDiv);
1044                // skip following space, if any
1045                i += (i + 3 < strlen && rtf.charAt(i + 3) == ' ') ? 3 : 2;
1046                continue;
1047            }
1048
1049            // convert Unicode representations to Unicode
1050            if (rtf.startsWith("\\u", i)) //$NON-NLS-1$
1051            {
1052                StringBuffer buf = new StringBuffer();
1053                i += 2;
1054                while (i < strlen)
1055                {
1056                    char curDigit = rtf.charAt(i);
1057                    if (curDigit != '-' && !Character.isDigit(curDigit))
1058                    {
1059                        break;
1060                    }
1061                    buf.append(curDigit);
1062                    i++;
1063                }
1064                // At this point:
1065                // buf contains the numeric representation of the number, 16-bit signed
1066                // charAt(i) is the substitution character if Unicode is not supported
1067                int value = Integer.parseInt(buf.toString());
1068                if (value < 0)
1069                {
1070                    value += 65536;
1071                }
1072                text.append((char) value);
1073                // don't advance since i is on the substitute character.
1074                continue;
1075            }
1076
1077            // close italic and bold
1078            if (rtf.startsWith("\\i0", i) || rtf.startsWith("\\b0", i)) //$NON-NLS-1$ //$NON-NLS-2$
1079            {
1080                Element currentElement = (Element) stack.pop();
1081                currentElement.addContent(text.toString());
1082                text.delete(0, text.length());
1083                i += (i + 3 < strlen && rtf.charAt(i + 3) == ' ') ? 3 : 2;
1084                continue;
1085            }
1086
1087            // Skip escaped whitespace
1088            if (rtf.startsWith(" ", i) || rtf.startsWith("\n", i)) //$NON-NLS-1$ //$NON-NLS-2$
1089            {
1090                i += 1;
1091                continue;
1092            }
1093
1094            // start italic
1095            if (rtf.startsWith("\\i", i)) //$NON-NLS-1$
1096            {
1097                Element hiElement = OSISUtil.factory.createHI();
1098                hiElement.setAttribute(OSIS_ATTR_TYPE, HI_ITALIC);
1099                Element currentElement = (Element) stack.peek();
1100                currentElement.addContent(text.toString());
1101                text.delete(0, text.length());
1102                currentElement.addContent(hiElement);
1103                stack.push(hiElement);
1104                i += (i + 2 < strlen && rtf.charAt(i + 2) == ' ') ? 2 : 1;
1105                continue;
1106            }
1107
1108            // start bold
1109            if (rtf.startsWith("\\b", i)) //$NON-NLS-1$
1110            {
1111                Element hiElement = OSISUtil.factory.createHI();
1112                hiElement.setAttribute(OSIS_ATTR_TYPE, HI_BOLD);
1113                Element currentElement = (Element) stack.peek();
1114                currentElement.addContent(text.toString());
1115                text.delete(0, text.length());
1116                currentElement.addContent(hiElement);
1117                stack.push(hiElement);
1118                i += (i + 2 < strlen && rtf.charAt(i + 2) == ' ') ? 2 : 1;
1119                continue;
1120            }
1121
1122        }
1123
1124        // If there is any text that has not been consumed
1125        if (text.length() > 0)
1126        {
1127            div.addContent(text.toString());
1128        }
1129//        div.addContent(text.toString());
1130//        // If the fragment is already in a document, then use that.
1131//        Document doc = div.getDocument();
1132//        if (doc == null)
1133//        {
1134//            doc = new Document(div);
1135//        }
1136//        SAXEventProvider ep = new JDOMSAXEventProvider(doc);
1137//        ContentHandler osis = new PrettySerializingContentHandler(FormatType.CLASSIC_INDENT);
1138//        try
1139//        {
1140//            ep.provideSAXEvents(osis);
1141//        }
1142//        catch (SAXException e)
1143//        {
1144//            e.printStackTrace();
1145//        }
1146//        System.err.println(osis.toString());
1147        return div.cloneContent();
1148    }
1149
1150    /**
1151     * Find all the instances of elements of type <code>find</code> under
1152     * the element <code>div</code>. For internal use only.
1153     */
1154    private static void recurseDeepContent(Element start, String name, List reply)
1155    {
1156        if (start.getName().equals(name))
1157        {
1158            reply.add(start);
1159        }
1160
1161        Object data = null;
1162        Element ele = null;
1163        Iterator contentIter = start.getContent().iterator();
1164        while (contentIter.hasNext())
1165        {
1166            data = contentIter.next();
1167            if (data instanceof Element)
1168            {
1169                ele = (Element) data;
1170                recurseDeepContent(ele, name, reply);
1171            }
1172        }
1173    }
1174
1175    /**
1176     * If we have a String just add it to the buffer, but if we have an Element
1177     * then try to dig the strings out of it.
1178     */
1179    private static void recurseElement(Object sub, StringBuffer buffer)
1180    {
1181        if (sub instanceof Text)
1182        {
1183            buffer.append(((Text) sub).getText());
1184        }
1185        else if (sub instanceof Element)
1186        {
1187            recurseChildren((Element) sub, buffer);
1188        }
1189        else
1190        {
1191            log.error("unknown type: " + sub.getClass().getName()); //$NON-NLS-1$
1192        }
1193    }
1194
1195    /**
1196     * Helper to extract the Strings from a nest of JDOM elements
1197     * @param ele The JDOM Element to dig into
1198     * @param buffer The place we accumulate strings.
1199     */
1200    private static void recurseChildren(Element ele, StringBuffer buffer)
1201    {
1202        // ele is a JDOM Element that might have a getContent() method
1203        Iterator contentIter = ele.getContent().iterator();
1204        while (contentIter.hasNext())
1205        {
1206            Object sub = contentIter.next();
1207            recurseElement(sub, buffer);
1208        }
1209    }
1210
1211    private static String strongsNumber = "strong:([GgHh][0-9]+!?[A-Za-z]*)"; //$NON-NLS-1$
1212    private static Pattern strongsNumberPattern = Pattern.compile(strongsNumber);
1213}
1214