1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 as published by
5    * the Free Software Foundation. This program is distributed in the hope
6    * that it will be useful, but WITHOUT ANY WARRANTY; without even the
7    * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *       http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * Copyright: 2005
18   *     The copyright to this program is held by it's authors.
19   *
20   * ID: $Id: OSISUtil.java 2221 2012-01-25 21:32:57Z dmsmith $
21   */
22  package org.crosswire.jsword.book;
23  
24  import java.util.ArrayList;
25  import java.util.Arrays;
26  import java.util.Collection;
27  import java.util.HashSet;
28  import java.util.Iterator;
29  import java.util.List;
30  import java.util.Set;
31  import java.util.Stack;
32  import java.util.regex.Matcher;
33  import java.util.regex.Pattern;
34  
35  import org.crosswire.common.diff.Difference;
36  import org.crosswire.common.diff.EditType;
37  import org.crosswire.common.util.Logger;
38  import org.crosswire.jsword.JSOtherMsg;
39  import org.crosswire.jsword.passage.Key;
40  import org.crosswire.jsword.passage.NoSuchKeyException;
41  import org.crosswire.jsword.passage.NoSuchVerseException;
42  import org.crosswire.jsword.passage.PassageKeyFactory;
43  import org.crosswire.jsword.passage.Verse;
44  import org.crosswire.jsword.passage.VerseFactory;
45  import org.crosswire.jsword.versification.Versification;
46  import org.jdom.Content;
47  import org.jdom.Element;
48  import org.jdom.Parent;
49  import org.jdom.Text;
50  
51  /**
52   * Some simple utilities to help working with OSIS classes.
53   * 
54   * @see gnu.lgpl.License for license details.<br>
55   *      The copyright to this program is held by it's authors.
56   * @author Joe Walker [joe at eireneh dot com]
57   */
58  public final class OSISUtil {
59      /**
60       * The following are values for the type attribute on the hi element.
61       */
62      /**
63       * Constant for acrostic highlighting
64       */
65      public static final String HI_ACROSTIC = "acrostic";
66  
67      /**
68       * Constant for rendering bold text
69       */
70      public static final String HI_BOLD = "bold";
71  
72      /**
73       * Constant for rendering emphatic text
74       */
75      public static final String HI_EMPHASIS = "emphasis";
76  
77      /**
78       * Constant for rendering illuminated text.
79       */
80      public static final String HI_ILLUMINATED = "illuminated";
81  
82      /**
83       * Constant for rendering italic text.
84       */
85      public static final String HI_ITALIC = "italic";
86  
87      /**
88       * Constant for rendering strike-through text
89       */
90      public static final String HI_LINETHROUGH = "line-through";
91  
92      /**
93       * Constant for rendering normal text.
94       */
95      public static final String HI_NORMAL = "normal";
96  
97      /**
98       * Constant for rendering small caps
99       */
100     public static final String HI_SMALL_CAPS = "small-caps";
101 
102     /**
103      * Constant for rendering subscripts
104      */
105     public static final String HI_SUB = "sub";
106 
107     /**
108      * Constant for rendering superscripts
109      */
110     public static final String HI_SUPER = "super";
111 
112     /**
113      * Constant for rendering underlined text
114      */
115     public static final String HI_UNDERLINE = "underline";
116 
117     /**
118      * Constant for rendering upper case text
119      */
120     public static final String HI_X_CAPS = "x-caps";
121 
122     /**
123      * Constant for rendering big text
124      */
125     public static final String HI_X_BIG = "x-big";
126 
127     /**
128      * Constant for rendering small text
129      */
130     public static final String HI_X_SMALL = "x-small";
131 
132     /**
133      * Constant for rendering tt text
134      */
135     public static final String HI_X_TT = "x-tt";
136 
137     /**
138      * Constant to help narrow down what we use seg for. In this case the
139      * justify right tag
140      */
141     public static final String SEG_JUSTIFYRIGHT = "text-align: right;";
142 
143     /**
144      * Constant to help narrow down what we use seg for. In this case the
145      * justify right tag
146      */
147     public static final String SEG_JUSTIFYLEFT = "text-align: left;";
148 
149     /**
150      * Constant to help narrow down what we use seg for. In this case the thml
151      * center tag
152      */
153     public static final String SEG_CENTER = "text-align: center;";
154 
155     /**
156      * Constant to help narrow down what we use div for. In this case the thml
157      * pre tag
158      */
159     public static final String DIV_PRE = "x-pre";
160 
161     /**
162      * Constant to help narrow down what we use seg for. In this case the color
163      * tag
164      */
165     public static final String SEG_COLORPREFIX = "color: ";
166 
167     /**
168      * Constant to help narrow down what we use seg for. In this case the
169      * font-size tag
170      */
171     public static final String SEG_SIZEPREFIX = "font-size: ";
172 
173     /**
174      * Constant for x- types
175      */
176     public static final String TYPE_X_PREFIX = "x-";
177 
178     /**
179      * Constant for the study note type
180      */
181     public static final String NOTETYPE_STUDY = "x-StudyNote";
182 
183     /**
184      * Constant for the cross reference note type
185      */
186     public static final String NOTETYPE_REFERENCE = "crossReference";
187 
188     /**
189      * Constant for the variant type segment
190      */
191     public static final String VARIANT_TYPE = "x-variant";
192     public static final String VARIANT_CLASS = "x-class";
193 
194     /**
195      * Constant for JSword generated content. Used for type or subType.
196      */
197     public static final String GENERATED_CONTENT = "x-gen";
198 
199     /**
200      * Constant for the pos (part of speech) type.
201      */
202     public static final String POS_TYPE = "x-pos";
203 
204     /**
205      * Constant for the def (dictionary definition) type
206      */
207     public static final String DEF_TYPE = "x-def";
208 
209     /**
210      * Constant for a Strong's numbering lemma
211      */
212     public static final String LEMMA_STRONGS = "strong:";
213     public static final String MORPH_ROBINSONS = "robinson:";
214 
215     /**
216      * Constant for Strong's numbering morphology
217      */
218     public static final String MORPH_STRONGS = "x-StrongsMorph:T";
219 
220     /**
221      * Constant to help narrow down what we use "q" for. In this case:
222      * blockquote
223      */
224     public static final String Q_BLOCK = "blockquote";
225 
226     /**
227      * Constant to help narrow down what we use "q" for. In this case: citation
228      */
229     public static final String Q_CITATION = "citation";
230 
231     /**
232      * Constant to help narrow down what we use "q" for. In this case: embedded
233      */
234     public static final String Q_EMBEDDED = "embedded";
235 
236     /**
237      * Constant to help narrow down what we use "list" for.
238      */
239     public static final String LIST_ORDERED = "x-ordered";
240     public static final String LIST_UNORDERED = "x-unordered";
241 
242     /**
243      * Table roles (on table, row and cell elements) can be "data", the default,
244      * or label.
245      */
246     public static final String TABLE_ROLE_LABEL = "label";
247 
248     /**
249      * Possible cell alignments
250      */
251     public static final String CELL_ALIGN_LEFT = "left";
252     public static final String CELL_ALIGN_RIGHT = "right";
253     public static final String CELL_ALIGN_CENTER = "center";
254     public static final String CELL_ALIGN_JUSTIFY = "justify";
255     public static final String CELL_ALIGN_START = "start";
256     public static final String CELL_ALIGN_END = "end";
257 
258     public static final String OSIS_ELEMENT_ABBR = "abbr";
259     public static final String OSIS_ELEMENT_TITLE = "title";
260     public static final String OSIS_ELEMENT_TABLE = "table";
261     public static final String OSIS_ELEMENT_SPEECH = "speech";
262     public static final String OSIS_ELEMENT_SPEAKER = "speaker";
263     public static final String OSIS_ELEMENT_ROW = "row";
264     public static final String OSIS_ELEMENT_REFERENCE = "reference";
265     public static final String OSIS_ELEMENT_NOTE = "note";
266     public static final String OSIS_ELEMENT_NAME = "name";
267     public static final String OSIS_ELEMENT_Q = "q";
268     public static final String OSIS_ELEMENT_LIST = "list";
269     public static final String OSIS_ELEMENT_P = "p";
270     public static final String OSIS_ELEMENT_ITEM = "item";
271     public static final String OSIS_ELEMENT_FIGURE = "figure";
272     public static final String OSIS_ELEMENT_FOREIGN = "foreign";
273     public static final String OSIS_ELEMENT_W = "w";
274     public static final String OSIS_ELEMENT_CHAPTER = "chapter";
275     public static final String OSIS_ELEMENT_VERSE = "verse";
276     public static final String OSIS_ELEMENT_CELL = "cell";
277     public static final String OSIS_ELEMENT_DIV = "div";
278     public static final String OSIS_ELEMENT_OSIS = "osis";
279     public static final String OSIS_ELEMENT_WORK = "work";
280     public static final String OSIS_ELEMENT_HEADER = "header";
281     public static final String OSIS_ELEMENT_OSISTEXT = "osisText";
282     public static final String OSIS_ELEMENT_SEG = "seg";
283     public static final String OSIS_ELEMENT_LG = "lg";
284     public static final String OSIS_ELEMENT_L = "l";
285     public static final String OSIS_ELEMENT_LB = "lb";
286     public static final String OSIS_ELEMENT_HI = "hi";
287 
288     public static final String ATTRIBUTE_TEXT_OSISIDWORK = "osisIDWork";
289     public static final String ATTRIBUTE_WORK_OSISWORK = "osisWork";
290     public static final String OSIS_ATTR_OSISID = "osisID";
291     public static final String OSIS_ATTR_SID = "sID";
292     public static final String OSIS_ATTR_EID = "eID";
293     public static final String ATTRIBUTE_W_LEMMA = "lemma";
294     public static final String ATTRIBUTE_FIGURE_SRC = "src";
295     public static final String ATTRIBUTE_TABLE_ROLE = "role";
296     public static final String ATTRIBUTE_CELL_ALIGN = "align";
297     public static final String OSIS_ATTR_TYPE = "type";
298     public static final String OSIS_ATTR_CANONICAL = "canonical";
299     public static final String OSIS_ATTR_SUBTYPE = "subType";
300     public static final String OSIS_ATTR_REF = "osisRef";
301     public static final String OSIS_ATTR_LEVEL = "level";
302     public static final String ATTRIBUTE_SPEAKER_WHO = "who";
303     public static final String ATTRIBUTE_Q_WHO = "who";
304     public static final String ATTRIBUTE_W_MORPH = "morph";
305     public static final String ATTRIBUTE_OSISTEXT_OSISIDWORK = "osisIDWork";
306     // OSIS defines the lang attribute as the one from the xml namespace
307     // Typical usage element.setAttribute(OSISUtil.OSIS_ATTR_LANG, lang,
308     // Namespace.XML_NAMESPACE);
309     public static final String OSIS_ATTR_LANG = "lang";
310     public static final String ATTRIBUTE_DIV_BOOK = "book";
311 
312     /**
313      * Prefix for OSIS IDs that refer to Bibles
314      */
315     private static final String OSISID_PREFIX_BIBLE = "Bible.";
316 
317     private static final Set<String> EXTRA_BIBLICAL_ELEMENTS = new HashSet<String>(Arrays.asList(new String[] {
318             OSIS_ELEMENT_NOTE, OSIS_ELEMENT_TITLE, OSIS_ELEMENT_REFERENCE
319     }));
320 
321     /**
322      * The log stream
323      */
324     private static final Logger log = Logger.getLogger(OSISUtil.class);
325 
326     /**
327      * Prevent instantiation
328      */
329     private OSISUtil() {
330     }
331 
332     private static OSISFactory factory = new OSISFactory();
333 
334     /**
335      * An accessor for the OSISFactory that creates OSIS objects
336      */
337     public static OSISFactory factory() {
338         return factory;
339     }
340 
341     /**
342      * A generic way of creating empty Elements of various types
343      */
344     public static class OSISFactory {
345         /**
346         *
347         */
348         public Element createAbbr() {
349             return new Element(OSIS_ELEMENT_ABBR);
350         }
351 
352         /**
353        *
354        */
355         public Element createSeg() {
356             return new Element(OSIS_ELEMENT_SEG);
357         }
358 
359         /**
360          *
361          */
362         public Element createOsisText() {
363             return new Element(OSIS_ELEMENT_OSISTEXT);
364         }
365 
366         /**
367          *
368          */
369         public Element createHeader() {
370             return new Element(OSIS_ELEMENT_HEADER);
371         }
372 
373         /**
374          *
375          */
376         public Element createWork() {
377             return new Element(OSIS_ELEMENT_WORK);
378         }
379 
380         /**
381          *
382          */
383         public Element createOsis() {
384             return new Element(OSIS_ELEMENT_OSIS);
385         }
386 
387         /**
388          *
389          */
390         public Element createDiv() {
391             return new Element(OSIS_ELEMENT_DIV);
392         }
393 
394         /**
395          *
396          */
397         public Element createCell() {
398             return new Element(OSIS_ELEMENT_CELL);
399         }
400 
401         /**
402          *
403          */
404         public Element createHeaderCell() {
405             Element ele = new Element(OSIS_ELEMENT_CELL);
406             ele.setAttribute(ATTRIBUTE_TABLE_ROLE, TABLE_ROLE_LABEL);
407             ele.setAttribute(ATTRIBUTE_CELL_ALIGN, CELL_ALIGN_CENTER);
408             return ele;
409         }
410 
411         /**
412          *
413          */
414         public Element createVerse() {
415             return new Element(OSIS_ELEMENT_VERSE);
416         }
417 
418         /**
419          *
420          */
421         public Element createW() {
422             return new Element(OSIS_ELEMENT_W);
423         }
424 
425         /**
426          *
427          */
428         public Element createFigure() {
429             return new Element(OSIS_ELEMENT_FIGURE);
430         }
431 
432         /**
433          *
434          */
435         public Element createForeign() {
436             return new Element(OSIS_ELEMENT_FOREIGN);
437         }
438 
439         /**
440          *
441          */
442         public Element createItem() {
443             return new Element(OSIS_ELEMENT_ITEM);
444         }
445 
446         /**
447          *
448          */
449         public Element createP() {
450             return new Element(OSIS_ELEMENT_P);
451         }
452 
453         /**
454          *
455          */
456         public Element createList() {
457             return new Element(OSIS_ELEMENT_LIST);
458         }
459 
460         /**
461          *
462          */
463         public Element createQ() {
464             return new Element(OSIS_ELEMENT_Q);
465         }
466 
467         /**
468          *
469          */
470         public Element createName() {
471             return new Element(OSIS_ELEMENT_NAME);
472         }
473 
474         /**
475          *
476          */
477         public Element createNote() {
478             return new Element(OSIS_ELEMENT_NOTE);
479         }
480 
481         /**
482          *
483          */
484         public Element createReference() {
485             return new Element(OSIS_ELEMENT_REFERENCE);
486         }
487 
488         /**
489          *
490          */
491         public Element createRow() {
492             return new Element(OSIS_ELEMENT_ROW);
493         }
494 
495         /**
496          *
497          */
498         public Element createSpeaker() {
499             return new Element(OSIS_ELEMENT_SPEAKER);
500         }
501 
502         /**
503          *
504          */
505         public Element createSpeech() {
506             return new Element(OSIS_ELEMENT_SPEECH);
507         }
508 
509         /**
510          *
511          */
512         public Element createTable() {
513             return new Element(OSIS_ELEMENT_TABLE);
514         }
515 
516         /**
517          *
518          */
519         public Element createTitle() {
520             return new Element(OSIS_ELEMENT_TITLE);
521         }
522 
523         /**
524          * Line Group
525          */
526         public Element createLG() {
527             return new Element(OSIS_ELEMENT_LG);
528         }
529 
530         /**
531          * Line
532          */
533         public Element createL() {
534             return new Element(OSIS_ELEMENT_L);
535         }
536 
537         /**
538          * Line Break
539          */
540         public Element createLB() {
541             return new Element(OSIS_ELEMENT_LB);
542         }
543 
544         /**
545          * Highlight
546          */
547         public Element createHI() {
548             return new Element(OSIS_ELEMENT_HI);
549         }
550 
551         /**
552          * Text
553          */
554         public Text createText(String text) {
555             return new Text(text);
556         }
557     }
558 
559     /**
560      * Dig past the osis and osisText element, if present, to get the meaningful
561      * content of the document.
562      * 
563      * @return a fragment
564      */
565     public static List<Content> getFragment(Element root) {
566         if (root != null) {
567             Element content = root;
568             if (OSISUtil.OSIS_ELEMENT_OSIS.equals(root.getName())) {
569                 content = root.getChild(OSISUtil.OSIS_ELEMENT_OSISTEXT);
570             }
571 
572             if (OSISUtil.OSIS_ELEMENT_OSISTEXT.equals(root.getName())) {
573                 content = root.getChild(OSISUtil.OSIS_ELEMENT_DIV);
574             }
575 
576             // At this point we are at something interesting, possibly null.
577             // If this was a semantically valid OSIS document then it is a div.
578             // As long as this node has one child dig deeper.
579             if (content != null && content.getContentSize() == 1) {
580                 Content firstChild = content.getContent(0);
581                 if (firstChild instanceof Element && OSISUtil.OSIS_ELEMENT_DIV.equals(((Element) firstChild).getName())) {
582                     content = (Element) firstChild;
583                 }
584             }
585 
586             if (content != null) {
587                 return content.getContent();
588             }
589         }
590         return new ArrayList<Content>();
591     }
592 
593     /**
594      * Get the canonical text from an osis document consisting of a single
595      * fragment. The document is assumed to be valid OSIS2.0 XML. While xml
596      * valid is rigidly defined as meaning that an xml parser can validate the
597      * document, it does not mean that the document is valid OSIS. This is a
598      * semantic problem that is not validated. This method assumes that the root
599      * element is also semantically valid.
600      * 
601      * <p>
602      * This means that the top level element's tagname is osis. This can contain
603      * either a osisText or an osisCorpus. If it is an osisCorpus, then it
604      * contains an osisText. However, as a simplification, since JSword
605      * constructs the whole doc for the fragment, osisCorpus can be ignored.
606      * <p>
607      * The osisText element contains a div element that is either a container or
608      * a milestone. Again, JSword is providing the div element and it will be
609      * provided as a container. It is this div that "contains" the actual
610      * fragment.
611      * </p>
612      * <p>
613      * A verse element may either be a container or a milestone. Sword OSIS
614      * books differ in whether they provide the verse element. Most do not. The
615      * few that do are using the container model, but it has been proposed that
616      * milestones are the best practice.
617      * </p>
618      * 
619      * <p>
620      * The fragment may contain elements that are not a part of the original
621      * text. These are things such as notes.
622      * </p>
623      * 
624      * <p>
625      * Milestones require special handling. Beginning milestones elements have
626      * an sID attribute, while ending milestones have an eID with the same value
627      * as the opening. So everything between the start and the corresponding end
628      * is the content of the element. Also, for a given element, say div, they
629      * have to be properly nested as if they were container elements.
630      * </p>
631      * 
632      * @param root
633      *            the whole osis document.
634      * @return The canonical text without markup
635      */
636     public static String getCanonicalText(Element root) {
637         StringBuilder buffer = new StringBuilder();
638 
639         // Dig past osis, osisText, if present, to get to the real content.
640         List<Content> frag = OSISUtil.getFragment(root);
641 
642         Iterator<Content> dit = frag.iterator();
643         String sID = null;
644         Content data = null;
645         Element ele = null;
646         while (dit.hasNext()) {
647             data = dit.next();
648             if (data instanceof Element) {
649                 ele = (Element) data;
650                 if (!isCanonical(ele)) {
651                     continue;
652                 }
653 
654                 if (ele.getName().equals(OSISUtil.OSIS_ELEMENT_VERSE)) {
655                     sID = ele.getAttributeValue(OSISUtil.OSIS_ATTR_SID);
656                 }
657 
658                 if (sID != null) {
659                     getCanonicalContent(ele, sID, dit, buffer);
660                 } else {
661                     getCanonicalContent(ele, null, ele.getContent().iterator(), buffer);
662                 }
663             } else if (data instanceof Text) {
664                 // make sure that adjacent text elements are separated by
665                 // whitespace
666                 // TODO(dms): verify that the xml parser does not split words
667                 // containing entities.
668                 int lastIndex = buffer.length() - 1;
669                 String text = ((Text) data).getText();
670                 // Ignore empty text nodes.
671                 if (text.length() != 0) {
672                     if (lastIndex >= 0 && !Character.isWhitespace(buffer.charAt(lastIndex)) && !Character.isWhitespace(text.charAt(0))) {
673                         buffer.append(' ');
674                     }
675                     buffer.append(text);
676                 }
677             }
678         }
679 
680         return buffer.toString().trim();
681     }
682 
683     /**
684      * A simplified plain text version of the data in this Element with all the
685      * markup stripped out.
686      * 
687      * @return The Bible text without markup
688      */
689     public static String getPlainText(Element root) {
690         // Dig past osis, osisText, if present, to get to the real content.
691         return getTextContent(OSISUtil.getFragment(root));
692     }
693 
694     /**
695      * A space separate string containing Strong's numbers.
696      * 
697      * @return The Strong's numbers in the text
698      */
699     public static String getStrongsNumbers(Element root) {
700         StringBuilder buffer = new StringBuilder();
701 
702         for (Content content : getDeepContent(root, OSISUtil.OSIS_ELEMENT_W)) {
703             Element ele = (Element) content;
704             String attr = ele.getAttributeValue(OSISUtil.ATTRIBUTE_W_LEMMA);
705             if (attr != null) {
706                 Matcher matcher = strongsNumberPattern.matcher(attr);
707                 while (matcher.find()) {
708                     String strongsNum = matcher.group(1);
709                     if (buffer.length() > 0) {
710                         buffer.append(' ');
711                     }
712                     buffer.append(strongsNum);
713                 }
714             }
715         }
716 
717         return buffer.toString().trim();
718     }
719 
720     /**
721      * A space separate string containing osisID from the reference element.
722      * 
723      * @return The references in the text
724      */
725     public static String getReferences(Versification v11n, Element root) {
726         PassageKeyFactory keyf = PassageKeyFactory.instance();
727         Key collector = keyf.createEmptyKeyList(v11n);
728 
729         for (Content content : getDeepContent(root, OSISUtil.OSIS_ELEMENT_REFERENCE)) {
730             Element ele = (Element) content;
731             String attr = ele.getAttributeValue(OSISUtil.OSIS_ATTR_REF);
732             if (attr != null) {
733                 try {
734                     Key key = keyf.getKey(v11n, attr);
735                     collector.addAll(key);
736                 } catch (NoSuchKeyException e) {
737                     log.warn("Unable to parse: " + attr, e);
738                 }
739             }
740         }
741 
742         return collector.getOsisID();
743     }
744 
745     /**
746      * The text of non-reference notes.
747      * 
748      * @return The references in the text
749      */
750     public static String getNotes(Element root) {
751         StringBuilder buffer = new StringBuilder();
752 
753         for (Content content : getDeepContent(root, OSISUtil.OSIS_ELEMENT_NOTE)) {
754             Element ele = (Element) content;
755             String attr = ele.getAttributeValue(OSISUtil.OSIS_ATTR_TYPE);
756             if (attr == null || !attr.equals(NOTETYPE_REFERENCE)) {
757                 if (buffer.length() > 0) {
758                     buffer.append(' ');
759                 }
760                 buffer.append(OSISUtil.getTextContent(ele.getContent()));
761             }
762         }
763 
764         return buffer.toString();
765     }
766 
767     /**
768      * The text of non-reference notes.
769      * 
770      * @return The references in the text
771      */
772     public static String getHeadings(Element root) {
773         StringBuilder buffer = new StringBuilder();
774 
775         for (Content content : getDeepContent(root, OSISUtil.OSIS_ELEMENT_TITLE)) {
776             Element ele = (Element) content;
777             getCanonicalContent(ele, null, ele.getContent().iterator(), buffer);
778         }
779 
780         return buffer.toString();
781     }
782 
783     private static void getCanonicalContent(Element parent, String sID, Iterator<Content> iter, StringBuilder buffer) {
784         if (!isCanonical(parent)) {
785             return;
786         }
787 
788         Content data = null;
789         Element ele = null;
790         String eleName = null;
791         String eID = null;
792         while (iter.hasNext()) {
793             data = iter.next();
794             if (data instanceof Element) {
795                 ele = (Element) data;
796                 // If the milestoned element is done then quit.
797                 // This should be a eID=, that matches sID, from the same
798                 // element.
799                 eleName = ele.getName();
800                 eID = ele.getAttributeValue(OSISUtil.OSIS_ATTR_SID);
801                 if (eID != null && eID.equals(sID) && eleName.equals(parent.getName())) {
802                     break;
803                 }
804                 OSISUtil.getCanonicalContent(ele, sID, ele.getContent().iterator(), buffer);
805             } else if (data instanceof Text) {
806                 // make sure that adjacent text elements are separated by
807                 // whitespace
808                 // Empty elements also produce whitespace.
809                 // TODO(dms): verify that the xml parser does not split words
810                 // containing entities.
811                 int lastIndex = buffer.length() - 1;
812                 String text = ((Text) data).getText();
813                 if (lastIndex >= 0 && !Character.isWhitespace(buffer.charAt(lastIndex)) && (text.length() == 0 || !Character.isWhitespace(text.charAt(0)))) {
814                     buffer.append(' ');
815                 }
816                 buffer.append(text);
817             }
818         }
819     }
820 
821     private static boolean isCanonical(Content content) {
822         boolean result = true;
823         if (content instanceof Element) {
824             Element element = (Element) content;
825 
826             // Ignore extra-biblical text
827             if (EXTRA_BIBLICAL_ELEMENTS.contains(element.getName())) {
828                 String canonical = element.getAttributeValue(OSISUtil.OSIS_ATTR_CANONICAL);
829                 result = Boolean.valueOf(canonical).booleanValue();
830             }
831         }
832 
833         return result;
834     }
835 
836     private static String getTextContent(List<Content> fragment) {
837         StringBuilder buffer = new StringBuilder();
838 
839         for (Content next : fragment) {
840             recurseElement(next, buffer);
841         }
842 
843         return buffer.toString();
844     }
845 
846     /**
847      * Find all the instances of elements of type <code>find</code> under the
848      * element <code>div</code>.
849      */
850     public static Collection<Content> getDeepContent(Element div, String name) {
851         List<Content> reply = new ArrayList<Content>();
852         recurseDeepContent(div, name, reply);
853         return reply;
854     }
855 
856     /**
857      * Walk up the tree from the W to find out what verse we are in.
858      * 
859      * @param ele
860      *            The start point for our verse hunt.
861      * @return The verse we are in
862      */
863     public static Verse getVerse(Versification v11n, Element ele) throws BookException {
864         if (ele.getName().equals(OSIS_ELEMENT_VERSE)) {
865             // If the element is an OSIS Verse then this is fairly easy
866             String osisid = ele.getAttributeValue(OSIS_ATTR_OSISID);
867 
868             try {
869                 return VerseFactory.fromString(v11n, osisid);
870             } catch (NoSuchVerseException ex) {
871                 throw new BookException(JSOtherMsg.lookupText("OsisID not valid: {0}", osisid), ex);
872             }
873         }
874 
875         // So we just walk up the tree trying to find a verse
876         Parent parent = ele.getParent();
877         if (parent instanceof Element) {
878             return getVerse(v11n, (Element) parent);
879         }
880 
881         throw new BookException(JSOtherMsg.lookupText("Verse element could not be found"));
882     }
883 
884     /**
885      * Helper method to create the boilerplate headers in an OSIS document from
886      * the current metadata object
887      */
888     public static Element createOsisFramework(BookMetaData bmd) {
889         Element osis = factory().createOsis();
890         String osisid = bmd.getInitials();
891 
892         Element work = factory().createWork();
893         work.setAttribute(ATTRIBUTE_WORK_OSISWORK, osisid);
894 
895         Element header = factory().createHeader();
896         header.addContent(work);
897 
898         Element text = factory().createOsisText();
899         text.setAttribute(ATTRIBUTE_TEXT_OSISIDWORK, OSISID_PREFIX_BIBLE + osisid);
900         text.addContent(header);
901 
902         osis.addContent(text);
903 
904         return osis;
905     }
906 
907     /**
908      * Convert a Difference list into a pretty HTML report.
909      * 
910      * @param diffs
911      *            List of Difference objects
912      * @return HTML representation
913      */
914     public static List<Content> diffToOsis(List<Difference> diffs) {
915         Element div = factory().createDiv();
916 
917         for (int x = 0; x < diffs.size(); x++) {
918             Difference diff = diffs.get(x);
919             EditType editType = diff.getEditType(); // Mode (delete, equal,
920                                                     // insert)
921             Text text = factory.createText(diff.getText()); // Text of change.
922 
923             if (EditType.DELETE.equals(editType)) {
924                 Element hi = factory().createHI();
925                 hi.setAttribute(OSISUtil.OSIS_ATTR_TYPE, OSISUtil.HI_LINETHROUGH);
926                 hi.addContent(text);
927                 div.addContent(hi);
928             } else if (EditType.INSERT.equals(editType)) {
929                 Element hi = factory().createHI();
930                 hi.setAttribute(OSISUtil.OSIS_ATTR_TYPE, OSISUtil.HI_UNDERLINE);
931                 hi.addContent(text);
932                 div.addContent(hi);
933             } else {
934                 div.addContent(text);
935             }
936         }
937         return div.cloneContent();
938     }
939 
940     public static List<Content> rtfToOsis(String rtf) {
941         Element div = factory().createDiv();
942         Stack<Content> stack = new Stack<Content>();
943         stack.push(div);
944 
945         int strlen = rtf.length();
946 
947         StringBuilder text = new StringBuilder(strlen);
948 
949         int i = 0;
950         for (i = 0; i < strlen; i++) {
951             char curChar = rtf.charAt(i);
952             if (curChar != '\\') {
953                 text.append(curChar);
954                 continue;
955             }
956 
957             // The following are ordered from most to least common
958             // and when one is a prefix of another, it follows.
959 
960             // Used to end all open attributes. Only \qc in our implementation.
961             if (rtf.startsWith("\\pard", i)) {
962                 Element currentElement = (Element) stack.pop();
963                 currentElement.addContent(text.toString());
964                 text.delete(0, text.length());
965                 stack.clear();
966                 stack.push(div);
967                 i += (i + 5 < strlen && rtf.charAt(i + 5) == ' ') ? 5 : 4;
968                 continue;
969             }
970 
971             // Simulate a paragraph break.
972             if (rtf.startsWith("\\par", i)) {
973                 Element currentElement = (Element) stack.peek();
974                 currentElement.addContent(text.toString());
975                 text.delete(0, text.length());
976                 currentElement.addContent(OSISUtil.factory.createLB());
977                 i += (i + 4 < strlen && rtf.charAt(i + 4) == ' ') ? 4 : 3;
978                 continue;
979             }
980 
981             // OSIS does not have the notion of centered text.
982             // So we define our own
983             if (rtf.startsWith("\\qc", i)) {
984                 Element centerDiv = OSISUtil.factory.createDiv();
985                 centerDiv.setAttribute(OSIS_ATTR_TYPE, "x-center");
986                 Element currentElement = (Element) stack.peek();
987                 currentElement.addContent(text.toString());
988                 text.delete(0, text.length());
989                 currentElement.addContent(centerDiv);
990                 stack.push(centerDiv);
991                 // skip following space, if any
992                 i += (i + 3 < strlen && rtf.charAt(i + 3) == ' ') ? 3 : 2;
993                 continue;
994             }
995 
996             // convert Unicode representations to Unicode
997             if (rtf.startsWith("\\u", i)) {
998                 StringBuilder buf = new StringBuilder();
999                 i += 2;
1000                while (i < strlen) {
1001                    char curDigit = rtf.charAt(i);
1002                    if (curDigit != '-' && !Character.isDigit(curDigit)) {
1003                        break;
1004                    }
1005                    buf.append(curDigit);
1006                    i++;
1007                }
1008                // At this point:
1009                // buf contains the numeric representation of the number, 16-bit
1010                // signed
1011                // charAt(i) is the substitution character if Unicode is not
1012                // supported
1013                int value = Integer.parseInt(buf.toString());
1014                if (value < 0) {
1015                    value += 65536;
1016                }
1017                text.append((char) value);
1018                // don't advance since i is on the substitute character.
1019                continue;
1020            }
1021
1022            // close italic and bold
1023            if (rtf.startsWith("\\i0", i) || rtf.startsWith("\\b0", i)) {
1024                Element currentElement = (Element) stack.pop();
1025                currentElement.addContent(text.toString());
1026                text.delete(0, text.length());
1027                i += (i + 3 < strlen && rtf.charAt(i + 3) == ' ') ? 3 : 2;
1028                continue;
1029            }
1030
1031            // Skip escaped whitespace
1032            if (rtf.startsWith(" ", i) || rtf.startsWith("\n", i)) {
1033                i += 1;
1034                continue;
1035            }
1036
1037            // start italic
1038            if (rtf.startsWith("\\i", i)) {
1039                Element hiElement = OSISUtil.factory.createHI();
1040                hiElement.setAttribute(OSIS_ATTR_TYPE, HI_ITALIC);
1041                Element currentElement = (Element) stack.peek();
1042                currentElement.addContent(text.toString());
1043                text.delete(0, text.length());
1044                currentElement.addContent(hiElement);
1045                stack.push(hiElement);
1046                i += (i + 2 < strlen && rtf.charAt(i + 2) == ' ') ? 2 : 1;
1047                continue;
1048            }
1049
1050            // start bold
1051            if (rtf.startsWith("\\b", i)) {
1052                Element hiElement = OSISUtil.factory.createHI();
1053                hiElement.setAttribute(OSIS_ATTR_TYPE, HI_BOLD);
1054                Element currentElement = (Element) stack.peek();
1055                currentElement.addContent(text.toString());
1056                text.delete(0, text.length());
1057                currentElement.addContent(hiElement);
1058                stack.push(hiElement);
1059                i += (i + 2 < strlen && rtf.charAt(i + 2) == ' ') ? 2 : 1;
1060                continue;
1061            }
1062
1063        }
1064
1065        // If there is any text that has not been consumed
1066        if (text.length() > 0) {
1067            div.addContent(text.toString());
1068        }
1069        // div.addContent(text.toString());
1070        // // If the fragment is already in a document, then use that.
1071        // Document doc = div.getDocument();
1072        // if (doc == null)
1073        // {
1074        // doc = new Document(div);
1075        // }
1076        // SAXEventProvider ep = new JDOMSAXEventProvider(doc);
1077        // ContentHandler osis = new
1078        // PrettySerializingContentHandler(FormatType.CLASSIC_INDENT);
1079        // try
1080        // {
1081        // ep.provideSAXEvents(osis);
1082        // }
1083        // catch (SAXException e)
1084        // {
1085        // e.printStackTrace();
1086        // }
1087        // System.err.println(osis.toString());
1088        return div.cloneContent();
1089    }
1090
1091    /**
1092     * Find all the instances of elements of type <code>find</code> under the
1093     * element <code>div</code>. For internal use only.
1094     */
1095    private static void recurseDeepContent(Element start, String name, List<Content> reply) {
1096        if (start.getName().equals(name)) {
1097            reply.add(start);
1098        }
1099
1100//        Content data = null;
1101        Element ele = null;
1102        for (Content data : (List<Content>) start.getContent()) {
1103            if (data instanceof Element) {
1104                ele = (Element) data;
1105                recurseDeepContent(ele, name, reply);
1106            }
1107        }
1108    }
1109
1110    /**
1111     * If we have a String just add it to the buffer, but if we have an Element
1112     * then try to dig the strings out of it.
1113     */
1114    private static void recurseElement(Object sub, StringBuilder buffer) {
1115        if (sub instanceof Text) {
1116            buffer.append(((Text) sub).getText());
1117        } else if (sub instanceof Element) {
1118            recurseChildren((Element) sub, buffer);
1119        } else {
1120            log.error("unknown type: " + sub.getClass().getName());
1121        }
1122    }
1123
1124    /**
1125     * Helper to extract the Strings from a nest of JDOM elements
1126     * 
1127     * @param ele
1128     *            The JDOM Element to dig into
1129     * @param buffer
1130     *            The place we accumulate strings.
1131     */
1132    private static void recurseChildren(Element ele, StringBuilder buffer) {
1133        // ele is a JDOM Element that might have a getContent() method
1134        for (Content sub : (List<Content>) ele.getContent()) {
1135            recurseElement(sub, buffer);
1136        }
1137    }
1138
1139    private static String strongsNumber = "strong:([GgHh][0-9]+!?[A-Za-z]*)";
1140    private static Pattern strongsNumberPattern = Pattern.compile(strongsNumber);
1141}
1142