1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 or later
5    * as published by the Free Software Foundation. This program is distributed
6    * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
7    * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *       http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * Copyright: 2005-2013
18   *     The copyright to this program is held by its authors.
19   *
20   */
21  package org.crosswire.jsword.book;
22  
23  import java.util.ArrayList;
24  import java.util.Arrays;
25  import java.util.Collection;
26  import java.util.HashSet;
27  import java.util.Iterator;
28  import java.util.List;
29  import java.util.Set;
30  import java.util.Stack;
31  import java.util.regex.Matcher;
32  import java.util.regex.Pattern;
33  
34  import org.crosswire.common.diff.Difference;
35  import org.crosswire.common.diff.EditType;
36  import org.crosswire.jsword.JSOtherMsg;
37  import org.crosswire.jsword.passage.Key;
38  import org.crosswire.jsword.passage.NoSuchKeyException;
39  import org.crosswire.jsword.passage.NoSuchVerseException;
40  import org.crosswire.jsword.passage.PassageKeyFactory;
41  import org.crosswire.jsword.passage.Verse;
42  import org.crosswire.jsword.passage.VerseFactory;
43  import org.crosswire.jsword.versification.Versification;
44  import org.jdom2.Content;
45  import org.jdom2.Element;
46  import org.jdom2.Parent;
47  import org.jdom2.Text;
48  import org.slf4j.Logger;
49  import org.slf4j.LoggerFactory;
50  
51  /**
52   * Some simple utilities to help working with OSIS classes.
53   * 
54   * @see gnu.lgpl.License The GNU Lesser General Public License for details.
55   * @author Joe Walker
56   */
57  public final class OSISUtil {
58      private static final char SPACE_SEPARATOR = ' ';
59      private static final char MORPH_INFO_SEPARATOR = '@';
60  
61      /**
62       * The following are values for the type attribute on the hi element.
63       */
64      /**
65       * Constant for acrostic highlighting
66       */
67      public static final String HI_ACROSTIC = "acrostic";
68  
69      /**
70       * Constant for rendering bold text
71       */
72      public static final String HI_BOLD = "bold";
73  
74      /**
75       * Constant for rendering emphatic text
76       */
77      public static final String HI_EMPHASIS = "emphasis";
78  
79      /**
80       * Constant for rendering illuminated text.
81       */
82      public static final String HI_ILLUMINATED = "illuminated";
83  
84      /**
85       * Constant for rendering italic text.
86       */
87      public static final String HI_ITALIC = "italic";
88  
89      /**
90       * Constant for rendering strike-through text
91       */
92      public static final String HI_LINETHROUGH = "line-through";
93  
94      /**
95       * Constant for rendering normal text.
96       */
97      public static final String HI_NORMAL = "normal";
98  
99      /**
100      * Constant for rendering small caps
101      */
102     public static final String HI_SMALL_CAPS = "small-caps";
103 
104     /**
105      * Constant for rendering subscripts
106      */
107     public static final String HI_SUB = "sub";
108 
109     /**
110      * Constant for rendering superscripts
111      */
112     public static final String HI_SUPER = "super";
113 
114     /**
115      * Constant for rendering underlined text
116      */
117     public static final String HI_UNDERLINE = "underline";
118 
119     /**
120      * Constant for rendering upper case text
121      */
122     public static final String HI_X_CAPS = "x-caps";
123 
124     /**
125      * Constant for rendering big text
126      */
127     public static final String HI_X_BIG = "x-big";
128 
129     /**
130      * Constant for rendering small text
131      */
132     public static final String HI_X_SMALL = "x-small";
133 
134     /**
135      * Constant for rendering tt text
136      */
137     public static final String HI_X_TT = "x-tt";
138 
139     /**
140      * Constant to help narrow down what we use seg for. In this case the
141      * justify right tag
142      */
143     public static final String SEG_JUSTIFYRIGHT = "text-align: right;";
144 
145     /**
146      * Constant to help narrow down what we use seg for. In this case the
147      * justify right tag
148      */
149     public static final String SEG_JUSTIFYLEFT = "text-align: left;";
150 
151     /**
152      * Constant to help narrow down what we use seg for. In this case the thml
153      * center tag
154      */
155     public static final String SEG_CENTER = "text-align: center;";
156 
157     /**
158      * Constant to help narrow down what we use div for. In this case the thml
159      * pre tag
160      */
161     public static final String DIV_PRE = "x-pre";
162 
163     /**
164      * Constant to help narrow down what we use seg for. In this case the color
165      * tag
166      */
167     public static final String SEG_COLORPREFIX = "color: ";
168 
169     /**
170      * Constant to help narrow down what we use seg for. In this case the
171      * font-size tag
172      */
173     public static final String SEG_SIZEPREFIX = "font-size: ";
174 
175     /**
176      * Constant for x- types
177      */
178     public static final String TYPE_X_PREFIX = "x-";
179 
180     /**
181      * Constant for the study note type
182      */
183     public static final String NOTETYPE_STUDY = "x-StudyNote";
184 
185     /**
186      * Constant for the cross reference note type
187      */
188     public static final String NOTETYPE_REFERENCE = "crossReference";
189 
190     /**
191      * Constant for the variant type segment
192      */
193     public static final String VARIANT_TYPE = "x-variant";
194     public static final String VARIANT_CLASS = "x-";
195 
196     /**
197      * Constant for JSword generated content. Used for type or subType.
198      */
199     public static final String GENERATED_CONTENT = "x-gen";
200 
201     /**
202      * Constant for the pos (part of speech) type.
203      */
204     public static final String POS_TYPE = "x-pos";
205 
206     /**
207      * Constant for the def (dictionary definition) type
208      */
209     public static final String DEF_TYPE = "x-def";
210 
211     /**
212      * Constant for a Strong's numbering lemma
213      */
214     public static final String LEMMA_STRONGS = "strong:";
215     public static final String LEMMA_MISC = "lemma:";
216     public static final String MORPH_ROBINSONS = "robinson:";
217 
218     /**
219      * Constant for Strong's numbering morphology
220      */
221     public static final String MORPH_STRONGS = "x-StrongsMorph:T";
222 
223     /**
224      * Constant to help narrow down what we use "q" for. In this case:
225      * blockquote
226      */
227     public static final String Q_BLOCK = "blockquote";
228 
229     /**
230      * Constant to help narrow down what we use "q" for. In this case: citation
231      */
232     public static final String Q_CITATION = "citation";
233 
234     /**
235      * Constant to help narrow down what we use "q" for. In this case: embedded
236      */
237     public static final String Q_EMBEDDED = "embedded";
238 
239     /**
240      * Constant to help narrow down what we use "list" for.
241      */
242     public static final String LIST_ORDERED = "x-ordered";
243     public static final String LIST_UNORDERED = "x-unordered";
244 
245     /**
246      * Table roles (on table, row and cell elements) can be "data", the default,
247      * or label.
248      */
249     public static final String TABLE_ROLE_LABEL = "label";
250 
251     /**
252      * Possible cell alignments
253      */
254     public static final String CELL_ALIGN_LEFT = "left";
255     public static final String CELL_ALIGN_RIGHT = "right";
256     public static final String CELL_ALIGN_CENTER = "center";
257     public static final String CELL_ALIGN_JUSTIFY = "justify";
258     public static final String CELL_ALIGN_START = "start";
259     public static final String CELL_ALIGN_END = "end";
260 
261     public static final String OSIS_ELEMENT_ABBR = "abbr";
262     public static final String OSIS_ELEMENT_TITLE = "title";
263     public static final String OSIS_ELEMENT_TABLE = "table";
264     public static final String OSIS_ELEMENT_SPEECH = "speech";
265     public static final String OSIS_ELEMENT_SPEAKER = "speaker";
266     public static final String OSIS_ELEMENT_ROW = "row";
267     public static final String OSIS_ELEMENT_REFERENCE = "reference";
268     public static final String OSIS_ELEMENT_NOTE = "note";
269     public static final String OSIS_ELEMENT_NAME = "name";
270     public static final String OSIS_ELEMENT_Q = "q";
271     public static final String OSIS_ELEMENT_LIST = "list";
272     public static final String OSIS_ELEMENT_P = "p";
273     public static final String OSIS_ELEMENT_ITEM = "item";
274     public static final String OSIS_ELEMENT_FIGURE = "figure";
275     public static final String OSIS_ELEMENT_FOREIGN = "foreign";
276     public static final String OSIS_ELEMENT_W = "w";
277     public static final String OSIS_ELEMENT_CHAPTER = "chapter";
278     public static final String OSIS_ELEMENT_VERSE = "verse";
279     public static final String OSIS_ELEMENT_CELL = "cell";
280     public static final String OSIS_ELEMENT_DIV = "div";
281     public static final String OSIS_ELEMENT_OSIS = "osis";
282     public static final String OSIS_ELEMENT_WORK = "work";
283     public static final String OSIS_ELEMENT_HEADER = "header";
284     public static final String OSIS_ELEMENT_OSISTEXT = "osisText";
285     public static final String OSIS_ELEMENT_SEG = "seg";
286     public static final String OSIS_ELEMENT_LG = "lg";
287     public static final String OSIS_ELEMENT_L = "l";
288     public static final String OSIS_ELEMENT_LB = "lb";
289     public static final String OSIS_ELEMENT_HI = "hi";
290 
291     public static final String ATTRIBUTE_TEXT_OSISIDWORK = "osisIDWork";
292     public static final String ATTRIBUTE_WORK_OSISWORK = "osisWork";
293     public static final String OSIS_ATTR_OSISID = "osisID";
294     public static final String OSIS_ATTR_SID = "sID";
295     public static final String OSIS_ATTR_EID = "eID";
296     public static final String ATTRIBUTE_W_LEMMA = "lemma";
297     public static final String ATTRIBUTE_FIGURE_SRC = "src";
298     public static final String ATTRIBUTE_TABLE_ROLE = "role";
299     public static final String ATTRIBUTE_CELL_ALIGN = "align";
300     public static final String OSIS_ATTR_TYPE = "type";
301     public static final String OSIS_ATTR_CANONICAL = "canonical";
302     public static final String OSIS_ATTR_SUBTYPE = "subType";
303     public static final String OSIS_ATTR_REF = "osisRef";
304     public static final String OSIS_ATTR_LEVEL = "level";
305     public static final String ATTRIBUTE_SPEAKER_WHO = "who";
306     public static final String ATTRIBUTE_Q_WHO = "who";
307     public static final String ATTRIBUTE_W_MORPH = "morph";
308     public static final String ATTRIBUTE_OSISTEXT_OSISIDWORK = "osisIDWork";
309     // OSIS defines the lang attribute as the one from the xml namespace
310     // Typical usage element.setAttribute(OSISUtil.OSIS_ATTR_LANG, lang,
311     // Namespace.XML_NAMESPACE);
312     public static final String OSIS_ATTR_LANG = "lang";
313     public static final String ATTRIBUTE_DIV_BOOK = "book";
314 
315     /**
316      * Prefix for OSIS IDs that refer to Bibles
317      */
318     private static final String OSISID_PREFIX_BIBLE = "Bible.";
319 
320     private static final Set<String> EXTRA_BIBLICAL_ELEMENTS = new HashSet<String>(Arrays.asList(new String[] {
321             OSIS_ELEMENT_NOTE, OSIS_ELEMENT_TITLE, OSIS_ELEMENT_REFERENCE
322     }));
323 
324     /**
325      * The log stream
326      */
327     private static final Logger log = LoggerFactory.getLogger(OSISUtil.class);
328 
329 
330     /**
331      * Prevent instantiation
332      */
333     private OSISUtil() {
334     }
335 
336     private static OSISFactory factory = new OSISFactory();
337 
338     /**
339      * An accessor for the OSISFactory that creates OSIS objects
340      * 
341      * @return the singleton OSISFactory
342      */
343     public static OSISFactory factory() {
344         return factory;
345     }
346 
347     /**
348      * A generic way of creating empty Elements of various types
349      */
350     public static class OSISFactory {
351         /**
352         * @return an abbr element
353         */
354         public Element createAbbr() {
355             return new Element(OSIS_ELEMENT_ABBR);
356         }
357 
358         /**
359          * @return a seg element
360          */
361         public Element createSeg() {
362             return new Element(OSIS_ELEMENT_SEG);
363         }
364 
365         /**
366          * @return an osisText element
367          */
368         public Element createOsisText() {
369             return new Element(OSIS_ELEMENT_OSISTEXT);
370         }
371 
372         /**
373          * @return a header element
374          */
375         public Element createHeader() {
376             return new Element(OSIS_ELEMENT_HEADER);
377         }
378 
379         /**
380          * @return a work element
381          */
382         public Element createWork() {
383             return new Element(OSIS_ELEMENT_WORK);
384         }
385 
386         /**
387          * @return an osis element
388          */
389         public Element createOsis() {
390             return new Element(OSIS_ELEMENT_OSIS);
391         }
392 
393         /**
394          * @return a div element
395          */
396         public Element createDiv() {
397             return new Element(OSIS_ELEMENT_DIV);
398         }
399 
400         /**
401          * @return a cell element
402          */
403         public Element createCell() {
404             return new Element(OSIS_ELEMENT_CELL);
405         }
406 
407         /**
408          * @return a header cell element (akin to HTML's TH)
409          */
410         public Element createHeaderCell() {
411             Element ele = new Element(OSIS_ELEMENT_CELL);
412             ele.setAttribute(ATTRIBUTE_TABLE_ROLE, TABLE_ROLE_LABEL);
413             ele.setAttribute(ATTRIBUTE_CELL_ALIGN, CELL_ALIGN_CENTER);
414             return ele;
415         }
416 
417         /**
418          * @return a verse element
419          */
420         public Element createVerse() {
421             return new Element(OSIS_ELEMENT_VERSE);
422         }
423 
424         /**
425          * @return a w element
426          */
427         public Element createW() {
428             return new Element(OSIS_ELEMENT_W);
429         }
430 
431         /**
432          * @return a figure element
433          */
434         public Element createFigure() {
435             return new Element(OSIS_ELEMENT_FIGURE);
436         }
437 
438         /**
439          * @return a foreign element
440          */
441         public Element createForeign() {
442             return new Element(OSIS_ELEMENT_FOREIGN);
443         }
444 
445         /**
446          * @return an item element
447          */
448         public Element createItem() {
449             return new Element(OSIS_ELEMENT_ITEM);
450         }
451 
452         /**
453          * @return a p element
454          */
455         public Element createP() {
456             return new Element(OSIS_ELEMENT_P);
457         }
458 
459         /**
460          * @return a list element
461          */
462         public Element createList() {
463             return new Element(OSIS_ELEMENT_LIST);
464         }
465 
466         /**
467          * @return a q element
468          */
469         public Element createQ() {
470             return new Element(OSIS_ELEMENT_Q);
471         }
472 
473         /**
474          * @return a name element
475          */
476         public Element createName() {
477             return new Element(OSIS_ELEMENT_NAME);
478         }
479 
480         /**
481          * @return a note element
482          */
483         public Element createNote() {
484             return new Element(OSIS_ELEMENT_NOTE);
485         }
486 
487         /**
488          * @return a reference element
489          */
490         public Element createReference() {
491             return new Element(OSIS_ELEMENT_REFERENCE);
492         }
493 
494         /**
495          * @return a row element
496          */
497         public Element createRow() {
498             return new Element(OSIS_ELEMENT_ROW);
499         }
500 
501         /**
502          * @return a speaker element
503          */
504         public Element createSpeaker() {
505             return new Element(OSIS_ELEMENT_SPEAKER);
506         }
507 
508         /**
509          * @return a speech element
510          */
511         public Element createSpeech() {
512             return new Element(OSIS_ELEMENT_SPEECH);
513         }
514 
515         /**
516          * @return a table element
517          */
518         public Element createTable() {
519             return new Element(OSIS_ELEMENT_TABLE);
520         }
521 
522        /**
523         * @return a title element
524         */
525        public Element createTitle() {
526            return new Element(OSIS_ELEMENT_TITLE);
527        }
528 
529         /**
530          * Create a title marked as generated.
531          * 
532          * @return a generated title element
533          */
534         public Element createGeneratedTitle() {
535             Element title = new Element(OSIS_ELEMENT_TITLE);
536             title.setAttribute(OSISUtil.OSIS_ATTR_TYPE, OSISUtil.GENERATED_CONTENT);
537             return title;
538         }
539 
540         /**
541          * Line Group
542          * 
543          * @return a lg element
544          */
545         public Element createLG() {
546             return new Element(OSIS_ELEMENT_LG);
547         }
548 
549         /**
550          * Line
551          * 
552          * @return a l element
553          */
554         public Element createL() {
555             return new Element(OSIS_ELEMENT_L);
556         }
557 
558         /**
559          * Line Break
560          * 
561          * @return a lb element
562          */
563         public Element createLB() {
564             return new Element(OSIS_ELEMENT_LB);
565         }
566 
567         /**
568          * Highlight
569          * 
570          * @return a hi element
571          */
572         public Element createHI() {
573             return new Element(OSIS_ELEMENT_HI);
574         }
575 
576         /**
577          * Text
578          * 
579          * @param text the text for this element
580          * @return a text element
581          */
582         public Text createText(String text) {
583             return new Text(text);
584         }
585     }
586 
587     /**
588      * Dig past the osis and osisText element, if present, to get the meaningful
589      * content of the document.
590      * 
591      * @param root the element from which to get a fragment
592      * @return a fragment
593      */
594     public static List<Content> getFragment(Element root) {
595         if (root != null) {
596             Element content = root;
597             if (OSISUtil.OSIS_ELEMENT_OSIS.equals(root.getName())) {
598                 content = root.getChild(OSISUtil.OSIS_ELEMENT_OSISTEXT);
599             }
600 
601             if (OSISUtil.OSIS_ELEMENT_OSISTEXT.equals(root.getName())) {
602                 content = root.getChild(OSISUtil.OSIS_ELEMENT_DIV);
603             }
604 
605             // At this point we are at something interesting, possibly null.
606             // If this was a semantically valid OSIS document then it is a div.
607             // As long as this node has one child dig deeper.
608             if (content != null && content.getContentSize() == 1) {
609                 Content firstChild = content.getContent(0);
610                 if (firstChild instanceof Element && OSISUtil.OSIS_ELEMENT_DIV.equals(((Element) firstChild).getName())) {
611                     content = (Element) firstChild;
612                 }
613             }
614 
615             if (content != null) {
616                 return content.getContent();
617             }
618         }
619         return new ArrayList<Content>();
620     }
621 
622     /**
623      * Get the canonical text from an osis document consisting of a single
624      * fragment. The document is assumed to be valid OSIS2.0 XML. While xml
625      * valid is rigidly defined as meaning that an xml parser can validate the
626      * document, it does not mean that the document is valid OSIS. This is a
627      * semantic problem that is not validated. This method assumes that the root
628      * element is also semantically valid.
629      * 
630      * <p>
631      * This means that the top level element's tagname is osis. This can contain
632      * either a osisText or an osisCorpus. If it is an osisCorpus, then it
633      * contains an osisText. However, as a simplification, since JSword
634      * constructs the whole doc for the fragment, osisCorpus can be ignored.
635      * <p>
636      * The osisText element contains a div element that is either a container or
637      * a milestone. Again, JSword is providing the div element and it will be
638      * provided as a container. It is this div that "contains" the actual
639      * fragment.
640      * </p>
641      * <p>
642      * A verse element may either be a container or a milestone. Sword OSIS
643      * books differ in whether they provide the verse element. Most do not. The
644      * few that do are using the container model, but it has been proposed that
645      * milestones are the best practice.
646      * </p>
647      * 
648      * <p>
649      * The fragment may contain elements that are not a part of the original
650      * text. These are things such as notes.
651      * </p>
652      * 
653      * <p>
654      * Milestones require special handling. Beginning milestones elements have
655      * an sID attribute, while ending milestones have an eID with the same value
656      * as the opening. So everything between the start and the corresponding end
657      * is the content of the element. Also, for a given element, say div, they
658      * have to be properly nested as if they were container elements.
659      * </p>
660      * 
661      * @param root
662      *            the whole osis document.
663      * @return The canonical text without markup
664      */
665     public static String getCanonicalText(Element root) {
666         // if someone passes a root element which has text in, we need to check whether it's worth processing.
667         // For example. where you have a non-canonical title being passed in, we deal with this here.
668         if (!isCanonical(root)) {
669             //no point in continuing...
670             return "";
671         }
672 
673         StringBuilder buffer = new StringBuilder();
674 
675         // Dig past osis, osisText, if present, to get to the real content.
676         List<Content> frag = OSISUtil.getFragment(root);
677 
678         Iterator<Content> dit = frag.iterator();
679         String sID = null;
680         Content data = null;
681         Element ele = null;
682         while (dit.hasNext()) {
683             data = dit.next();
684             if (data instanceof Element) {
685                 ele = (Element) data;
686                 if (!isCanonical(ele)) {
687                     continue;
688                 }
689 
690                 if (ele.getName().equals(OSISUtil.OSIS_ELEMENT_VERSE)) {
691                     sID = ele.getAttributeValue(OSISUtil.OSIS_ATTR_SID);
692                 }
693 
694                 if (sID != null) {
695                     getCanonicalContent(ele, sID, dit, buffer);
696                 } else {
697                     getCanonicalContent(ele, null, ele.getContent().iterator(), buffer);
698                 }
699             } else if (data instanceof Text) {
700                 // make sure that adjacent text elements are separated by
701                 // whitespace
702                 // TODO(dms): verify that the xml parser does not split words
703                 // containing entities.
704                 int lastIndex = buffer.length() - 1;
705                 String text = ((Text) data).getText();
706                 // Ignore empty text nodes and do not add 
707                 if (text.length() != 0) {
708                     //do not add spaces when within a OSIS seg
709                     if (lastIndex >= 0 && !Character.isWhitespace(buffer.charAt(lastIndex)) && !Character.isWhitespace(text.charAt(0))) {
710                         buffer.append(' ');
711                     }
712                     buffer.append(text);
713                 }
714             }
715         }
716 
717         return buffer.toString().trim();
718     }
719 
720     /**
721      * A simplified plain text version of the data in this Element with all the
722      * markup stripped out.
723      * 
724      * @param root
725      *            the whole osis document.
726      * @return The Bible text without markup
727      */
728     public static String getPlainText(Element root) {
729         // Dig past osis, osisText, if present, to get to the real content.
730         return getTextContent(OSISUtil.getFragment(root));
731     }
732 
733     /**
734      * A space separate string containing Strong's numbers.
735      * 
736      * @param root
737      *            the whole osis document.
738      * @return The Strong's numbers in the text
739      */
740     public static String getStrongsNumbers(Element root) {
741         return getLexicalInformation(root, false);
742     }
743 
744     /**
745      * A '@' separated list of morphologies and strong numbers
746      * 
747      * @param root the osis element in question
748      * @return the string
749      */
750     public static String getMorphologiesWithStrong(Element root) {
751         return getLexicalInformation(root, true);
752     }
753 
754     /**
755      * concatenates strong and morphology information together
756      * 
757      * @param root the osis element in question
758      * @param includeMorphology whether to include morphology
759      * @return root of the element
760      */
761     public static String getLexicalInformation(Element root, boolean includeMorphology) {
762         StringBuilder buffer = new StringBuilder();
763 
764         for (Content content : getDeepContent(root, OSISUtil.OSIS_ELEMENT_W)) {
765             Element ele = (Element) content;
766             String attr = ele.getAttributeValue(OSISUtil.ATTRIBUTE_W_LEMMA);
767             if (attr != null) {
768                 Matcher matcher = strongsNumberPattern.matcher(attr);
769                 while (matcher.find()) {
770                     String strongsNum = matcher.group(1);
771                     if (buffer.length() > 0) {
772                         buffer.append(' ');
773                     }
774 
775                     if (includeMorphology) {
776                         //if including morphology, we want 1 big field, separated with '@'
777                         strongsNum = strongsNum.replace(SPACE_SEPARATOR, MORPH_INFO_SEPARATOR);
778                     }
779                     buffer.append(strongsNum);
780 
781                     if (includeMorphology) {
782                         //also include morphology if available
783                         String morph = ele.getAttributeValue(OSISUtil.ATTRIBUTE_W_MORPH);
784                         if (morph != null && morph.length() != 0) {
785                             buffer.append(MORPH_INFO_SEPARATOR);
786                             buffer.append(morph.replace(SPACE_SEPARATOR, MORPH_INFO_SEPARATOR));
787                         }
788                     }
789                 }
790             }
791         }
792 
793         return buffer.toString().trim();
794     }
795 
796     /**
797      * A space separate string containing osisID from the reference element.
798      * We pass book and key because the xref may not be valid and it needs to be reported.
799      *
800      * @param book the book to which the references refer
801      * @param key the verse containing the cross references
802      * @param v11n the versification
803      * @param root the osis element in question
804      * @return The references in the text
805      */
806     public static String getReferences(Book book, Key key, Versification v11n, Element root) {
807         PassageKeyFactory keyf = PassageKeyFactory.instance();
808         Key collector = keyf.createEmptyKeyList(v11n);
809 
810         for (Content content : getDeepContent(root, OSISUtil.OSIS_ELEMENT_REFERENCE)) {
811             Element ele = (Element) content;
812             String attr = ele.getAttributeValue(OSISUtil.OSIS_ATTR_REF);
813             if (attr != null) {
814                 try {
815                     collector.addAll(keyf.getKey(v11n, attr));
816                 } catch (NoSuchKeyException e) {
817                     DataPolice.report(book, key, "Unable to parse: " + attr + " - No such reference:" + e.getMessage());
818                 }
819             }
820         }
821 
822         return collector.getOsisID();
823     }
824 
825     /**
826      * The text of non-reference notes.
827      * 
828      * @param root the whole OSIS document
829      * @return The references in the text
830      */
831     public static String getNotes(Element root) {
832         StringBuilder buffer = new StringBuilder();
833 
834         for (Content content : getDeepContent(root, OSISUtil.OSIS_ELEMENT_NOTE)) {
835             Element ele = (Element) content;
836             String attr = ele.getAttributeValue(OSISUtil.OSIS_ATTR_TYPE);
837             if (attr == null || !attr.equals(NOTETYPE_REFERENCE)) {
838                 if (buffer.length() > 0) {
839                     buffer.append(' ');
840                 }
841                 buffer.append(OSISUtil.getTextContent(ele.getContent()));
842             }
843         }
844 
845         return buffer.toString();
846     }
847 
848     /**
849      * The text of non-reference notes.
850      * 
851      * @param root the whole OSIS document
852      * @return The references in the text
853      */
854     public static String getHeadings(Element root) {
855         StringBuilder buffer = new StringBuilder();
856 
857         for (Content content : getDeepContent(root, OSISUtil.OSIS_ELEMENT_TITLE)) {
858             Element ele = (Element) content;
859 
860             if (buffer.length() > 0) {
861                 buffer.append(' ');
862             }
863             buffer.append(OSISUtil.getTextContent(ele.getContent()));
864         }
865 
866         return buffer.toString();
867     }
868 
869     private static void getCanonicalContent(Element parent, String sID, Iterator<Content> iter, StringBuilder buffer) {
870         if (!isCanonical(parent)) {
871             return;
872         }
873 
874         Content data = null;
875         Element ele = null;
876         String eleName = null;
877         String eID = null;
878         while (iter.hasNext()) {
879             data = iter.next();
880             if (data instanceof Element) {
881                 ele = (Element) data;
882                 // If the milestoned element is done then quit.
883                 // This should be a eID=, that matches sID, from the same
884                 // element.
885                 eleName = ele.getName();
886                 eID = ele.getAttributeValue(OSISUtil.OSIS_ATTR_SID);
887                 if (eID != null && eID.equals(sID) && eleName.equals(parent.getName())) {
888                     break;
889                 }
890                 OSISUtil.getCanonicalContent(ele, sID, ele.getContent().iterator(), buffer);
891             } else if (data instanceof Text) {
892                 // make sure that adjacent text elements are separated by
893                 // whitespace
894                 // Empty elements also produce whitespace.
895                 // TODO(dms): verify that the xml parser does not split words
896                 // containing entities.
897                 int lastIndex = buffer.length() - 1;
898                 String text = ((Text) data).getText();
899                 if (lastIndex >= 0 && !Character.isWhitespace(buffer.charAt(lastIndex)) && (text.length() == 0 || !Character.isWhitespace(text.charAt(0)))  && !OSIS_ELEMENT_SEG.equals(parent.getName())) {
900                     buffer.append(' ');
901                 }
902                 buffer.append(text);
903             }
904         }
905     }
906 
907     private static boolean isCanonical(Content content) {
908         boolean result = true;
909         if (content instanceof Element) {
910             Element element = (Element) content;
911 
912             // Ignore extra-biblical text
913             if (EXTRA_BIBLICAL_ELEMENTS.contains(element.getName())) {
914                 String canonical = element.getAttributeValue(OSISUtil.OSIS_ATTR_CANONICAL);
915                 result = Boolean.valueOf(canonical).booleanValue();
916             }
917         }
918 
919         return result;
920     }
921 
922     private static String getTextContent(List<Content> fragment) {
923         StringBuilder buffer = new StringBuilder();
924 
925         for (Content next : fragment) {
926             recurseElement(next, buffer);
927         }
928 
929         return buffer.toString();
930     }
931 
932     /**
933      * Find all the instances of elements of type <code>find</code> under the
934      * element <code>div</code>.
935      * 
936      * @param div the element to trawl
937      * @param name the element name to search
938      * @return the collection of matching content
939      */
940     public static Collection<Content> getDeepContent(Element div, String name) {
941         List<Content> reply = new ArrayList<Content>();
942         recurseDeepContent(div, name, reply);
943         return reply;
944     }
945 
946     /**
947      * Walk up the tree from the W to find out what verse we are in.
948      * 
949      * @param v11n the versification
950      * @param ele
951      *            The start point for our verse hunt.
952      * @return The verse we are in
953      * @throws BookException 
954      */
955     public static Verse getVerse(Versification v11n, Element ele) throws BookException {
956         if (ele.getName().equals(OSIS_ELEMENT_VERSE)) {
957             // If the element is an OSIS Verse then this is fairly easy
958             String osisid = ele.getAttributeValue(OSIS_ATTR_OSISID);
959 
960             try {
961                 return VerseFactory.fromString(v11n, osisid);
962             } catch (NoSuchVerseException ex) {
963                 throw new BookException(JSOtherMsg.lookupText("OsisID not valid: {0}", osisid), ex);
964             }
965         }
966 
967         // So we just walk up the tree trying to find a verse
968         Parent parent = ele.getParent();
969         if (parent instanceof Element) {
970             return getVerse(v11n, (Element) parent);
971         }
972 
973         throw new BookException(JSOtherMsg.lookupText("Verse element could not be found"));
974     }
975 
976     /**
977      * Helper method to create the boilerplate headers in an OSIS document from
978      * the current metadata object
979      * 
980      * @param bmd the book's meta data
981      * @return the root of an OSIS document
982      */
983     public static Element createOsisFramework(BookMetaData bmd) {
984         Element osis = factory().createOsis();
985         String osisid = bmd.getInitials();
986 
987         Element work = factory().createWork();
988         work.setAttribute(ATTRIBUTE_WORK_OSISWORK, osisid);
989 
990         Element header = factory().createHeader();
991         header.addContent(work);
992 
993         Element text = factory().createOsisText();
994         text.setAttribute(ATTRIBUTE_TEXT_OSISIDWORK, OSISID_PREFIX_BIBLE + osisid);
995         text.addContent(header);
996 
997         osis.addContent(text);
998 
999         return osis;
1000    }
1001
1002    /**
1003     * Convert a Difference list into a pretty HTML report.
1004     * 
1005     * @param diffs
1006     *            List of Difference objects
1007     * @return HTML representation
1008     */
1009    public static List<Content> diffToOsis(List<Difference> diffs) {
1010        Element div = factory().createDiv();
1011
1012        for (int x = 0; x < diffs.size(); x++) {
1013            Difference diff = diffs.get(x);
1014            EditType editType = diff.getEditType(); // Mode (delete, equal,
1015                                                    // insert)
1016            Text text = factory.createText(diff.getText()); // Text of change.
1017
1018            if (EditType.DELETE.equals(editType)) {
1019                Element hi = factory().createHI();
1020                hi.setAttribute(OSISUtil.OSIS_ATTR_TYPE, OSISUtil.HI_LINETHROUGH);
1021                hi.addContent(text);
1022                div.addContent(hi);
1023            } else if (EditType.INSERT.equals(editType)) {
1024                Element hi = factory().createHI();
1025                hi.setAttribute(OSISUtil.OSIS_ATTR_TYPE, OSISUtil.HI_UNDERLINE);
1026                hi.addContent(text);
1027                div.addContent(hi);
1028            } else {
1029                div.addContent(text);
1030            }
1031        }
1032        return div.cloneContent();
1033    }
1034
1035    public static List<Content> rtfToOsis(String rtf) {
1036        Element div = factory().createDiv();
1037        Stack<Content> stack = new Stack<Content>();
1038        stack.push(div);
1039
1040        int strlen = rtf.length();
1041
1042        StringBuilder text = new StringBuilder(strlen);
1043
1044        int i = 0;
1045        for (i = 0; i < strlen; i++) {
1046            char curChar = rtf.charAt(i);
1047            if (curChar != '\\') {
1048                text.append(curChar);
1049                continue;
1050            }
1051
1052            // The following are ordered from most to least common
1053            // and when one is a prefix of another, it follows.
1054
1055            // Used to end all open attributes. Only \qc in our implementation.
1056            if (rtf.startsWith("\\pard", i)) {
1057                Element currentElement = (Element) stack.pop();
1058                currentElement.addContent(text.toString());
1059                text.delete(0, text.length());
1060                stack.clear();
1061                stack.push(div);
1062                i += (i + 5 < strlen && rtf.charAt(i + 5) == ' ') ? 5 : 4;
1063                continue;
1064            }
1065
1066            // Simulate a paragraph break.
1067            if (rtf.startsWith("\\par", i)) {
1068                Element currentElement = (Element) stack.peek();
1069                currentElement.addContent(text.toString());
1070                text.delete(0, text.length());
1071                currentElement.addContent(OSISUtil.factory.createLB());
1072                i += (i + 4 < strlen && rtf.charAt(i + 4) == ' ') ? 4 : 3;
1073                continue;
1074            }
1075
1076            // OSIS does not have the notion of centered text.
1077            // So we define our own
1078            if (rtf.startsWith("\\qc", i)) {
1079                Element centerDiv = OSISUtil.factory.createDiv();
1080                centerDiv.setAttribute(OSIS_ATTR_TYPE, "x-center");
1081                Element currentElement = (Element) stack.peek();
1082                currentElement.addContent(text.toString());
1083                text.delete(0, text.length());
1084                currentElement.addContent(centerDiv);
1085                stack.push(centerDiv);
1086                // skip following space, if any
1087                i += (i + 3 < strlen && rtf.charAt(i + 3) == ' ') ? 3 : 2;
1088                continue;
1089            }
1090
1091            // convert Unicode representations to Unicode
1092            if (rtf.startsWith("\\u", i)) {
1093                StringBuilder buf = new StringBuilder();
1094                i += 2;
1095                while (i < strlen) {
1096                    char curDigit = rtf.charAt(i);
1097                    if (curDigit != '-' && !Character.isDigit(curDigit)) {
1098                        break;
1099                    }
1100                    buf.append(curDigit);
1101                    i++;
1102                }
1103                // At this point:
1104                // buf contains the numeric representation of the number, 16-bit
1105                // signed
1106                // charAt(i) is the substitution character if Unicode is not
1107                // supported
1108                int value = Integer.parseInt(buf.toString());
1109                if (value < 0) {
1110                    value += 65536;
1111                }
1112                text.append((char) value);
1113                // don't advance since i is on the substitute character.
1114                continue;
1115            }
1116
1117            // close italic and bold
1118            if (rtf.startsWith("\\i0", i) || rtf.startsWith("\\b0", i)) {
1119                Element currentElement = (Element) stack.pop();
1120                currentElement.addContent(text.toString());
1121                text.delete(0, text.length());
1122                i += (i + 3 < strlen && rtf.charAt(i + 3) == ' ') ? 3 : 2;
1123                continue;
1124            }
1125
1126            // Skip escaped whitespace
1127            if (rtf.startsWith(" ", i) || rtf.startsWith("\n", i)) {
1128                i += 1;
1129                continue;
1130            }
1131
1132            // start italic
1133            if (rtf.startsWith("\\i", i)) {
1134                Element hiElement = OSISUtil.factory.createHI();
1135                hiElement.setAttribute(OSIS_ATTR_TYPE, HI_ITALIC);
1136                Element currentElement = (Element) stack.peek();
1137                currentElement.addContent(text.toString());
1138                text.delete(0, text.length());
1139                currentElement.addContent(hiElement);
1140                stack.push(hiElement);
1141                i += (i + 2 < strlen && rtf.charAt(i + 2) == ' ') ? 2 : 1;
1142                continue;
1143            }
1144
1145            // start bold
1146            if (rtf.startsWith("\\b", i)) {
1147                Element hiElement = OSISUtil.factory.createHI();
1148                hiElement.setAttribute(OSIS_ATTR_TYPE, HI_BOLD);
1149                Element currentElement = (Element) stack.peek();
1150                currentElement.addContent(text.toString());
1151                text.delete(0, text.length());
1152                currentElement.addContent(hiElement);
1153                stack.push(hiElement);
1154                i += (i + 2 < strlen && rtf.charAt(i + 2) == ' ') ? 2 : 1;
1155                continue;
1156            }
1157
1158        }
1159
1160        // If there is any text that has not been consumed
1161        if (text.length() > 0) {
1162            div.addContent(text.toString());
1163        }
1164        // div.addContent(text.toString());
1165        // // If the fragment is already in a document, then use that.
1166        // Document doc = div.getDocument();
1167        // if (doc == null)
1168        // {
1169        // doc = new Document(div);
1170        // }
1171        // SAXEventProvider ep = new JDOMSAXEventProvider(doc);
1172        // ContentHandler osis = new
1173        // PrettySerializingContentHandler(FormatType.CLASSIC_INDENT);
1174        // try
1175        // {
1176        // ep.provideSAXEvents(osis);
1177        // }
1178        // catch (SAXException e)
1179        // {
1180        // e.printStackTrace();
1181        // }
1182        // System.err.println(osis.toString());
1183        return div.cloneContent();
1184    }
1185
1186    /**
1187     * Find all the instances of elements of type <code>find</code> under the
1188     * element <code>div</code>. For internal use only.
1189     * 
1190     * @param start the node under which searches occur
1191     * @param name element name to search
1192     * @param reply the list to modify with matching content
1193     */
1194    private static void recurseDeepContent(Element start, String name, List<Content> reply) {
1195        if (start.getName().equals(name)) {
1196            reply.add(start);
1197        }
1198
1199//        Content data = null;
1200        Element ele = null;
1201        for (Content data : start.getContent()) {
1202            if (data instanceof Element) {
1203                ele = (Element) data;
1204                recurseDeepContent(ele, name, reply);
1205            }
1206        }
1207    }
1208
1209    /**
1210     * If we have a String just add it to the buffer, but if we have an Element
1211     * then try to dig the strings out of it.
1212     * 
1213     * @param sub a sub element or text node
1214     * @param buffer the buffer to build on match
1215     */
1216    private static void recurseElement(Object sub, StringBuilder buffer) {
1217        if (sub instanceof Text) {
1218            buffer.append(((Text) sub).getText());
1219        } else if (sub instanceof Element) {
1220            recurseChildren((Element) sub, buffer);
1221        } else {
1222            log.error("unknown type: {}", sub.getClass().getName());
1223        }
1224    }
1225
1226    /**
1227     * Helper to extract the Strings from a nest of JDOM elements
1228     * 
1229     * @param ele
1230     *            The JDOM Element to dig into
1231     * @param buffer
1232     *            The place we accumulate strings.
1233     */
1234    private static void recurseChildren(Element ele, StringBuilder buffer) {
1235        // ele is a JDOM Element that might have a getContent() method
1236        for (Content sub : ele.getContent()) {
1237            recurseElement(sub, buffer);
1238        }
1239    }
1240
1241    private static String strongsNumber = "strong:([GgHh][0-9]+!?[A-Za-z]*)";
1242    private static Pattern strongsNumberPattern = Pattern.compile(strongsNumber);
1243}
1244