1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 or later
5    * as published by the Free Software Foundation. This program is distributed
6    * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
7    * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *      http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * © CrossWire Bible Society, 2005 - 2016
18   *
19   */
20  package org.crosswire.jsword.book;
21  
22  import java.util.ArrayList;
23  import java.util.Arrays;
24  import java.util.Collection;
25  import java.util.HashSet;
26  import java.util.Iterator;
27  import java.util.List;
28  import java.util.Set;
29  import java.util.Stack;
30  import java.util.regex.Matcher;
31  import java.util.regex.Pattern;
32  
33  import org.crosswire.common.diff.Difference;
34  import org.crosswire.common.diff.EditType;
35  import org.crosswire.jsword.JSOtherMsg;
36  import org.crosswire.jsword.passage.Key;
37  import org.crosswire.jsword.passage.NoSuchKeyException;
38  import org.crosswire.jsword.passage.NoSuchVerseException;
39  import org.crosswire.jsword.passage.PassageKeyFactory;
40  import org.crosswire.jsword.passage.Verse;
41  import org.crosswire.jsword.passage.VerseFactory;
42  import org.crosswire.jsword.versification.Versification;
43  import org.jdom2.Content;
44  import org.jdom2.Element;
45  import org.jdom2.Parent;
46  import org.jdom2.Text;
47  import org.slf4j.Logger;
48  import org.slf4j.LoggerFactory;
49  
50  /**
51   * Some simple utilities to help working with OSIS classes.
52   * 
53   * @see gnu.lgpl.License The GNU Lesser General Public License for details.
54   * @author Joe Walker
55   */
56  public final class OSISUtil {
57      private static final char SPACE_SEPARATOR = ' ';
58      private static final char MORPH_INFO_SEPARATOR = '@';
59  
60      /**
61       * The following are values for the type attribute on the hi element.
62       */
63      /**
64       * Constant for acrostic highlighting
65       */
66      public static final String HI_ACROSTIC = "acrostic";
67  
68      /**
69       * Constant for rendering bold text
70       */
71      public static final String HI_BOLD = "bold";
72  
73      /**
74       * Constant for rendering emphatic text
75       */
76      public static final String HI_EMPHASIS = "emphasis";
77  
78      /**
79       * Constant for rendering illuminated text.
80       */
81      public static final String HI_ILLUMINATED = "illuminated";
82  
83      /**
84       * Constant for rendering italic text.
85       */
86      public static final String HI_ITALIC = "italic";
87  
88      /**
89       * Constant for rendering strike-through text
90       */
91      public static final String HI_LINETHROUGH = "line-through";
92  
93      /**
94       * Constant for rendering normal text.
95       */
96      public static final String HI_NORMAL = "normal";
97  
98      /**
99       * Constant for rendering small caps
100      */
101     public static final String HI_SMALL_CAPS = "small-caps";
102 
103     /**
104      * Constant for rendering subscripts
105      */
106     public static final String HI_SUB = "sub";
107 
108     /**
109      * Constant for rendering superscripts
110      */
111     public static final String HI_SUPER = "super";
112 
113     /**
114      * Constant for rendering underlined text
115      */
116     public static final String HI_UNDERLINE = "underline";
117 
118     /**
119      * Constant for rendering upper case text
120      */
121     public static final String HI_X_CAPS = "x-caps";
122 
123     /**
124      * Constant for rendering big text
125      */
126     public static final String HI_X_BIG = "x-big";
127 
128     /**
129      * Constant for rendering small text
130      */
131     public static final String HI_X_SMALL = "x-small";
132 
133     /**
134      * Constant for rendering tt text
135      */
136     public static final String HI_X_TT = "x-tt";
137 
138     /**
139      * Constant to help narrow down what we use seg for. In this case the
140      * justify right tag
141      */
142     public static final String SEG_JUSTIFYRIGHT = "text-align: right;";
143 
144     /**
145      * Constant to help narrow down what we use seg for. In this case the
146      * justify right tag
147      */
148     public static final String SEG_JUSTIFYLEFT = "text-align: left;";
149 
150     /**
151      * Constant to help narrow down what we use seg for. In this case the thml
152      * center tag
153      */
154     public static final String SEG_CENTER = "text-align: center;";
155 
156     /**
157      * Constant to help narrow down what we use div for. In this case the thml
158      * pre tag
159      */
160     public static final String DIV_PRE = "x-pre";
161 
162     /**
163      * Constant to help narrow down what we use seg for. In this case the color
164      * tag
165      */
166     public static final String SEG_COLORPREFIX = "color: ";
167 
168     /**
169      * Constant to help narrow down what we use seg for. In this case the
170      * font-size tag
171      */
172     public static final String SEG_SIZEPREFIX = "font-size: ";
173 
174     /**
175      * Constant for x- types
176      */
177     public static final String TYPE_X_PREFIX = "x-";
178 
179     /**
180      * Constant for the study note type
181      */
182     public static final String NOTETYPE_STUDY = "x-StudyNote";
183 
184     /**
185      * Constant for the cross reference note type
186      */
187     public static final String NOTETYPE_REFERENCE = "crossReference";
188 
189     /**
190      * Constant for the variant type segment
191      */
192     public static final String VARIANT_TYPE = "x-variant";
193     public static final String VARIANT_CLASS = "x-";
194 
195     /**
196      * Constant for JSword generated content. Used for type or subType.
197      */
198     public static final String GENERATED_CONTENT = "x-gen";
199 
200     /**
201      * Constant for the pos (part of speech) type.
202      */
203     public static final String POS_TYPE = "x-pos";
204 
205     /**
206      * Constant for the def (dictionary definition) type
207      */
208     public static final String DEF_TYPE = "x-def";
209 
210     /**
211      * Constant for a Strong's numbering lemma
212      */
213     public static final String LEMMA_STRONGS = "strong:";
214     public static final String LEMMA_MISC = "lemma:";
215     public static final String MORPH_ROBINSONS = "robinson:";
216 
217     /**
218      * Constant for Strong's numbering morphology
219      */
220     public static final String MORPH_STRONGS = "x-StrongsMorph:T";
221 
222     /**
223      * Constant to help narrow down what we use "q" for. In this case:
224      * blockquote
225      */
226     public static final String Q_BLOCK = "blockquote";
227 
228     /**
229      * Constant to help narrow down what we use "q" for. In this case: citation
230      */
231     public static final String Q_CITATION = "citation";
232 
233     /**
234      * Constant to help narrow down what we use "q" for. In this case: embedded
235      */
236     public static final String Q_EMBEDDED = "embedded";
237 
238     /**
239      * Constant to help narrow down what we use "list" for.
240      */
241     public static final String LIST_ORDERED = "x-ordered";
242     public static final String LIST_UNORDERED = "x-unordered";
243 
244     /**
245      * Table roles (on table, row and cell elements) can be "data", the default,
246      * or label.
247      */
248     public static final String TABLE_ROLE_LABEL = "label";
249 
250     /**
251      * Possible cell alignments
252      */
253     public static final String CELL_ALIGN_LEFT = "left";
254     public static final String CELL_ALIGN_RIGHT = "right";
255     public static final String CELL_ALIGN_CENTER = "center";
256     public static final String CELL_ALIGN_JUSTIFY = "justify";
257     public static final String CELL_ALIGN_START = "start";
258     public static final String CELL_ALIGN_END = "end";
259 
260     public static final String OSIS_ELEMENT_ABBR = "abbr";
261     public static final String OSIS_ELEMENT_TITLE = "title";
262     public static final String OSIS_ELEMENT_TABLE = "table";
263     public static final String OSIS_ELEMENT_SPEECH = "speech";
264     public static final String OSIS_ELEMENT_SPEAKER = "speaker";
265     public static final String OSIS_ELEMENT_ROW = "row";
266     public static final String OSIS_ELEMENT_REFERENCE = "reference";
267     public static final String OSIS_ELEMENT_NOTE = "note";
268     public static final String OSIS_ELEMENT_NAME = "name";
269     public static final String OSIS_ELEMENT_Q = "q";
270     public static final String OSIS_ELEMENT_LIST = "list";
271     public static final String OSIS_ELEMENT_P = "p";
272     public static final String OSIS_ELEMENT_ITEM = "item";
273     public static final String OSIS_ELEMENT_FIGURE = "figure";
274     public static final String OSIS_ELEMENT_FOREIGN = "foreign";
275     public static final String OSIS_ELEMENT_W = "w";
276     public static final String OSIS_ELEMENT_CHAPTER = "chapter";
277     public static final String OSIS_ELEMENT_VERSE = "verse";
278     public static final String OSIS_ELEMENT_CELL = "cell";
279     public static final String OSIS_ELEMENT_DIV = "div";
280     public static final String OSIS_ELEMENT_OSIS = "osis";
281     public static final String OSIS_ELEMENT_WORK = "work";
282     public static final String OSIS_ELEMENT_HEADER = "header";
283     public static final String OSIS_ELEMENT_OSISTEXT = "osisText";
284     public static final String OSIS_ELEMENT_SEG = "seg";
285     public static final String OSIS_ELEMENT_LG = "lg";
286     public static final String OSIS_ELEMENT_L = "l";
287     public static final String OSIS_ELEMENT_LB = "lb";
288     public static final String OSIS_ELEMENT_HI = "hi";
289 
290     public static final String ATTRIBUTE_TEXT_OSISIDWORK = "osisIDWork";
291     public static final String ATTRIBUTE_WORK_OSISWORK = "osisWork";
292     public static final String OSIS_ATTR_OSISID = "osisID";
293     public static final String OSIS_ATTR_SID = "sID";
294     public static final String OSIS_ATTR_EID = "eID";
295     public static final String ATTRIBUTE_W_LEMMA = "lemma";
296     public static final String ATTRIBUTE_FIGURE_SRC = "src";
297     public static final String ATTRIBUTE_TABLE_BORDER = "border";
298     public static final String ATTRIBUTE_TABLE_ROLE = "role";
299     public static final String ATTRIBUTE_CELL_ALIGN = "align";
300     public static final String ATTRIBUTE_CELL_ROWS = "rows";
301     public static final String ATTRIBUTE_CELL_COLS = "cols";
302     public static final String OSIS_ATTR_TYPE = "type";
303     public static final String OSIS_ATTR_CANONICAL = "canonical";
304     public static final String OSIS_ATTR_SUBTYPE = "subType";
305     public static final String OSIS_ATTR_REF = "osisRef";
306     public static final String OSIS_ATTR_LEVEL = "level";
307     public static final String ATTRIBUTE_SPEAKER_WHO = "who";
308     public static final String ATTRIBUTE_Q_WHO = "who";
309     public static final String ATTRIBUTE_W_MORPH = "morph";
310     public static final String ATTRIBUTE_OSISTEXT_OSISIDWORK = "osisIDWork";
311     // OSIS defines the lang attribute as the one from the xml namespace
312     // Typical usage element.setAttribute(OSISUtil.OSIS_ATTR_LANG, lang,
313     // Namespace.XML_NAMESPACE);
314     public static final String OSIS_ATTR_LANG = "lang";
315     public static final String ATTRIBUTE_DIV_BOOK = "book";
316 
317     /**
318      * Prefix for OSIS IDs that refer to Bibles
319      */
320     private static final String OSISID_PREFIX_BIBLE = "Bible.";
321 
322     private static final Set<String> EXTRA_BIBLICAL_ELEMENTS = new HashSet<String>(Arrays.asList(new String[] {
323             OSIS_ELEMENT_NOTE, OSIS_ELEMENT_TITLE, OSIS_ELEMENT_REFERENCE
324     }));
325 
326     /**
327      * The log stream
328      */
329     private static final Logger log = LoggerFactory.getLogger(OSISUtil.class);
330 
331 
332     /**
333      * Prevent instantiation
334      */
335     private OSISUtil() {
336     }
337 
338     private static OSISFactory factory = new OSISFactory();
339 
340     /**
341      * An accessor for the OSISFactory that creates OSIS objects
342      * 
343      * @return the singleton OSISFactory
344      */
345     public static OSISFactory factory() {
346         return factory;
347     }
348 
349     /**
350      * A generic way of creating empty Elements of various types
351      */
352     public static class OSISFactory {
353         /**
354         * @return an abbr element
355         */
356         public Element createAbbr() {
357             return new Element(OSIS_ELEMENT_ABBR);
358         }
359 
360         /**
361          * @return a seg element
362          */
363         public Element createSeg() {
364             return new Element(OSIS_ELEMENT_SEG);
365         }
366 
367         /**
368          * @return an osisText element
369          */
370         public Element createOsisText() {
371             return new Element(OSIS_ELEMENT_OSISTEXT);
372         }
373 
374         /**
375          * @return a header element
376          */
377         public Element createHeader() {
378             return new Element(OSIS_ELEMENT_HEADER);
379         }
380 
381         /**
382          * @return a work element
383          */
384         public Element createWork() {
385             return new Element(OSIS_ELEMENT_WORK);
386         }
387 
388         /**
389          * @return an osis element
390          */
391         public Element createOsis() {
392             return new Element(OSIS_ELEMENT_OSIS);
393         }
394 
395         /**
396          * @return a div element
397          */
398         public Element createDiv() {
399             return new Element(OSIS_ELEMENT_DIV);
400         }
401 
402         /**
403          * @return a cell element
404          */
405         public Element createCell() {
406             return new Element(OSIS_ELEMENT_CELL);
407         }
408 
409         /**
410          * @return a header cell element (akin to HTML's TH)
411          */
412         public Element createHeaderCell() {
413             Element ele = new Element(OSIS_ELEMENT_CELL);
414             ele.setAttribute(ATTRIBUTE_TABLE_ROLE, TABLE_ROLE_LABEL);
415             ele.setAttribute(ATTRIBUTE_CELL_ALIGN, CELL_ALIGN_CENTER);
416             return ele;
417         }
418 
419         /**
420          * @return a verse element
421          */
422         public Element createVerse() {
423             return new Element(OSIS_ELEMENT_VERSE);
424         }
425 
426         /**
427          * @return a w element
428          */
429         public Element createW() {
430             return new Element(OSIS_ELEMENT_W);
431         }
432 
433         /**
434          * @return a figure element
435          */
436         public Element createFigure() {
437             return new Element(OSIS_ELEMENT_FIGURE);
438         }
439 
440         /**
441          * @return a foreign element
442          */
443         public Element createForeign() {
444             return new Element(OSIS_ELEMENT_FOREIGN);
445         }
446 
447         /**
448          * @return an item element
449          */
450         public Element createItem() {
451             return new Element(OSIS_ELEMENT_ITEM);
452         }
453 
454         /**
455          * @return a p element
456          */
457         public Element createP() {
458             return new Element(OSIS_ELEMENT_P);
459         }
460 
461         /**
462          * @return a list element
463          */
464         public Element createList() {
465             return new Element(OSIS_ELEMENT_LIST);
466         }
467 
468         /**
469          * @return a q element
470          */
471         public Element createQ() {
472             return new Element(OSIS_ELEMENT_Q);
473         }
474 
475         /**
476          * @return a name element
477          */
478         public Element createName() {
479             return new Element(OSIS_ELEMENT_NAME);
480         }
481 
482         /**
483          * @return a note element
484          */
485         public Element createNote() {
486             return new Element(OSIS_ELEMENT_NOTE);
487         }
488 
489         /**
490          * @return a reference element
491          */
492         public Element createReference() {
493             return new Element(OSIS_ELEMENT_REFERENCE);
494         }
495 
496         /**
497          * @return a row element
498          */
499         public Element createRow() {
500             return new Element(OSIS_ELEMENT_ROW);
501         }
502 
503         /**
504          * @return a speaker element
505          */
506         public Element createSpeaker() {
507             return new Element(OSIS_ELEMENT_SPEAKER);
508         }
509 
510         /**
511          * @return a speech element
512          */
513         public Element createSpeech() {
514             return new Element(OSIS_ELEMENT_SPEECH);
515         }
516 
517         /**
518          * @return a table element
519          */
520         public Element createTable() {
521             return new Element(OSIS_ELEMENT_TABLE);
522         }
523 
524        /**
525         * @return a title element
526         */
527        public Element createTitle() {
528            return new Element(OSIS_ELEMENT_TITLE);
529        }
530 
531         /**
532          * Create a title marked as generated.
533          * 
534          * @return a generated title element
535          */
536         public Element createGeneratedTitle() {
537             Element title = new Element(OSIS_ELEMENT_TITLE);
538             title.setAttribute(OSISUtil.OSIS_ATTR_TYPE, OSISUtil.GENERATED_CONTENT);
539             return title;
540         }
541 
542         /**
543          * Line Group
544          * 
545          * @return a lg element
546          */
547         public Element createLG() {
548             return new Element(OSIS_ELEMENT_LG);
549         }
550 
551         /**
552          * Line
553          * 
554          * @return a l element
555          */
556         public Element createL() {
557             return new Element(OSIS_ELEMENT_L);
558         }
559 
560         /**
561          * Line Break
562          * 
563          * @return a lb element
564          */
565         public Element createLB() {
566             return new Element(OSIS_ELEMENT_LB);
567         }
568 
569         /**
570          * Highlight
571          * 
572          * @return a hi element
573          */
574         public Element createHI() {
575             return new Element(OSIS_ELEMENT_HI);
576         }
577 
578         /**
579          * Text
580          * 
581          * @param text the text for this element
582          * @return a text element
583          */
584         public Text createText(String text) {
585             return new Text(text);
586         }
587     }
588 
589     /**
590      * Dig past the osis and osisText element, if present, to get the meaningful
591      * content of the document.
592      * 
593      * @param root the element from which to get a fragment
594      * @return a fragment
595      */
596     public static List<Content> getFragment(Element root) {
597         if (root != null) {
598             Element content = root;
599             if (OSISUtil.OSIS_ELEMENT_OSIS.equals(root.getName())) {
600                 content = root.getChild(OSISUtil.OSIS_ELEMENT_OSISTEXT);
601             }
602 
603             if (OSISUtil.OSIS_ELEMENT_OSISTEXT.equals(root.getName())) {
604                 content = root.getChild(OSISUtil.OSIS_ELEMENT_DIV);
605             }
606 
607             // At this point we are at something interesting, possibly null.
608             // If this was a semantically valid OSIS document then it is a div.
609             // As long as this node has one child dig deeper.
610             if (content != null && content.getContentSize() == 1) {
611                 Content firstChild = content.getContent(0);
612                 if (firstChild instanceof Element && OSISUtil.OSIS_ELEMENT_DIV.equals(((Element) firstChild).getName())) {
613                     content = (Element) firstChild;
614                 }
615             }
616 
617             if (content != null) {
618                 return content.getContent();
619             }
620         }
621         return new ArrayList<Content>();
622     }
623 
624     /**
625      * Get the canonical text from an osis document consisting of a single
626      * fragment. The document is assumed to be valid OSIS2.0 XML. While xml
627      * valid is rigidly defined as meaning that an xml parser can validate the
628      * document, it does not mean that the document is valid OSIS. This is a
629      * semantic problem that is not validated. This method assumes that the root
630      * element is also semantically valid.
631      * 
632      * <p>
633      * This means that the top level element's tagname is osis. This can contain
634      * either a osisText or an osisCorpus. If it is an osisCorpus, then it
635      * contains an osisText. However, as a simplification, since JSword
636      * constructs the whole doc for the fragment, osisCorpus can be ignored.
637      * <p>
638      * The osisText element contains a div element that is either a container or
639      * a milestone. Again, JSword is providing the div element and it will be
640      * provided as a container. It is this div that "contains" the actual
641      * fragment.
642      * </p>
643      * <p>
644      * A verse element may either be a container or a milestone. Sword OSIS
645      * books differ in whether they provide the verse element. Most do not. The
646      * few that do are using the container model, but it has been proposed that
647      * milestones are the best practice.
648      * </p>
649      * 
650      * <p>
651      * The fragment may contain elements that are not a part of the original
652      * text. These are things such as notes.
653      * </p>
654      * 
655      * <p>
656      * Milestones require special handling. Beginning milestones elements have
657      * an sID attribute, while ending milestones have an eID with the same value
658      * as the opening. So everything between the start and the corresponding end
659      * is the content of the element. Also, for a given element, say div, they
660      * have to be properly nested as if they were container elements.
661      * </p>
662      * 
663      * @param root
664      *            the whole osis document.
665      * @return The canonical text without markup
666      */
667     public static String getCanonicalText(Element root) {
668         // if someone passes a root element which has text in, we need to check whether it's worth processing.
669         // For example. where you have a non-canonical title being passed in, we deal with this here.
670         if (!isCanonical(root)) {
671             //no point in continuing...
672             return "";
673         }
674 
675         StringBuilder buffer = new StringBuilder();
676 
677         // Dig past osis, osisText, if present, to get to the real content.
678         List<Content> frag = OSISUtil.getFragment(root);
679 
680         Iterator<Content> dit = frag.iterator();
681         String sID = null;
682         Content data = null;
683         Element ele = null;
684         while (dit.hasNext()) {
685             data = dit.next();
686             if (data instanceof Element) {
687                 ele = (Element) data;
688                 if (!isCanonical(ele)) {
689                     continue;
690                 }
691 
692                 if (ele.getName().equals(OSISUtil.OSIS_ELEMENT_VERSE)) {
693                     sID = ele.getAttributeValue(OSISUtil.OSIS_ATTR_SID);
694                 }
695 
696                 if (sID != null) {
697                     getCanonicalContent(ele, sID, dit, buffer);
698                 } else {
699                     getCanonicalContent(ele, null, ele.getContent().iterator(), buffer);
700                 }
701             } else if (data instanceof Text) {
702                 // make sure that adjacent text elements are separated by
703                 // whitespace
704                 // TODO(dms): verify that the xml parser does not split words
705                 // containing entities.
706                 int lastIndex = buffer.length() - 1;
707                 String text = ((Text) data).getText();
708                 // Ignore empty text nodes and do not add 
709                 if (text.length() != 0) {
710                     //do not add spaces when within a OSIS seg
711                     if (lastIndex >= 0 && !Character.isWhitespace(buffer.charAt(lastIndex)) && !Character.isWhitespace(text.charAt(0))) {
712                         buffer.append(' ');
713                     }
714                     buffer.append(text);
715                 }
716             }
717         }
718 
719         return buffer.toString().trim();
720     }
721 
722     /**
723      * A simplified plain text version of the data in this Element with all the
724      * markup stripped out.
725      * 
726      * @param root
727      *            the whole osis document.
728      * @return The Bible text without markup
729      */
730     public static String getPlainText(Element root) {
731         // Dig past osis, osisText, if present, to get to the real content.
732         return getTextContent(OSISUtil.getFragment(root));
733     }
734 
735     /**
736      * A space separate string containing Strong's numbers.
737      * 
738      * @param root
739      *            the whole osis document.
740      * @return The Strong's numbers in the text
741      */
742     public static String getStrongsNumbers(Element root) {
743         return getLexicalInformation(root, false);
744     }
745 
746     /**
747      * A '@' separated list of morphologies and strong numbers
748      * 
749      * @param root the osis element in question
750      * @return the string
751      */
752     public static String getMorphologiesWithStrong(Element root) {
753         return getLexicalInformation(root, true);
754     }
755 
756     /**
757      * concatenates strong and morphology information together
758      * 
759      * @param root the osis element in question
760      * @param includeMorphology whether to include morphology
761      * @return root of the element
762      */
763     public static String getLexicalInformation(Element root, boolean includeMorphology) {
764         StringBuilder buffer = new StringBuilder();
765 
766         for (Content content : getDeepContent(root, OSISUtil.OSIS_ELEMENT_W)) {
767             Element ele = (Element) content;
768             String attr = ele.getAttributeValue(OSISUtil.ATTRIBUTE_W_LEMMA);
769             if (attr != null) {
770                 Matcher matcher = strongsNumberPattern.matcher(attr);
771                 while (matcher.find()) {
772                     String strongsNum = matcher.group(1);
773                     if (buffer.length() > 0) {
774                         buffer.append(' ');
775                     }
776 
777                     if (includeMorphology) {
778                         //if including morphology, we want 1 big field, separated with '@'
779                         strongsNum = strongsNum.replace(SPACE_SEPARATOR, MORPH_INFO_SEPARATOR);
780                     }
781                     buffer.append(strongsNum);
782 
783                     if (includeMorphology) {
784                         //also include morphology if available
785                         String morph = ele.getAttributeValue(OSISUtil.ATTRIBUTE_W_MORPH);
786                         if (morph != null && morph.length() != 0) {
787                             buffer.append(MORPH_INFO_SEPARATOR);
788                             buffer.append(morph.replace(SPACE_SEPARATOR, MORPH_INFO_SEPARATOR));
789                         }
790                     }
791                 }
792             }
793         }
794 
795         return buffer.toString().trim();
796     }
797 
798     /**
799      * A space separate string containing osisID from the reference element.
800      * We pass book and key because the xref may not be valid and it needs to be reported.
801      *
802      * @param book the book to which the references refer
803      * @param key the verse containing the cross references
804      * @param v11n the versification
805      * @param root the osis element in question
806      * @return The references in the text
807      */
808     public static String getReferences(Book book, Key key, Versification v11n, Element root) {
809         PassageKeyFactory keyf = PassageKeyFactory.instance();
810         Key collector = keyf.createEmptyKeyList(v11n);
811 
812         for (Content content : getDeepContent(root, OSISUtil.OSIS_ELEMENT_REFERENCE)) {
813             Element ele = (Element) content;
814             String attr = ele.getAttributeValue(OSISUtil.OSIS_ATTR_REF);
815             if (attr != null) {
816                 try {
817                     collector.addAll(keyf.getKey(v11n, attr));
818                 } catch (NoSuchKeyException e) {
819                     DataPolice.report(book, key, "Unable to parse: " + attr + " - No such reference:" + e.getMessage());
820                 }
821             }
822         }
823 
824         return collector.getOsisID();
825     }
826 
827     /**
828      * The text of non-reference notes.
829      * 
830      * @param root the whole OSIS document
831      * @return The references in the text
832      */
833     public static String getNotes(Element root) {
834         StringBuilder buffer = new StringBuilder();
835 
836         for (Content content : getDeepContent(root, OSISUtil.OSIS_ELEMENT_NOTE)) {
837             Element ele = (Element) content;
838             String attr = ele.getAttributeValue(OSISUtil.OSIS_ATTR_TYPE);
839             if (attr == null || !attr.equals(NOTETYPE_REFERENCE)) {
840                 if (buffer.length() > 0) {
841                     buffer.append(' ');
842                 }
843                 buffer.append(OSISUtil.getTextContent(ele.getContent()));
844             }
845         }
846 
847         return buffer.toString();
848     }
849 
850     /**
851      * The text of non-reference notes.
852      * 
853      * @param root the whole OSIS document
854      * @return The references in the text
855      */
856     public static String getHeadings(Element root) {
857         StringBuilder buffer = new StringBuilder();
858 
859         for (Content content : getDeepContent(root, OSISUtil.OSIS_ELEMENT_TITLE)) {
860             Element ele = (Element) content;
861 
862             if (buffer.length() > 0) {
863                 buffer.append(' ');
864             }
865             buffer.append(OSISUtil.getTextContent(ele.getContent()));
866         }
867 
868         return buffer.toString();
869     }
870 
871     private static void getCanonicalContent(Element parent, String sID, Iterator<Content> iter, StringBuilder buffer) {
872         if (!isCanonical(parent)) {
873             return;
874         }
875 
876         Content data = null;
877         Element ele = null;
878         String eleName = null;
879         String eID = null;
880         while (iter.hasNext()) {
881             data = iter.next();
882             if (data instanceof Element) {
883                 ele = (Element) data;
884                 // If the milestoned element is done then quit.
885                 // This should be a eID=, that matches sID, from the same
886                 // element.
887                 eleName = ele.getName();
888                 eID = ele.getAttributeValue(OSISUtil.OSIS_ATTR_SID);
889                 if (eID != null && eID.equals(sID) && eleName.equals(parent.getName())) {
890                     break;
891                 }
892                 OSISUtil.getCanonicalContent(ele, sID, ele.getContent().iterator(), buffer);
893             } else if (data instanceof Text) {
894                 // make sure that adjacent text elements are separated by
895                 // whitespace
896                 // Empty elements also produce whitespace.
897                 // TODO(dms): verify that the xml parser does not split words
898                 // containing entities.
899                 int lastIndex = buffer.length() - 1;
900                 String text = ((Text) data).getText();
901                 if (lastIndex >= 0 && !Character.isWhitespace(buffer.charAt(lastIndex)) && (text.length() == 0 || !Character.isWhitespace(text.charAt(0)))  && !OSIS_ELEMENT_SEG.equals(parent.getName())) {
902                     buffer.append(' ');
903                 }
904                 buffer.append(text);
905             }
906         }
907     }
908 
909     private static boolean isCanonical(Content content) {
910         boolean result = true;
911         if (content instanceof Element) {
912             Element element = (Element) content;
913 
914             // Ignore extra-biblical text
915             if (EXTRA_BIBLICAL_ELEMENTS.contains(element.getName())) {
916                 String canonical = element.getAttributeValue(OSISUtil.OSIS_ATTR_CANONICAL);
917                 result = Boolean.valueOf(canonical).booleanValue();
918             }
919         }
920 
921         return result;
922     }
923 
924     private static String getTextContent(List<Content> fragment) {
925         StringBuilder buffer = new StringBuilder();
926 
927         for (Content next : fragment) {
928             recurseElement(next, buffer);
929         }
930 
931         return buffer.toString();
932     }
933 
934     /**
935      * Find all the instances of elements of type <code>find</code> under the
936      * element <code>div</code>.
937      * 
938      * @param div the element to trawl
939      * @param name the element name to search
940      * @return the collection of matching content
941      */
942     public static Collection<Content> getDeepContent(Element div, String name) {
943         List<Content> reply = new ArrayList<Content>();
944         recurseDeepContent(div, name, reply);
945         return reply;
946     }
947 
948     /**
949      * Walk up the tree from the W to find out what verse we are in.
950      * 
951      * @param v11n the versification
952      * @param ele
953      *            The start point for our verse hunt.
954      * @return The verse we are in
955      * @throws BookException 
956      */
957     public static Verse getVerse(Versification v11n, Element ele) throws BookException {
958         if (ele.getName().equals(OSIS_ELEMENT_VERSE)) {
959             // If the element is an OSIS Verse then this is fairly easy
960             String osisid = ele.getAttributeValue(OSIS_ATTR_OSISID);
961 
962             try {
963                 return VerseFactory.fromString(v11n, osisid);
964             } catch (NoSuchVerseException ex) {
965                 throw new BookException(JSOtherMsg.lookupText("OsisID not valid: {0}", osisid), ex);
966             }
967         }
968 
969         // So we just walk up the tree trying to find a verse
970         Parent parent = ele.getParent();
971         if (parent instanceof Element) {
972             return getVerse(v11n, (Element) parent);
973         }
974 
975         throw new BookException(JSOtherMsg.lookupText("Verse element could not be found"));
976     }
977 
978     /**
979      * Helper method to create the boilerplate headers in an OSIS document from
980      * the current metadata object
981      * 
982      * @param bmd the book's meta data
983      * @return the root of an OSIS document
984      */
985     public static Element createOsisFramework(BookMetaData bmd) {
986         Element osis = factory().createOsis();
987         String osisid = bmd.getInitials();
988 
989         Element work = factory().createWork();
990         work.setAttribute(ATTRIBUTE_WORK_OSISWORK, osisid);
991 
992         Element header = factory().createHeader();
993         header.addContent(work);
994 
995         Element text = factory().createOsisText();
996         text.setAttribute(ATTRIBUTE_TEXT_OSISIDWORK, OSISID_PREFIX_BIBLE + osisid);
997         text.addContent(header);
998 
999         osis.addContent(text);
1000
1001        return osis;
1002    }
1003
1004    /**
1005     * Convert a Difference list into a pretty HTML report.
1006     * 
1007     * @param diffs
1008     *            List of Difference objects
1009     * @return HTML representation
1010     */
1011    public static List<Content> diffToOsis(List<Difference> diffs) {
1012        Element div = factory().createDiv();
1013
1014        for (int x = 0; x < diffs.size(); x++) {
1015            Difference diff = diffs.get(x);
1016            EditType editType = diff.getEditType(); // Mode (delete, equal,
1017                                                    // insert)
1018            Text text = factory.createText(diff.getText()); // Text of change.
1019
1020            if (EditType.DELETE.equals(editType)) {
1021                Element hi = factory().createHI();
1022                hi.setAttribute(OSISUtil.OSIS_ATTR_TYPE, OSISUtil.HI_LINETHROUGH);
1023                hi.addContent(text);
1024                div.addContent(hi);
1025            } else if (EditType.INSERT.equals(editType)) {
1026                Element hi = factory().createHI();
1027                hi.setAttribute(OSISUtil.OSIS_ATTR_TYPE, OSISUtil.HI_UNDERLINE);
1028                hi.addContent(text);
1029                div.addContent(hi);
1030            } else {
1031                div.addContent(text);
1032            }
1033        }
1034        return div.cloneContent();
1035    }
1036
1037    public static List<Content> rtfToOsis(String rtf) {
1038        Element div = factory().createDiv();
1039        Stack<Content> stack = new Stack<Content>();
1040        stack.push(div);
1041
1042        int strlen = rtf.length();
1043
1044        StringBuilder text = new StringBuilder(strlen);
1045
1046        int i = 0;
1047        for (i = 0; i < strlen; i++) {
1048            char curChar = rtf.charAt(i);
1049            if (curChar != '\\') {
1050                text.append(curChar);
1051                continue;
1052            }
1053
1054            // The following are ordered from most to least common
1055            // and when one is a prefix of another, it follows.
1056
1057            // Used to end all open attributes. Only \qc in our implementation.
1058            if (rtf.startsWith("\\pard", i)) {
1059                Element currentElement = (Element) stack.pop();
1060                currentElement.addContent(text.toString());
1061                text.delete(0, text.length());
1062                stack.clear();
1063                stack.push(div);
1064                i += (i + 5 < strlen && rtf.charAt(i + 5) == ' ') ? 5 : 4;
1065                continue;
1066            }
1067
1068            // Simulate a paragraph break.
1069            if (rtf.startsWith("\\par", i)) {
1070                Element currentElement = (Element) stack.peek();
1071                currentElement.addContent(text.toString());
1072                text.delete(0, text.length());
1073                currentElement.addContent(OSISUtil.factory.createLB());
1074                i += (i + 4 < strlen && rtf.charAt(i + 4) == ' ') ? 4 : 3;
1075                continue;
1076            }
1077
1078            // OSIS does not have the notion of centered text.
1079            // So we define our own
1080            if (rtf.startsWith("\\qc", i)) {
1081                Element centerDiv = OSISUtil.factory.createDiv();
1082                centerDiv.setAttribute(OSIS_ATTR_TYPE, "x-center");
1083                Element currentElement = (Element) stack.peek();
1084                currentElement.addContent(text.toString());
1085                text.delete(0, text.length());
1086                currentElement.addContent(centerDiv);
1087                stack.push(centerDiv);
1088                // skip following space, if any
1089                i += (i + 3 < strlen && rtf.charAt(i + 3) == ' ') ? 3 : 2;
1090                continue;
1091            }
1092
1093            // convert Unicode representations to Unicode
1094            if (rtf.startsWith("\\u", i)) {
1095                StringBuilder buf = new StringBuilder();
1096                i += 2;
1097                while (i < strlen) {
1098                    char curDigit = rtf.charAt(i);
1099                    if (curDigit != '-' && !Character.isDigit(curDigit)) {
1100                        break;
1101                    }
1102                    buf.append(curDigit);
1103                    i++;
1104                }
1105                // At this point:
1106                // buf contains the numeric representation of the number, 16-bit
1107                // signed
1108                // charAt(i) is the substitution character if Unicode is not
1109                // supported
1110                int value = Integer.parseInt(buf.toString());
1111                if (value < 0) {
1112                    value += 65536;
1113                }
1114                text.append((char) value);
1115                // don't advance since i is on the substitute character.
1116                continue;
1117            }
1118
1119            // close italic and bold
1120            if (rtf.startsWith("\\i0", i) || rtf.startsWith("\\b0", i)) {
1121                Element currentElement = (Element) stack.pop();
1122                currentElement.addContent(text.toString());
1123                text.delete(0, text.length());
1124                i += (i + 3 < strlen && rtf.charAt(i + 3) == ' ') ? 3 : 2;
1125                continue;
1126            }
1127
1128            // Skip escaped whitespace
1129            if (rtf.startsWith(" ", i) || rtf.startsWith("\n", i)) {
1130                i += 1;
1131                continue;
1132            }
1133
1134            // start italic
1135            if (rtf.startsWith("\\i", i)) {
1136                Element hiElement = OSISUtil.factory.createHI();
1137                hiElement.setAttribute(OSIS_ATTR_TYPE, HI_ITALIC);
1138                Element currentElement = (Element) stack.peek();
1139                currentElement.addContent(text.toString());
1140                text.delete(0, text.length());
1141                currentElement.addContent(hiElement);
1142                stack.push(hiElement);
1143                i += (i + 2 < strlen && rtf.charAt(i + 2) == ' ') ? 2 : 1;
1144                continue;
1145            }
1146
1147            // start bold
1148            if (rtf.startsWith("\\b", i)) {
1149                Element hiElement = OSISUtil.factory.createHI();
1150                hiElement.setAttribute(OSIS_ATTR_TYPE, HI_BOLD);
1151                Element currentElement = (Element) stack.peek();
1152                currentElement.addContent(text.toString());
1153                text.delete(0, text.length());
1154                currentElement.addContent(hiElement);
1155                stack.push(hiElement);
1156                i += (i + 2 < strlen && rtf.charAt(i + 2) == ' ') ? 2 : 1;
1157                continue;
1158            }
1159
1160        }
1161
1162        // If there is any text that has not been consumed
1163        if (text.length() > 0) {
1164            div.addContent(text.toString());
1165        }
1166        // div.addContent(text.toString());
1167        // // If the fragment is already in a document, then use that.
1168        // Document doc = div.getDocument();
1169        // if (doc == null)
1170        // {
1171        // doc = new Document(div);
1172        // }
1173        // SAXEventProvider ep = new JDOMSAXEventProvider(doc);
1174        // ContentHandler osis = new
1175        // PrettySerializingContentHandler(FormatType.CLASSIC_INDENT);
1176        // try
1177        // {
1178        // ep.provideSAXEvents(osis);
1179        // }
1180        // catch (SAXException e)
1181        // {
1182        // e.printStackTrace();
1183        // }
1184        // System.err.println(osis.toString());
1185        return div.cloneContent();
1186    }
1187
1188    /**
1189     * Find all the instances of elements of type <code>find</code> under the
1190     * element <code>div</code>. For internal use only.
1191     * 
1192     * @param start the node under which searches occur
1193     * @param name element name to search
1194     * @param reply the list to modify with matching content
1195     */
1196    private static void recurseDeepContent(Element start, String name, List<Content> reply) {
1197        if (start.getName().equals(name)) {
1198            reply.add(start);
1199        }
1200
1201//        Content data = null;
1202        Element ele = null;
1203        for (Content data : start.getContent()) {
1204            if (data instanceof Element) {
1205                ele = (Element) data;
1206                recurseDeepContent(ele, name, reply);
1207            }
1208        }
1209    }
1210
1211    /**
1212     * If we have a String just add it to the buffer, but if we have an Element
1213     * then try to dig the strings out of it.
1214     * 
1215     * @param sub a sub element or text node
1216     * @param buffer the buffer to build on match
1217     */
1218    private static void recurseElement(Object sub, StringBuilder buffer) {
1219        if (sub instanceof Text) {
1220            buffer.append(((Text) sub).getText());
1221        } else if (sub instanceof Element) {
1222            recurseChildren((Element) sub, buffer);
1223        } else {
1224            log.error("unknown type: {}", sub.getClass().getName());
1225        }
1226    }
1227
1228    /**
1229     * Helper to extract the Strings from a nest of JDOM elements
1230     * 
1231     * @param ele
1232     *            The JDOM Element to dig into
1233     * @param buffer
1234     *            The place we accumulate strings.
1235     */
1236    private static void recurseChildren(Element ele, StringBuilder buffer) {
1237        // ele is a JDOM Element that might have a getContent() method
1238        for (Content sub : ele.getContent()) {
1239            recurseElement(sub, buffer);
1240        }
1241    }
1242
1243    private static String strongsNumber = "strong:([GgHh][0-9]+!?[A-Za-z]*)";
1244    private static Pattern strongsNumberPattern = Pattern.compile(strongsNumber);
1245}
1246