1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 or later
5    * as published by the Free Software Foundation. This program is distributed
6    * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
7    * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *       http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * Copyright: 2005-2013
18   *     The copyright to this program is held by it's authors.
19   *
20   */
21  package org.crosswire.jsword.book;
22  
23  import java.util.ArrayList;
24  import java.util.Arrays;
25  import java.util.Collection;
26  import java.util.HashSet;
27  import java.util.Iterator;
28  import java.util.List;
29  import java.util.Set;
30  import java.util.Stack;
31  import java.util.regex.Matcher;
32  import java.util.regex.Pattern;
33  
34  import org.crosswire.common.diff.Difference;
35  import org.crosswire.common.diff.EditType;
36  import org.crosswire.jsword.JSOtherMsg;
37  import org.crosswire.jsword.passage.Key;
38  import org.crosswire.jsword.passage.NoSuchKeyException;
39  import org.crosswire.jsword.passage.NoSuchVerseException;
40  import org.crosswire.jsword.passage.PassageKeyFactory;
41  import org.crosswire.jsword.passage.Verse;
42  import org.crosswire.jsword.passage.VerseFactory;
43  import org.crosswire.jsword.versification.Versification;
44  import org.jdom2.Content;
45  import org.jdom2.Element;
46  import org.jdom2.Parent;
47  import org.jdom2.Text;
48  import org.slf4j.Logger;
49  import org.slf4j.LoggerFactory;
50  
51  /**
52   * Some simple utilities to help working with OSIS classes.
53   * 
54   * @see gnu.lgpl.License for license details.<br>
55   *      The copyright to this program is held by it's authors.
56   * @author Joe Walker [joe at eireneh dot com]
57   */
58  public final class OSISUtil {
59      private static final char SPACE_SEPARATOR = ' ';
60      private static final char MORPH_INFO_SEPARATOR = '@';
61  
62      /**
63       * The following are values for the type attribute on the hi element.
64       */
65      /**
66       * Constant for acrostic highlighting
67       */
68      public static final String HI_ACROSTIC = "acrostic";
69  
70      /**
71       * Constant for rendering bold text
72       */
73      public static final String HI_BOLD = "bold";
74  
75      /**
76       * Constant for rendering emphatic text
77       */
78      public static final String HI_EMPHASIS = "emphasis";
79  
80      /**
81       * Constant for rendering illuminated text.
82       */
83      public static final String HI_ILLUMINATED = "illuminated";
84  
85      /**
86       * Constant for rendering italic text.
87       */
88      public static final String HI_ITALIC = "italic";
89  
90      /**
91       * Constant for rendering strike-through text
92       */
93      public static final String HI_LINETHROUGH = "line-through";
94  
95      /**
96       * Constant for rendering normal text.
97       */
98      public static final String HI_NORMAL = "normal";
99  
100     /**
101      * Constant for rendering small caps
102      */
103     public static final String HI_SMALL_CAPS = "small-caps";
104 
105     /**
106      * Constant for rendering subscripts
107      */
108     public static final String HI_SUB = "sub";
109 
110     /**
111      * Constant for rendering superscripts
112      */
113     public static final String HI_SUPER = "super";
114 
115     /**
116      * Constant for rendering underlined text
117      */
118     public static final String HI_UNDERLINE = "underline";
119 
120     /**
121      * Constant for rendering upper case text
122      */
123     public static final String HI_X_CAPS = "x-caps";
124 
125     /**
126      * Constant for rendering big text
127      */
128     public static final String HI_X_BIG = "x-big";
129 
130     /**
131      * Constant for rendering small text
132      */
133     public static final String HI_X_SMALL = "x-small";
134 
135     /**
136      * Constant for rendering tt text
137      */
138     public static final String HI_X_TT = "x-tt";
139 
140     /**
141      * Constant to help narrow down what we use seg for. In this case the
142      * justify right tag
143      */
144     public static final String SEG_JUSTIFYRIGHT = "text-align: right;";
145 
146     /**
147      * Constant to help narrow down what we use seg for. In this case the
148      * justify right tag
149      */
150     public static final String SEG_JUSTIFYLEFT = "text-align: left;";
151 
152     /**
153      * Constant to help narrow down what we use seg for. In this case the thml
154      * center tag
155      */
156     public static final String SEG_CENTER = "text-align: center;";
157 
158     /**
159      * Constant to help narrow down what we use div for. In this case the thml
160      * pre tag
161      */
162     public static final String DIV_PRE = "x-pre";
163 
164     /**
165      * Constant to help narrow down what we use seg for. In this case the color
166      * tag
167      */
168     public static final String SEG_COLORPREFIX = "color: ";
169 
170     /**
171      * Constant to help narrow down what we use seg for. In this case the
172      * font-size tag
173      */
174     public static final String SEG_SIZEPREFIX = "font-size: ";
175 
176     /**
177      * Constant for x- types
178      */
179     public static final String TYPE_X_PREFIX = "x-";
180 
181     /**
182      * Constant for the study note type
183      */
184     public static final String NOTETYPE_STUDY = "x-StudyNote";
185 
186     /**
187      * Constant for the cross reference note type
188      */
189     public static final String NOTETYPE_REFERENCE = "crossReference";
190 
191     /**
192      * Constant for the variant type segment
193      */
194     public static final String VARIANT_TYPE = "x-variant";
195     public static final String VARIANT_CLASS = "x-";
196 
197     /**
198      * Constant for JSword generated content. Used for type or subType.
199      */
200     public static final String GENERATED_CONTENT = "x-gen";
201 
202     /**
203      * Constant for the pos (part of speech) type.
204      */
205     public static final String POS_TYPE = "x-pos";
206 
207     /**
208      * Constant for the def (dictionary definition) type
209      */
210     public static final String DEF_TYPE = "x-def";
211 
212     /**
213      * Constant for a Strong's numbering lemma
214      */
215     public static final String LEMMA_STRONGS = "strong:";
216     public static final String MORPH_ROBINSONS = "robinson:";
217 
218     /**
219      * Constant for Strong's numbering morphology
220      */
221     public static final String MORPH_STRONGS = "x-StrongsMorph:T";
222 
223     /**
224      * Constant to help narrow down what we use "q" for. In this case:
225      * blockquote
226      */
227     public static final String Q_BLOCK = "blockquote";
228 
229     /**
230      * Constant to help narrow down what we use "q" for. In this case: citation
231      */
232     public static final String Q_CITATION = "citation";
233 
234     /**
235      * Constant to help narrow down what we use "q" for. In this case: embedded
236      */
237     public static final String Q_EMBEDDED = "embedded";
238 
239     /**
240      * Constant to help narrow down what we use "list" for.
241      */
242     public static final String LIST_ORDERED = "x-ordered";
243     public static final String LIST_UNORDERED = "x-unordered";
244 
245     /**
246      * Table roles (on table, row and cell elements) can be "data", the default,
247      * or label.
248      */
249     public static final String TABLE_ROLE_LABEL = "label";
250 
251     /**
252      * Possible cell alignments
253      */
254     public static final String CELL_ALIGN_LEFT = "left";
255     public static final String CELL_ALIGN_RIGHT = "right";
256     public static final String CELL_ALIGN_CENTER = "center";
257     public static final String CELL_ALIGN_JUSTIFY = "justify";
258     public static final String CELL_ALIGN_START = "start";
259     public static final String CELL_ALIGN_END = "end";
260 
261     public static final String OSIS_ELEMENT_ABBR = "abbr";
262     public static final String OSIS_ELEMENT_TITLE = "title";
263     public static final String OSIS_ELEMENT_TABLE = "table";
264     public static final String OSIS_ELEMENT_SPEECH = "speech";
265     public static final String OSIS_ELEMENT_SPEAKER = "speaker";
266     public static final String OSIS_ELEMENT_ROW = "row";
267     public static final String OSIS_ELEMENT_REFERENCE = "reference";
268     public static final String OSIS_ELEMENT_NOTE = "note";
269     public static final String OSIS_ELEMENT_NAME = "name";
270     public static final String OSIS_ELEMENT_Q = "q";
271     public static final String OSIS_ELEMENT_LIST = "list";
272     public static final String OSIS_ELEMENT_P = "p";
273     public static final String OSIS_ELEMENT_ITEM = "item";
274     public static final String OSIS_ELEMENT_FIGURE = "figure";
275     public static final String OSIS_ELEMENT_FOREIGN = "foreign";
276     public static final String OSIS_ELEMENT_W = "w";
277     public static final String OSIS_ELEMENT_CHAPTER = "chapter";
278     public static final String OSIS_ELEMENT_VERSE = "verse";
279     public static final String OSIS_ELEMENT_CELL = "cell";
280     public static final String OSIS_ELEMENT_DIV = "div";
281     public static final String OSIS_ELEMENT_OSIS = "osis";
282     public static final String OSIS_ELEMENT_WORK = "work";
283     public static final String OSIS_ELEMENT_HEADER = "header";
284     public static final String OSIS_ELEMENT_OSISTEXT = "osisText";
285     public static final String OSIS_ELEMENT_SEG = "seg";
286     public static final String OSIS_ELEMENT_LG = "lg";
287     public static final String OSIS_ELEMENT_L = "l";
288     public static final String OSIS_ELEMENT_LB = "lb";
289     public static final String OSIS_ELEMENT_HI = "hi";
290 
291     public static final String ATTRIBUTE_TEXT_OSISIDWORK = "osisIDWork";
292     public static final String ATTRIBUTE_WORK_OSISWORK = "osisWork";
293     public static final String OSIS_ATTR_OSISID = "osisID";
294     public static final String OSIS_ATTR_SID = "sID";
295     public static final String OSIS_ATTR_EID = "eID";
296     public static final String ATTRIBUTE_W_LEMMA = "lemma";
297     public static final String ATTRIBUTE_FIGURE_SRC = "src";
298     public static final String ATTRIBUTE_TABLE_ROLE = "role";
299     public static final String ATTRIBUTE_CELL_ALIGN = "align";
300     public static final String OSIS_ATTR_TYPE = "type";
301     public static final String OSIS_ATTR_CANONICAL = "canonical";
302     public static final String OSIS_ATTR_SUBTYPE = "subType";
303     public static final String OSIS_ATTR_REF = "osisRef";
304     public static final String OSIS_ATTR_LEVEL = "level";
305     public static final String ATTRIBUTE_SPEAKER_WHO = "who";
306     public static final String ATTRIBUTE_Q_WHO = "who";
307     public static final String ATTRIBUTE_W_MORPH = "morph";
308     public static final String ATTRIBUTE_OSISTEXT_OSISIDWORK = "osisIDWork";
309     // OSIS defines the lang attribute as the one from the xml namespace
310     // Typical usage element.setAttribute(OSISUtil.OSIS_ATTR_LANG, lang,
311     // Namespace.XML_NAMESPACE);
312     public static final String OSIS_ATTR_LANG = "lang";
313     public static final String ATTRIBUTE_DIV_BOOK = "book";
314 
315     /**
316      * Prefix for OSIS IDs that refer to Bibles
317      */
318     private static final String OSISID_PREFIX_BIBLE = "Bible.";
319 
320     private static final Set<String> EXTRA_BIBLICAL_ELEMENTS = new HashSet<String>(Arrays.asList(new String[] {
321             OSIS_ELEMENT_NOTE, OSIS_ELEMENT_TITLE, OSIS_ELEMENT_REFERENCE
322     }));
323 
324     /**
325      * The log stream
326      */
327     private static final Logger log = LoggerFactory.getLogger(OSISUtil.class);
328 
329 
330     /**
331      * Prevent instantiation
332      */
333     private OSISUtil() {
334     }
335 
336     private static OSISFactory factory = new OSISFactory();
337 
338     /**
339      * An accessor for the OSISFactory that creates OSIS objects
340      */
341     public static OSISFactory factory() {
342         return factory;
343     }
344 
345     /**
346      * A generic way of creating empty Elements of various types
347      */
348     public static class OSISFactory {
349         /**
350         *
351         */
352         public Element createAbbr() {
353             return new Element(OSIS_ELEMENT_ABBR);
354         }
355 
356         /**
357        *
358        */
359         public Element createSeg() {
360             return new Element(OSIS_ELEMENT_SEG);
361         }
362 
363         /**
364          *
365          */
366         public Element createOsisText() {
367             return new Element(OSIS_ELEMENT_OSISTEXT);
368         }
369 
370         /**
371          *
372          */
373         public Element createHeader() {
374             return new Element(OSIS_ELEMENT_HEADER);
375         }
376 
377         /**
378          *
379          */
380         public Element createWork() {
381             return new Element(OSIS_ELEMENT_WORK);
382         }
383 
384         /**
385          *
386          */
387         public Element createOsis() {
388             return new Element(OSIS_ELEMENT_OSIS);
389         }
390 
391         /**
392          *
393          */
394         public Element createDiv() {
395             return new Element(OSIS_ELEMENT_DIV);
396         }
397 
398         /**
399          *
400          */
401         public Element createCell() {
402             return new Element(OSIS_ELEMENT_CELL);
403         }
404 
405         /**
406          *
407          */
408         public Element createHeaderCell() {
409             Element ele = new Element(OSIS_ELEMENT_CELL);
410             ele.setAttribute(ATTRIBUTE_TABLE_ROLE, TABLE_ROLE_LABEL);
411             ele.setAttribute(ATTRIBUTE_CELL_ALIGN, CELL_ALIGN_CENTER);
412             return ele;
413         }
414 
415         /**
416          *
417          */
418         public Element createVerse() {
419             return new Element(OSIS_ELEMENT_VERSE);
420         }
421 
422         /**
423          *
424          */
425         public Element createW() {
426             return new Element(OSIS_ELEMENT_W);
427         }
428 
429         /**
430          *
431          */
432         public Element createFigure() {
433             return new Element(OSIS_ELEMENT_FIGURE);
434         }
435 
436         /**
437          *
438          */
439         public Element createForeign() {
440             return new Element(OSIS_ELEMENT_FOREIGN);
441         }
442 
443         /**
444          *
445          */
446         public Element createItem() {
447             return new Element(OSIS_ELEMENT_ITEM);
448         }
449 
450         /**
451          *
452          */
453         public Element createP() {
454             return new Element(OSIS_ELEMENT_P);
455         }
456 
457         /**
458          *
459          */
460         public Element createList() {
461             return new Element(OSIS_ELEMENT_LIST);
462         }
463 
464         /**
465          *
466          */
467         public Element createQ() {
468             return new Element(OSIS_ELEMENT_Q);
469         }
470 
471         /**
472          *
473          */
474         public Element createName() {
475             return new Element(OSIS_ELEMENT_NAME);
476         }
477 
478         /**
479          *
480          */
481         public Element createNote() {
482             return new Element(OSIS_ELEMENT_NOTE);
483         }
484 
485         /**
486          *
487          */
488         public Element createReference() {
489             return new Element(OSIS_ELEMENT_REFERENCE);
490         }
491 
492         /**
493          *
494          */
495         public Element createRow() {
496             return new Element(OSIS_ELEMENT_ROW);
497         }
498 
499         /**
500          *
501          */
502         public Element createSpeaker() {
503             return new Element(OSIS_ELEMENT_SPEAKER);
504         }
505 
506         /**
507          *
508          */
509         public Element createSpeech() {
510             return new Element(OSIS_ELEMENT_SPEECH);
511         }
512 
513         /**
514          *
515          */
516         public Element createTable() {
517             return new Element(OSIS_ELEMENT_TABLE);
518         }
519 
520         /**
521         *
522         */
523        public Element createTitle() {
524            return new Element(OSIS_ELEMENT_TITLE);
525        }
526 
527         /**
528          * Create a title marked as generated.
529          */
530         public Element createGeneratedTitle() {
531             Element title = new Element(OSIS_ELEMENT_TITLE);
532             title.setAttribute(OSISUtil.OSIS_ATTR_TYPE, OSISUtil.GENERATED_CONTENT);
533             return title;
534         }
535 
536         /**
537          * Line Group
538          */
539         public Element createLG() {
540             return new Element(OSIS_ELEMENT_LG);
541         }
542 
543         /**
544          * Line
545          */
546         public Element createL() {
547             return new Element(OSIS_ELEMENT_L);
548         }
549 
550         /**
551          * Line Break
552          */
553         public Element createLB() {
554             return new Element(OSIS_ELEMENT_LB);
555         }
556 
557         /**
558          * Highlight
559          */
560         public Element createHI() {
561             return new Element(OSIS_ELEMENT_HI);
562         }
563 
564         /**
565          * Text
566          */
567         public Text createText(String text) {
568             return new Text(text);
569         }
570     }
571 
572     /**
573      * Dig past the osis and osisText element, if present, to get the meaningful
574      * content of the document.
575      * 
576      * @return a fragment
577      */
578     public static List<Content> getFragment(Element root) {
579         if (root != null) {
580             Element content = root;
581             if (OSISUtil.OSIS_ELEMENT_OSIS.equals(root.getName())) {
582                 content = root.getChild(OSISUtil.OSIS_ELEMENT_OSISTEXT);
583             }
584 
585             if (OSISUtil.OSIS_ELEMENT_OSISTEXT.equals(root.getName())) {
586                 content = root.getChild(OSISUtil.OSIS_ELEMENT_DIV);
587             }
588 
589             // At this point we are at something interesting, possibly null.
590             // If this was a semantically valid OSIS document then it is a div.
591             // As long as this node has one child dig deeper.
592             if (content != null && content.getContentSize() == 1) {
593                 Content firstChild = content.getContent(0);
594                 if (firstChild instanceof Element && OSISUtil.OSIS_ELEMENT_DIV.equals(((Element) firstChild).getName())) {
595                     content = (Element) firstChild;
596                 }
597             }
598 
599             if (content != null) {
600                 return content.getContent();
601             }
602         }
603         return new ArrayList<Content>();
604     }
605 
606     /**
607      * Get the canonical text from an osis document consisting of a single
608      * fragment. The document is assumed to be valid OSIS2.0 XML. While xml
609      * valid is rigidly defined as meaning that an xml parser can validate the
610      * document, it does not mean that the document is valid OSIS. This is a
611      * semantic problem that is not validated. This method assumes that the root
612      * element is also semantically valid.
613      * 
614      * <p>
615      * This means that the top level element's tagname is osis. This can contain
616      * either a osisText or an osisCorpus. If it is an osisCorpus, then it
617      * contains an osisText. However, as a simplification, since JSword
618      * constructs the whole doc for the fragment, osisCorpus can be ignored.
619      * <p>
620      * The osisText element contains a div element that is either a container or
621      * a milestone. Again, JSword is providing the div element and it will be
622      * provided as a container. It is this div that "contains" the actual
623      * fragment.
624      * </p>
625      * <p>
626      * A verse element may either be a container or a milestone. Sword OSIS
627      * books differ in whether they provide the verse element. Most do not. The
628      * few that do are using the container model, but it has been proposed that
629      * milestones are the best practice.
630      * </p>
631      * 
632      * <p>
633      * The fragment may contain elements that are not a part of the original
634      * text. These are things such as notes.
635      * </p>
636      * 
637      * <p>
638      * Milestones require special handling. Beginning milestones elements have
639      * an sID attribute, while ending milestones have an eID with the same value
640      * as the opening. So everything between the start and the corresponding end
641      * is the content of the element. Also, for a given element, say div, they
642      * have to be properly nested as if they were container elements.
643      * </p>
644      * 
645      * @param root
646      *            the whole osis document.
647      * @return The canonical text without markup
648      */
649     public static String getCanonicalText(Element root) {
650         StringBuilder buffer = new StringBuilder();
651 
652         // Dig past osis, osisText, if present, to get to the real content.
653         List<Content> frag = OSISUtil.getFragment(root);
654 
655         Iterator<Content> dit = frag.iterator();
656         String sID = null;
657         Content data = null;
658         Element ele = null;
659         while (dit.hasNext()) {
660             data = dit.next();
661             if (data instanceof Element) {
662                 ele = (Element) data;
663                 if (!isCanonical(ele)) {
664                     continue;
665                 }
666 
667                 if (ele.getName().equals(OSISUtil.OSIS_ELEMENT_VERSE)) {
668                     sID = ele.getAttributeValue(OSISUtil.OSIS_ATTR_SID);
669                 }
670 
671                 if (sID != null) {
672                     getCanonicalContent(ele, sID, dit, buffer);
673                 } else {
674                     getCanonicalContent(ele, null, ele.getContent().iterator(), buffer);
675                 }
676             } else if (data instanceof Text) {
677                 // make sure that adjacent text elements are separated by
678                 // whitespace
679                 // TODO(dms): verify that the xml parser does not split words
680                 // containing entities.
681                 int lastIndex = buffer.length() - 1;
682                 String text = ((Text) data).getText();
683                 // Ignore empty text nodes and do not add 
684                 if (text.length() != 0) {
685                     //do not add spaces when within a OSIS seg
686                     if (lastIndex >= 0 && !Character.isWhitespace(buffer.charAt(lastIndex)) && !Character.isWhitespace(text.charAt(0))) {
687                         buffer.append(' ');
688                     }
689                     buffer.append(text);
690                 }
691             }
692         }
693 
694         return buffer.toString().trim();
695     }
696 
697     /**
698      * A simplified plain text version of the data in this Element with all the
699      * markup stripped out.
700      * 
701      * @return The Bible text without markup
702      */
703     public static String getPlainText(Element root) {
704         // Dig past osis, osisText, if present, to get to the real content.
705         return getTextContent(OSISUtil.getFragment(root));
706     }
707 
708     /**
709      * A space separate string containing Strong's numbers.
710      * 
711      * @return The Strong's numbers in the text
712      */
713     public static String getStrongsNumbers(Element root) {
714         return getLexicalInformation(root, false);
715     }
716 
717     /**
718      * A '@' separated list of morphologies and strong numbers
719      * @param root the osis element in question
720      * @return the string
721      */
722     public static String getMorphologiesWithStrong(Element root) {
723         return getLexicalInformation(root, true);
724     }
725 
726     /**
727      * concatenates strong and morphology information together
728      * @param root
729      * @param includeMorphology
730      * @return root of the element
731      */
732     public static String getLexicalInformation(Element root, boolean includeMorphology) {
733         StringBuilder buffer = new StringBuilder();
734 
735         for (Content content : getDeepContent(root, OSISUtil.OSIS_ELEMENT_W)) {
736             Element ele = (Element) content;
737             String attr = ele.getAttributeValue(OSISUtil.ATTRIBUTE_W_LEMMA);
738             if (attr != null) {
739                 Matcher matcher = strongsNumberPattern.matcher(attr);
740                 while (matcher.find()) {
741                     String strongsNum = matcher.group(1);
742                     if (buffer.length() > 0) {
743                         buffer.append(' ');
744                     }
745 
746                     if (includeMorphology) {
747                         //if including morphology, we want 1 big field, separated with '@'
748                         strongsNum = strongsNum.replace(SPACE_SEPARATOR, MORPH_INFO_SEPARATOR);
749                     }
750                     buffer.append(strongsNum);
751 
752                     if (includeMorphology) {
753                         //also include morphology if available
754                         String morph = ele.getAttributeValue(OSISUtil.ATTRIBUTE_W_MORPH);
755                         if (morph != null && morph.length() != 0) {
756                             buffer.append(MORPH_INFO_SEPARATOR);
757                             buffer.append(morph.replace(SPACE_SEPARATOR, MORPH_INFO_SEPARATOR));
758                         }
759                     }
760                 }
761             }
762         }
763 
764         return buffer.toString().trim();
765     }
766 
767     /**
768      * A space separate string containing osisID from the reference element.
769      * 
770      * @return The references in the text
771      */
772     public static String getReferences(Versification v11n, Element root) {
773         PassageKeyFactory keyf = PassageKeyFactory.instance();
774         Key collector = keyf.createEmptyKeyList(v11n);
775 
776         for (Content content : getDeepContent(root, OSISUtil.OSIS_ELEMENT_REFERENCE)) {
777             Element ele = (Element) content;
778             String attr = ele.getAttributeValue(OSISUtil.OSIS_ATTR_REF);
779             if (attr != null) {
780                 try {
781                     Key key = keyf.getKey(v11n, attr);
782                     collector.addAll(key);
783                 } catch (NoSuchKeyException e) {
784                     log.warn("Unable to parse: {}", attr, e);
785                 }
786             }
787         }
788 
789         return collector.getOsisID();
790     }
791 
792     /**
793      * The text of non-reference notes.
794      * 
795      * @return The references in the text
796      */
797     public static String getNotes(Element root) {
798         StringBuilder buffer = new StringBuilder();
799 
800         for (Content content : getDeepContent(root, OSISUtil.OSIS_ELEMENT_NOTE)) {
801             Element ele = (Element) content;
802             String attr = ele.getAttributeValue(OSISUtil.OSIS_ATTR_TYPE);
803             if (attr == null || !attr.equals(NOTETYPE_REFERENCE)) {
804                 if (buffer.length() > 0) {
805                     buffer.append(' ');
806                 }
807                 buffer.append(OSISUtil.getTextContent(ele.getContent()));
808             }
809         }
810 
811         return buffer.toString();
812     }
813 
814     /**
815      * The text of non-reference notes.
816      * 
817      * @return The references in the text
818      */
819     public static String getHeadings(Element root) {
820         StringBuilder buffer = new StringBuilder();
821 
822         for (Content content : getDeepContent(root, OSISUtil.OSIS_ELEMENT_TITLE)) {
823             Element ele = (Element) content;
824 
825             if (buffer.length() > 0) {
826                 buffer.append(' ');
827             }
828             buffer.append(OSISUtil.getTextContent(ele.getContent()));
829         }
830 
831         return buffer.toString();
832     }
833 
834     private static void getCanonicalContent(Element parent, String sID, Iterator<Content> iter, StringBuilder buffer) {
835         if (!isCanonical(parent)) {
836             return;
837         }
838 
839         Content data = null;
840         Element ele = null;
841         String eleName = null;
842         String eID = null;
843         while (iter.hasNext()) {
844             data = iter.next();
845             if (data instanceof Element) {
846                 ele = (Element) data;
847                 // If the milestoned element is done then quit.
848                 // This should be a eID=, that matches sID, from the same
849                 // element.
850                 eleName = ele.getName();
851                 eID = ele.getAttributeValue(OSISUtil.OSIS_ATTR_SID);
852                 if (eID != null && eID.equals(sID) && eleName.equals(parent.getName())) {
853                     break;
854                 }
855                 OSISUtil.getCanonicalContent(ele, sID, ele.getContent().iterator(), buffer);
856             } else if (data instanceof Text) {
857                 // make sure that adjacent text elements are separated by
858                 // whitespace
859                 // Empty elements also produce whitespace.
860                 // TODO(dms): verify that the xml parser does not split words
861                 // containing entities.
862                 int lastIndex = buffer.length() - 1;
863                 String text = ((Text) data).getText();
864                 if (lastIndex >= 0 && !Character.isWhitespace(buffer.charAt(lastIndex)) && (text.length() == 0 || !Character.isWhitespace(text.charAt(0)))  && !OSIS_ELEMENT_SEG.equals(parent.getName())) {
865                     buffer.append(' ');
866                 }
867                 buffer.append(text);
868             }
869         }
870     }
871 
872     private static boolean isCanonical(Content content) {
873         boolean result = true;
874         if (content instanceof Element) {
875             Element element = (Element) content;
876 
877             // Ignore extra-biblical text
878             if (EXTRA_BIBLICAL_ELEMENTS.contains(element.getName())) {
879                 String canonical = element.getAttributeValue(OSISUtil.OSIS_ATTR_CANONICAL);
880                 result = Boolean.valueOf(canonical).booleanValue();
881             }
882         }
883 
884         return result;
885     }
886 
887     private static String getTextContent(List<Content> fragment) {
888         StringBuilder buffer = new StringBuilder();
889 
890         for (Content next : fragment) {
891             recurseElement(next, buffer);
892         }
893 
894         return buffer.toString();
895     }
896 
897     /**
898      * Find all the instances of elements of type <code>find</code> under the
899      * element <code>div</code>.
900      */
901     public static Collection<Content> getDeepContent(Element div, String name) {
902         List<Content> reply = new ArrayList<Content>();
903         recurseDeepContent(div, name, reply);
904         return reply;
905     }
906 
907     /**
908      * Walk up the tree from the W to find out what verse we are in.
909      * 
910      * @param ele
911      *            The start point for our verse hunt.
912      * @return The verse we are in
913      */
914     public static Verse getVerse(Versification v11n, Element ele) throws BookException {
915         if (ele.getName().equals(OSIS_ELEMENT_VERSE)) {
916             // If the element is an OSIS Verse then this is fairly easy
917             String osisid = ele.getAttributeValue(OSIS_ATTR_OSISID);
918 
919             try {
920                 return VerseFactory.fromString(v11n, osisid);
921             } catch (NoSuchVerseException ex) {
922                 throw new BookException(JSOtherMsg.lookupText("OsisID not valid: {0}", osisid), ex);
923             }
924         }
925 
926         // So we just walk up the tree trying to find a verse
927         Parent parent = ele.getParent();
928         if (parent instanceof Element) {
929             return getVerse(v11n, (Element) parent);
930         }
931 
932         throw new BookException(JSOtherMsg.lookupText("Verse element could not be found"));
933     }
934 
935     /**
936      * Helper method to create the boilerplate headers in an OSIS document from
937      * the current metadata object
938      */
939     public static Element createOsisFramework(BookMetaData bmd) {
940         Element osis = factory().createOsis();
941         String osisid = bmd.getInitials();
942 
943         Element work = factory().createWork();
944         work.setAttribute(ATTRIBUTE_WORK_OSISWORK, osisid);
945 
946         Element header = factory().createHeader();
947         header.addContent(work);
948 
949         Element text = factory().createOsisText();
950         text.setAttribute(ATTRIBUTE_TEXT_OSISIDWORK, OSISID_PREFIX_BIBLE + osisid);
951         text.addContent(header);
952 
953         osis.addContent(text);
954 
955         return osis;
956     }
957 
958     /**
959      * Convert a Difference list into a pretty HTML report.
960      * 
961      * @param diffs
962      *            List of Difference objects
963      * @return HTML representation
964      */
965     public static List<Content> diffToOsis(List<Difference> diffs) {
966         Element div = factory().createDiv();
967 
968         for (int x = 0; x < diffs.size(); x++) {
969             Difference diff = diffs.get(x);
970             EditType editType = diff.getEditType(); // Mode (delete, equal,
971                                                     // insert)
972             Text text = factory.createText(diff.getText()); // Text of change.
973 
974             if (EditType.DELETE.equals(editType)) {
975                 Element hi = factory().createHI();
976                 hi.setAttribute(OSISUtil.OSIS_ATTR_TYPE, OSISUtil.HI_LINETHROUGH);
977                 hi.addContent(text);
978                 div.addContent(hi);
979             } else if (EditType.INSERT.equals(editType)) {
980                 Element hi = factory().createHI();
981                 hi.setAttribute(OSISUtil.OSIS_ATTR_TYPE, OSISUtil.HI_UNDERLINE);
982                 hi.addContent(text);
983                 div.addContent(hi);
984             } else {
985                 div.addContent(text);
986             }
987         }
988         return div.cloneContent();
989     }
990 
991     public static List<Content> rtfToOsis(String rtf) {
992         Element div = factory().createDiv();
993         Stack<Content> stack = new Stack<Content>();
994         stack.push(div);
995 
996         int strlen = rtf.length();
997 
998         StringBuilder text = new StringBuilder(strlen);
999 
1000        int i = 0;
1001        for (i = 0; i < strlen; i++) {
1002            char curChar = rtf.charAt(i);
1003            if (curChar != '\\') {
1004                text.append(curChar);
1005                continue;
1006            }
1007
1008            // The following are ordered from most to least common
1009            // and when one is a prefix of another, it follows.
1010
1011            // Used to end all open attributes. Only \qc in our implementation.
1012            if (rtf.startsWith("\\pard", i)) {
1013                Element currentElement = (Element) stack.pop();
1014                currentElement.addContent(text.toString());
1015                text.delete(0, text.length());
1016                stack.clear();
1017                stack.push(div);
1018                i += (i + 5 < strlen && rtf.charAt(i + 5) == ' ') ? 5 : 4;
1019                continue;
1020            }
1021
1022            // Simulate a paragraph break.
1023            if (rtf.startsWith("\\par", i)) {
1024                Element currentElement = (Element) stack.peek();
1025                currentElement.addContent(text.toString());
1026                text.delete(0, text.length());
1027                currentElement.addContent(OSISUtil.factory.createLB());
1028                i += (i + 4 < strlen && rtf.charAt(i + 4) == ' ') ? 4 : 3;
1029                continue;
1030            }
1031
1032            // OSIS does not have the notion of centered text.
1033            // So we define our own
1034            if (rtf.startsWith("\\qc", i)) {
1035                Element centerDiv = OSISUtil.factory.createDiv();
1036                centerDiv.setAttribute(OSIS_ATTR_TYPE, "x-center");
1037                Element currentElement = (Element) stack.peek();
1038                currentElement.addContent(text.toString());
1039                text.delete(0, text.length());
1040                currentElement.addContent(centerDiv);
1041                stack.push(centerDiv);
1042                // skip following space, if any
1043                i += (i + 3 < strlen && rtf.charAt(i + 3) == ' ') ? 3 : 2;
1044                continue;
1045            }
1046
1047            // convert Unicode representations to Unicode
1048            if (rtf.startsWith("\\u", i)) {
1049                StringBuilder buf = new StringBuilder();
1050                i += 2;
1051                while (i < strlen) {
1052                    char curDigit = rtf.charAt(i);
1053                    if (curDigit != '-' && !Character.isDigit(curDigit)) {
1054                        break;
1055                    }
1056                    buf.append(curDigit);
1057                    i++;
1058                }
1059                // At this point:
1060                // buf contains the numeric representation of the number, 16-bit
1061                // signed
1062                // charAt(i) is the substitution character if Unicode is not
1063                // supported
1064                int value = Integer.parseInt(buf.toString());
1065                if (value < 0) {
1066                    value += 65536;
1067                }
1068                text.append((char) value);
1069                // don't advance since i is on the substitute character.
1070                continue;
1071            }
1072
1073            // close italic and bold
1074            if (rtf.startsWith("\\i0", i) || rtf.startsWith("\\b0", i)) {
1075                Element currentElement = (Element) stack.pop();
1076                currentElement.addContent(text.toString());
1077                text.delete(0, text.length());
1078                i += (i + 3 < strlen && rtf.charAt(i + 3) == ' ') ? 3 : 2;
1079                continue;
1080            }
1081
1082            // Skip escaped whitespace
1083            if (rtf.startsWith(" ", i) || rtf.startsWith("\n", i)) {
1084                i += 1;
1085                continue;
1086            }
1087
1088            // start italic
1089            if (rtf.startsWith("\\i", i)) {
1090                Element hiElement = OSISUtil.factory.createHI();
1091                hiElement.setAttribute(OSIS_ATTR_TYPE, HI_ITALIC);
1092                Element currentElement = (Element) stack.peek();
1093                currentElement.addContent(text.toString());
1094                text.delete(0, text.length());
1095                currentElement.addContent(hiElement);
1096                stack.push(hiElement);
1097                i += (i + 2 < strlen && rtf.charAt(i + 2) == ' ') ? 2 : 1;
1098                continue;
1099            }
1100
1101            // start bold
1102            if (rtf.startsWith("\\b", i)) {
1103                Element hiElement = OSISUtil.factory.createHI();
1104                hiElement.setAttribute(OSIS_ATTR_TYPE, HI_BOLD);
1105                Element currentElement = (Element) stack.peek();
1106                currentElement.addContent(text.toString());
1107                text.delete(0, text.length());
1108                currentElement.addContent(hiElement);
1109                stack.push(hiElement);
1110                i += (i + 2 < strlen && rtf.charAt(i + 2) == ' ') ? 2 : 1;
1111                continue;
1112            }
1113
1114        }
1115
1116        // If there is any text that has not been consumed
1117        if (text.length() > 0) {
1118            div.addContent(text.toString());
1119        }
1120        // div.addContent(text.toString());
1121        // // If the fragment is already in a document, then use that.
1122        // Document doc = div.getDocument();
1123        // if (doc == null)
1124        // {
1125        // doc = new Document(div);
1126        // }
1127        // SAXEventProvider ep = new JDOMSAXEventProvider(doc);
1128        // ContentHandler osis = new
1129        // PrettySerializingContentHandler(FormatType.CLASSIC_INDENT);
1130        // try
1131        // {
1132        // ep.provideSAXEvents(osis);
1133        // }
1134        // catch (SAXException e)
1135        // {
1136        // e.printStackTrace();
1137        // }
1138        // System.err.println(osis.toString());
1139        return div.cloneContent();
1140    }
1141
1142    /**
1143     * Find all the instances of elements of type <code>find</code> under the
1144     * element <code>div</code>. For internal use only.
1145     */
1146    private static void recurseDeepContent(Element start, String name, List<Content> reply) {
1147        if (start.getName().equals(name)) {
1148            reply.add(start);
1149        }
1150
1151//        Content data = null;
1152        Element ele = null;
1153        for (Content data : start.getContent()) {
1154            if (data instanceof Element) {
1155                ele = (Element) data;
1156                recurseDeepContent(ele, name, reply);
1157            }
1158        }
1159    }
1160
1161    /**
1162     * If we have a String just add it to the buffer, but if we have an Element
1163     * then try to dig the strings out of it.
1164     */
1165    private static void recurseElement(Object sub, StringBuilder buffer) {
1166        if (sub instanceof Text) {
1167            buffer.append(((Text) sub).getText());
1168        } else if (sub instanceof Element) {
1169            recurseChildren((Element) sub, buffer);
1170        } else {
1171            log.error("unknown type: {}", sub.getClass().getName());
1172        }
1173    }
1174
1175    /**
1176     * Helper to extract the Strings from a nest of JDOM elements
1177     * 
1178     * @param ele
1179     *            The JDOM Element to dig into
1180     * @param buffer
1181     *            The place we accumulate strings.
1182     */
1183    private static void recurseChildren(Element ele, StringBuilder buffer) {
1184        // ele is a JDOM Element that might have a getContent() method
1185        for (Content sub : ele.getContent()) {
1186            recurseElement(sub, buffer);
1187        }
1188    }
1189
1190    private static String strongsNumber = "strong:([GgHh][0-9]+!?[A-Za-z]*)";
1191    private static Pattern strongsNumberPattern = Pattern.compile(strongsNumber);
1192    private static String robinsons = "robinson:([a-zA-Z][-a-zA-Z]*)";
1193    private static Pattern robinsonsPattern = Pattern.compile(robinsons);
1194}
1195