[jsword-svn] r1053 - trunk/jsword/src/main/java/org/crosswire/jsword/examples

dmsmith at crosswire.org dmsmith at crosswire.org
Wed Mar 22 02:31:14 MST 2006


Author: dmsmith
Date: 2006-03-22 02:31:09 -0700 (Wed, 22 Mar 2006)
New Revision: 1053

Modified:
   trunk/jsword/src/main/java/org/crosswire/jsword/examples/BibleToOsis.java
Log:
KJV 2003 fixup example.

Modified: trunk/jsword/src/main/java/org/crosswire/jsword/examples/BibleToOsis.java
===================================================================
--- trunk/jsword/src/main/java/org/crosswire/jsword/examples/BibleToOsis.java	2006-03-17 20:28:06 UTC (rev 1052)
+++ trunk/jsword/src/main/java/org/crosswire/jsword/examples/BibleToOsis.java	2006-03-22 09:31:09 UTC (rev 1053)
@@ -524,10 +524,58 @@
         input = input.replaceAll("\"transChange\"", "\"x-transChange\""); //$NON-NLS-1$ //$NON-NLS-2$
         input = input.replaceAll("\"type:", "\"x-"); //$NON-NLS-1$ //$NON-NLS-2$
         input = input.replaceAll("changeType=\"", "type=\""); //$NON-NLS-1$ //$NON-NLS-2$
+        input = input.replaceAll("x-StudyNote", "study"); //$NON-NLS-1$ //$NON-NLS-2$
+        input = input.replaceAll("\\s*</q>", "</q>"); //$NON-NLS-1$ //$NON-NLS-2$
+        
+        // normalize paragraph markers and move them from the end of a verse to the beginning of the next
         input = input.replaceAll("<milestone type=\"x-p\"\\s*/>", "<milestone type=\"x-p\" marker=\"\u00B6\"/>"); //$NON-NLS-1$ //$NON-NLS-2$
         input = input.replaceAll("<p/>", "<milestone type=\"x-p\" marker=\"\u00B6\"/>"); //$NON-NLS-1$ //$NON-NLS-2$
-        input = input.replaceAll("x-StudyNote", "study"); //$NON-NLS-1$ //$NON-NLS-2$
-        input = input.replaceAll("\\s*</q>", "</q>"); //$NON-NLS-1$ //$NON-NLS-2$
+        if (input.contains("<milestone type=\"x-p\" marker=\"\u00B6\"/>")) //$NON-NLS-1$
+        {
+            input = input.replaceAll("<milestone type=\"x-p\" marker=\"\u00B6\"/>", ""); //$NON-NLS-1$ //$NON-NLS-2$
+            moveP = true;
+//            System.err.println(osisID + " remove \u00b6"); //$NON-NLS-1$
+        }
+        else if (moveP)
+        {
+            input = "<milestone type=\"x-p\" marker=\"\u00B6\"/>" + input; //$NON-NLS-1$
+            moveP = false;
+        }
+
+        // # is used in a note for a greek strong's #
+        input = input.replace('#', 'G');
+        // used in a note as a quotation mark at the beginning of a word. i.e. `not'
+        input = input.replace('`', '\'');
+        // used in notes as a space
+        input = input.replace('_', ' ');
+        // used in notes to indicate italics. These are incomplete GBF codes.
+        input = input.replaceAll("[{][Ff][iI][}]", ""); //$NON-NLS-1$ //$NON-NLS-2$
+        // found an email address in a note
+        input = input.replace("@hotmail.", "at hotmail dot "); //$NON-NLS-1$ //$NON-NLS-2$
+
+        if (osisID.equals("Exod.32.32")) //$NON-NLS-1$
+        {
+            input = input.replace("<w morph=\"strongMorph:TH8798\" lemma=\"strong:H04229\">--; ", //$NON-NLS-1$
+                                  "\u2015; <w morph=\"strongMorph:TH8798\" lemma=\"strong:H04229\">"); //$NON-NLS-1$
+        }
+
+        if (osisID.equals("Ezek.26.16")) //$NON-NLS-1$
+        {
+            input = input.replace("\\pa", ""); //$NON-NLS-1$ //$NON-NLS-2$
+        }
+
+        if (osisID.equals("Matt.5.30")) //$NON-NLS-1$
+        {
+            input = input.replace("<w src=\"10\" lemma=\"strong:G846\" morph=\"robinson:P-ASF\">if</w>", //$NON-NLS-1$
+                                  "<w src=\"10\" lemma=\"strong:G846\" morph=\"robinson:P-ASF\">it</w>"); //$NON-NLS-1$
+        }
+
+        if (osisID.equals("Matt.16.17")) //$NON-NLS-1$
+        {
+            input = input.replace("<w src=\"10\" lemma=\"strong:G920\" morph=\"robinson:ARAM\">Bar</w><w src=\"11\" lemma=\"strong:G920\" morph=\"robinson:ARAM\">jona</w>", //$NON-NLS-1$
+                                  "<w src=\"10 11\" lemma=\"strong:G920\" morph=\"robinson:ARAM\">Bar\u2013jona</w>"); //$NON-NLS-1$
+        }
+
         if (osisID.equals("Matt.24.38")) //$NON-NLS-1$
         {
             input = input.replace("<w src=\"18\" lemma=\"strong:G3739\" morph=\"robinson:R-GSF\"><w src=\"7\" lemma=\"strong:G3588\" morph=\"robinson:T-DPF\">that</w></w>", //$NON-NLS-1$
@@ -1184,14 +1232,26 @@
             before.removeAll(after);
             System.err.println(osisID + ": Problems with w src attribute. Missing: " + before); //$NON-NLS-1$
         }
-        return analyzeApostrophe(osisID, input);
+        input = fixApostrophe(osisID, input);
+        input = fixPunctuation(osisID, input);
+        return input;
     }
 
-    private String analyzeApostrophe(String osisID, String input)
+    private String fixApostrophe(String osisID, String input)
     {
         Matcher matcher;
         boolean changed = false;
+        Verse v = null;
 
+        try
+        {
+            v = VerseFactory.fromString(osisID);
+        }
+        catch (NoSuchVerseException e)
+        {
+            return input;
+        }
+
         if (input.indexOf('\'') == -1)
         {
             return input;
@@ -1243,16 +1303,6 @@
 //            System.err.println(osisID + " replace |" + matcher.group() + "| with |" + replace + '|'); //$NON-NLS-1$ //$NON-NLS-2$
         }
 
-        Verse v = null;
-        try
-        {
-            v = VerseFactory.fromString(osisID);
-        }
-        catch (NoSuchVerseException e)
-        {
-            return input;
-        }
-
         // for the ot only
         if (SwordConstants.getTestament(v) == SwordConstants.TESTAMENT_OLD)
         {
@@ -1336,9 +1386,9 @@
         if (matcher.find())
         {
             String replace = matcher.group(1) + "s</w>" + matcher.group(2); //$NON-NLS-1$
-//            input = input.replace(matcher.group(), replace);
+            input = input.replace(matcher.group(), replace);
             changed = true;
-            System.err.println(osisID + " replace |" + matcher.group() + "| with |" + replace + '|'); //$NON-NLS-1$ //$NON-NLS-2$
+//            System.err.println(osisID + " replace |" + matcher.group() + "| with |" + replace + '|'); //$NON-NLS-1$ //$NON-NLS-2$
         }
 
         matcher = a15Pattern.matcher(input);
@@ -1362,6 +1412,15 @@
             }
         }
 
+        matcher = a17Pattern.matcher(input);
+        if (matcher.find())
+        {
+            String replace = matcher.group(1) + 's' + matcher.group(2);
+            input = input.replace(matcher.group(), replace);
+            changed = true;
+//            System.err.println(osisID + " replace |" + matcher.group() + "| with |" + replace + '|'); //$NON-NLS-1$ //$NON-NLS-2$
+        }
+
 //        matcher = axPattern.matcher(input);
 //        if (matcher.find())
 //        {
@@ -1376,6 +1435,98 @@
         return input;
     }
 
+    private String fixPunctuation(String osisID, String input)
+    {
+        Matcher matcher = w1Pattern.matcher(input);
+        while (matcher.find())
+        {
+            String replace = matcher.group(1);
+            input = input.replace(matcher.group(), replace);
+//            System.err.println(osisID + " replace |" + matcher.group() + "| with |" + replace + '|'); //$NON-NLS-1$ //$NON-NLS-2$
+        }
+
+        matcher = w2Pattern.matcher(input);
+        while (matcher.find())
+        {
+            String replace = ") "; //$NON-NLS-1$
+            input = input.replace(matcher.group(), replace);
+//            System.err.println(osisID + " replace |" + matcher.group() + "| with |" + replace + '|'); //$NON-NLS-1$ //$NON-NLS-2$
+        }
+
+        matcher = w3Pattern.matcher(input);
+        while (matcher.find())
+        {
+            String replace = " "; //$NON-NLS-1$
+            input = input.replace(matcher.group(), replace);
+//            System.err.println(osisID + " replace |" + matcher.group() + "| with |" + replace + '|'); //$NON-NLS-1$ //$NON-NLS-2$
+        }
+
+        matcher = w4Pattern.matcher(input);
+        while (matcher.find())
+        {
+            String replace = "</w>" + matcher.group(1); //$NON-NLS-1$
+            input = input.replace(matcher.group(), replace);
+//            System.err.println(osisID + " replace |" + matcher.group() + "| with |" + replace + '|'); //$NON-NLS-1$ //$NON-NLS-2$
+        }
+
+        matcher = w5Pattern.matcher(input);
+        if (matcher.find())
+        {
+            String replace = matcher.group(2) + matcher.group(1);
+            input = input.replace(matcher.group(), replace);
+//            System.err.println(osisID + " replace |" + matcher.group() + "| with |" + replace + '|'); //$NON-NLS-1$ //$NON-NLS-2$
+        }
+
+        // strip trailing spaces
+        int length = input.length();
+        int here = length;
+        while (input.charAt(here - 1) == ' ')
+        {
+            here--;
+        }
+
+        if (here < length)
+        {
+            input = input.substring(0, here);
+//            if (length - here > 1)
+//            {
+//                System.err.println(osisID + " remove " + (length - here) + " trailing spaces"); //$NON-NLS-1$ //$NON-NLS-2$
+//            }
+        }
+
+        matcher = w6Pattern.matcher(input);
+        while (matcher.find())
+        {
+            String replace = matcher.group(2) + matcher.group(1);
+            input = input.replace(matcher.group(), replace);
+//            System.err.println(osisID + " replace |" + matcher.group() + "| with |" + replace + '|'); //$NON-NLS-1$ //$NON-NLS-2$
+            matcher.reset(input);
+        }
+
+        // strip leading spaces
+        here = 0;
+        while (input.charAt(here) == ' ')
+        {
+            here++;
+        }
+        
+        if (here > 0)
+        {
+            input = input.substring(here);
+//            System.err.println(osisID + " remove " + here + " leading spaces"); //$NON-NLS-1$ //$NON-NLS-2$
+        }
+
+        matcher = wnPattern.matcher(input);
+        if (matcher.find())
+        {
+            String replace = " "; //$NON-NLS-1$
+            input = input.replace(matcher.group(), replace);
+//            System.err.println(osisID + " replace |" + matcher.group() + "| with |" + replace + '|'); //$NON-NLS-1$ //$NON-NLS-2$
+        }
+
+        return input;
+    }
+
     private static FieldPosition pos = new FieldPosition(0);
 
     private static String preVerseStart = "<title subtype=\"x-preverse\" type=\"section\">"; //$NON-NLS-1$
@@ -1414,8 +1565,20 @@
     private static Pattern a14Pattern = Pattern.compile("(\\w+[^Ss]')</w>(.)"); //$NON-NLS-1$
     private static Pattern a15Pattern = Pattern.compile("(husband') "); //$NON-NLS-1$
     private static Pattern a16Pattern = Pattern.compile("(cockatrice')s"); //$NON-NLS-1$
-    private static Pattern axPattern = Pattern.compile(".....s'[^ < ].........."); //$NON-NLS-1$
+    private static Pattern a17Pattern = Pattern.compile("(ass')([^s])"); //$NON-NLS-1$
+//    private static Pattern axPattern = Pattern.compile(".....[sS]'[^sS< \\.].........."); //$NON-NLS-1$
 
+    private static Pattern w1Pattern = Pattern.compile("\\s([,;:.?!])"); //$NON-NLS-1$
+    private static Pattern w2Pattern = Pattern.compile("\\s\\)"); //$NON-NLS-1$
+    private static Pattern w3Pattern = Pattern.compile("[\n\r\t]"); //$NON-NLS-1$
+    private static Pattern w4Pattern = Pattern.compile("(\\{Punct}|\\s)+</w>"); //$NON-NLS-1$
+    private static Pattern wxxPattern = Pattern.compile("([!\"#$%&'()*+,-./:;=?@^_`{|}~])"); //$NON-NLS-1$
+    private static Pattern w5Pattern = Pattern.compile("(<w\\s[^>]*>)([!\"#$%&'()*+,-./:;=?@^_`{|}~ ]+)"); //$NON-NLS-1$
+    private static Pattern w6Pattern = Pattern.compile("(<w\\s[^>]*></w>)([!\"#$%&'()*+,-./:;=?@^_`{|}~ ]+)"); //$NON-NLS-1$
+    private static Pattern wnPattern = Pattern.compile("\\s\\s+"); //$NON-NLS-1$
+
+    private boolean moveP = false;
+
     private Writer writer;
     private String filename;
 }



More information about the jsword-svn mailing list