[sword-cvs] icu-sword/source/data/brkitr char.txt,1.1,1.2 line.txt,1.1,1.2 line_th.txt,1.1,1.2 sent.txt,1.1,1.2 title.txt,1.1,1.2 word.txt,1.1,1.2 word_th.txt,1.1,1.2

sword@www.crosswire.org sword@www.crosswire.org
Tue, 6 Apr 2004 03:11:08 -0700


Update of /cvs/core/icu-sword/source/data/brkitr
In directory www:/tmp/cvs-serv8911/source/data/brkitr

Modified Files:
	char.txt line.txt line_th.txt sent.txt title.txt word.txt 
	word_th.txt 
Log Message:
ICU 2.8 sync

Index: char.txt
===================================================================
RCS file: /cvs/core/icu-sword/source/data/brkitr/char.txt,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- char.txt	10 Sep 2003 02:42:05 -0000	1.1
+++ char.txt	6 Apr 2004 10:08:10 -0000	1.2
@@ -17,7 +17,7 @@
 $LF = \n;
 $Control    = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];
 
-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
+$Extend     = [[:Grapheme_Extend = TRUE:]];
 
 #
 # Korean Syllable Definitions
@@ -31,15 +31,31 @@
 
 $HangulSyllable = $L+ | ($L* ($LV? $V+ | $LV | $LVT) $T*) | $T+;
 
-#
-#  Forward Break Rules
-#
+## -------------------------------------------------
+
+!!forward;
+
 $CR $LF;
 ([^$Control] | $HangulSyllable) $Extend*;
-.;
 
+## -------------------------------------------------
 
-#
-#  Reverse Rule, back up to the beginning of some preceding grapheme cluster.
-#
-! ($Extend | $V | $T )*   ($LF $CR | ($LV | $LVT)?$L* | .);
+!!reverse;
+
+$BackHangulSyllable = $L+ | ($T* ($V+$LV? | $LV | $LVT) $L*) | $T+;
+$BackOneCluster = ($LF $CR) | ($Extend* ([^$Control] | $BackHangulSyllable));
+$BackOneCluster;
+
+## -------------------------------------------------
+
+!!safe_reverse;
+
+# rule 6, 7, 8
+$V+ $L;
+
+## -------------------------------------------------
+
+!!safe_forward;
+
+# rule 6, 7, 8
+$V+ $T;

Index: line.txt
===================================================================
RCS file: /cvs/core/icu-sword/source/data/brkitr/line.txt,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- line.txt	10 Sep 2003 02:42:05 -0000	1.1
+++ line.txt	6 Apr 2004 10:08:10 -0000	1.2
@@ -12,6 +12,10 @@
 #  Character Classes defined by TR 14.
 #
 
+!!chain;
+!!LBCMNoChain;
+!!lookAheadHardBreak;
+
 $AI = [:LineBreak =  Ambiguous:];
 $AL = [:LineBreak =  Alphabetic:];
 $BA = [:LineBreak =  Break_After:];
@@ -29,6 +33,7 @@
 $IN = [:LineBreak =  Inseperable:];
 $IS = [:LineBreak =  Infix_Numeric:];
 $LF = [:LineBreak =  Line_Feed:];
+$NL = [:LineBreak =  Next_Line:];
 $NS = [:LineBreak =  Nonstarter:];
 $NU = [:LineBreak =  Numeric:];
 $OP = [:LineBreak =  Open_Punctuation:];
@@ -39,105 +44,343 @@
 $SG = [:LineBreak =  Surrogate:];
 $SP = [:LineBreak =  Space:];
 $SY = [:LineBreak =  Break_Symbols:];
+$WJ = [:LineBreak =  Word_Joiner:];
 $XX = [:LineBreak =  Unknown:];
 $ZW = [:LineBreak =  ZWSpace:];
 
 
 #
-#  Character classes from TR 29.  Needed for finding characters.
-#
+# Korean Syllable Definitions
 #
-$Extend  = [:Grapheme_Extend = TRUE:];
+$L   = [:Hangul_Syllable_Type = L:];
+$V   = [:Hangul_Syllable_Type = V:];
+$T   = [:Hangul_Syllable_Type = T:];
+
+$LV  = [:Hangul_Syllable_Type = LV:];
+$LVT = [:Hangul_Syllable_Type = LVT:];
 
+$HangulSyllable = $L+ | ($L* ($LV? $V+ | $LV | $LVT) $T*) | $T+;
 
 #
-#  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width) and
-#                               SA  (South East Asian: Thai, Lao, Khmer) as $AL  (Alphabetic)
+#  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width),
+#                               SA  (South East Asian: Thai, Lao, Khmer)
+#                               XX  (Unknown, unassigned)
+#                         as $AL  (Alphabetic)
 #
-$ALPlus = $AL | $AI | $SA;
+$ALPlus = $AL | $AI | $SA | $XX;
 
 #
 #  Combining Marks.   X $CM*  behaves as if it were X.  Rule LB6.
 #
 $ALcm = $ALPlus $CM*;
-$IDcm = ($ID $CM* | $SP $CM+);
-$NUcm = $NU $Extend*;
-$HYcm = $HY $Extend*;
-$QUcm = $QU $Extend*;
-$POcm = $PO $Extend*;
-$OPcm = $OP $Extend*;
-$BAcm = $BA $Extend*;
-$BBcm = $BB $Extend*;
-$NScm = $NS $Extend*;
-$GLcm = $GL $Extend*;
-$B2cm = $B2 $Extend*;
-$INcm = $IN $Extend*;
-
+$BAcm = $BA $CM*;
+$BBcm = $BB $CM*;
+$B2cm = $B2 $CM*;
+$CLcm = $CL $CM*;
+$EXcm = $EX $CM*;
+$GLcm = $GL $CM*;
+$HYcm = $HY $CM*;
+$IDcm = ($ID | $HangulSyllable) $CM*;
+$INcm = $IN $CM*;
+$IScm = $IS $CM*;
+$NScm = $NS $CM*;
+$NUcm = $NU $CM*;
+$OPcm = $OP $CM*;
+$POcm = $PO $CM*;
+$PRcm = $PR $CM*;
+$QUcm = $QU $CM*;
+$SPcm = $SP $CM*;
+$SYcm = $SY $CM*;
+$WJcm = $WJ $CM*;
 
-#  New Lines.  Always break after, never break before.
-#              Rule LB 3
 #
-#  Endings.    NewLine or Zero Width Space, or both.  Rules 4, 5
-#              Because we never break before these things, $Endings
-#              appears at the end of line break rule.
+#  Each class of character can stand by itself as an unbroken token, with trailing combining stuff
 #
-$NLF = $BK | $CR | $LF | $CR $LF;
-$Endings = $SP* $ZW* $NLF?;
+$ALPlus $CM+;
+$BA $CM+;
+$BB $CM+;
+$B2 $CM+;
+$CL $CM+;
+$EX $CM+;
+$GL $CM+;
+$HY $CM+;
+$ID $CM+;
+$IN $CM+;
+$IS $CM+;
+$NS $CM+;
+$NU $CM+;
+$OP $CM+;
+$PO $CM+;
+$PR $CM+;
+$QU $CM+;
+$SP $CM+;
+$SY $CM+;
+$WJ $CM+;
 
+## -------------------------------------------------
 
-#
-#  Openings  Sequences that can precede Words, and that should not be separated from them.
-#            Rules LB 9, 10
-#
-$Openings = (($QUcm $SP*)? $OPcm $SP*)*;
+!!forward;
 
 #
-#  Closings  Seqences that follow words, and that should not be separated from them,
-#            Rule LB 8, 11, 15
-$Closings =  ($SP*( ($CL ($SP* $NScm)?  |  $EX  | $IS  | $SY) $Extend*) | $BAcm | $HYcm  | $NScm)*;
+#  Rule LB 3
+$LB3Breaks = [$BK $CR $LF $NL];
+$LB3NonBreaks = [^$BK $CR $LF $NL];
+$LB5NonBreaks = [[$LB3NonBreaks] - [$ZW]];
 
-#
-#  Words.  Includes mixed Alpha-numerics.
-#          Rules 11a, 16, 17, 19, more or less.
-#
-$NumberInterior = $IDcm | ($NUcm | $ALcm | $IS $NUcm)+;  
-$Number         =  $PR? ($OPcm | $HYcm)? $NumberInterior $CL? $POcm?; # Fancy Number     18 
-$Word   = (($IDcm | ($ALcm | $NUcm)+) ($POcm? | $INcm?))  ;           # Alpha-numeric.   16, 17 
-$Dashes = (($B2cm $SP*)*);                                            # Dashes           11a   
-        
-        
+$LB3NonBreaks?     $LB3Breaks {100};
+$LB5NonBreaks $CM* $LB3Breaks {100};
+$CR $LF {100};
 
+# LB 4         x SP
+#              x ZW
+$ZW [$SP $ZW];
+$LB5NonBreaks $CM* [$SP $ZW];
 
- 
- 
-        
-$Word15 = ($BBcm* ($Word | $Number | $Dashes)? ($BAcm | $HYcm | $NScm)*) |  # Rule 15. Stuff sticks around words.
-          [^[:Cc:] $BK $CR $LF $ZW $SP $GL] $Extend*  |                 # Allow characters that don't meet the
-          [^$BK $CR $LF $ZW $SP $GL ];                                   #  more elaborate definitions for WORD
-                                                                    #  to be glued.
-        
-$GluedWord  = ($GLcm | $QUcm)? $Word15 (($GLcm | $QUcm) $Word15)*;  # "Glue" will stick anything below it together.
-                                                                    # Rules 13, 14
+# LB 5         Break after zero width space
+$LB5Breaks = [$LB3Breaks $ZW];
 
+# LB 6
 #
-#  The actual rule, a combination of everything defined above.
+# Korean Syllable Definitions
 #
-$Openings $GluedWord  $Closings $Endings;
-# $GluedWord;
 
+($HangulSyllable) $CM*;
 
+# LB 7     Combining marks.      $SP $CM needs to behave like $ID.
+#                                X   $CM needs to behave like X, where X is not $SP.   
+#                                $CM not covered by the above needs to behave like $AL   
+#                                
+$LB5NonBreaks $CM+;    #  Stick together any combining sequences that don't match other rules.
 
+# LB 8
+$LB5NonBreaks $CM* $CL;
+$LB5NonBreaks $CM* $EX;
+$LB5NonBreaks $CM* $IS;
+$LB5NonBreaks $CM* $SY;
+
+# LB 9
+$OPcm $SP* .?;
+$OPcm $SP* $LB5NonBreaks $CM*;
+
+# LB 10
+$QUcm $SP* $OPcm;
+
+# LB 11
+$CLcm $SP* $NScm;
+
+# LB 11a
+($B2cm)+;
+
+# LB 11b
+$LB5NonBreaks $CM* ($GLcm | $WJcm);
+($GLcm | $WJcm) .?;
+
+# LB 12
+$LB12NonBreaks = [$LB5NonBreaks - $SP];
+
+# LB 14
+$LB12NonBreaks $CM* $QUcm+ .?;
+$LB12NonBreaks $CM* $QUcm+ $LB5NonBreaks $CM*;
+$SP $CM+            $QUcm+ .?;                      # LB7a  SP CM+ behaves as ID
+$SP $CM+            $QUcm+ $LB5NonBreaks $CM*;
 
+$QUcm $LB3NonBreaks?;
+$QUcm $LB5NonBreaks $CM*;    # Don't let a combining mark go onto $CR, $BK, etc.
+
+# LB 14a
+$LB14NonBreaks = [$LB12NonBreaks - $CB];
+$LB14CanBreakAfter = $LB14NonBreaks $CM* | $SP $CM+;
+
+# LB 15
+$LB14CanBreakAfter ($BAcm | $HYcm | $NScm);
+$BBcm [^$CB];
+$BBcm [^$CB $CR $LF $BK $NL $ZW] $CM*;
+
+# LB 16
+$ALcm    $INcm;
+$CM+     $INcm;     #  by rule 7c, any otherwise unattached CM behaves as AL
+$IDcm    $INcm;
+$SP $CM+ $INcm;     # by rule 7a, $SP $CM behaves like ID
+$INcm    $INcm;
+$NUcm    $INcm;
+
+
+# $LB 17
+($IDcm | $SP $CM+) $POcm;
+$ALcm+ $NUcm;       # includes $LB19
+$CM+   $NUcm;       # Rule 7c
+$NUcm $ALcm+;
+
+# LB 18
+$PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm)* $CLcm? $POcm?;
+
+# LB 19
+$CM* $ALcm+;    # The $CM* is from rule 7C, and unattached CM is treated as AL
 
 #
 #  Reverse Rules.
 #
-#     Back up to a hard break or a space that will cause a boundary.
-#     Not all spaces cause line breaks.  $SpaceGlue represents a sequence
-#     containing a space that may inhibit a break from occuring.
-#
+## -------------------------------------------------
 
-$SpaceGlue  = ([$ZW $CL $IS $NS $OP]  ($Extend* $SP)) | (($Extend* $SP)+ $OP);
-$ClumpingChars = [^$SP $BK $CR $LF];
+!!reverse;
 
-!. . $ClumpingChars*  ($SpaceGlue $ClumpingChars*)* (. | $LF $CR);
+$CM+ $ALPlus;
+$CM+ $BA;
+$CM+ $BB;
+$CM+ $B2;
+$CM+ $CL;
+$CM+ $EX;
+$CM+ $GL;
+$CM+ $HY;
+$CM+ $ID;
+$CM+ $IN;
+$CM+ $IS;
+$CM+ $NS;
+$CM+ $NU;
+$CM+ $OP;
+$CM+ $PO;
+$CM+ $PR;
+$CM+ $QU;
+$CM+ $SP;
+$CM+ $SY;
+$CM+ $WJ;
+
+# LB 3
+
+$LB3Breaks $LB3NonBreaks;
+$LB3Breaks $CM* $LB5NonBreaks;
+$LF $CR;
+
+# LB 4         x SP
+#              x ZW
+[$SP $ZW] $LB3NonBreaks;
+[$SP $ZW] $CM* $LB5NonBreaks;
+
+# LB 5 Break after zero width space
+
+# LB 6 Jamo is treated like an alphabet
+
+$BackHangulSyllable = $L+ | ($T* ($V+$LV? | $LV | $LVT) $L*) | $T+;
+$CM* $BackHangulSyllable;
+
+# LB 7 Combining marks.
+#    $SP $CM needs to behave like $ID.
+#    X   $CM needs to behave like X, where X is not $SP.
+#    $CM not covered by the above needs to behave like $AL
+# Stick together any combining sequences that don't match other rules.
+$CM+ $LB5NonBreaks;
+
+# LB 8
+$CL $CM* $LB5NonBreaks;
+$EX $CM* $LB5NonBreaks;
+$IS $CM* $LB5NonBreaks;
+$SY $CM* $LB5NonBreaks;
+
+# LB 9
+$LB5NonBreaks $SP* $CM* $OP;
+
+# LB 10
+$CM* $OP $SP* $CM* $QU;
+
+# LB 11
+$CM* $NS $SP* $CM* $CL;
+
+# LB 11a
+($CM* $B2)+;
+
+# LB 11b
+$CM* ($GL | $WJ) $CM* $LB5NonBreaks;
+$CM* $LB5NonBreaks $CM* ($GL | $WJ);
+. $CM* ($GL | $WJ);
+
+# LB 12
+
+# LB 14
+$CM* $QU $CM* $LB12NonBreaks;
+$CM* $QU $CM+ $SP;
+$CM* $LB5NonBreaks $CM* $QU;
+
+# LB 14a
+$BackLB14CanBreakAfter = ($CM* [$LB14NonBreaks - $CM]) | ($CM+ $SP);
+
+# LB 15
+$CM* ($BA | $HY | $NS) $BackLB14CanBreakAfter;
+($CM* ($BA | $HY | $NS))+ $CM+ / $LB5Breaks;
+[$CR $LF $BK $NL $ZW] $CM* $BB;
+$CM* [^$CB $CR $LF $BK $NL $ZW] $CM* $BB;
+
+# LB 16
+$CM* $IN $CM* $ALPlus;
+# by rule 7c, any otherwise unattached CM behaves as AL
+$CM* $IN $CM+ / $LB5Breaks;
+
+$CM* $IN $CM* ($ID | $CM $SP);
+$CM* $IN $CM* $IN;
+$CM* $IN $CM* $NU;
+
+# $LB 17
+$CM* $PO $CM* ($ID | $CM $SP);
+$CM* $NU ($CM* $ALPlus)+; # includes $LB19
+$CM* $NU $CM+ / $LB5Breaks;        # Rule 7c
+
+$CM* $ALPlus $CM* $NU;
+
+# LB 18
+($CM* $PO)? ($CM* $CL)? ($CM* ($NU | $IS))* $CM* $NU ($CM* ($OP | $HY))? ($CM* $PR)?;
+
+# LB 19
+$CM* $ALPlus $CM* $ALPlus;
+# The $CM* is from rule 7C, and unattached CM is treated as AL
+$CM* $ALPlus $CM+ / $LB5Breaks;
+
+## problem state table can't handle lookahead when it is at the
+## start of the string, currently handled in the rbbi code
+## todo fix this
+
+## -------------------------------------------------
+
+!!safe_reverse;
+
+# LB 6
+$V+ $L;
+
+# LB 7
+$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
+$CM+ $SP / .;
+
+# LB 9
+$SP+ $CM* $OP;
+
+# LB 10
+$SP+ $CM* $QU;
+
+# LB 11
+$SP+ $CM* $CL;
+
+# LB 18
+($CM* $IS)+ $CM* $NU;
+$CL $CM* ($NU | $IS);
+
+## -------------------------------------------------
+
+!!safe_forward;
+
+# LB 6
+$V+ $T;
+
+# LB 7
+[^$BK $CR $LF $NL $ZW $SP] $CM+;
+$SP $CM+ / [^$CM];
+
+# LB 9
+$OP $CM* $SP+;
+
+# LB 10
+$QU $CM* $SP+;
+
+# LB 11
+$CL $CM* $SP+;
+
+# LB 18
+$HY $CM* $NU;
+$IS $CM* $CL;

Index: line_th.txt
===================================================================
RCS file: /cvs/core/icu-sword/source/data/brkitr/line_th.txt,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- line_th.txt	10 Sep 2003 02:42:05 -0000	1.1
+++ line_th.txt	6 Apr 2004 10:08:10 -0000	1.2
@@ -1,4 +1,4 @@
-# Copyright (c) 2002, International Business Machines Corporation and
+# Copyright (c) 2002-2003, International Business Machines Corporation and
 # others. All Rights Reserved.
 #
 #  file:  line.txt
@@ -266,29 +266,29 @@
 #
 $Extend     =   # From UNIDATA/DerivedCoreProperties.txt
 	[\u0300-\u034E \u0360-\u036F \u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9
-	\u05BB-\u05BD \u05BF   \u05C1-\u05C2 \u05C4   \u064B-\u0655 \u0670   \u06D6-\u06DC
-	\u06DE   \u06DF-\u06E4 \u06E7-\u06E8 \u06EA-\u06ED \u0711   \u0730-\u074A
-	\u07A6-\u07B0 \u0901-\u0902 \u0903   \u093C   \u093E-\u0940 \u0941-\u0948
-	\u0949-\u094C \u0951-\u0954 \u0962-\u0963 \u0981   \u0982-\u0983 \u09BC
-	\u09BE-\u09C0 \u09C1-\u09C4 \u09C7-\u09C8 \u09CB-\u09CC \u09D7   \u09E2-\u09E3
-	\u0A02   \u0A3C   \u0A3E-\u0A40 \u0A41-\u0A42 \u0A47-\u0A48 \u0A4B-\u0A4C
-	\u0A70-\u0A71 \u0A81-\u0A82 \u0A83   \u0ABC   \u0ABE-\u0AC0 \u0AC1-\u0AC5
-	\u0AC7-\u0AC8 \u0AC9   \u0ACB-\u0ACC \u0B01   \u0B02-\u0B03 \u0B3C   \u0B3E
-	\u0B3F   \u0B40   \u0B41-\u0B43 \u0B47-\u0B48 \u0B4B-\u0B4C \u0B56   \u0B57
-	\u0B82   \u0BBE-\u0BBF \u0BC0   \u0BC1-\u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0BD7
+	\u05BB-\u05BD \u05BF \u05C1-\u05C2 \u05C4 \u064B-\u0655 \u0670 \u06D6-\u06DC
+	\u06DE \u06DF-\u06E4 \u06E7-\u06E8 \u06EA-\u06ED \u0711 \u0730-\u074A
+	\u07A6-\u07B0 \u0901-\u0902 \u0903 \u093C \u093E-\u0940 \u0941-\u0948
+	\u0949-\u094C \u0951-\u0954 \u0962-\u0963 \u0981 \u0982-\u0983 \u09BC
+	\u09BE-\u09C0 \u09C1-\u09C4 \u09C7-\u09C8 \u09CB-\u09CC \u09D7 \u09E2-\u09E3
+	\u0A02 \u0A3C \u0A3E-\u0A40 \u0A41-\u0A42 \u0A47-\u0A48 \u0A4B-\u0A4C
+	\u0A70-\u0A71 \u0A81-\u0A82 \u0A83 \u0ABC \u0ABE-\u0AC0 \u0AC1-\u0AC5
+	\u0AC7-\u0AC8 \u0AC9 \u0ACB-\u0ACC \u0B01 \u0B02-\u0B03 \u0B3C \u0B3E
+	\u0B3F \u0B40 \u0B41-\u0B43 \u0B47-\u0B48 \u0B4B-\u0B4C \u0B56 \u0B57
+	\u0B82 \u0BBE-\u0BBF \u0BC0 \u0BC1-\u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0BD7
 	\u0C01-\u0C03 \u0C3E-\u0C40 \u0C41-\u0C44 \u0C46-\u0C48 \u0C4A-\u0C4C
-	\u0C55-\u0C56 \u0C82-\u0C83 \u0CBE   \u0CBF   \u0CC0-\u0CC4 \u0CC6
-	\u0CC7-\u0CC8 \u0CCA-\u0CCB \u0CCC   \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D40
-	\u0D41-\u0D43 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D57   \u0D82-\u0D83 \u0DCF-\u0DD1
-	\u0DD2-\u0DD4 \u0DD6   \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31   \u0E34-\u0E39
-	\u0E47-\u0E4E \u0EB1   \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
-	\u0F35   \u0F37   \u0F39   \u0F3E-\u0F3F \u0F71-\u0F7E \u0F7F   \u0F80-\u0F84
-	\u0F86-\u0F87 \u0F90-\u0F97 \u0F99-\u0FBC \u0FC6   \u102C   \u102D-\u1030 \u1031
-	\u1032   \u1036-\u1037 \u1038   \u1056-\u1057 \u1058-\u1059 \u1712-\u1714
+	\u0C55-\u0C56 \u0C82-\u0C83 \u0CBE \u0CBF \u0CC0-\u0CC4 \u0CC6
+	\u0CC7-\u0CC8 \u0CCA-\u0CCB \u0CCC \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D40
+	\u0D41-\u0D43 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D57 \u0D82-\u0D83 \u0DCF-\u0DD1
+	\u0DD2-\u0DD4 \u0DD6 \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31 \u0E34-\u0E39
+	\u0E47-\u0E4E \u0EB1 \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
+	\u0F35 \u0F37 \u0F39 \u0F3E-\u0F3F \u0F71-\u0F7E \u0F7F \u0F80-\u0F84
+	\u0F86-\u0F87 \u0F90-\u0F97 \u0F99-\u0FBC \u0FC6 \u102C \u102D-\u1030 \u1031
+	\u1032 \u1036-\u1037 \u1038 \u1056-\u1057 \u1058-\u1059 \u1712-\u1714
 	\u1732-\u1734 \u1752-\u1753 \u1772-\u1773 \u17B4-\u17B6 \u17B7-\u17BD
-	\u17BE-\u17C5 \u17C6   \u17C7-\u17C8 \u17C9-\u17D1 \u17D3   \u180B-\u180D
-	\u18A9   \u20D0-\u20DC \u20DD-\u20E0 \u20E1   \u20E2-\u20E4 \u20E5-\u20EA
-	\u302A-\u302F \u3099-\u309A \uFB1E   \uFE00-\uFE0F \uFE20-\uFE23 \uFF9E-\uFF9F
+	\u17BE-\u17C5 \u17C6 \u17C7-\u17C8 \u17C9-\u17D1 \u17D3 \u180B-\u180D
+	\u18A9 \u20D0-\u20DC \u20DD-\u20E0 \u20E1 \u20E2-\u20E4 \u20E5-\u20EA
+	\u302A-\u302F \u3099-\u309A \uFB1E \uFE00-\uFE0F \uFE20-\uFE23 \uFF9E-\uFF9F
 	\U0001D165-\U0001D166 \U0001D167-\U0001D169 \U0001D16D-\U0001D172 
 	\U0001D17B-\U0001D182 \U0001D185-\U0001D18B \U0001D1AA-\U0001D1AD];
 
@@ -347,21 +347,21 @@
 #  Words.  Includes mixed Alpha-numerics.
 #          Rules 11a, 16, 17, 19, more or less.
 #
-$NumberInterior = $IDcm | ($NUcm | $ALcm | $IS $NUcm)+;  
-$Number         =  $PR? ($OPcm | $HYcm)? $NumberInterior $CL? $POcm?;   # Fancy Number     18 
-$Word           = (($IDcm | ($ALcm | $NUcm)+) ($POcm? | $INcm?));       # Alpha-numeric.   16, 17 
-$Dashes         = (($B2cm $SPcm*)*);                                    # Dashes           11a   
+$NumberInterior = $IDcm | ($NUcm | $ALcm | $IS $NUcm)+;
+$Number         =  $PR? ($OPcm | $HYcm)? $NumberInterior $CL? $POcm?;   # Fancy Number     18
+$Word           = (($IDcm | ($ALcm | $NUcm)+) ($POcm? | $INcm?));       # Alpha-numeric.   16, 17
+$Dashes         = (($B2cm $SPcm*)*);                                    # Dashes           11a
 $ThaiRange      = $dictionary+ | $thai_etc;
 $WordLikeThing  = $Number | $Word | $Dashes | $ThaiRange;
-        
 
 
-        
+
+
 $Word15 = ($BBcm* ($WordLikeThing)? ($BAcm | $HYcm | $NScm)*) |     # Rule 15. Stuff sticks around words.
-          [^[:Cc:] $BK $CR $LF $ZW $SP $GL] $Extend*  |                 # Allow characters that don't meet the
-          [^$BK $CR $LF $ZW $SP $GL ];                                  #  more elaborate definitions for WORD
-                                                                    #  to be glued.
-        
+          [^[:Cc:] $BK $CR $LF $ZW $SP $GL] $Extend*  |             # Allow characters that don't meet the
+          [^$BK $CR $LF $ZW $SP $GL ];                              #  more elaborate definitions for WORD to be glued.
+
+
 $GluedWord  = ($GLcm | $QUcm)? $Word15 (($GLcm | $QUcm) $Word15)*;  # "Glue" will stick anything below it together.
                                                                     # Rules 13, 14
 
@@ -371,10 +371,10 @@
 $Openings $GluedWord  $Closings $paiyannoi? $EndingsMandatory;
 $Openings $GluedWord  $Closings  $Endings;
 
-$Openings $GluedWord  $Closings $paiyannoi   /  
+$Openings $GluedWord  $Closings $paiyannoi   / 
                ([^\u0e25 $Extend] | \u0e25[^$paiyannoi $Extend]);
-     
-     
+
+
  #"$word($nbsp+$word)*$paiyannoi/([^[\u0e25$_ignore_]]|"
  #                       + "\u0e25[^$paiyannoi$_ignore_]);"
 

Index: sent.txt
===================================================================
RCS file: /cvs/core/icu-sword/source/data/brkitr/sent.txt,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- sent.txt	10 Sep 2003 02:42:05 -0000	1.1
+++ sent.txt	6 Apr 2004 10:08:10 -0000	1.2
@@ -2,13 +2,13 @@
 #   Copyright (C) 2002-2003, International Business Machines Corporation and others.
 #       All Rights Reserved.
 #
-#   file:  sent.txt   
+#   file:  sent.txt
 #
 #   ICU Sentence Break Rules
 #      See Unicode Standard Annex #29.
 #      These rules are based on TR 29 version 4.0.0
 #
-    
+
 
 #
 # Character categories as defined in TR 29
@@ -21,21 +21,21 @@
 $OLetter = [[:Alphabetic:] [:name = HEBREW PUNCTUATION GERESH:] - [$Lower $Upper]];
 $Numeric = [:LineBreak = Numeric:];
 
-$ATerm = [.];  
+$ATerm = [.];
 
 $Term  = [\u0021 \u003F \u0589 \u061F \u06D4 \u0700 \u0701 \u0702 \u0964 \u1362
           \u1367 \u1368 \u104a \u104b \u166e \u1803 \u1809 \u203C \u203D \u2047 
           \u2048 \u2049 \u3002 \uFE52 \uFE57 \uFF01 \uFF0E \uFF1F \uFF61];
-          
+
 $Close   = [[:Open_Punctuation:] [:Close_Punctuation:] [:Linebreak = Quotation:] -
            [[:name = HEBREW PUNCTUATION GERESH:] $ATerm $Term]];
-           
-           
+
+
 
 # Define extended forms of the character classes,
 #   incorporate grapheme cluster + format chars.
 
-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
+$Extend     = [[:Grapheme_Extend = TRUE:]];
 $ATermEx    = $ATerm   $Extend* $Format*;
 $NumericEx  = $Numeric $Extend* $Format*;
 $UpperEx    = $Upper   $Extend* $Format*;
@@ -49,6 +49,9 @@
 # $InteriorChars are those that never trigger a following break.
 $InteriorChars = [^$Term $ATerm $Sep];   #Note:  includes Extend and Format chars
 
+## -------------------------------------------------
+
+!!forward;
 
 # Rule 6.  Match an ATerm (.) that does not cause a break because a number immediately follows it.
 $NumberFollows = $InteriorChars* $ATermEx $NumericEx;
@@ -64,24 +67,86 @@
 # Rules 3, 9, 10, 11
 #                       Matches a simple sentence, or the trailing part of a complex sentence,
 #                       where a simple sentence contains no interior "."s.
-$EndSequence       = $InteriorChars* ($TermEx | $ATermEx) $Close* $Sp* $SepSeq? |
-                     $InteriorChars* $SepSeq?;
+$TermEndSequence   = $InteriorChars* ($TermEx | $ATermEx) $Close* $Sp* $SepSeq?;
+$EndSequence       = $InteriorChars* $SepSeq?;
 
+# Put them all together.
+($NumberFollows | $UppersSurround |  $LowerWordFollows)*  $TermEndSequence{0};   # status = UBRK_SENTENCE_TERM
+($NumberFollows | $UppersSurround |  $LowerWordFollows)*  $EndSequence{100};     # status = UBRK_SENTENCE_SEP
 
+## -------------------------------------------------
 
-# Put them all together.  
-($NumberFollows | $UppersSurround |  $LowerWordFollows)*  $EndSequence;
+!!reverse;
 
-     
-#
-#  Reverse Rules
-#
-$EndGorp                  = ($Term | $ATerm | $Sep | $Close | $Extend | $Format | $Sp);
-$RevEndSequence           = $EndGorp* $InteriorChars* $EndGorp* | $Sep [^$ATerm $Term]*;
-$ReverseLowerWordFollows  = $Lower [^$OLetter $Upper $Lower $Sep]* $ATerm $InteriorChars*;
-$ReverseUpperSurround     = $Upper $Format* $Extend* $ATerm $Format* $Extend* $Upper $InteriorChars*;
-$ReverseNumberFollows     = $Numeric $Format* $Extend* $ATerm $InteriorChars*;
+# rule 6
 
-! $RevEndSequence ($ReverseLowerWordFollows | $ReverseUpperSurround | $ReverseNumberFollows)* .?;
-#! .*;
- 
+$RULE6 = $Numeric $Format* $Extend* $ATerm;
+
+# rule 7
+
+$RULE7 = $Upper $Format* $Extend* $ATerm $Format* $Extend* $Upper;
+
+# rule 8
+
+$RULE8 = $Lower ($Format* $Extend* [^$OLetter $Upper $Lower $Sep])* 
+             ($Format* $Extend* $Sp)* ($Format* $Extend* $Close)*
+             $Format* $Extend* $ATerm;
+
+# rule 9, 10, 11
+
+# $CR $LF
+$End = $Sep | \u000a\u000d
+       | $Format* $Extend* $Sp* $Format* $Extend* $Close* $Format* 
+		 $Extend* ($Term | $ATerm)
+	   | $Sep $Format* $Extend* $Sp* $Format* $Extend* $Close* $Format* 
+		 $Extend* ($Term | $ATerm);
+	
+# rule 12
+
+$RULE12 = [^$Sep $Term $ATerm];
+
+$Join = ($RULE6 | $RULE7 | $RULE8 | $RULE12)*;
+
+$End;
+
+$End? $Join [$RULE12 - $Sp - $Close];
+
+# forces a break at the beginning of text "$Sp blah blah blah"
+# remember the break iterators takes the longest match
+$End? $Join $Sp / [^$Term $ATerm $Sp $Close];
+
+# forces a break at the beginning of text "$Close blah blah blah"
+$End? $Join $Close / [^$Term $ATerm $Close];
+
+## -------------------------------------------------
+
+!!safe_reverse;
+
+# rule 4
+$Extend+ [^$Extend];
+
+# rule 7
+$Extend* $ATerm $Format* $Extend* $Upper;
+
+# rule 8
+($Extend* $Term)+ ($Extend* $Sp $Format*)* ($Extend* $Close $Format*)* $Extend* $ATerm;
+
+# rule 11
+($Extend* $Sp $Format*)* ($Extend* $Close $Format*)*;
+($Extend* $Sp $Format*)* ($Extend* $Close $Format*)* $Extend* ($Term | $ATerm);
+
+## -------------------------------------------------
+
+!!safe_forward;
+
+# rule 7
+
+$ATerm $Extend* $Format* $Upper;
+
+# rule 8
+
+$Lower .;
+
+# rule 11
+
+($Close $Extend* $Format*)* ($Sp $Extend* $Format*)*;
\ No newline at end of file

Index: title.txt
===================================================================
RCS file: /cvs/core/icu-sword/source/data/brkitr/title.txt,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- title.txt	10 Sep 2003 02:42:05 -0000	1.1
+++ title.txt	6 Apr 2004 10:08:10 -0000	1.2
@@ -11,7 +11,7 @@
 #
 #  If the iterator was not stopped on a cased character, advance it to the first cased char
 #
-($NotCased | $CaseIgnorable)*;
+$NotCased+;
 
 #
 #  If the iterator starts on a cased item, advance through all adjacent cased items plus
@@ -22,5 +22,11 @@
 #
 #  Reverse Rules
 #
-!$NotCased* ($Cased | $CaseIgnorable)* $NotCased?;
 
+! $NotCased+;
+
+#
+#  If the iterator starts on a cased item, advance through all adjacent cased items plus
+#    any non-cased stuff, to reach the start of the next word.
+#
+! $NotCased* ($Cased | $CaseIgnorable)* $Cased;
\ No newline at end of file

Index: word.txt
===================================================================
RCS file: /cvs/core/icu-sword/source/data/brkitr/word.txt,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- word.txt	10 Sep 2003 02:42:05 -0000	1.1
+++ word.txt	6 Apr 2004 10:08:10 -0000	1.2
@@ -1,132 +1,234 @@
 #
-#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
-#       All Rights Reserved.
+# Copyright (C) 2002-2003,
+# International Business Machines Corporation and others.
+# All Rights Reserved.
 #
-#   file:  word.txt   
+# file:  word.txt
 #
-#   ICU Word Break Rules
+# ICU Word Break Rules
 #      See Unicode Standard Annex #29.
 #      These rules are based on Version 4.0.0, dated 2003-04-17
 #
 
-
-
-####################################################################################
+##############################################################################
 #
 #  Character class definitions from TR 29
 #
-####################################################################################
-$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 
-                                   [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
-                                   [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
-                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
+##############################################################################
 
+!!chain;
+!!LBCMNoChain;
+
+$Katakana  = [[:Script = KATAKANA:]
+			  [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
+			  [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
+			  [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
+			  [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
+
+
+$ALetter   = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
+						   - [:Ideographic:]
+						   - $Katakana
+						   - [:Script = Thai:]
+						   - [:Script = Lao:]
+						   - [:Script = Hiragana:]];
+
+$ABaseLetter = [$ALetter - [:Grapheme_Extend = TRUE:]];
+$ACMLetter   = [$ALetter & [:Grapheme_Extend = TRUE:]];
+
+$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]
+			  [:name = HEBREW PUNCTUATION GERSHAYIM:]
+			  [:name = RIGHT SINGLE QUOTATION MARK:]
+			  [:name = HYPHENATION POINT:]];
 
-$ALetter   = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:] 
-                           - [:Ideographic:]
-                           - $Katakana
-                           - [:Script = Thai:]
-                           - [:Script = Lao:]
-                           - [:Script = Hiragana:]];
-                           
-$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]  [:name = HEBREW PUNCTUATION GERSHAYIM:]
-              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]];  
-              
 $MidNumLet = [[:name = FULL STOP:] [:name = COLON:]];
 
 $MidNum    = [[:LineBreak = Infix_Numeric:] - $MidNumLet];
 $Numeric   = [:LineBreak = Numeric:];
 
-
 #
 #  Character Class Definitions.
 #    The names are those from TR29.
 #
-$CR         = \u000d;
-$LF         = \u000a;
-$Control    = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];
-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
 
+$CR      = \u000d;
+$LF      = \u000a;
+$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];
+$Extend  = [[:Grapheme_Extend = TRUE:]];
+$Format  = [[:Cf:]];
+$Hiragana = [:Hiragana:];
+$Ideographic = [:IDEOGRAPHIC:];
 
+## -------------------------------------------------
 
+!!forward;
 
-####################################################################################
-#
-#  Word Break Rules.    Definitions and Rules specific to word break begin Here. 
-#
-####################################################################################
+$CR $LF;
 
-$Format    = [[:Cf:]];
+# rule 3 and 4
 
+$ALetterEx     = $ALetter     $Extend*;
+$ABaseLetterEx = $ABaseLetter $Extend*;
+$ACMLetterEx   = $ACMLetter   $Extend*;
+$NumericEx     = $Numeric     $Extend*;
+$MidNumEx      = $MidNum      $Extend*;
+$MidNumLetEx   = $MidNumLet   $Extend*;
+$MidLetterEx   = $MidLetter   $Extend*;
+$KatakanaEx    = $Katakana    $Extend*;
 
+# see character breaks
 
-# Rule 3:  Treat a grapheme cluster as if it were a single character.
-#          Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
-#          because we don't need to find the boundaries between adjacent syllables -
-#          they won't be word boundaries.
-#
+[^$Control] $Extend*;
 
+# rule 5
 
-#
-#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the base char.
-#
-$ALetterEx    = $ALetter   $Extend*; 
-$NumericEx    = $Numeric   $Extend*;
-$MidNumEx     = $MidNum    $Extend*;
-$MidNumLetEx  = $MidNumLet $Extend*;
-$MidLetterEx  = $MidLetter $Extend*;
-$KatakanaEx   = $Katakana  $Extend*;
-$FormatEx     = $Format    $Extend*;
+$ALetterEx ($Format* $ALetterEx)* {200};
 
+# rule 6 and 7
 
-#
-#  Numbers.  Rules 8, 11, 12 form the TR.
-#
-$NumberSequence = $NumericEx ($FormatEx* ($MidNumEx | $MidNumLetEx)? $FormatEx* $NumericEx)*;
-$NumberSequence {100};
+$MidALetterEx = ($ABaseLetterEx | $Format $ACMLetterEx);
 
-#
-#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
-#     - must include at least one letter. 
-#     - may include both letters and numbers.
-#     - may include  MideLetter, MidNumber punctuation.
-#
-$LetterSequence = $ALetterEx ($FormatEx* ($MidLetterEx | $MidNumLetEx)? $FormatEx* $ALetterEx)*;     # rules #6, #7
-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* {200};
+$ALetterSeq =
+$ALetterEx
+(
+    $Format* ($MidLetterEx | $MidNumLetEx) $Format* $MidALetterEx
+)*;
 
-#
-#  Do not break between Katakana.   Rule #13.
-#
-$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
-[:Hiragana:] $Extend* {300};
+$MidALetterSeq =
+$MidALetterEx
+(
+    $Format* ($MidLetterEx | $MidNumLetEx) $Format* $MidALetterEx
+)*;
 
-#
-#  Ideographic Characters.  Stand by themselves as words.
-#                           Separated from the "Everything Else" rule, below, only so that they
-#                           can be tagged with a return value.   TODO:  is this what we want?
-#
-[:IDEOGRAPHIC:] $Extend* {400};
+# rule 8
 
-#
-#  Everything Else, with no tag.
-#                   Non-Control chars combine with $Extend (combining) chars.
-#                   Controls are do not.
-#
-[^$Control [:Ideographic:]] $Extend*;
-$CR $LF;
+$NumericEx ($Format* $NumericEx)* {100};
 
-#
-#  Reverse Rules.   Back up over any of the chars that can group together.
-#                   (Reverse rules do not need to be exact; they can back up  too far,
-#                   but must back up at least enough, and must stop on a boundary.)
-#
+# rule 9
 
-# NonStarters are the set of all characters that can appear at the 2nd - nth position of
-#    a word.   (They may also be the first.)   The reverse rule skips over these, until it
-#    reaches something that can only be the start (and probably only) char in a "word".
-#    A space or punctuation meets the test.
-#
-$NonStarters = [$Numeric $ALetter $Katakana $MidLetter $MidNum $MidNumLet $Extend $Format];
+$ALetterSeq ($Format* ($NumericEx | $MidALetterSeq))* {200};
 
-#!.*;
-! ($NonStarters* | \n \r) .;
+# rule 10
+
+$NumericEx ($Format* $MidALetterSeq)+ ($Format* $NumericEx)* {200};
+
+# rule 11 and 12 
+
+$NumericEx ($Format* ($MidNumEx | $MidNumLetEx) $Format* $NumericEx)+ {100};
+
+# rule 13
+
+$KatakanaEx ($Format* $KatakanaEx)* {300};
+$Hiragana $Extend* {300};
+$Ideographic $Extend* {400};
+
+## -------------------------------------------------
+
+!!reverse;
+
+$BackALetterEx     = $Extend* $ALetter;
+$BackABaseLetterEx = $Extend* $ABaseLetter;
+$BackACMLetterEx   = $Extend* $ACMLetter;
+$BackNumericEx     = $Extend* $Numeric;
+$BackMidNumEx      = $Extend* $MidNum;
+$BackMidNumLetEx   = $Extend* $MidNumLet;
+$BackMidLetterEx   = $Extend* $MidLetter;
+$BackKatakanaEx    = $Extend* $Katakana;
+
+$LF $CR;
+
+# see character breaks
+
+$Extend* [^$Control];
+
+# rule 5
+
+($BackALetterEx $Format*)* $BackABaseLetterEx;
+($BackALetterEx $Format*)* $BackACMLetterEx / $Control;
+
+# rule 6 and 7
+
+$BackMidALetterEx = ($BackABaseLetterEx | $BackACMLetterEx $Format);
+
+$BackALetterSeq =
+(
+    $BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format*
+)*
+$BackABaseLetterEx;
+
+$BackMidALetterSeq =
+(
+    $BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format*
+)*
+$BackMidALetterEx;
+
+# rule 8
+
+$BackNumericEx $Format* $BackNumericEx;
+
+# rule 10
+
+(($BackNumericEx | $BackMidALetterSeq) $Format*)* $BackALetterSeq;
+
+# to handle letter sequences ending with a combining mark
+(($BackNumericEx | $BackMidALetterSeq) $Format*)* 
+(
+    $BackMidALetterEx $Format* ($BackMidLetterEx | $BackMidNumLetEx) $Format*
+)*
+$BackACMLetterEx / $Control;
+
+# rule 10
+
+($BackNumericEx $Format*)* ($BackMidALetterSeq $Format*)* $BackNumericEx;
+
+# rule 11 and 12
+
+$BackNumericEx $Format* ($BackMidNumEx | $BackMidNumLetEx) $Format* $BackNumericEx;
+
+# rule 13
+
+$BackKatakanaEx $Format* $BackKatakanaEx;
+
+## -------------------------------------------------
+
+!!safe_reverse;
+
+# rule 3
+$Extend+ [^$Extend];
+
+# rule 4
+$Format+ $BackABaseLetterEx;
+$Format+ $BackACMLetterEx / $Control;
+$Format+ $BackNumericEx;
+$Format+ $BackMidLetterEx;
+$Format+ $BackMidNumLetEx;
+$Format+ $BackMidNumEx;
+$Format+ $BackKatakanaEx;
+
+# rule 6
+($MidLetter | $MidNumLet) $Format* $BackABaseLetterEx;
+($MidLetter | $MidNumLet) $Format* $BackACMLetterEx / $Control;
+
+# rule 11
+($MidNum | $MidNumLet) $Format* $BackNumericEx;
+
+## -------------------------------------------------
+
+!!safe_forward;
+
+# rule 3
+$Extend+;
+
+# rule 4
+$Format+ $ALetterEx;
+$Format+ $NumericEx;
+$Format+ $MidLetterEx;
+$Format+ $MidNumLetEx;
+$Format+ $MidNumEx;
+$Format+ $KatakanaEx;
+
+# rule 6
+($MidLetter | $MidNumLet) $Format* $ALetterEx;
+
+# rule 11
+($MidNum | $MidNumLet) $Format* $NumericEx;

Index: word_th.txt
===================================================================
RCS file: /cvs/core/icu-sword/source/data/brkitr/word_th.txt,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- word_th.txt	10 Sep 2003 02:42:05 -0000	1.1
+++ word_th.txt	6 Apr 2004 10:08:10 -0000	1.2
@@ -1,4 +1,4 @@
-# Copyright (c) 2002, International Business Machines Corporation and
+# Copyright (c) 2002-2003, International Business Machines Corporation and
 # others. All Rights Reserved.
 #
 #  word.txt    Word Breaking Rules for ICU Rules Based Break Iterator.
@@ -11,7 +11,7 @@
 #
 #  Definition of $Ideographic is from TR14, Line Breaking.
 #
-$Ideographic = 
+$Ideographic =
       [ \u1100-\u1159 \u115F \u2E80-\u2E99 \u2E9B-\u2EF3 \u2F00-\u2FD5 \u2FF0-\u2FFB
         \u3000 \u3003-\u3004 \u3006-\u3007 \u3012-\u3013 \u3020-\u3029
         \u3030-\u303A \u303D-\u303F \u3042 \u3044 \u3046 \u3048 \u304A-\u3062
@@ -31,35 +31,34 @@
 # These definitions are from the character break rules.
 #
 $CGJ = [\u034f];   #Combining Grapheme Joiner
-$Link       = [\u094D \u09CD \u0A4D \u0ACD \u0B4D \u0BCD \u0C4D \u0CCD \u0D4D \u0DCA \u0E3A \u1039 \u17D2]; 
-$NotControl = [^[:Zl:] [:Zp:] [:Cc:]];  #Line Separator,
-                                        #Paragraph Separtor,
-                                        # General Category == Control
+$Link       = [\u094D \u09CD \u0A4D \u0ACD \u0B4D \u0BCD \u0C4D \u0CCD \u0D4D \u0DCA \u0E3A \u1039 \u17D2];
+$NotControl = [^[:Zl:] [:Zp:] [:Cc:]];  #Line Separator, Paragraph Separtor, General Category == Control
+
 $Extend     =   # From UNIDATA/DerivedCoreProperties.txt
 	[\u0300-\u034E \u0360-\u036F \u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9
-	\u05BB-\u05BD \u05BF   \u05C1-\u05C2 \u05C4   \u064B-\u0655 \u0670   \u06D6-\u06DC
-	\u06DE   \u06DF-\u06E4 \u06E7-\u06E8 \u06EA-\u06ED \u0711   \u0730-\u074A
-	\u07A6-\u07B0 \u0901-\u0902 \u0903   \u093C   \u093E-\u0940 \u0941-\u0948
-	\u0949-\u094C \u0951-\u0954 \u0962-\u0963 \u0981   \u0982-\u0983 \u09BC
-	\u09BE-\u09C0 \u09C1-\u09C4 \u09C7-\u09C8 \u09CB-\u09CC \u09D7   \u09E2-\u09E3
-	\u0A02   \u0A3C   \u0A3E-\u0A40 \u0A41-\u0A42 \u0A47-\u0A48 \u0A4B-\u0A4C
-	\u0A70-\u0A71 \u0A81-\u0A82 \u0A83   \u0ABC   \u0ABE-\u0AC0 \u0AC1-\u0AC5
-	\u0AC7-\u0AC8 \u0AC9   \u0ACB-\u0ACC \u0B01   \u0B02-\u0B03 \u0B3C   \u0B3E
-	\u0B3F   \u0B40   \u0B41-\u0B43 \u0B47-\u0B48 \u0B4B-\u0B4C \u0B56   \u0B57
-	\u0B82   \u0BBE-\u0BBF \u0BC0   \u0BC1-\u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0BD7
+	\u05BB-\u05BD \u05BF \u05C1-\u05C2 \u05C4 \u064B-\u0655 \u0670 \u06D6-\u06DC
+	\u06DE \u06DF-\u06E4 \u06E7-\u06E8 \u06EA-\u06ED \u0711 \u0730-\u074A
+	\u07A6-\u07B0 \u0901-\u0902 \u0903 \u093C \u093E-\u0940 \u0941-\u0948
+	\u0949-\u094C \u0951-\u0954 \u0962-\u0963 \u0981 \u0982-\u0983 \u09BC
+	\u09BE-\u09C0 \u09C1-\u09C4 \u09C7-\u09C8 \u09CB-\u09CC \u09D7 \u09E2-\u09E3
+	\u0A02 \u0A3C \u0A3E-\u0A40 \u0A41-\u0A42 \u0A47-\u0A48 \u0A4B-\u0A4C
+	\u0A70-\u0A71 \u0A81-\u0A82 \u0A83 \u0ABC \u0ABE-\u0AC0 \u0AC1-\u0AC5
+	\u0AC7-\u0AC8 \u0AC9 \u0ACB-\u0ACC \u0B01 \u0B02-\u0B03 \u0B3C \u0B3E
+	\u0B3F \u0B40 \u0B41-\u0B43 \u0B47-\u0B48 \u0B4B-\u0B4C \u0B56 \u0B57
+	\u0B82 \u0BBE-\u0BBF \u0BC0 \u0BC1-\u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0BD7
 	\u0C01-\u0C03 \u0C3E-\u0C40 \u0C41-\u0C44 \u0C46-\u0C48 \u0C4A-\u0C4C
-	\u0C55-\u0C56 \u0C82-\u0C83 \u0CBE   \u0CBF   \u0CC0-\u0CC4 \u0CC6
-	\u0CC7-\u0CC8 \u0CCA-\u0CCB \u0CCC   \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D40
-	\u0D41-\u0D43 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D57   \u0D82-\u0D83 \u0DCF-\u0DD1
-	\u0DD2-\u0DD4 \u0DD6   \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31   \u0E34-\u0E39
-	\u0E47-\u0E4E \u0EB1   \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
-	\u0F35   \u0F37   \u0F39   \u0F3E-\u0F3F \u0F71-\u0F7E \u0F7F   \u0F80-\u0F84
-	\u0F86-\u0F87 \u0F90-\u0F97 \u0F99-\u0FBC \u0FC6   \u102C   \u102D-\u1030 \u1031
-	\u1032   \u1036-\u1037 \u1038   \u1056-\u1057 \u1058-\u1059 \u1712-\u1714
+	\u0C55-\u0C56 \u0C82-\u0C83 \u0CBE \u0CBF \u0CC0-\u0CC4 \u0CC6
+	\u0CC7-\u0CC8 \u0CCA-\u0CCB \u0CCC \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D40
+	\u0D41-\u0D43 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D57 \u0D82-\u0D83 \u0DCF-\u0DD1
+	\u0DD2-\u0DD4 \u0DD6 \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31 \u0E34-\u0E39
+	\u0E47-\u0E4E \u0EB1 \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
+	\u0F35 \u0F37 \u0F39 \u0F3E-\u0F3F \u0F71-\u0F7E \u0F7F \u0F80-\u0F84
+	\u0F86-\u0F87 \u0F90-\u0F97 \u0F99-\u0FBC \u0FC6 \u102C \u102D-\u1030 \u1031
+	\u1032 \u1036-\u1037 \u1038 \u1056-\u1057 \u1058-\u1059 \u1712-\u1714
 	\u1732-\u1734 \u1752-\u1753 \u1772-\u1773 \u17B4-\u17B6 \u17B7-\u17BD
-	\u17BE-\u17C5 \u17C6   \u17C7-\u17C8 \u17C9-\u17D1 \u17D3   \u180B-\u180D
-	\u18A9   \u20D0-\u20DC \u20DD-\u20E0 \u20E1   \u20E2-\u20E4 \u20E5-\u20EA
-	\u302A-\u302F \u3099-\u309A \uFB1E   \uFE00-\uFE0F \uFE20-\uFE23 \uFF9E-\uFF9F
+	\u17BE-\u17C5 \u17C6 \u17C7-\u17C8 \u17C9-\u17D1 \u17D3 \u180B-\u180D
+	\u18A9 \u20D0-\u20DC \u20DD-\u20E0 \u20E1 \u20E2-\u20E4 \u20E5-\u20EA
+	\u302A-\u302F \u3099-\u309A \uFB1E \uFE00-\uFE0F \uFE20-\uFE23 \uFF9E-\uFF9F
 	\U0001D165-\U0001D166 \U0001D167-\U0001D169 \U0001D16D-\U0001D172 
 	\U0001D17B-\U0001D182 \U0001D185-\U0001D18B \U0001D1AA-\U0001D1AD];
 
@@ -72,30 +71,30 @@
 $L  = [\u1100-\u115f];
 $V  = [\u1160-\u11a2];
 $T  = [\u11a8-\u11f9];
-$LV = [ \uac00 \uac1c \uac38 \uac54 \uac70 \uac8c \uaca8 \uacc4 \uace0 \uacfc \uad18 \uad34 \uad50 \uad6c \uad88 \uada4 
-		\uadc0 \uaddc \uadf8 \uae14 \uae30 \uae4c \uae68 \uae84 \uaea0 \uaebc \uaed8 \uaef4 \uaf10 \uaf2c \uaf48 \uaf64 
-		\uaf80 \uaf9c \uafb8 \uafd4 \uaff0 \ub00c \ub028 \ub044 \ub060 \ub07c \ub098 \ub0b4 \ub0d0 \ub0ec \ub108 \ub124 
-		\ub140 \ub15c \ub178 \ub194 \ub1b0 \ub1cc \ub1e8 \ub204 \ub220 \ub23c \ub258 \ub274 \ub290 \ub2ac \ub2c8 \ub2e4 
-		\ub300 \ub31c \ub338 \ub354 \ub370 \ub38c \ub3a8 \ub3c4 \ub3e0 \ub3fc \ub418 \ub434 \ub450 \ub46c \ub488 \ub4a4 
-		\ub4c0 \ub4dc \ub4f8 \ub514 \ub530 \ub54c \ub568 \ub584 \ub5a0 \ub5bc \ub5d8 \ub5f4 \ub610 \ub62c \ub648 \ub664 
-		\ub680 \ub69c \ub6b8 \ub6d4 \ub6f0 \ub70c \ub728 \ub744 \ub760 \ub77c \ub798 \ub7b4 \ub7d0 \ub7ec \ub808 \ub824 
-		\ub840 \ub85c \ub878 \ub894 \ub8b0 \ub8cc \ub8e8 \ub904 \ub920 \ub93c \ub958 \ub974 \ub990 \ub9ac \ub9c8 \ub9e4 
-		\uba00 \uba1c \uba38 \uba54 \uba70 \uba8c \ubaa8 \ubac4 \ubae0 \ubafc \ubb18 \ubb34 \ubb50 \ubb6c \ubb88 \ubba4 
-		\ubbc0 \ubbdc \ubbf8 \ubc14 \ubc30 \ubc4c \ubc68 \ubc84 \ubca0 \ubcbc \ubcd8 \ubcf4 \ubd10 \ubd2c \ubd48 \ubd64 
-		\ubd80 \ubd9c \ubdb8 \ubdd4 \ubdf0 \ube0c \ube28 \ube44 \ube60 \ube7c \ube98 \ubeb4 \ubed0 \ubeec \ubf08 \ubf24 
-		\ubf40 \ubf5c \ubf78 \ubf94 \ubfb0 \ubfcc \ubfe8 \uc004 \uc020 \uc03c \uc058 \uc074 \uc090 \uc0ac \uc0c8 \uc0e4 
-		\uc100 \uc11c \uc138 \uc154 \uc170 \uc18c \uc1a8 \uc1c4 \uc1e0 \uc1fc \uc218 \uc234 \uc250 \uc26c \uc288 \uc2a4 
-		\uc2c0 \uc2dc \uc2f8 \uc314 \uc330 \uc34c \uc368 \uc384 \uc3a0 \uc3bc \uc3d8 \uc3f4 \uc410 \uc42c \uc448 \uc464 
-		\uc480 \uc49c \uc4b8 \uc4d4 \uc4f0 \uc50c \uc528 \uc544 \uc560 \uc57c \uc598 \uc5b4 \uc5d0 \uc5ec \uc608 \uc624 
-		\uc640 \uc65c \uc678 \uc694 \uc6b0 \uc6cc \uc6e8 \uc704 \uc720 \uc73c \uc758 \uc774 \uc790 \uc7ac \uc7c8 \uc7e4 
-		\uc800 \uc81c \uc838 \uc854 \uc870 \uc88c \uc8a8 \uc8c4 \uc8e0 \uc8fc \uc918 \uc934 \uc950 \uc96c \uc988 \uc9a4 
-		\uc9c0 \uc9dc \uc9f8 \uca14 \uca30 \uca4c \uca68 \uca84 \ucaa0 \ucabc \ucad8 \ucaf4 \ucb10 \ucb2c \ucb48 \ucb64 
-		\ucb80 \ucb9c \ucbb8 \ucbd4 \ucbf0 \ucc0c \ucc28 \ucc44 \ucc60 \ucc7c \ucc98 \uccb4 \uccd0 \uccec \ucd08 \ucd24 
-		\ucd40 \ucd5c \ucd78 \ucd94 \ucdb0 \ucdcc \ucde8 \uce04 \uce20 \uce3c \uce58 \uce74 \uce90 \uceac \ucec8 \ucee4 
-		\ucf00 \ucf1c \ucf38 \ucf54 \ucf70 \ucf8c \ucfa8 \ucfc4 \ucfe0 \ucffc \ud018 \ud034 \ud050 \ud06c \ud088 \ud0a4 
-		\ud0c0 \ud0dc \ud0f8 \ud114 \ud130 \ud14c \ud168 \ud184 \ud1a0 \ud1bc \ud1d8 \ud1f4 \ud210 \ud22c \ud248 \ud264 
-		\ud280 \ud29c \ud2b8 \ud2d4 \ud2f0 \ud30c \ud328 \ud344 \ud360 \ud37c \ud398 \ud3b4 \ud3d0 \ud3ec \ud408 \ud424 
-		\ud440 \ud45c \ud478 \ud494 \ud4b0 \ud4cc \ud4e8 \ud504 \ud520 \ud53c \ud558 \ud574 \ud590 \ud5ac \ud5c8 \ud5e4 
+$LV = [ \uac00 \uac1c \uac38 \uac54 \uac70 \uac8c \uaca8 \uacc4 \uace0 \uacfc \uad18 \uad34 \uad50 \uad6c \uad88 \uada4
+		\uadc0 \uaddc \uadf8 \uae14 \uae30 \uae4c \uae68 \uae84 \uaea0 \uaebc \uaed8 \uaef4 \uaf10 \uaf2c \uaf48 \uaf64
+		\uaf80 \uaf9c \uafb8 \uafd4 \uaff0 \ub00c \ub028 \ub044 \ub060 \ub07c \ub098 \ub0b4 \ub0d0 \ub0ec \ub108 \ub124
+		\ub140 \ub15c \ub178 \ub194 \ub1b0 \ub1cc \ub1e8 \ub204 \ub220 \ub23c \ub258 \ub274 \ub290 \ub2ac \ub2c8 \ub2e4
+		\ub300 \ub31c \ub338 \ub354 \ub370 \ub38c \ub3a8 \ub3c4 \ub3e0 \ub3fc \ub418 \ub434 \ub450 \ub46c \ub488 \ub4a4
+		\ub4c0 \ub4dc \ub4f8 \ub514 \ub530 \ub54c \ub568 \ub584 \ub5a0 \ub5bc \ub5d8 \ub5f4 \ub610 \ub62c \ub648 \ub664
+		\ub680 \ub69c \ub6b8 \ub6d4 \ub6f0 \ub70c \ub728 \ub744 \ub760 \ub77c \ub798 \ub7b4 \ub7d0 \ub7ec \ub808 \ub824
+		\ub840 \ub85c \ub878 \ub894 \ub8b0 \ub8cc \ub8e8 \ub904 \ub920 \ub93c \ub958 \ub974 \ub990 \ub9ac \ub9c8 \ub9e4
+		\uba00 \uba1c \uba38 \uba54 \uba70 \uba8c \ubaa8 \ubac4 \ubae0 \ubafc \ubb18 \ubb34 \ubb50 \ubb6c \ubb88 \ubba4
+		\ubbc0 \ubbdc \ubbf8 \ubc14 \ubc30 \ubc4c \ubc68 \ubc84 \ubca0 \ubcbc \ubcd8 \ubcf4 \ubd10 \ubd2c \ubd48 \ubd64
+		\ubd80 \ubd9c \ubdb8 \ubdd4 \ubdf0 \ube0c \ube28 \ube44 \ube60 \ube7c \ube98 \ubeb4 \ubed0 \ubeec \ubf08 \ubf24
+		\ubf40 \ubf5c \ubf78 \ubf94 \ubfb0 \ubfcc \ubfe8 \uc004 \uc020 \uc03c \uc058 \uc074 \uc090 \uc0ac \uc0c8 \uc0e4
+		\uc100 \uc11c \uc138 \uc154 \uc170 \uc18c \uc1a8 \uc1c4 \uc1e0 \uc1fc \uc218 \uc234 \uc250 \uc26c \uc288 \uc2a4
+		\uc2c0 \uc2dc \uc2f8 \uc314 \uc330 \uc34c \uc368 \uc384 \uc3a0 \uc3bc \uc3d8 \uc3f4 \uc410 \uc42c \uc448 \uc464
+		\uc480 \uc49c \uc4b8 \uc4d4 \uc4f0 \uc50c \uc528 \uc544 \uc560 \uc57c \uc598 \uc5b4 \uc5d0 \uc5ec \uc608 \uc624
+		\uc640 \uc65c \uc678 \uc694 \uc6b0 \uc6cc \uc6e8 \uc704 \uc720 \uc73c \uc758 \uc774 \uc790 \uc7ac \uc7c8 \uc7e4
+		\uc800 \uc81c \uc838 \uc854 \uc870 \uc88c \uc8a8 \uc8c4 \uc8e0 \uc8fc \uc918 \uc934 \uc950 \uc96c \uc988 \uc9a4
+		\uc9c0 \uc9dc \uc9f8 \uca14 \uca30 \uca4c \uca68 \uca84 \ucaa0 \ucabc \ucad8 \ucaf4 \ucb10 \ucb2c \ucb48 \ucb64
+		\ucb80 \ucb9c \ucbb8 \ucbd4 \ucbf0 \ucc0c \ucc28 \ucc44 \ucc60 \ucc7c \ucc98 \uccb4 \uccd0 \uccec \ucd08 \ucd24
+		\ucd40 \ucd5c \ucd78 \ucd94 \ucdb0 \ucdcc \ucde8 \uce04 \uce20 \uce3c \uce58 \uce74 \uce90 \uceac \ucec8 \ucee4
+		\ucf00 \ucf1c \ucf38 \ucf54 \ucf70 \ucf8c \ucfa8 \ucfc4 \ucfe0 \ucffc \ud018 \ud034 \ud050 \ud06c \ud088 \ud0a4
+		\ud0c0 \ud0dc \ud0f8 \ud114 \ud130 \ud14c \ud168 \ud184 \ud1a0 \ud1bc \ud1d8 \ud1f4 \ud210 \ud22c \ud248 \ud264
+		\ud280 \ud29c \ud2b8 \ud2d4 \ud2f0 \ud30c \ud328 \ud344 \ud360 \ud37c \ud398 \ud3b4 \ud3d0 \ud3ec \ud408 \ud424
+		\ud440 \ud45c \ud478 \ud494 \ud4b0 \ud4cc \ud4e8 \ud504 \ud520 \ud53c \ud558 \ud574 \ud590 \ud5ac \ud5c8 \ud5e4
 		\ud600 \ud61c \ud638 \ud654 \ud670 \ud68c \ud6a8 \ud6c4 \ud6e0 \ud6fc \ud718 \ud734 \ud750 \ud76c \ud788 ];
 $LVT = [[\uac00-\ud7a3] - $LV];
 $Hangul_Sequence = ((($L+ $LV?) | ($L* $LV)) $V* $T* ) | ($L* $LVT $T*);
@@ -129,7 +128,7 @@
 $CGJSequence     = $CGJ+ ($Base | $Hangul_Sequence);
 $Join_Control    = [\u200d-\u200e];        # Zero Width Joiner, Zero Width Non-Joiner
 $LinkSequence    = $Link+ $Extend* $Join_Control? $LetterBase;
-$LetterEx        = ($Letter | $Hangul_Sequence) $Extend*  ((($LinkSequence | $CGJSequence) $Extend*)*); 
+$LetterEx        = ($Letter | $Hangul_Sequence) $Extend*  ((($LinkSequence | $CGJSequence) $Extend*)*);
 
 
 
@@ -142,8 +141,8 @@
 $InfixNumeric    = [\u002c \u002e \u003a \u003b \u0589];
 $PostfixNumeric  = [\%     \u00a2 \u00b0 \u2030 \u2031 \u2032-\u2037 \u20a7
                     \u2103 \u2109 \u2126 \ufe6a \uff05 \uffe0];
-$PrefixNumeric   = [[[:Sc:] \u002b \u005c \u00b1 \u2116 \u2212 \u2213 \-] - [$PostfixNumeric]]; 
-              
+$PrefixNumeric   = [[[:Sc:] \u002b \u005c \u00b1 \u2116 \u2212 \u2213 \-] - [$PostfixNumeric]];
+
 $NumericPrefix   = $PrefixNumeric $NumericEx ($InfixNumeric $NumericEx)?;
 $NumericInterior = $NumericEx ($InfixNumeric? $NumericEx)*;