[sword-cvs] icu-sword/source/data/brkitr char.txt,NONE,1.1 line.txt,NONE,1.1 line_th.txt,NONE,1.1 sent.txt,NONE,1.1 thaidict.brk,NONE,1.1 title.txt,NONE,1.1 word.txt,NONE,1.1 word_th.txt,NONE,1.1

sword@www.crosswire.org sword@www.crosswire.org
Tue, 9 Sep 2003 19:42:08 -0700


Update of /usr/local/cvsroot/icu-sword/source/data/brkitr
In directory www:/tmp/cvs-serv19862/source/data/brkitr

Added Files:
	char.txt line.txt line_th.txt sent.txt thaidict.brk title.txt 
	word.txt word_th.txt 
Log Message:
ICU 2.6 commit

--- NEW FILE: char.txt ---
#
#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
#       All Rights Reserved.
#
#   file:  char.txt 
#
#   ICU Character Break Rules, also known as Grapheme Cluster Boundaries
#      See Unicode Standard Annex #29.
#      These rules are based on TR29 Version 4.0.0
#

#
#  Character Class Definitions.
#    The names are those from TR29.
#
$CR = \r;
$LF = \n;
$Control    = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];

$Extend     = [[:Grapheme_Extend = TRUE:]]; 

#
# Korean Syllable Definitions
#
$L   = [:Hangul_Syllable_Type = L:];
$V   = [:Hangul_Syllable_Type = V:];
$T   = [:Hangul_Syllable_Type = T:];

$LV  = [:Hangul_Syllable_Type = LV:];
$LVT = [:Hangul_Syllable_Type = LVT:];

$HangulSyllable = $L+ | ($L* ($LV? $V+ | $LV | $LVT) $T*) | $T+;

#
#  Forward Break Rules
#
$CR $LF;
([^$Control] | $HangulSyllable) $Extend*;
.;


#
#  Reverse Rule, back up to the beginning of some preceding grapheme cluster.
#
! ($Extend | $V | $T )*   ($LF $CR | ($LV | $LVT)?$L* | .);

--- NEW FILE: line.txt ---
# Copyright (c) 2002-2003  International Business Machines Corporation and
# others. All Rights Reserved.
#
#  file:  line.txt
#
#         Line Breaking Rules
#         Implement default line breaking as defined by Unicode TR 14.
#


#
#  Character Classes defined by TR 14.
#

$AI = [:LineBreak =  Ambiguous:];
$AL = [:LineBreak =  Alphabetic:];
$BA = [:LineBreak =  Break_After:];
$BB = [:LineBreak =  Break_Before:];
$BK = [:LineBreak =  Mandatory_Break:];
$B2 = [:LineBreak =  Break_Both:];
$CB = [:LineBreak =  Contingent_Break:];
$CL = [:LineBreak =  Close_Punctuation:];
$CM = [:LineBreak =  Combining_Mark:];
$CR = [:LineBreak =  Carriage_Return:];
$EX = [:LineBreak =  Exclamation:];
$GL = [:LineBreak =  Glue:];
$HY = [:LineBreak =  Hyphen:];
$ID = [:LineBreak =  Ideographic:];
$IN = [:LineBreak =  Inseperable:];
$IS = [:LineBreak =  Infix_Numeric:];
$LF = [:LineBreak =  Line_Feed:];
$NS = [:LineBreak =  Nonstarter:];
$NU = [:LineBreak =  Numeric:];
$OP = [:LineBreak =  Open_Punctuation:];
$PO = [:LineBreak =  Postfix_Numeric:];
$PR = [:LineBreak =  Prefix_Numeric:];
$QU = [:LineBreak =  Quotation:];
$SA = [:LineBreak =  Complex_Context:];
$SG = [:LineBreak =  Surrogate:];
$SP = [:LineBreak =  Space:];
$SY = [:LineBreak =  Break_Symbols:];
$XX = [:LineBreak =  Unknown:];
$ZW = [:LineBreak =  ZWSpace:];


#
#  Character classes from TR 29.  Needed for finding characters.
#
#
$Extend  = [:Grapheme_Extend = TRUE:];


#
#  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width) and
#                               SA  (South East Asian: Thai, Lao, Khmer) as $AL  (Alphabetic)
#
$ALPlus = $AL | $AI | $SA;

#
#  Combining Marks.   X $CM*  behaves as if it were X.  Rule LB6.
#
$ALcm = $ALPlus $CM*;
$IDcm = ($ID $CM* | $SP $CM+);
$NUcm = $NU $Extend*;
$HYcm = $HY $Extend*;
$QUcm = $QU $Extend*;
$POcm = $PO $Extend*;
$OPcm = $OP $Extend*;
$BAcm = $BA $Extend*;
$BBcm = $BB $Extend*;
$NScm = $NS $Extend*;
$GLcm = $GL $Extend*;
$B2cm = $B2 $Extend*;
$INcm = $IN $Extend*;


#  New Lines.  Always break after, never break before.
#              Rule LB 3
#
#  Endings.    NewLine or Zero Width Space, or both.  Rules 4, 5
#              Because we never break before these things, $Endings
#              appears at the end of line break rule.
#
$NLF = $BK | $CR | $LF | $CR $LF;
$Endings = $SP* $ZW* $NLF?;


#
#  Openings  Sequences that can precede Words, and that should not be separated from them.
#            Rules LB 9, 10
#
$Openings = (($QUcm $SP*)? $OPcm $SP*)*;

#
#  Closings  Seqences that follow words, and that should not be separated from them,
#            Rule LB 8, 11, 15
$Closings =  ($SP*( ($CL ($SP* $NScm)?  |  $EX  | $IS  | $SY) $Extend*) | $BAcm | $HYcm  | $NScm)*;

#
#  Words.  Includes mixed Alpha-numerics.
#          Rules 11a, 16, 17, 19, more or less.
#
$NumberInterior = $IDcm | ($NUcm | $ALcm | $IS $NUcm)+;  
$Number         =  $PR? ($OPcm | $HYcm)? $NumberInterior $CL? $POcm?; # Fancy Number     18 
$Word   = (($IDcm | ($ALcm | $NUcm)+) ($POcm? | $INcm?))  ;           # Alpha-numeric.   16, 17 
$Dashes = (($B2cm $SP*)*);                                            # Dashes           11a   
        
        


 
 
        
$Word15 = ($BBcm* ($Word | $Number | $Dashes)? ($BAcm | $HYcm | $NScm)*) |  # Rule 15. Stuff sticks around words.
          [^[:Cc:] $BK $CR $LF $ZW $SP $GL] $Extend*  |                 # Allow characters that don't meet the
          [^$BK $CR $LF $ZW $SP $GL ];                                   #  more elaborate definitions for WORD
                                                                    #  to be glued.
        
$GluedWord  = ($GLcm | $QUcm)? $Word15 (($GLcm | $QUcm) $Word15)*;  # "Glue" will stick anything below it together.
                                                                    # Rules 13, 14

#
#  The actual rule, a combination of everything defined above.
#
$Openings $GluedWord  $Closings $Endings;
# $GluedWord;





#
#  Reverse Rules.
#
#     Back up to a hard break or a space that will cause a boundary.
#     Not all spaces cause line breaks.  $SpaceGlue represents a sequence
#     containing a space that may inhibit a break from occuring.
#

$SpaceGlue  = ([$ZW $CL $IS $NS $OP]  ($Extend* $SP)) | (($Extend* $SP)+ $OP);
$ClumpingChars = [^$SP $BK $CR $LF];

!. . $ClumpingChars*  ($SpaceGlue $ClumpingChars*)* (. | $LF $CR);

--- NEW FILE: line_th.txt ---
# Copyright (c) 2002, International Business Machines Corporation and
# others. All Rights Reserved.
#
#  file:  line.txt
#
#         Line Breaking Rules for ICU rules based break iteration.
#         Implement default line breaking as defined by Unicode TR 14.
#


#
#  Character Classes defined by Unicode TR 14.
#  These are generated by a script from the Unicode LineBreak derived
#  properties file.
#

############  Start of Script-Generated Definitions   #######################

$LF = [ \u000A];

$IN = [ \u2024-\u2026];

$SY = [ \u002F];

$EX = [ \u0021 \u003F \u2762-\u2763 \uFE56-\uFE57 \uFF01 \uFF1F];

$BA = [ \u0009 \u007C \u00AD \u058A \u0F0B \u1361 \u1680 \u17D5 \u2000-\u2006
        \u2008-\u200A \u2010 \u2012-\u2013 \u2027 \u205F];

$IS = [ \u002C \u002E \u003A-\u003B \u0589];

$BB = [ \u00B4 \u02C8 \u02CC \u1806];

$SA = [ \u0E01-\u0E30 \u0E32-\u0E33 \u0E40-\u0E46 \u0E81-\u0E82 \u0E84 \u0E87-\u0E88
        \u0E8A \u0E8D \u0E94-\u0E97 \u0E99-\u0E9F \u0EA1-\u0EA3 \u0EA5
        \u0EA7 \u0EAA-\u0EAB \u0EAD-\u0EB0 \u0EB2-\u0EB3 \u0EBD \u0EC0-\u0EC4
        \u0EC6 \u0EDC-\u0EDD \u1000-\u1021 \u1023-\u1027 \u1029-\u102A
        \u1050-\u1055 \u1780-\u17B3];

$CB = [ \uFFFC];

$XX = [ \uE000-\uF8FF \U000F0000-\U000FFFFD \U00100000-\U0010FFFD];

$HY = [ \u002D];

$AI = [ \u00A1 \u00A7-\u00A8 \u00AA \u00B2-\u00B3 \u00B6-\u00BA \u00BC-\u00BF
        \u00C6 \u00D0 \u00D7-\u00D8 \u00DE-\u00E1 \u00E6 \u00E8-\u00EA
        \u00EC-\u00ED \u00F0 \u00F2-\u00F3 \u00F7-\u00FA \u00FC \u00FE
        \u0101 \u0111 \u0113 \u011B \u0126-\u0127 \u012B \u0131-\u0133
        \u0138 \u013F-\u0142 \u0144 \u0148-\u014A \u014D \u0152-\u0153
        \u0166-\u0167 \u016B \u01CE \u01D0 \u01D2 \u01D4 \u01D6 \u01D8
        \u01DA \u01DC \u0251 \u0261 \u02C7 \u02C9-\u02CB \u02CD \u02D0
        \u02D8-\u02DB \u02DD \u0391-\u03A1 \u03A3-\u03A9 \u03B1-\u03C1
        \u03C3-\u03C9 \u0401 \u0410-\u044F \u0451 \u2015-\u2016 \u2020-\u2021
        \u203B \u2074 \u207F \u2081-\u2084 \u2105 \u2113 \u2121-\u2122
        \u212B \u2140 \u2154-\u2155 \u215B \u215E \u2160-\u216B \u2170-\u2179
        \u2190-\u2199 \u21D2 \u21D4 \u2200 \u2202-\u2203 \u2207-\u2208
        \u220B \u220F \u2211 \u2215 \u221A \u221D-\u2220 \u2223 \u2225
        \u2227-\u222C \u222E \u2234-\u2237 \u223C-\u223D \u2248 \u224C
        \u2252 \u2260-\u2261 \u2264-\u2267 \u226A-\u226B \u226E-\u226F
        \u2282-\u2283 \u2286-\u2287 \u2295 \u2299 \u22A5 \u22BF \u2312
        \u2460-\u24BF \u24D0-\u24E9 \u24EB-\u24FE \u2500-\u254B \u2550-\u2574
        \u2580-\u258F \u2592-\u2595 \u25A0-\u25A1 \u25A3-\u25A9 \u25B2-\u25B3
        \u25B6-\u25B7 \u25BC-\u25BD \u25C0-\u25C1 \u25C6-\u25C8 \u25CB
        \u25CE-\u25D1 \u25E2-\u25E5 \u25EF \u2605-\u2606 \u2609 \u260E-\u260F
        \u2616-\u2617 \u261C \u261E \u2640 \u2642 \u2660-\u2661 \u2663-\u2665
        \u2667-\u266A \u266C-\u266D \u266F \uFFFD];

$ZW = [ \u200B];

$SG = [ \uD800-\uDFFF];

$AL = [ \u0023 \u0026 \u002A \u003C-\u003E \u0040-\u005A \u005E-\u007A \u007E
        \u00A6 \u00A9 \u00AC \u00AE-\u00AF \u00B5 \u00C0-\u00C5 \u00C7-\u00CF
        \u00D1-\u00D6 \u00D9-\u00DD \u00E2-\u00E5 \u00E7 \u00EB \u00EE-\u00EF
        \u00F1 \u00F4-\u00F6 \u00FB \u00FD \u00FF-\u0100 \u0102-\u0110
        \u0112 \u0114-\u011A \u011C-\u0125 \u0128-\u012A \u012C-\u0130
        \u0134-\u0137 \u0139-\u013E \u0143 \u0145-\u0147 \u014B-\u014C
        \u014E-\u0151 \u0154-\u0165 \u0168-\u016A \u016C-\u01CD \u01CF
        \u01D1 \u01D3 \u01D5 \u01D7 \u01D9 \u01DB \u01DD-\u0220 \u0222-\u0233
        \u0250 \u0252-\u0260 \u0262-\u02AD \u02B0-\u02C6 \u02CE-\u02CF
        \u02D1-\u02D7 \u02DC \u02DE-\u02EE \u0374-\u0375 \u037A \u037E
        \u0384-\u038A \u038C \u038E-\u0390 \u03AA-\u03B0 \u03C2 \u03CA-\u03CE
        \u03D0-\u03F6 \u0400 \u0402-\u040F \u0450 \u0452-\u0482 \u048A-\u04CE
        \u04D0-\u04F5 \u04F8-\u04F9 \u0500-\u050F \u0531-\u0556 \u0559-\u055F
        \u0561-\u0587 \u05BE \u05C0 \u05C3 \u05D0-\u05EA \u05F0-\u05F4
        \u060C \u061B \u061F \u0621-\u063A \u0640-\u064A \u066A-\u066F
        \u0671-\u06D5 \u06E5-\u06E6 \u06E9 \u06FA-\u06FE \u0700-\u070D
        \u0710 \u0712-\u072C \u0780-\u07A5 \u07B1 \u0905-\u0939 \u093D
        \u0950 \u0958-\u0961 \u0964-\u0965 \u0970 \u0985-\u098C \u098F-\u0990
        \u0993-\u09A8 \u09AA-\u09B0 \u09B2 \u09B6-\u09B9 \u09DC-\u09DD
        \u09DF-\u09E1 \u09F0-\u09F1 \u09F4-\u09FA \u0A05-\u0A0A \u0A0F-\u0A10
        \u0A13-\u0A28 \u0A2A-\u0A30 \u0A32-\u0A33 \u0A35-\u0A36 \u0A38-\u0A39
        \u0A59-\u0A5C \u0A5E \u0A72-\u0A74 \u0A85-\u0A8B \u0A8D \u0A8F-\u0A91
        \u0A93-\u0AA8 \u0AAA-\u0AB0 \u0AB2-\u0AB3 \u0AB5-\u0AB9 \u0ABD
        \u0AD0 \u0AE0 \u0B05-\u0B0C \u0B0F-\u0B10 \u0B13-\u0B28 \u0B2A-\u0B30
        \u0B32-\u0B33 \u0B36-\u0B39 \u0B3D \u0B5C-\u0B5D \u0B5F-\u0B61
        \u0B70 \u0B83 \u0B85-\u0B8A \u0B8E-\u0B90 \u0B92-\u0B95 \u0B99-\u0B9A
        \u0B9C \u0B9E-\u0B9F \u0BA3-\u0BA4 \u0BA8-\u0BAA \u0BAE-\u0BB5
        \u0BB7-\u0BB9 \u0BF0-\u0BF2 \u0C05-\u0C0C \u0C0E-\u0C10 \u0C12-\u0C28
        \u0C2A-\u0C33 \u0C35-\u0C39 \u0C60-\u0C61 \u0C85-\u0C8C \u0C8E-\u0C90
        \u0C92-\u0CA8 \u0CAA-\u0CB3 \u0CB5-\u0CB9 \u0CDE \u0CE0-\u0CE1
        \u0D05-\u0D0C \u0D0E-\u0D10 \u0D12-\u0D28 \u0D2A-\u0D39 \u0D60-\u0D61
        \u0D85-\u0D96 \u0D9A-\u0DB1 \u0DB3-\u0DBB \u0DBD \u0DC0-\u0DC6
        \u0DF4 \u0E4F \u0F00-\u0F0A \u0F0D-\u0F17 \u0F1A-\u0F1F \u0F2A-\u0F34
        \u0F36 \u0F38 \u0F40-\u0F47 \u0F49-\u0F6A \u0F85 \u0F88-\u0F8B
        \u0FBE-\u0FC5 \u0FC7-\u0FCC \u0FCF \u104A-\u104F \u10A0-\u10C5
        \u10D0-\u10F8 \u10FB \u1200-\u1206 \u1208-\u1246 \u1248 \u124A-\u124D
        \u1250-\u1256 \u1258 \u125A-\u125D \u1260-\u1286 \u1288 \u128A-\u128D
        \u1290-\u12AE \u12B0 \u12B2-\u12B5 \u12B8-\u12BE \u12C0 \u12C2-\u12C5
        \u12C8-\u12CE \u12D0-\u12D6 \u12D8-\u12EE \u12F0-\u130E \u1310
        \u1312-\u1315 \u1318-\u131E \u1320-\u1346 \u1348-\u135A \u1362-\u1368
        \u1372-\u137C \u13A0-\u13F4 \u1401-\u1676 \u1681-\u169A \u16A0-\u16F0
        \u1700-\u170C \u170E-\u1711 \u1720-\u1731 \u1735-\u1736 \u1740-\u1751
        \u1760-\u176C \u176E-\u1770 \u17DC \u1800-\u1805 \u1807-\u180A
        \u1820-\u1877 \u1880-\u18A8 \u1E00-\u1E9B \u1EA0-\u1EF9 \u1F00-\u1F15
        \u1F18-\u1F1D \u1F20-\u1F45 \u1F48-\u1F4D \u1F50-\u1F57 \u1F59
        \u1F5B \u1F5D \u1F5F-\u1F7D \u1F80-\u1FB4 \u1FB6-\u1FC4 \u1FC6-\u1FD3
        \u1FD6-\u1FDB \u1FDD-\u1FEF \u1FF2-\u1FF4 \u1FF6-\u1FFE \u2017
        \u2022-\u2023 \u2038 \u203D-\u2043 \u2047-\u2052 \u2057 \u2061-\u2063
        \u2070-\u2071 \u2075-\u207C \u2080 \u2085-\u208C \u2100-\u2102
        \u2104 \u2106-\u2108 \u210A-\u2112 \u2114-\u2115 \u2117-\u2120
        \u2123-\u2125 \u2127-\u212A \u212C-\u213A \u213D-\u213F \u2141-\u214B
        \u2153 \u2156-\u215A \u215C-\u215D \u215F \u216C-\u216F \u217A-\u2183
        \u219A-\u21D1 \u21D3 \u21D5-\u21FF \u2201 \u2204-\u2206 \u2209-\u220A
        \u220C-\u220E \u2210 \u2214 \u2216-\u2219 \u221B-\u221C \u2221-\u2222
        \u2224 \u2226 \u222D \u222F-\u2233 \u2238-\u223B \u223E-\u2247
        \u2249-\u224B \u224D-\u2251 \u2253-\u225F \u2262-\u2263 \u2268-\u2269
        \u226C-\u226D \u2270-\u2281 \u2284-\u2285 \u2288-\u2294 \u2296-\u2298
        \u229A-\u22A4 \u22A6-\u22BE \u22C0-\u2311 \u2313-\u2328 \u232B-\u23B3
        \u23B7-\u23CE \u2400-\u2426 \u2440-\u244A \u24C0-\u24CF \u24EA
        \u254C-\u254F \u2575-\u257F \u2590-\u2591 \u2596-\u259F \u25A2
        \u25AA-\u25B1 \u25B4-\u25B5 \u25B8-\u25BB \u25BE-\u25BF \u25C2-\u25C5
        \u25C9-\u25CA \u25CC-\u25CD \u25D2-\u25E1 \u25E6-\u25EE \u25F0-\u2604
        \u2607-\u2608 \u260A-\u260D \u2610-\u2613 \u2619-\u261B \u261D
        \u261F-\u263F \u2641 \u2643-\u265F \u2662 \u2666 \u266B \u266E
        \u2670-\u267D \u2680-\u2689 \u2701-\u2704 \u2706-\u2709 \u270C-\u2727
        \u2729-\u274B \u274D \u274F-\u2752 \u2756 \u2758-\u275A \u2761
        \u2764-\u2767 \u2776-\u2794 \u2798-\u27AF \u27B1-\u27BE \u27D0-\u27E5
        \u27F0-\u2982 \u2999-\u29D7 \u29DC-\u29FB \u29FE-\u2AFF \uFB00-\uFB06
        \uFB13-\uFB17 \uFB1D \uFB1F-\uFB36 \uFB38-\uFB3C \uFB3E \uFB40-\uFB41
        \uFB43-\uFB44 \uFB46-\uFBB1 \uFBD3-\uFD3D \uFD50-\uFD8F \uFD92-\uFDC7
        \uFDF0-\uFDFB \uFE70-\uFE74 \uFE76-\uFEFC \uFF66 \uFF71-\uFF9D
        \uFFA0-\uFFBE \uFFC2-\uFFC7 \uFFCA-\uFFCF \uFFD2-\uFFD7 \uFFDA-\uFFDC
        \uFFE8-\uFFEE \U00010300-\U0001031E \U00010320-\U00010323 \U00010330-\U0001034A
        \U00010400-\U00010425 \U00010428-\U0001044D \U0001D000-\U0001D0F5
        \U0001D100-\U0001D126 \U0001D12A-\U0001D164 \U0001D16A-\U0001D16C
        \U0001D183-\U0001D184 \U0001D18C-\U0001D1A9 \U0001D1AE-\U0001D1DD
        \U0001D400-\U0001D454 \U0001D456-\U0001D49C \U0001D49E-\U0001D49F
        \U0001D4A2 \U0001D4A5-\U0001D4A6 \U0001D4A9-\U0001D4AC \U0001D4AE-\U0001D4B9
        \U0001D4BB \U0001D4BD-\U0001D4C0 \U0001D4C2-\U0001D4C3 \U0001D4C5-\U0001D505
        \U0001D507-\U0001D50A \U0001D50D-\U0001D514 \U0001D516-\U0001D51C
        \U0001D51E-\U0001D539 \U0001D53B-\U0001D53E \U0001D540-\U0001D544
        \U0001D546 \U0001D54A-\U0001D550 \U0001D552-\U0001D6A3 \U0001D6A8-\U0001D7C9];

$OP = [ \u0028 \u005B \u007B \u0F3A \u0F3C \u169B \u201A \u201E \u2045 \u207D
        \u208D \u2329 \u23B4 \u2768 \u276A \u276C \u276E \u2770 \u2772
        \u2774 \u27E6 \u27E8 \u27EA \u2983 \u2985 \u2987 \u2989 \u298B
        \u298D \u298F \u2991 \u2993 \u2995 \u2997 \u29D8 \u29DA \u29FC
        \u3008 \u300A \u300C \u300E \u3010 \u3014 \u3016 \u3018 \u301A
        \u301D \uFD3E \uFE35 \uFE37 \uFE39 \uFE3B \uFE3D \uFE3F \uFE41
        \uFE43 \uFE59 \uFE5B \uFE5D \uFF08 \uFF3B \uFF5B \uFF5F \uFF62];

$BK = [ \u000C \u2028-\u2029];

$PO = [ \u0025 \u00A2 \u00B0 \u2030-\u2037 \u20A7 \u2103 \u2109 \u2126 \uFDFC
        \uFE6A \uFF05 \uFFE0];

$NS = [ \u0E5A-\u0E5B \u17D4 \u17D6-\u17DA \u203C \u2044 \u3005 \u301C \u303B-\u303C
        \u3041 \u3043 \u3045 \u3047 \u3049 \u3063 \u3083 \u3085 \u3087
        \u308E \u3095-\u3096 \u309B-\u309E \u30A0-\u30A1 \u30A3 \u30A5
        \u30A7 \u30A9 \u30C3 \u30E3 \u30E5 \u30E7 \u30EE \u30F5-\u30F6
        \u30FB \u30FD \u31F0-\u31FF \uFE54-\uFE55 \uFF1A-\uFF1B \uFF65
        \uFF67-\uFF70 \uFF9E-\uFF9F];

$CL = [ \u0029 \u005D \u007D \u0F3B \u0F3D \u169C \u2046 \u207E \u208E \u232A
        \u23B5 \u2769 \u276B \u276D \u276F \u2771 \u2773 \u2775 \u27E7
        \u27E9 \u27EB \u2984 \u2986 \u2988 \u298A \u298C \u298E \u2990
        \u2992 \u2994 \u2996 \u2998 \u29D9 \u29DB \u29FD \u3001-\u3002
        \u3009 \u300B \u300D \u300F \u3011 \u3015 \u3017 \u3019 \u301B
        \u301E-\u301F \uFD3F \uFE36 \uFE38 \uFE3A \uFE3C \uFE3E \uFE40
        \uFE42 \uFE44 \uFE50 \uFE52 \uFE5A \uFE5C \uFE5E \uFF09 \uFF0C
        \uFF0E \uFF3D \uFF5D \uFF60-\uFF61 \uFF63-\uFF64];

$NU = [ \u0030-\u0039 \u0660-\u0669 \u06F0-\u06F9 \u0966-\u096F \u09E6-\u09EF
        \u0A66-\u0A6F \u0AE6-\u0AEF \u0B66-\u0B6F \u0BE7-\u0BEF \u0C66-\u0C6F
        \u0CE6-\u0CEF \u0D66-\u0D6F \u0E50-\u0E59 \u0ED0-\u0ED9 \u0F20-\u0F29
        \u1040-\u1049 \u1369-\u1371 \u17E0-\u17E9 \u1810-\u1819 \U0001D7CE-\U0001D7FF];

$CM = [ \u0000-\u0008 \u000B \u000E-\u001F \u007F-\u009F \u0300-\u034F \u0360-\u036F
        \u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9 \u05BB-\u05BD
        \u05BF \u05C1-\u05C2 \u05C4 \u064B-\u0655 \u0670 \u06D6-\u06E4
        \u06E7-\u06E8 \u06EA-\u06ED \u070F \u0711 \u0730-\u074A \u07A6-\u07B0
        \u0901-\u0903 \u093C \u093E-\u094D \u0951-\u0954 \u0962-\u0963
        \u0981-\u0983 \u09BC \u09BE-\u09C4 \u09C7-\u09C8 \u09CB-\u09CD
        \u09D7 \u09E2-\u09E3 \u0A02 \u0A3C \u0A3E-\u0A42 \u0A47-\u0A48
        \u0A4B-\u0A4D \u0A70-\u0A71 \u0A81-\u0A83 \u0ABC \u0ABE-\u0AC5
        \u0AC7-\u0AC9 \u0ACB-\u0ACD \u0B01-\u0B03 \u0B3C \u0B3E-\u0B43
        \u0B47-\u0B48 \u0B4B-\u0B4D \u0B56-\u0B57 \u0B82 \u0BBE-\u0BC2
        \u0BC6-\u0BC8 \u0BCA-\u0BCD \u0BD7 \u0C01-\u0C03 \u0C3E-\u0C44
        \u0C46-\u0C48 \u0C4A-\u0C4D \u0C55-\u0C56 \u0C82-\u0C83 \u0CBE-\u0CC4
        \u0CC6-\u0CC8 \u0CCA-\u0CCD \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D43
        \u0D46-\u0D48 \u0D4A-\u0D4D \u0D57 \u0D82-\u0D83 \u0DCA \u0DCF-\u0DD4
        \u0DD6 \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31 \u0E34-\u0E3A \u0E47-\u0E4E
        \u0EB1 \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
        \u0F35 \u0F37 \u0F39 \u0F3E-\u0F3F \u0F71-\u0F84 \u0F86-\u0F87
        \u0F90-\u0F97 \u0F99-\u0FBC \u0FC6 \u102C-\u1032 \u1036-\u1039
        \u1056-\u1059 \u1160-\u11A2 \u11A8-\u11F9 \u1712-\u1714 \u1732-\u1734
        \u1752-\u1753 \u1772-\u1773 \u17B4-\u17D3 \u180B-\u180E \u18A9
        \u200C-\u200F \u202A-\u202E \u206A-\u206F \u20D0-\u20EA \u302A-\u302F
        \u3099-\u309A \uFB1E \uFE00-\uFE0F \uFE20-\uFE23 \uFFF9-\uFFFB
        \U0001D165-\U0001D169 \U0001D16D-\U0001D182 \U0001D185-\U0001D18B
        \U0001D1AA-\U0001D1AD \U000E0001 \U000E0020-\U000E007F];

$PR = [ \u0024 \u002B \u005C \u00A3-\u00A5 \u00B1 \u09F2-\u09F3 \u0E3F \u17DB
        \u20A0-\u20A6 \u20A8-\u20B1 \u2116 \u2212-\u2213 \uFE69 \uFF04
        \uFFE1 \uFFE5-\uFFE6];

$B2 = [ \u2014];

$ID = [ \u1100-\u1159 \u115F \u2E80-\u2E99 \u2E9B-\u2EF3 \u2F00-\u2FD5 \u2FF0-\u2FFB
        \u3000 \u3003-\u3004 \u3006-\u3007 \u3012-\u3013 \u3020-\u3029
        \u3030-\u303A \u303D-\u303F \u3042 \u3044 \u3046 \u3048 \u304A-\u3062
        \u3064-\u3082 \u3084 \u3086 \u3088-\u308D \u308F-\u3094 \u309F
        \u30A2 \u30A4 \u30A6 \u30A8 \u30AA-\u30C2 \u30C4-\u30E2 \u30E4
        \u30E6 \u30E8-\u30ED \u30EF-\u30F4 \u30F7-\u30FA \u30FC \u30FE-\u30FF
        \u3105-\u312C \u3131-\u318E \u3190-\u31B7 \u3200-\u321C \u3220-\u3243
        \u3251-\u327B \u327F-\u32CB \u32D0-\u32FE \u3300-\u3376 \u337B-\u33DD
        \u33E0-\u33FE \u3400-\u4DB5 \u4E00-\u9FA5 \uA000-\uA48C \uA490-\uA4C6
        \uAC00-\uD7A3 \uF900-\uFA2D \uFA30-\uFA6A \uFE30-\uFE34 \uFE45-\uFE46
        \uFE49-\uFE4F \uFE51 \uFE58 \uFE5F-\uFE66 \uFE68 \uFE6B \uFF02-\uFF03
        \uFF06-\uFF07 \uFF0A-\uFF0B \uFF0D \uFF0F-\uFF19 \uFF1C-\uFF1E
        \uFF20-\uFF3A \uFF3C \uFF3E-\uFF5A \uFF5C \uFF5E \uFFE2-\uFFE4
        \U00020000-\U0002A6D6 \U0002F800-\U0002FA1D];

$SP = [ \u0020];

$QU = [ \u0022 \u0027 \u00AB \u00BB \u2018-\u2019 \u201B-\u201D \u201F \u2039-\u203A
        \u23B6 \u275B-\u275E];

$CR = [ \u000D];

$GL = [ \u00A0 \u0F0C \u2007 \u2011 \u202F \u2060 \uFEFF];

############  End of Script-Generated Definitions   #######################



#
#  Thai Dictionary related definitions and rules
#

$dictionary = [\u0e01-\u0e2e \u0e30-\u0e3a \u0e40-\u0e44 \u0e47-\u0e4e];  # this rule breaks the iterator with mixed Thai and English
$paiyannoi  = [\u0e2f];
$maiyamok   = [\u0e46];
$thai_etc   = $paiyannoi \u0e25 $paiyannoi;




#
#  Character classes from TR 29.  Needed for finding characters.
#
#  $Extend is all combining characters, and none of the other cruft that
#          TR14 puts into $CM, which is its concept of combining marks.
#
$Extend     =   # From UNIDATA/DerivedCoreProperties.txt
	[\u0300-\u034E \u0360-\u036F \u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9
	\u05BB-\u05BD \u05BF   \u05C1-\u05C2 \u05C4   \u064B-\u0655 \u0670   \u06D6-\u06DC
	\u06DE   \u06DF-\u06E4 \u06E7-\u06E8 \u06EA-\u06ED \u0711   \u0730-\u074A
	\u07A6-\u07B0 \u0901-\u0902 \u0903   \u093C   \u093E-\u0940 \u0941-\u0948
	\u0949-\u094C \u0951-\u0954 \u0962-\u0963 \u0981   \u0982-\u0983 \u09BC
	\u09BE-\u09C0 \u09C1-\u09C4 \u09C7-\u09C8 \u09CB-\u09CC \u09D7   \u09E2-\u09E3
	\u0A02   \u0A3C   \u0A3E-\u0A40 \u0A41-\u0A42 \u0A47-\u0A48 \u0A4B-\u0A4C
	\u0A70-\u0A71 \u0A81-\u0A82 \u0A83   \u0ABC   \u0ABE-\u0AC0 \u0AC1-\u0AC5
	\u0AC7-\u0AC8 \u0AC9   \u0ACB-\u0ACC \u0B01   \u0B02-\u0B03 \u0B3C   \u0B3E
	\u0B3F   \u0B40   \u0B41-\u0B43 \u0B47-\u0B48 \u0B4B-\u0B4C \u0B56   \u0B57
	\u0B82   \u0BBE-\u0BBF \u0BC0   \u0BC1-\u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0BD7
	\u0C01-\u0C03 \u0C3E-\u0C40 \u0C41-\u0C44 \u0C46-\u0C48 \u0C4A-\u0C4C
	\u0C55-\u0C56 \u0C82-\u0C83 \u0CBE   \u0CBF   \u0CC0-\u0CC4 \u0CC6
	\u0CC7-\u0CC8 \u0CCA-\u0CCB \u0CCC   \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D40
	\u0D41-\u0D43 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D57   \u0D82-\u0D83 \u0DCF-\u0DD1
	\u0DD2-\u0DD4 \u0DD6   \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31   \u0E34-\u0E39
	\u0E47-\u0E4E \u0EB1   \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
	\u0F35   \u0F37   \u0F39   \u0F3E-\u0F3F \u0F71-\u0F7E \u0F7F   \u0F80-\u0F84
	\u0F86-\u0F87 \u0F90-\u0F97 \u0F99-\u0FBC \u0FC6   \u102C   \u102D-\u1030 \u1031
	\u1032   \u1036-\u1037 \u1038   \u1056-\u1057 \u1058-\u1059 \u1712-\u1714
	\u1732-\u1734 \u1752-\u1753 \u1772-\u1773 \u17B4-\u17B6 \u17B7-\u17BD
	\u17BE-\u17C5 \u17C6   \u17C7-\u17C8 \u17C9-\u17D1 \u17D3   \u180B-\u180D
	\u18A9   \u20D0-\u20DC \u20DD-\u20E0 \u20E1   \u20E2-\u20E4 \u20E5-\u20EA
	\u302A-\u302F \u3099-\u309A \uFB1E   \uFE00-\uFE0F \uFE20-\uFE23 \uFF9E-\uFF9F
	\U0001D165-\U0001D166 \U0001D167-\U0001D169 \U0001D16D-\U0001D172 
	\U0001D17B-\U0001D182 \U0001D185-\U0001D18B \U0001D1AA-\U0001D1AD];


#
#  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width) and
#                               SA  (South East Asian: Thai, Lao, Khmer) as $AL  (Alphabetic)
#
$ALPlus = $AL | $AI | [$SA - $dictionary];

#
#  Combining Marks.   X $CM*  behaves as if it were X.  Rule LB6.
#                     TODO:  This is going to produce some odd results, because of the non-combining
#                            chars that are included in $CM.  Use $Extend instead, where possible.
#
$ALcm = $ALPlus $CM*;
$IDcm = $ID $CM*;
$NUcm = $NU $Extend*;
$HYcm = $HY $Extend*;
$SPcm = $SP $Extend*;
$QUcm = $QU $Extend*;
$POcm = $PO $Extend*;
$OPcm = $OP $Extend*;
$BAcm = $BA $Extend*;
$BBcm = $BB $Extend*;
$NScm = $NS $Extend*;
$GLcm = $GL $Extend*;
$B2cm = $B2 $Extend*;
$INcm = $IN $Extend*;


#  New Lines.  Always break after, never break before.
#              Rule LB 3
#
#  Endings.    NewLine or Zero Width Space, or both.  Rules 4, 5
#              Because we never break before these things, $Endings
#              appears at the end of line break rule.
#
$NLF = $BK | $CR | $LF | $CR $LF;
$Endings = $SPcm* $ZW* $NLF?;
$EndingsMandatory = $SPcm* $NLF | $SPcm* $ZW $NLF?;


#
#  Openings  Sequences that can precede Words, and that should not be separated from them.
#            Rules LB 9, 10
#
$Openings = (($QUcm $SPcm*)? $OPcm $SPcm*)*;

#
#  Closings  Seqences that follow words, and that should not be separated from them,
#            Rule LB 8, 11, 15
$Closings =  ($SPcm*( ($CL ($SPcm* $NScm)?  |  $EX  | $IS  | $SY) $Extend*) | $BAcm | $HYcm  | $NScm | $maiyamok)*;

#
#  Words.  Includes mixed Alpha-numerics.
#          Rules 11a, 16, 17, 19, more or less.
#
$NumberInterior = $IDcm | ($NUcm | $ALcm | $IS $NUcm)+;  
$Number         =  $PR? ($OPcm | $HYcm)? $NumberInterior $CL? $POcm?;   # Fancy Number     18 
$Word           = (($IDcm | ($ALcm | $NUcm)+) ($POcm? | $INcm?));       # Alpha-numeric.   16, 17 
$Dashes         = (($B2cm $SPcm*)*);                                    # Dashes           11a   
$ThaiRange      = $dictionary+ | $thai_etc;
$WordLikeThing  = $Number | $Word | $Dashes | $ThaiRange;
        


        
$Word15 = ($BBcm* ($WordLikeThing)? ($BAcm | $HYcm | $NScm)*) |     # Rule 15. Stuff sticks around words.
          [^[:Cc:] $BK $CR $LF $ZW $SP $GL] $Extend*  |                 # Allow characters that don't meet the
          [^$BK $CR $LF $ZW $SP $GL ];                                  #  more elaborate definitions for WORD
                                                                    #  to be glued.
        
$GluedWord  = ($GLcm | $QUcm)? $Word15 (($GLcm | $QUcm) $Word15)*;  # "Glue" will stick anything below it together.
                                                                    # Rules 13, 14

#
#  The actual rules, a combination of everything defined above.
#
$Openings $GluedWord  $Closings $paiyannoi? $EndingsMandatory;
$Openings $GluedWord  $Closings  $Endings;

$Openings $GluedWord  $Closings $paiyannoi   /  
               ([^\u0e25 $Extend] | \u0e25[^$paiyannoi $Extend]);
     
     
 #"$word($nbsp+$word)*$paiyannoi/([^[\u0e25$_ignore_]]|"
 #                       + "\u0e25[^$paiyannoi$_ignore_]);"


#
#  Reverse Rules.
#
#     Back up to a hard break or a space that will cause a boundary.
#     Not all spaces cause line breaks.  $SpaceGlue represents a sequence
#     containing a space that may inhibit a break from occuring.
#
$SpaceGlue  = ([$ZW $CL $IS $NS $OP]  ($Extend* $SP)) | (($Extend* $SP)+ $OP);
$ClumpingChars = [^$SP $BK $CR $LF];

!. . $ClumpingChars*  ($SpaceGlue $ClumpingChars*)* (. | $LF $CR);


--- NEW FILE: sent.txt ---
#
#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
#       All Rights Reserved.
#
#   file:  sent.txt   
#
#   ICU Sentence Break Rules
#      See Unicode Standard Annex #29.
#      These rules are based on TR 29 version 4.0.0
#
    

#
# Character categories as defined in TR 29
#
$Sep     = [\u000a \u000d \u0085 \u2028 \u2029];
$Format  = [[:Format:]];
$Sp      = [[:Whitespace:] - $Sep];
$Lower   = [[:Lowercase:]];
$Upper   = [[:TitleCase_Letter:] [:Uppercase:]];
$OLetter = [[:Alphabetic:] [:name = HEBREW PUNCTUATION GERESH:] - [$Lower $Upper]];
$Numeric = [:LineBreak = Numeric:];

$ATerm = [.];  

$Term  = [\u0021 \u003F \u0589 \u061F \u06D4 \u0700 \u0701 \u0702 \u0964 \u1362
          \u1367 \u1368 \u104a \u104b \u166e \u1803 \u1809 \u203C \u203D \u2047 
          \u2048 \u2049 \u3002 \uFE52 \uFE57 \uFF01 \uFF0E \uFF1F \uFF61];
          
$Close   = [[:Open_Punctuation:] [:Close_Punctuation:] [:Linebreak = Quotation:] -
           [[:name = HEBREW PUNCTUATION GERESH:] $ATerm $Term]];
           
           

# Define extended forms of the character classes,
#   incorporate grapheme cluster + format chars.

$Extend     = [[:Grapheme_Extend = TRUE:]]; 
$ATermEx    = $ATerm   $Extend* $Format*;
$NumericEx  = $Numeric $Extend* $Format*;
$UpperEx    = $Upper   $Extend* $Format*;
$TermEx     = $Term    $Extend* $Format*;

#
#  $SepSeq keeps together CRLF as a separator.  (CRLF is a grapheme cluster)
#
$SepSeq  = $Sep | \u000d\u000a;

# $InteriorChars are those that never trigger a following break.
$InteriorChars = [^$Term $ATerm $Sep];   #Note:  includes Extend and Format chars


# Rule 6.  Match an ATerm (.) that does not cause a break because a number immediately follows it.
$NumberFollows = $InteriorChars* $ATermEx $NumericEx;


# Rule 7.  $UppersSurround   Match a no-break sentence fragment containing a . surrounded by Uppers
$UppersSurround = $InteriorChars* $UpperEx $ATermEx $UpperEx;

# Rule 8   Matches a sentence fragment containing "." that should not cause a sentence break,
#          because a lower case word follows the period.
$LowerWordFollows  = $InteriorChars* $ATermEx $Close* $Sp* [^$OLetter $Upper $Lower $Sep]* $Lower;

# Rules 3, 9, 10, 11
#                       Matches a simple sentence, or the trailing part of a complex sentence,
#                       where a simple sentence contains no interior "."s.
$EndSequence       = $InteriorChars* ($TermEx | $ATermEx) $Close* $Sp* $SepSeq? |
                     $InteriorChars* $SepSeq?;



# Put them all together.  
($NumberFollows | $UppersSurround |  $LowerWordFollows)*  $EndSequence;

     
#
#  Reverse Rules
#
$EndGorp                  = ($Term | $ATerm | $Sep | $Close | $Extend | $Format | $Sp);
$RevEndSequence           = $EndGorp* $InteriorChars* $EndGorp* | $Sep [^$ATerm $Term]*;
$ReverseLowerWordFollows  = $Lower [^$OLetter $Upper $Lower $Sep]* $ATerm $InteriorChars*;
$ReverseUpperSurround     = $Upper $Format* $Extend* $ATerm $Format* $Extend* $Upper $InteriorChars*;
$ReverseNumberFollows     = $Numeric $Format* $Extend* $ATerm $InteriorChars*;

! $RevEndSequence ($ReverseLowerWordFollows | $ReverseUpperSurround | $ReverseNumberFollows)* .?;
#! .*;
 

--- NEW FILE: thaidict.brk ---
(This appears to be a binary file; contents omitted.)

--- NEW FILE: title.txt ---
# Copyright (c) 2002-2003, International Business Machines Corporation and
# others. All Rights Reserved.
#
#  Title Casing Break Rules
#

$CaseIgnorable   = [[:Mn:][:Me:][:Cf:][:Lm:][:Sk:] \u0027 \u00AD \u2019];
$Cased           = [[:Upper_Case:][:Lower_Case:][:Lt:]  - $CaseIgnorable];
$NotCased        = [^ $Cased];

#
#  If the iterator was not stopped on a cased character, advance it to the first cased char
#
($NotCased | $CaseIgnorable)*;

#
#  If the iterator starts on a cased item, advance through all adjacent cased items plus
#    any non-cased stuff, to reach the start of the next word.
#
$Cased ($Cased | $CaseIgnorable)* $NotCased*;

#
#  Reverse Rules
#
!$NotCased* ($Cased | $CaseIgnorable)* $NotCased?;


--- NEW FILE: word.txt ---
#
#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
#       All Rights Reserved.
#
#   file:  word.txt   
#
#   ICU Word Break Rules
#      See Unicode Standard Annex #29.
#      These rules are based on Version 4.0.0, dated 2003-04-17
#



####################################################################################
#
#  Character class definitions from TR 29
#
####################################################################################
$Katakana  = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 
                                   [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
                                   [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
                                   [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];


$ALetter   = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:] 
                           - [:Ideographic:]
                           - $Katakana
                           - [:Script = Thai:]
                           - [:Script = Lao:]
                           - [:Script = Hiragana:]];
                           
$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]  [:name = HEBREW PUNCTUATION GERSHAYIM:]
              [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]];  
              
$MidNumLet = [[:name = FULL STOP:] [:name = COLON:]];

$MidNum    = [[:LineBreak = Infix_Numeric:] - $MidNumLet];
$Numeric   = [:LineBreak = Numeric:];


#
#  Character Class Definitions.
#    The names are those from TR29.
#
$CR         = \u000d;
$LF         = \u000a;
$Control    = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];
$Extend     = [[:Grapheme_Extend = TRUE:]]; 




####################################################################################
#
#  Word Break Rules.    Definitions and Rules specific to word break begin Here. 
#
####################################################################################

$Format    = [[:Cf:]];



# Rule 3:  Treat a grapheme cluster as if it were a single character.
#          Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
#          because we don't need to find the boundaries between adjacent syllables -
#          they won't be word boundaries.
#


#
#  "Extended"  definitions.  Grapheme Cluster + Format Chars, treated like the base char.
#
$ALetterEx    = $ALetter   $Extend*; 
$NumericEx    = $Numeric   $Extend*;
$MidNumEx     = $MidNum    $Extend*;
$MidNumLetEx  = $MidNumLet $Extend*;
$MidLetterEx  = $MidLetter $Extend*;
$KatakanaEx   = $Katakana  $Extend*;
$FormatEx     = $Format    $Extend*;


#
#  Numbers.  Rules 8, 11, 12 form the TR.
#
$NumberSequence = $NumericEx ($FormatEx* ($MidNumEx | $MidNumLetEx)? $FormatEx* $NumericEx)*;
$NumberSequence {100};

#
#  Words.  Alpha-numerics.  Rule 5, 6, 7, 9, 10
#     - must include at least one letter. 
#     - may include both letters and numbers.
#     - may include  MideLetter, MidNumber punctuation.
#
$LetterSequence = $ALetterEx ($FormatEx* ($MidLetterEx | $MidNumLetEx)? $FormatEx* $ALetterEx)*;     # rules #6, #7
($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* {200};

#
#  Do not break between Katakana.   Rule #13.
#
$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
[:Hiragana:] $Extend* {300};

#
#  Ideographic Characters.  Stand by themselves as words.
#                           Separated from the "Everything Else" rule, below, only so that they
#                           can be tagged with a return value.   TODO:  is this what we want?
#
[:IDEOGRAPHIC:] $Extend* {400};

#
#  Everything Else, with no tag.
#                   Non-Control chars combine with $Extend (combining) chars.
#                   Controls are do not.
#
[^$Control [:Ideographic:]] $Extend*;
$CR $LF;

#
#  Reverse Rules.   Back up over any of the chars that can group together.
#                   (Reverse rules do not need to be exact; they can back up  too far,
#                   but must back up at least enough, and must stop on a boundary.)
#

# NonStarters are the set of all characters that can appear at the 2nd - nth position of
#    a word.   (They may also be the first.)   The reverse rule skips over these, until it
#    reaches something that can only be the start (and probably only) char in a "word".
#    A space or punctuation meets the test.
#
$NonStarters = [$Numeric $ALetter $Katakana $MidLetter $MidNum $MidNumLet $Extend $Format];

#!.*;
! ($NonStarters* | \n \r) .;

--- NEW FILE: word_th.txt ---
# Copyright (c) 2002, International Business Machines Corporation and
# others. All Rights Reserved.
#
#  word.txt    Word Breaking Rules for ICU Rules Based Break Iterator.
#


$Hiragana = [[:L:] & [:Hira:]];
$Katakana = [[:L:] & [:Kana:]];

#
#  Definition of $Ideographic is from TR14, Line Breaking.
#
$Ideographic = 
      [ \u1100-\u1159 \u115F \u2E80-\u2E99 \u2E9B-\u2EF3 \u2F00-\u2FD5 \u2FF0-\u2FFB
        \u3000 \u3003-\u3004 \u3006-\u3007 \u3012-\u3013 \u3020-\u3029
        \u3030-\u303A \u303D-\u303F \u3042 \u3044 \u3046 \u3048 \u304A-\u3062
        \u3064-\u3082 \u3084 \u3086 \u3088-\u308D \u308F-\u3094 \u309F
        \u30A2 \u30A4 \u30A6 \u30A8 \u30AA-\u30C2 \u30C4-\u30E2 \u30E4
        \u30E6 \u30E8-\u30ED \u30EF-\u30F4 \u30F7-\u30FA \u30FC \u30FE-\u30FF
        \u3105-\u312C \u3131-\u318E \u3190-\u31B7 \u3200-\u321C \u3220-\u3243
        \u3251-\u327B \u327F-\u32CB \u32D0-\u32FE \u3300-\u3376 \u337B-\u33DD
        \u33E0-\u33FE \u3400-\u4DB5 \u4E00-\u9FA5 \uA000-\uA48C \uA490-\uA4C6
        \uAC00-\uD7A3 \uF900-\uFA2D \uFA30-\uFA6A \uFE30-\uFE34 \uFE45-\uFE46
        \uFE49-\uFE4F \uFE51 \uFE58 \uFE5F-\uFE66 \uFE68 \uFE6B \uFF02-\uFF03
        \uFF06-\uFF07 \uFF0A-\uFF0B \uFF0D \uFF0F-\uFF19 \uFF1C-\uFF1E
        \uFF20-\uFF3A \uFF3C \uFF3E-\uFF5A \uFF5C \uFF5E \uFFE2-\uFFE4
        \U00020000-\U0002A6D6 \U0002F800-\U0002FA1D];

#
# These definitions are from the character break rules.
#
$CGJ = [\u034f];   #Combining Grapheme Joiner
$Link       = [\u094D \u09CD \u0A4D \u0ACD \u0B4D \u0BCD \u0C4D \u0CCD \u0D4D \u0DCA \u0E3A \u1039 \u17D2]; 
$NotControl = [^[:Zl:] [:Zp:] [:Cc:]];  #Line Separator,
                                        #Paragraph Separtor,
                                        # General Category == Control
$Extend     =   # From UNIDATA/DerivedCoreProperties.txt
	[\u0300-\u034E \u0360-\u036F \u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9
	\u05BB-\u05BD \u05BF   \u05C1-\u05C2 \u05C4   \u064B-\u0655 \u0670   \u06D6-\u06DC
	\u06DE   \u06DF-\u06E4 \u06E7-\u06E8 \u06EA-\u06ED \u0711   \u0730-\u074A
	\u07A6-\u07B0 \u0901-\u0902 \u0903   \u093C   \u093E-\u0940 \u0941-\u0948
	\u0949-\u094C \u0951-\u0954 \u0962-\u0963 \u0981   \u0982-\u0983 \u09BC
	\u09BE-\u09C0 \u09C1-\u09C4 \u09C7-\u09C8 \u09CB-\u09CC \u09D7   \u09E2-\u09E3
	\u0A02   \u0A3C   \u0A3E-\u0A40 \u0A41-\u0A42 \u0A47-\u0A48 \u0A4B-\u0A4C
	\u0A70-\u0A71 \u0A81-\u0A82 \u0A83   \u0ABC   \u0ABE-\u0AC0 \u0AC1-\u0AC5
	\u0AC7-\u0AC8 \u0AC9   \u0ACB-\u0ACC \u0B01   \u0B02-\u0B03 \u0B3C   \u0B3E
	\u0B3F   \u0B40   \u0B41-\u0B43 \u0B47-\u0B48 \u0B4B-\u0B4C \u0B56   \u0B57
	\u0B82   \u0BBE-\u0BBF \u0BC0   \u0BC1-\u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0BD7
	\u0C01-\u0C03 \u0C3E-\u0C40 \u0C41-\u0C44 \u0C46-\u0C48 \u0C4A-\u0C4C
	\u0C55-\u0C56 \u0C82-\u0C83 \u0CBE   \u0CBF   \u0CC0-\u0CC4 \u0CC6
	\u0CC7-\u0CC8 \u0CCA-\u0CCB \u0CCC   \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D40
	\u0D41-\u0D43 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D57   \u0D82-\u0D83 \u0DCF-\u0DD1
	\u0DD2-\u0DD4 \u0DD6   \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31   \u0E34-\u0E39
	\u0E47-\u0E4E \u0EB1   \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
	\u0F35   \u0F37   \u0F39   \u0F3E-\u0F3F \u0F71-\u0F7E \u0F7F   \u0F80-\u0F84
	\u0F86-\u0F87 \u0F90-\u0F97 \u0F99-\u0FBC \u0FC6   \u102C   \u102D-\u1030 \u1031
	\u1032   \u1036-\u1037 \u1038   \u1056-\u1057 \u1058-\u1059 \u1712-\u1714
	\u1732-\u1734 \u1752-\u1753 \u1772-\u1773 \u17B4-\u17B6 \u17B7-\u17BD
	\u17BE-\u17C5 \u17C6   \u17C7-\u17C8 \u17C9-\u17D1 \u17D3   \u180B-\u180D
	\u18A9   \u20D0-\u20DC \u20DD-\u20E0 \u20E1   \u20E2-\u20E4 \u20E5-\u20EA
	\u302A-\u302F \u3099-\u309A \uFB1E   \uFE00-\uFE0F \uFE20-\uFE23 \uFF9E-\uFF9F
	\U0001D165-\U0001D166 \U0001D167-\U0001D169 \U0001D16D-\U0001D172 
	\U0001D17B-\U0001D182 \U0001D185-\U0001D18B \U0001D1AA-\U0001D1AD];

#
#  Korean, also taken from character break rules.
#
#
# Korean Syllable Sequences
#
$L  = [\u1100-\u115f];
$V  = [\u1160-\u11a2];
$T  = [\u11a8-\u11f9];
$LV = [ \uac00 \uac1c \uac38 \uac54 \uac70 \uac8c \uaca8 \uacc4 \uace0 \uacfc \uad18 \uad34 \uad50 \uad6c \uad88 \uada4 
		\uadc0 \uaddc \uadf8 \uae14 \uae30 \uae4c \uae68 \uae84 \uaea0 \uaebc \uaed8 \uaef4 \uaf10 \uaf2c \uaf48 \uaf64 
		\uaf80 \uaf9c \uafb8 \uafd4 \uaff0 \ub00c \ub028 \ub044 \ub060 \ub07c \ub098 \ub0b4 \ub0d0 \ub0ec \ub108 \ub124 
		\ub140 \ub15c \ub178 \ub194 \ub1b0 \ub1cc \ub1e8 \ub204 \ub220 \ub23c \ub258 \ub274 \ub290 \ub2ac \ub2c8 \ub2e4 
		\ub300 \ub31c \ub338 \ub354 \ub370 \ub38c \ub3a8 \ub3c4 \ub3e0 \ub3fc \ub418 \ub434 \ub450 \ub46c \ub488 \ub4a4 
		\ub4c0 \ub4dc \ub4f8 \ub514 \ub530 \ub54c \ub568 \ub584 \ub5a0 \ub5bc \ub5d8 \ub5f4 \ub610 \ub62c \ub648 \ub664 
		\ub680 \ub69c \ub6b8 \ub6d4 \ub6f0 \ub70c \ub728 \ub744 \ub760 \ub77c \ub798 \ub7b4 \ub7d0 \ub7ec \ub808 \ub824 
		\ub840 \ub85c \ub878 \ub894 \ub8b0 \ub8cc \ub8e8 \ub904 \ub920 \ub93c \ub958 \ub974 \ub990 \ub9ac \ub9c8 \ub9e4 
		\uba00 \uba1c \uba38 \uba54 \uba70 \uba8c \ubaa8 \ubac4 \ubae0 \ubafc \ubb18 \ubb34 \ubb50 \ubb6c \ubb88 \ubba4 
		\ubbc0 \ubbdc \ubbf8 \ubc14 \ubc30 \ubc4c \ubc68 \ubc84 \ubca0 \ubcbc \ubcd8 \ubcf4 \ubd10 \ubd2c \ubd48 \ubd64 
		\ubd80 \ubd9c \ubdb8 \ubdd4 \ubdf0 \ube0c \ube28 \ube44 \ube60 \ube7c \ube98 \ubeb4 \ubed0 \ubeec \ubf08 \ubf24 
		\ubf40 \ubf5c \ubf78 \ubf94 \ubfb0 \ubfcc \ubfe8 \uc004 \uc020 \uc03c \uc058 \uc074 \uc090 \uc0ac \uc0c8 \uc0e4 
		\uc100 \uc11c \uc138 \uc154 \uc170 \uc18c \uc1a8 \uc1c4 \uc1e0 \uc1fc \uc218 \uc234 \uc250 \uc26c \uc288 \uc2a4 
		\uc2c0 \uc2dc \uc2f8 \uc314 \uc330 \uc34c \uc368 \uc384 \uc3a0 \uc3bc \uc3d8 \uc3f4 \uc410 \uc42c \uc448 \uc464 
		\uc480 \uc49c \uc4b8 \uc4d4 \uc4f0 \uc50c \uc528 \uc544 \uc560 \uc57c \uc598 \uc5b4 \uc5d0 \uc5ec \uc608 \uc624 
		\uc640 \uc65c \uc678 \uc694 \uc6b0 \uc6cc \uc6e8 \uc704 \uc720 \uc73c \uc758 \uc774 \uc790 \uc7ac \uc7c8 \uc7e4 
		\uc800 \uc81c \uc838 \uc854 \uc870 \uc88c \uc8a8 \uc8c4 \uc8e0 \uc8fc \uc918 \uc934 \uc950 \uc96c \uc988 \uc9a4 
		\uc9c0 \uc9dc \uc9f8 \uca14 \uca30 \uca4c \uca68 \uca84 \ucaa0 \ucabc \ucad8 \ucaf4 \ucb10 \ucb2c \ucb48 \ucb64 
		\ucb80 \ucb9c \ucbb8 \ucbd4 \ucbf0 \ucc0c \ucc28 \ucc44 \ucc60 \ucc7c \ucc98 \uccb4 \uccd0 \uccec \ucd08 \ucd24 
		\ucd40 \ucd5c \ucd78 \ucd94 \ucdb0 \ucdcc \ucde8 \uce04 \uce20 \uce3c \uce58 \uce74 \uce90 \uceac \ucec8 \ucee4 
		\ucf00 \ucf1c \ucf38 \ucf54 \ucf70 \ucf8c \ucfa8 \ucfc4 \ucfe0 \ucffc \ud018 \ud034 \ud050 \ud06c \ud088 \ud0a4 
		\ud0c0 \ud0dc \ud0f8 \ud114 \ud130 \ud14c \ud168 \ud184 \ud1a0 \ud1bc \ud1d8 \ud1f4 \ud210 \ud22c \ud248 \ud264 
		\ud280 \ud29c \ud2b8 \ud2d4 \ud2f0 \ud30c \ud328 \ud344 \ud360 \ud37c \ud398 \ud3b4 \ud3d0 \ud3ec \ud408 \ud424 
		\ud440 \ud45c \ud478 \ud494 \ud4b0 \ud4cc \ud4e8 \ud504 \ud520 \ud53c \ud558 \ud574 \ud590 \ud5ac \ud5c8 \ud5e4 
		\ud600 \ud61c \ud638 \ud654 \ud670 \ud68c \ud6a8 \ud6c4 \ud6e0 \ud6fc \ud718 \ud734 \ud750 \ud76c \ud788 ];
$LVT = [[\uac00-\ud7a3] - $LV];
$Hangul_Sequence = ((($L+ $LV?) | ($L* $LV)) $V* $T* ) | ($L* $LVT $T*);


#
#  Thai Dictionary Related Rules
#
$dictionary = [\u0e01-\u0e2e \u0e30-\u0e3a \u0e40-\u0e44 \u0e47-\u0e4e]; # this rule breaks the iterator with mixed Thai and English
$paiyannoi  = [\u0e2f];
$maiyamok   = [\u0e46];
$thai_etc   = $paiyannoi \u0e25 $paiyannoi;


$dictionary+ ($paiyannoi? $maiyamok)?;
$dictionary+ $paiyannoi / ([^\u0e25 $maiyamok $Extend] | \u0e25[^$paiyannoi $Extend]);
$thai_etc;


#
#  Definitions for building up Letters, so that breaks will not occur
#    within a single letter (Grapheme Cluster).  See the character break rules.
#
$LineBreak       = [$Ideographic $Hiragana $Katakana];
$Letter          = [[[:L:] [:Sk:]] & [^$LineBreak $dictionary]];
#$MidLetter      = [\u0027 \u2019 \u0029 \u00ad \u05f3 \u05f4];
$MidLetter       = [\u0027 \u2019 \u003a \u0029 \u00ad \u05f3 \u05f4];

$Base            = [^[:Cc:] [:Cf:] [:Cs:] [:Co:] [:Cn:] [:Zl:] [:Zp:] $Extend $Link $CGJ];
$LetterBase      = [:L:];
$CGJSequence     = $CGJ+ ($Base | $Hangul_Sequence);
$Join_Control    = [\u200d-\u200e];        # Zero Width Joiner, Zero Width Non-Joiner
$LinkSequence    = $Link+ $Extend* $Join_Control? $LetterBase;
$LetterEx        = ($Letter | $Hangul_Sequence) $Extend*  ((($LinkSequence | $CGJSequence) $Extend*)*); 



#
#  Numeric Definitions
#  TODO:  More complete handling of $Extend combining chars.
#
$Numeric         = [:Nd:];    #TODO  remove FULL WIDTH
$NumericEx       = $Numeric $Extend*;
$InfixNumeric    = [\u002c \u002e \u003a \u003b \u0589];
$PostfixNumeric  = [\%     \u00a2 \u00b0 \u2030 \u2031 \u2032-\u2037 \u20a7
                    \u2103 \u2109 \u2126 \ufe6a \uff05 \uffe0];
$PrefixNumeric   = [[[:Sc:] \u002b \u005c \u00b1 \u2116 \u2212 \u2213 \-] - [$PostfixNumeric]]; 
              
$NumericPrefix   = $PrefixNumeric $NumericEx ($InfixNumeric $NumericEx)?;
$NumericInterior = $NumericEx ($InfixNumeric? $NumericEx)*;


#
#  The Big Rule.  Gloms everything together.
#
$NumericPrefix? (($LetterEx ($MidLetter $LetterEx)*)? $NumericInterior?)* ($NumericInterior $PostfixNumeric)?;

#
#  Lesser rules
#
($Hiragana $Extend*)*;
($Katakana $Extend*)*;
$NotControl $Extend*;
\r\n;
.;

#
#  Reverse Rules.   Back up over any of the chars that can group together.
#                   (Reverse rules do not need to be exact; they can back up a bit too far,
#                   but must back up at least enough.)
#
! ( $Letter | $MidLetter | $Numeric | $PrefixNumeric | $Join_Control |
   $CGJ | $Link | $InfixNumeric | $PostfixNumeric | $Extend |
   $T | $V | $L | $LV | $LVT)*;
! ($Hiragana | $Extend)*;
! ($Katakana | $Extend)*;
! $Extend* .;
! \n\r;
#!.*;

! ($dictionary | $paiyannoi | $maiyamok | \u0e25)*;